mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 21:23:10 +08:00
[tune/rllib] revert removal of queue-trials (#10744)
This commit is contained in:
committed by
Barak Michener
parent
70305267d2
commit
fe23f23680
@@ -12,7 +12,6 @@ import ray
|
||||
from ray.exceptions import GetTimeoutError
|
||||
from ray import ray_constants
|
||||
from ray.resource_spec import ResourceSpec
|
||||
from ray.tune.cluster_info import is_ray_cluster
|
||||
from ray.tune.durable_trainable import DurableTrainable
|
||||
from ray.tune.error import AbortTrialExecution, TuneError
|
||||
from ray.tune.logger import NoopLogger
|
||||
@@ -136,17 +135,10 @@ class RayTrialExecutor(TrialExecutor):
|
||||
"""An implementation of TrialExecutor based on Ray."""
|
||||
|
||||
def __init__(self,
|
||||
queue_trials=None,
|
||||
queue_trials=False,
|
||||
reuse_actors=False,
|
||||
ray_auto_init=None,
|
||||
refresh_period=RESOURCE_REFRESH_PERIOD):
|
||||
if queue_trials is None:
|
||||
if os.environ.get("TUNE_DISABLE_QUEUE_TRIALS") == "1":
|
||||
logger.info("'TUNE_DISABLE_QUEUE_TRIALS=1' detected.")
|
||||
queue_trials = False
|
||||
elif is_ray_cluster():
|
||||
queue_trials = True
|
||||
|
||||
if ray_auto_init is None:
|
||||
if os.environ.get("TUNE_DISABLE_AUTO_INIT") == "1":
|
||||
logger.info("'TUNE_DISABLE_AUTO_INIT=1' detected.")
|
||||
|
||||
@@ -245,19 +245,17 @@ class TrainableFunctionApiTest(unittest.TestCase):
|
||||
register_trainable("B", B)
|
||||
|
||||
def f(cpus, gpus, queue_trials):
|
||||
if not queue_trials:
|
||||
os.environ["TUNE_DISABLE_QUEUE_TRIALS"] = "1"
|
||||
else:
|
||||
os.environ.pop("TUNE_DISABLE_QUEUE_TRIALS", None)
|
||||
return run_experiments({
|
||||
"foo": {
|
||||
"run": "B",
|
||||
"config": {
|
||||
"cpu": cpus,
|
||||
"gpu": gpus,
|
||||
},
|
||||
}
|
||||
})[0]
|
||||
return run_experiments(
|
||||
{
|
||||
"foo": {
|
||||
"run": "B",
|
||||
"config": {
|
||||
"cpu": cpus,
|
||||
"gpu": gpus,
|
||||
},
|
||||
}
|
||||
},
|
||||
queue_trials=queue_trials)[0]
|
||||
|
||||
# Should all succeed
|
||||
self.assertEqual(f(0, 0, False).status, Trial.TERMINATED)
|
||||
|
||||
@@ -3,6 +3,7 @@ import logging
|
||||
|
||||
from ray.tune.trial import Trial, Checkpoint
|
||||
from ray.tune.error import TuneError
|
||||
from ray.tune.cluster_info import is_ray_cluster
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -162,15 +163,22 @@ class TrialExecutor:
|
||||
for trial in trial_runner.get_trials():
|
||||
if trial.status == Trial.PENDING:
|
||||
if not self.has_resources(trial.resources):
|
||||
resource_string = trial.resources.summary_string()
|
||||
trial_resource_help_msg = trial.get_trainable_cls(
|
||||
).resource_help(trial.config)
|
||||
autoscaling_msg = ""
|
||||
if is_ray_cluster():
|
||||
autoscaling_msg = (
|
||||
"Pass `queue_trials=True` in ray.tune.run() or "
|
||||
"on the command line to queue trials until the "
|
||||
"cluster scales up or resources become available. "
|
||||
)
|
||||
raise TuneError(
|
||||
("Insufficient cluster resources to launch trial: "
|
||||
"trial requested {} but the cluster has only {}. "
|
||||
"This error should not occur if running on an "
|
||||
"autoscaling cluster. {}").format(
|
||||
trial.resources.summary_string(),
|
||||
self.resource_string(),
|
||||
trial.get_trainable_cls().resource_help(
|
||||
trial.config)))
|
||||
"Insufficient cluster resources to launch trial: "
|
||||
f"trial requested {resource_string}, but the cluster "
|
||||
f"has only {self.resource_string()}. "
|
||||
f"{autoscaling_msg}"
|
||||
f"{trial_resource_help_msg} ")
|
||||
elif trial.status == Trial.PAUSED:
|
||||
raise TuneError("There are paused trials, but no more pending "
|
||||
"trials with sufficient resources.")
|
||||
|
||||
@@ -95,13 +95,13 @@ def run(
|
||||
restore=None,
|
||||
server_port=None,
|
||||
resume=False,
|
||||
queue_trials=False,
|
||||
reuse_actors=False,
|
||||
trial_executor=None,
|
||||
raise_on_failed_trial=True,
|
||||
# Deprecated args
|
||||
ray_auto_init=None,
|
||||
run_errored_only=None,
|
||||
queue_trials=None,
|
||||
global_checkpoint_period=None,
|
||||
with_server=None,
|
||||
upload_dir=None,
|
||||
@@ -246,6 +246,10 @@ def run(
|
||||
ERRORED trials upon resume - previous trial artifacts will
|
||||
be left untouched. If resume is set but checkpoint does not exist,
|
||||
ValueError will be thrown.
|
||||
queue_trials (bool): Whether to queue trials when the cluster does
|
||||
not currently have enough resources to launch one. This should
|
||||
be set to True when running on an autoscaling cluster to enable
|
||||
automatic scale-up.
|
||||
reuse_actors (bool): Whether to reuse actors between different trials
|
||||
when possible. This can drastically speed up experiments that start
|
||||
and stop actors often (e.g., PBT in time-multiplexing mode). This
|
||||
@@ -264,11 +268,6 @@ def run(
|
||||
if global_checkpoint_period:
|
||||
raise ValueError("global_checkpoint_period is deprecated. Set env var "
|
||||
"'TUNE_GLOBAL_CHECKPOINT_S' instead.")
|
||||
if queue_trials:
|
||||
raise ValueError(
|
||||
"queue_trials is deprecated. "
|
||||
"Set env var 'TUNE_DISABLE_QUEUE_TRIALS=1' instead to "
|
||||
"disable queuing behavior.")
|
||||
if ray_auto_init:
|
||||
raise ValueError("ray_auto_init is deprecated. "
|
||||
"Set env var 'TUNE_DISABLE_AUTO_INIT=1' instead or "
|
||||
@@ -294,7 +293,7 @@ def run(
|
||||
set_sync_periods(sync_config)
|
||||
|
||||
trial_executor = trial_executor or RayTrialExecutor(
|
||||
reuse_actors=reuse_actors)
|
||||
reuse_actors=reuse_actors, queue_trials=queue_trials)
|
||||
if isinstance(run_or_experiment, list):
|
||||
experiments = run_or_experiment
|
||||
else:
|
||||
@@ -443,6 +442,7 @@ def run_experiments(experiments,
|
||||
verbose=2,
|
||||
progress_reporter=None,
|
||||
resume=False,
|
||||
queue_trials=False,
|
||||
reuse_actors=False,
|
||||
trial_executor=None,
|
||||
raise_on_failed_trial=True,
|
||||
@@ -472,6 +472,7 @@ def run_experiments(experiments,
|
||||
verbose=verbose,
|
||||
progress_reporter=progress_reporter,
|
||||
resume=resume,
|
||||
queue_trials=queue_trials,
|
||||
reuse_actors=reuse_actors,
|
||||
trial_executor=trial_executor,
|
||||
raise_on_failed_trial=raise_on_failed_trial,
|
||||
@@ -485,6 +486,7 @@ def run_experiments(experiments,
|
||||
verbose=verbose,
|
||||
progress_reporter=progress_reporter,
|
||||
resume=resume,
|
||||
queue_trials=queue_trials,
|
||||
reuse_actors=reuse_actors,
|
||||
trial_executor=trial_executor,
|
||||
raise_on_failed_trial=raise_on_failed_trial,
|
||||
|
||||
Reference in New Issue
Block a user