[tune/rllib] revert removal of queue-trials (#10744)

This commit is contained in:
Richard Liaw
2020-09-11 14:13:20 -07:00
committed by Barak Michener
parent 70305267d2
commit fe23f23680
5 changed files with 38 additions and 42 deletions
+1 -9
View File
@@ -12,7 +12,6 @@ import ray
from ray.exceptions import GetTimeoutError
from ray import ray_constants
from ray.resource_spec import ResourceSpec
from ray.tune.cluster_info import is_ray_cluster
from ray.tune.durable_trainable import DurableTrainable
from ray.tune.error import AbortTrialExecution, TuneError
from ray.tune.logger import NoopLogger
@@ -136,17 +135,10 @@ class RayTrialExecutor(TrialExecutor):
"""An implementation of TrialExecutor based on Ray."""
def __init__(self,
queue_trials=None,
queue_trials=False,
reuse_actors=False,
ray_auto_init=None,
refresh_period=RESOURCE_REFRESH_PERIOD):
if queue_trials is None:
if os.environ.get("TUNE_DISABLE_QUEUE_TRIALS") == "1":
logger.info("'TUNE_DISABLE_QUEUE_TRIALS=1' detected.")
queue_trials = False
elif is_ray_cluster():
queue_trials = True
if ray_auto_init is None:
if os.environ.get("TUNE_DISABLE_AUTO_INIT") == "1":
logger.info("'TUNE_DISABLE_AUTO_INIT=1' detected.")
+11 -13
View File
@@ -245,19 +245,17 @@ class TrainableFunctionApiTest(unittest.TestCase):
register_trainable("B", B)
def f(cpus, gpus, queue_trials):
if not queue_trials:
os.environ["TUNE_DISABLE_QUEUE_TRIALS"] = "1"
else:
os.environ.pop("TUNE_DISABLE_QUEUE_TRIALS", None)
return run_experiments({
"foo": {
"run": "B",
"config": {
"cpu": cpus,
"gpu": gpus,
},
}
})[0]
return run_experiments(
{
"foo": {
"run": "B",
"config": {
"cpu": cpus,
"gpu": gpus,
},
}
},
queue_trials=queue_trials)[0]
# Should all succeed
self.assertEqual(f(0, 0, False).status, Trial.TERMINATED)
+16 -8
View File
@@ -3,6 +3,7 @@ import logging
from ray.tune.trial import Trial, Checkpoint
from ray.tune.error import TuneError
from ray.tune.cluster_info import is_ray_cluster
logger = logging.getLogger(__name__)
@@ -162,15 +163,22 @@ class TrialExecutor:
for trial in trial_runner.get_trials():
if trial.status == Trial.PENDING:
if not self.has_resources(trial.resources):
resource_string = trial.resources.summary_string()
trial_resource_help_msg = trial.get_trainable_cls(
).resource_help(trial.config)
autoscaling_msg = ""
if is_ray_cluster():
autoscaling_msg = (
"Pass `queue_trials=True` in ray.tune.run() or "
"on the command line to queue trials until the "
"cluster scales up or resources become available. "
)
raise TuneError(
("Insufficient cluster resources to launch trial: "
"trial requested {} but the cluster has only {}. "
"This error should not occur if running on an "
"autoscaling cluster. {}").format(
trial.resources.summary_string(),
self.resource_string(),
trial.get_trainable_cls().resource_help(
trial.config)))
"Insufficient cluster resources to launch trial: "
f"trial requested {resource_string}, but the cluster "
f"has only {self.resource_string()}. "
f"{autoscaling_msg}"
f"{trial_resource_help_msg} ")
elif trial.status == Trial.PAUSED:
raise TuneError("There are paused trials, but no more pending "
"trials with sufficient resources.")
+9 -7
View File
@@ -95,13 +95,13 @@ def run(
restore=None,
server_port=None,
resume=False,
queue_trials=False,
reuse_actors=False,
trial_executor=None,
raise_on_failed_trial=True,
# Deprecated args
ray_auto_init=None,
run_errored_only=None,
queue_trials=None,
global_checkpoint_period=None,
with_server=None,
upload_dir=None,
@@ -246,6 +246,10 @@ def run(
ERRORED trials upon resume - previous trial artifacts will
be left untouched. If resume is set but checkpoint does not exist,
ValueError will be thrown.
queue_trials (bool): Whether to queue trials when the cluster does
not currently have enough resources to launch one. This should
be set to True when running on an autoscaling cluster to enable
automatic scale-up.
reuse_actors (bool): Whether to reuse actors between different trials
when possible. This can drastically speed up experiments that start
and stop actors often (e.g., PBT in time-multiplexing mode). This
@@ -264,11 +268,6 @@ def run(
if global_checkpoint_period:
raise ValueError("global_checkpoint_period is deprecated. Set env var "
"'TUNE_GLOBAL_CHECKPOINT_S' instead.")
if queue_trials:
raise ValueError(
"queue_trials is deprecated. "
"Set env var 'TUNE_DISABLE_QUEUE_TRIALS=1' instead to "
"disable queuing behavior.")
if ray_auto_init:
raise ValueError("ray_auto_init is deprecated. "
"Set env var 'TUNE_DISABLE_AUTO_INIT=1' instead or "
@@ -294,7 +293,7 @@ def run(
set_sync_periods(sync_config)
trial_executor = trial_executor or RayTrialExecutor(
reuse_actors=reuse_actors)
reuse_actors=reuse_actors, queue_trials=queue_trials)
if isinstance(run_or_experiment, list):
experiments = run_or_experiment
else:
@@ -443,6 +442,7 @@ def run_experiments(experiments,
verbose=2,
progress_reporter=None,
resume=False,
queue_trials=False,
reuse_actors=False,
trial_executor=None,
raise_on_failed_trial=True,
@@ -472,6 +472,7 @@ def run_experiments(experiments,
verbose=verbose,
progress_reporter=progress_reporter,
resume=resume,
queue_trials=queue_trials,
reuse_actors=reuse_actors,
trial_executor=trial_executor,
raise_on_failed_trial=raise_on_failed_trial,
@@ -485,6 +486,7 @@ def run_experiments(experiments,
verbose=verbose,
progress_reporter=progress_reporter,
resume=resume,
queue_trials=queue_trials,
reuse_actors=reuse_actors,
trial_executor=trial_executor,
raise_on_failed_trial=raise_on_failed_trial,
+1 -5
View File
@@ -205,15 +205,11 @@ def run(args, parser):
num_gpus=args.ray_num_gpus,
local_mode=args.local_mode)
if not args.queue_trials:
# TODO: this should be eventually removed as an arg
# because it is already autodetected on an autoscaling cluster.
os.environ["TUNE_DISABLE_QUEUE_TRIALS"] = "1"
run_experiments(
experiments,
scheduler=_make_scheduler(args),
resume=args.resume,
queue_trials=args.queue_trials,
verbose=verbose,
concurrent=True)