From 7db1f3be2a23332f2885f6c70badce6be33083bd Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 4 Jan 2019 17:23:19 -0800 Subject: [PATCH] [tune] resume=False by default but print a tip to set resume="prompt" + jenkins fix (#3681) --- doc/source/tune-usage.rst | 2 +- .../agents/a3c/a3c_torch_policy_graph.py | 2 +- python/ray/tune/test/cluster_tests.py | 2 -- python/ray/tune/test/trial_runner_test.py | 6 ----- python/ray/tune/tune.py | 23 +++++++++---------- 5 files changed, 13 insertions(+), 22 deletions(-) diff --git a/doc/source/tune-usage.rst b/doc/source/tune-usage.rst index 0cd6572d4..97112c130 100644 --- a/doc/source/tune-usage.rst +++ b/doc/source/tune-usage.rst @@ -299,7 +299,7 @@ of a trial, you can additionally set the checkpoint_at_end to True. An example i Recovering From Failures (Experimental) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Tune automatically persists the progress of your experiments, so if an experiment crashes or is otherwise cancelled, it can be resumed after prompting. The default setting of `resume=None` will cause Tune to prompt you for whether you want to resume. Prompting can be turned off with ``resume=True``. If ``resume=False``, a new experiment will be created instead. You can always force a new experiment to be created by changing the experiment name. +Tune automatically persists the progress of your experiments, so if an experiment crashes or is otherwise cancelled, it can be resumed with ``resume=True``. The default setting of ``resume=False`` creates a new experiment, and ``resume="prompt"`` will cause Tune to prompt you for whether you want to resume. You can always force a new experiment to be created by changing the experiment name. Note that trials will be restored to their last checkpoint. If trial checkpointing is not enabled, unfinished trials will be restarted from scratch. diff --git a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py index c415aa95a..ed72f653b 100644 --- a/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_torch_policy_graph.py @@ -82,4 +82,4 @@ class A3CTorchPolicyGraph(TorchPolicyGraph): with self.lock: obs = torch.from_numpy(obs).float().unsqueeze(0) _, _, vf, _ = self.model({"obs": obs}, []) - return vf.numpy().squeeze() + return vf.detach().numpy().squeeze() diff --git a/python/ray/tune/test/cluster_tests.py b/python/ray/tune/test/cluster_tests.py index 75b1a4c54..abb1fe394 100644 --- a/python/ray/tune/test/cluster_tests.py +++ b/python/ray/tune/test/cluster_tests.py @@ -62,7 +62,6 @@ def _start_new_cluster(): @pytest.fixture def start_connected_cluster(): # Start the Ray processes. - os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" cluster = _start_new_cluster() yield cluster # The code after the yield will run as teardown code. @@ -74,7 +73,6 @@ def start_connected_cluster(): def start_connected_emptyhead_cluster(): """Starts head with no resources.""" - os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" cluster = Cluster( initialize_head=True, connect=True, diff --git a/python/ray/tune/test/trial_runner_test.py b/python/ray/tune/test/trial_runner_test.py index 9d5f4457f..a92c3b0f1 100644 --- a/python/ray/tune/test/trial_runner_test.py +++ b/python/ray/tune/test/trial_runner_test.py @@ -39,7 +39,6 @@ else: class TrainableFunctionApiTest(unittest.TestCase): def setUp(self): - os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" ray.init(num_cpus=4, num_gpus=0) def tearDown(self): @@ -545,7 +544,6 @@ class TrainableFunctionApiTest(unittest.TestCase): class RunExperimentTest(unittest.TestCase): def setUp(self): - os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" ray.init() def tearDown(self): @@ -759,7 +757,6 @@ class RunExperimentTest(unittest.TestCase): class VariantGeneratorTest(unittest.TestCase): def setUp(self): - os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" ray.init() def tearDown(self): @@ -963,9 +960,6 @@ def create_mock_components(): class TrialRunnerTest(unittest.TestCase): - def setUp(self): - os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" - def tearDown(self): ray.shutdown() _register_all() # re-register the evicted objects diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py index e65e10c7b..1714f4c4f 100644 --- a/python/ray/tune/tune.py +++ b/python/ray/tune/tune.py @@ -58,7 +58,7 @@ def run_experiments(experiments, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=True, - resume=None, + resume=False, queue_trials=False, trial_executor=None, raise_on_failed_trial=True): @@ -76,8 +76,8 @@ def run_experiments(experiments, using the Client API. server_port (int): Port number for launching TuneServer. verbose (bool): How much output should be printed for each trial. - resume (bool|None): If checkpoint exists, the experiment will - resume from there. If resume is None, Tune will prompt if + resume (bool|"prompt"): If checkpoint exists, the experiment will + resume from there. If resume is "prompt", Tune will prompt if checkpoint detected. queue_trials (bool): Whether to queue trials when the cluster does not currently have enough resources to launch one. This should @@ -116,25 +116,24 @@ def run_experiments(experiments, runner = None restore = False - # TUNE_RESUME_PROMPT_OFF is for testing purposes and defaults - # `resume=False.` - if os.environ.get("TUNE_RESUME_PROMPT_OFF"): - resume = resume or False - if os.path.exists( os.path.join(checkpoint_dir, TrialRunner.CKPT_FILE_NAME)): - if resume: - restore = True - elif resume is None: + if resume == "prompt": msg = ("Found incomplete experiment at {}. " "Would you like to resume it?".format(checkpoint_dir)) - restore = click.confirm(msg, default=True) + restore = click.confirm(msg, default=False) if restore: logger.info("Tip: to always resume, " "pass resume=True to run_experiments()") else: logger.info("Tip: to always start a new experiment, " "pass resume=False to run_experiments()") + elif resume: + restore = True + else: + logger.info( + "Tip: to resume incomplete experiments, " + "pass resume='prompt' or resume=True to run_experiments()") else: logger.info( "Did not find checkpoint file in {}.".format(checkpoint_dir))