[tune] resume=False by default but print a tip to set resume="prompt" + jenkins fix (#3681)

This commit is contained in:
Eric Liang
2019-01-04 17:23:19 -08:00
committed by Richard Liaw
parent 747b117929
commit 7db1f3be2a
5 changed files with 13 additions and 22 deletions
@@ -82,4 +82,4 @@ class A3CTorchPolicyGraph(TorchPolicyGraph):
with self.lock:
obs = torch.from_numpy(obs).float().unsqueeze(0)
_, _, vf, _ = self.model({"obs": obs}, [])
return vf.numpy().squeeze()
return vf.detach().numpy().squeeze()
-2
View File
@@ -62,7 +62,6 @@ def _start_new_cluster():
@pytest.fixture
def start_connected_cluster():
# Start the Ray processes.
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
cluster = _start_new_cluster()
yield cluster
# The code after the yield will run as teardown code.
@@ -74,7 +73,6 @@ def start_connected_cluster():
def start_connected_emptyhead_cluster():
"""Starts head with no resources."""
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
cluster = Cluster(
initialize_head=True,
connect=True,
@@ -39,7 +39,6 @@ else:
class TrainableFunctionApiTest(unittest.TestCase):
def setUp(self):
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
ray.init(num_cpus=4, num_gpus=0)
def tearDown(self):
@@ -545,7 +544,6 @@ class TrainableFunctionApiTest(unittest.TestCase):
class RunExperimentTest(unittest.TestCase):
def setUp(self):
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
ray.init()
def tearDown(self):
@@ -759,7 +757,6 @@ class RunExperimentTest(unittest.TestCase):
class VariantGeneratorTest(unittest.TestCase):
def setUp(self):
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
ray.init()
def tearDown(self):
@@ -963,9 +960,6 @@ def create_mock_components():
class TrialRunnerTest(unittest.TestCase):
def setUp(self):
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
def tearDown(self):
ray.shutdown()
_register_all() # re-register the evicted objects
+11 -12
View File
@@ -58,7 +58,7 @@ def run_experiments(experiments,
with_server=False,
server_port=TuneServer.DEFAULT_PORT,
verbose=True,
resume=None,
resume=False,
queue_trials=False,
trial_executor=None,
raise_on_failed_trial=True):
@@ -76,8 +76,8 @@ def run_experiments(experiments,
using the Client API.
server_port (int): Port number for launching TuneServer.
verbose (bool): How much output should be printed for each trial.
resume (bool|None): If checkpoint exists, the experiment will
resume from there. If resume is None, Tune will prompt if
resume (bool|"prompt"): If checkpoint exists, the experiment will
resume from there. If resume is "prompt", Tune will prompt if
checkpoint detected.
queue_trials (bool): Whether to queue trials when the cluster does
not currently have enough resources to launch one. This should
@@ -116,25 +116,24 @@ def run_experiments(experiments,
runner = None
restore = False
# TUNE_RESUME_PROMPT_OFF is for testing purposes and defaults
# `resume=False.`
if os.environ.get("TUNE_RESUME_PROMPT_OFF"):
resume = resume or False
if os.path.exists(
os.path.join(checkpoint_dir, TrialRunner.CKPT_FILE_NAME)):
if resume:
restore = True
elif resume is None:
if resume == "prompt":
msg = ("Found incomplete experiment at {}. "
"Would you like to resume it?".format(checkpoint_dir))
restore = click.confirm(msg, default=True)
restore = click.confirm(msg, default=False)
if restore:
logger.info("Tip: to always resume, "
"pass resume=True to run_experiments()")
else:
logger.info("Tip: to always start a new experiment, "
"pass resume=False to run_experiments()")
elif resume:
restore = True
else:
logger.info(
"Tip: to resume incomplete experiments, "
"pass resume='prompt' or resume=True to run_experiments()")
else:
logger.info(
"Did not find checkpoint file in {}.".format(checkpoint_dir))