mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 13:37:39 +08:00
[tune] resume=False by default but print a tip to set resume="prompt" + jenkins fix (#3681)
This commit is contained in:
@@ -82,4 +82,4 @@ class A3CTorchPolicyGraph(TorchPolicyGraph):
|
||||
with self.lock:
|
||||
obs = torch.from_numpy(obs).float().unsqueeze(0)
|
||||
_, _, vf, _ = self.model({"obs": obs}, [])
|
||||
return vf.numpy().squeeze()
|
||||
return vf.detach().numpy().squeeze()
|
||||
|
||||
@@ -62,7 +62,6 @@ def _start_new_cluster():
|
||||
@pytest.fixture
|
||||
def start_connected_cluster():
|
||||
# Start the Ray processes.
|
||||
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
|
||||
cluster = _start_new_cluster()
|
||||
yield cluster
|
||||
# The code after the yield will run as teardown code.
|
||||
@@ -74,7 +73,6 @@ def start_connected_cluster():
|
||||
def start_connected_emptyhead_cluster():
|
||||
"""Starts head with no resources."""
|
||||
|
||||
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
|
||||
cluster = Cluster(
|
||||
initialize_head=True,
|
||||
connect=True,
|
||||
|
||||
@@ -39,7 +39,6 @@ else:
|
||||
|
||||
class TrainableFunctionApiTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
|
||||
ray.init(num_cpus=4, num_gpus=0)
|
||||
|
||||
def tearDown(self):
|
||||
@@ -545,7 +544,6 @@ class TrainableFunctionApiTest(unittest.TestCase):
|
||||
|
||||
class RunExperimentTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
|
||||
ray.init()
|
||||
|
||||
def tearDown(self):
|
||||
@@ -759,7 +757,6 @@ class RunExperimentTest(unittest.TestCase):
|
||||
|
||||
class VariantGeneratorTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
|
||||
ray.init()
|
||||
|
||||
def tearDown(self):
|
||||
@@ -963,9 +960,6 @@ def create_mock_components():
|
||||
|
||||
|
||||
class TrialRunnerTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
|
||||
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
_register_all() # re-register the evicted objects
|
||||
|
||||
+11
-12
@@ -58,7 +58,7 @@ def run_experiments(experiments,
|
||||
with_server=False,
|
||||
server_port=TuneServer.DEFAULT_PORT,
|
||||
verbose=True,
|
||||
resume=None,
|
||||
resume=False,
|
||||
queue_trials=False,
|
||||
trial_executor=None,
|
||||
raise_on_failed_trial=True):
|
||||
@@ -76,8 +76,8 @@ def run_experiments(experiments,
|
||||
using the Client API.
|
||||
server_port (int): Port number for launching TuneServer.
|
||||
verbose (bool): How much output should be printed for each trial.
|
||||
resume (bool|None): If checkpoint exists, the experiment will
|
||||
resume from there. If resume is None, Tune will prompt if
|
||||
resume (bool|"prompt"): If checkpoint exists, the experiment will
|
||||
resume from there. If resume is "prompt", Tune will prompt if
|
||||
checkpoint detected.
|
||||
queue_trials (bool): Whether to queue trials when the cluster does
|
||||
not currently have enough resources to launch one. This should
|
||||
@@ -116,25 +116,24 @@ def run_experiments(experiments,
|
||||
runner = None
|
||||
restore = False
|
||||
|
||||
# TUNE_RESUME_PROMPT_OFF is for testing purposes and defaults
|
||||
# `resume=False.`
|
||||
if os.environ.get("TUNE_RESUME_PROMPT_OFF"):
|
||||
resume = resume or False
|
||||
|
||||
if os.path.exists(
|
||||
os.path.join(checkpoint_dir, TrialRunner.CKPT_FILE_NAME)):
|
||||
if resume:
|
||||
restore = True
|
||||
elif resume is None:
|
||||
if resume == "prompt":
|
||||
msg = ("Found incomplete experiment at {}. "
|
||||
"Would you like to resume it?".format(checkpoint_dir))
|
||||
restore = click.confirm(msg, default=True)
|
||||
restore = click.confirm(msg, default=False)
|
||||
if restore:
|
||||
logger.info("Tip: to always resume, "
|
||||
"pass resume=True to run_experiments()")
|
||||
else:
|
||||
logger.info("Tip: to always start a new experiment, "
|
||||
"pass resume=False to run_experiments()")
|
||||
elif resume:
|
||||
restore = True
|
||||
else:
|
||||
logger.info(
|
||||
"Tip: to resume incomplete experiments, "
|
||||
"pass resume='prompt' or resume=True to run_experiments()")
|
||||
else:
|
||||
logger.info(
|
||||
"Did not find checkpoint file in {}.".format(checkpoint_dir))
|
||||
|
||||
Reference in New Issue
Block a user