[autoscaler] Ensure run_init happens in all cases (#10820)

* taking a stab at the horror that is NodeUpdaterThread

* added tests
This commit is contained in:
Ian Rodney
2020-09-17 11:23:56 -07:00
committed by GitHub
parent 47d7d83b6f
commit a159ae72b3
2 changed files with 30 additions and 6 deletions
+7 -6
View File
@@ -294,10 +294,16 @@ class NodeUpdater:
node_tags = self.provider.node_tags(self.node_id)
logger.debug("Node tags: {}".format(str(node_tags)))
if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash:
# When resuming from a stopped instance the runtime_hash may be the
# same, but the container will not be started.
self.cmd_runner.run_init(
as_head=self.is_head_node, file_mounts=self.file_mounts)
# runtime_hash will only change whenever the user restarts
# or updates their cluster with `get_or_create_head_node`
if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and (
self.file_mounts_contents_hash is None
not self.file_mounts_contents_hash
or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) ==
self.file_mounts_contents_hash):
# todo: we lie in the confirmation message since
@@ -310,11 +316,6 @@ class NodeUpdater:
"{}{} already up-to-date, skip to ray start",
self.log_prefix, self.node_id)
# When resuming from a stopped instance the runtime_hash may be the
# same, but the container will not be started.
self.cmd_runner.run_init(
as_head=self.is_head_node, file_mounts=self.file_mounts)
else:
cli_logger.print(
"Updating cluster configuration.",
+23
View File
@@ -1110,7 +1110,10 @@ class AutoscalingTest(unittest.TestCase):
runner.assert_has_call("172.0.0.1", "start_ray_worker")
def testSetupCommandsWithStoppedNodeCaching(self):
file_mount_dir = tempfile.mkdtemp()
config = SMALL_CLUSTER.copy()
config["file_mounts"] = {"/root/test-folder": file_mount_dir}
config["file_mounts_sync_continuously"] = True
config["min_workers"] = 1
config["max_workers"] = 1
config_path = self.write_config(config)
@@ -1133,6 +1136,7 @@ class AutoscalingTest(unittest.TestCase):
runner.assert_has_call("172.0.0.0", "setup_cmd")
runner.assert_has_call("172.0.0.0", "worker_setup_cmd")
runner.assert_has_call("172.0.0.0", "start_ray_worker")
runner.assert_has_call("172.0.0.0", "docker run")
# Check the node was indeed reused
self.provider.terminate_node(0)
@@ -1147,6 +1151,25 @@ class AutoscalingTest(unittest.TestCase):
runner.assert_not_has_call("172.0.0.0", "setup_cmd")
runner.assert_not_has_call("172.0.0.0", "worker_setup_cmd")
runner.assert_has_call("172.0.0.0", "start_ray_worker")
runner.assert_has_call("172.0.0.0", "docker run")
with open(f"{file_mount_dir}/new_file", "w") as f:
f.write("abcdefgh")
# Check that run_init happens when file_mounts have updated
self.provider.terminate_node(0)
autoscaler.update()
self.waitForNodes(1)
runner.clear_history()
self.provider.finish_starting_nodes()
autoscaler.update()
self.waitForNodes(
1, tag_filters={TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE})
runner.assert_not_has_call("172.0.0.0", "init_cmd")
runner.assert_not_has_call("172.0.0.0", "setup_cmd")
runner.assert_not_has_call("172.0.0.0", "worker_setup_cmd")
runner.assert_has_call("172.0.0.0", "start_ray_worker")
runner.assert_has_call("172.0.0.0", "docker run")
runner.clear_history()
autoscaler.update()