From 049b7b2017bbcf58814f139202e834b7df097cd3 Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Fri, 11 Sep 2020 16:56:26 -0700 Subject: [PATCH] [docker] Revert to rsync & cp instead of file mount for bootstrap config/key (#10734) --- python/ray/autoscaler/command_runner.py | 24 ++++++++++++++++-- python/ray/tests/test_autoscaler.py | 33 ++++++++++++++++++------- python/ray/tests/test_command_runner.py | 4 +-- 3 files changed, 48 insertions(+), 13 deletions(-) diff --git a/python/ray/autoscaler/command_runner.py b/python/ray/autoscaler/command_runner.py index b94a73683..74304585f 100644 --- a/python/ray/autoscaler/command_runner.py +++ b/python/ray/autoscaler/command_runner.py @@ -709,6 +709,10 @@ class DockerCommandRunner(CommandRunnerInterface): return string def run_init(self, *, as_head, file_mounts): + BOOTSTRAP_MOUNTS = [ + "~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem" + ] + image = self.docker_config.get("image") image = self.docker_config.get( f"{'head' if as_head else 'worker'}_image", image) @@ -720,8 +724,14 @@ class DockerCommandRunner(CommandRunnerInterface): self.run("docker pull {}".format(image), run_env="host") + # Bootstrap files cannot be bind mounted because docker opens the + # underlying inode. When the file is switched, docker becomes outdated. + cleaned_bind_mounts = file_mounts.copy() + for mnt in BOOTSTRAP_MOUNTS: + cleaned_bind_mounts.pop(mnt, None) + start_command = docker_start_cmds( - self.ssh_command_runner.ssh_user, image, file_mounts, + self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts, self.container_name, self.docker_config.get("run_options", []) + self.docker_config.get( f"{'head' if as_head else 'worker'}_run_options", [])) @@ -746,7 +756,8 @@ class DockerCommandRunner(CommandRunnerInterface): active_remote_mounts = [ mnt["Destination"] for mnt in active_mounts ] - for remote, local in file_mounts.items(): + # Ignore ray bootstrap files. + for remote, local in cleaned_bind_mounts.items(): remote = self._docker_expand_user(remote) if remote not in active_remote_mounts: cli_logger.error( @@ -756,4 +767,13 @@ class DockerCommandRunner(CommandRunnerInterface): cli_logger.verbose( "Unable to check if file_mounts specified in the YAML " "differ from those on the running container.") + + # Explicitly copy in ray bootstrap files. + for mount in BOOTSTRAP_MOUNTS: + if mount in file_mounts: + self.ssh_command_runner.run( + "docker cp {src} {container}:{dst}".format( + src=os.path.join(DOCKER_MOUNT_PREFIX, mount), + container=self.container_name, + dst=self._docker_expand_user(mount))) self.initialized = True diff --git a/python/ray/tests/test_autoscaler.py b/python/ray/tests/test_autoscaler.py index 9e40da822..daf21d738 100644 --- a/python/ray/tests/test_autoscaler.py +++ b/python/ray/tests/test_autoscaler.py @@ -54,15 +54,17 @@ class MockProcessRunner: def check_output(self, cmd): self.check_call(cmd) return_string = "command-output" - key_to_delete = None - for pattern, pair in self.call_response.items(): + key_to_shrink = None + for pattern, response_list in self.call_response.items(): if pattern in str(cmd): - return_string = pair[0] - if pair[1] - 1 == 0: - key_to_delete = pattern + return_string = response_list[0] + key_to_shrink = pattern break - if key_to_delete: - del self.call_response[key_to_delete] + if key_to_shrink: + self.call_response[key_to_shrink] = self.call_response[ + key_to_shrink][1:] + if len(self.call_response[key_to_shrink]) == 0: + del self.call_response[key_to_shrink] return return_string.encode() @@ -108,8 +110,8 @@ class MockProcessRunner: def clear_history(self): self.calls = [] - def respond_to_call(self, pattern, response, num_times=1): - self.call_response[pattern] = (response, num_times) + def respond_to_call(self, pattern, response_list): + self.call_response[pattern] = response_list class MockProvider(NodeProvider): @@ -400,6 +402,10 @@ class AutoscalingTest(unittest.TestCase): config_path = self.write_config(SMALL_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() + runner.respond_to_call("json .Mounts", ["[]"]) + # Two initial calls to docker cp, one before run, two final calls to cp + runner.respond_to_call(".State.Running", + ["false", "false", "false", "true", "true"]) get_or_create_head_node( SMALL_CLUSTER, config_path, @@ -414,6 +420,15 @@ class AutoscalingTest(unittest.TestCase): runner.assert_has_call("1.2.3.4", "head_setup_cmd") runner.assert_has_call("1.2.3.4", "start_ray_head") self.assertEqual(self.provider.mock_nodes[0].node_type, None) + runner.assert_has_call("1.2.3.4", pattern="docker run") + runner.assert_not_has_call( + "1.2.3.4", pattern="-v /tmp/ray_tmp_mount/~/ray_bootstrap_config") + runner.assert_has_call( + "1.2.3.4", + pattern="docker cp /tmp/ray_tmp_mount/~/ray_bootstrap_key.pem") + runner.assert_has_call( + "1.2.3.4", + pattern="docker cp /tmp/ray_tmp_mount/~/ray_bootstrap_config.yaml") def testScaleUp(self): config_path = self.write_config(SMALL_CLUSTER) diff --git a/python/ray/tests/test_command_runner.py b/python/ray/tests/test_command_runner.py index 5a26aaa09..2e747d882 100644 --- a/python/ray/tests/test_command_runner.py +++ b/python/ray/tests/test_command_runner.py @@ -202,7 +202,7 @@ def test_docker_rsync(): remote_file = "/root/protected-file" remote_host_file = f"{DOCKER_MOUNT_PREFIX}{remote_file}" - process_runner.respond_to_call("docker inspect -f", "true") + process_runner.respond_to_call("docker inspect -f", ["true"]) cmd_runner.run_rsync_up( local_mount, remote_mount, options={"file_mount": True}) @@ -219,7 +219,7 @@ def test_docker_rsync(): process_runner.clear_history() ############################## - process_runner.respond_to_call("docker inspect -f", "true") + process_runner.respond_to_call("docker inspect -f", ["true"]) cmd_runner.run_rsync_up( local_file, remote_file, options={"file_mount": False})