diff --git a/python/ray/autoscaler/_private/command_runner.py b/python/ray/autoscaler/_private/command_runner.py index 5a6354078..3dd0b3d38 100644 --- a/python/ray/autoscaler/_private/command_runner.py +++ b/python/ray/autoscaler/_private/command_runner.py @@ -765,16 +765,20 @@ class DockerCommandRunner(CommandRunnerInterface): "~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem" ] - image = self.docker_config.get("image") - image = self.docker_config.get( - f"{'head' if as_head else 'worker'}_image", image) + specific_image = self.docker_config.get( + f"{'head' if as_head else 'worker'}_image", + self.docker_config.get("image")) self._check_docker_installed() if self.docker_config.get("pull_before_run", True): - assert image, "Image must be included in config if " + \ + assert specific_image, "Image must be included in config if " + \ "pull_before_run is specified" + self.run("docker pull {}".format(specific_image), run_env="host") + else: - self.run("docker pull {}".format(image), run_env="host") + self.run( + f"docker image inspect {specific_image} 1> /dev/null 2>&1 || " + f"docker pull {specific_image}") # Bootstrap files cannot be bind mounted because docker opens the # underlying inode. When the file is switched, docker becomes outdated. @@ -788,14 +792,14 @@ class DockerCommandRunner(CommandRunnerInterface): requires_re_init = False if container_running: requires_re_init = self._check_if_container_restart_is_needed( - image, cleaned_bind_mounts) + specific_image, cleaned_bind_mounts) if requires_re_init: self.run(f"docker stop {self.container_name}", run_env="host") if (not container_running) or requires_re_init: # Get home directory image_env = self.ssh_command_runner.run( - "docker inspect -f '{{json .Config.Env}}' " + image, + "docker inspect -f '{{json .Config.Env}}' " + specific_image, with_output=True).decode().strip() home_directory = "/root" for env_var in json.loads(image_env): @@ -804,8 +808,8 @@ class DockerCommandRunner(CommandRunnerInterface): break start_command = docker_start_cmds( - self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts, - self.container_name, + self.ssh_command_runner.ssh_user, specific_image, + cleaned_bind_mounts, self.container_name, self.docker_config.get( "run_options", []) + self.docker_config.get( f"{'head' if as_head else 'worker'}_run_options", []) + diff --git a/python/ray/tests/test_autoscaler.py b/python/ray/tests/test_autoscaler.py index 111db350f..abec214dc 100644 --- a/python/ray/tests/test_autoscaler.py +++ b/python/ray/tests/test_autoscaler.py @@ -1935,6 +1935,42 @@ MemAvailable: 33000000 kB runner.assert_has_call("172.0.0.0", pattern="--shm-size") runner.assert_has_call("172.0.0.0", pattern="--runtime=nvidia") + def testDockerImageExistsBeforeInspect(self): + config = copy.deepcopy(SMALL_CLUSTER) + config["min_workers"] = 1 + config["max_workers"] = 1 + config["docker"]["pull_before_run"] = False + config_path = self.write_config(config) + self.provider = MockProvider() + runner = MockProcessRunner() + runner.respond_to_call("json .Config.Env", ["[]" for i in range(1)]) + autoscaler = StandardAutoscaler( + config_path, + LoadMetrics(), + max_failures=0, + process_runner=runner, + update_interval_s=0) + autoscaler.update() + autoscaler.update() + self.waitForNodes(1) + self.provider.finish_starting_nodes() + autoscaler.update() + self.waitForNodes( + 1, tag_filters={TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE}) + first_pull = [(i, cmd) + for i, cmd in enumerate(runner.command_history()) + if "docker pull" in cmd] + first_targeted_inspect = [ + (i, cmd) for i, cmd in enumerate(runner.command_history()) + if "docker inspect -f" in cmd + ] + + # This checks for the bug mentioned #13128 where the image is inspected + # before the image is present. + assert min(x[0] + for x in first_pull) < min(x[0] + for x in first_targeted_inspect) + if __name__ == "__main__": import sys