diff --git a/doc/source/cluster/autoscaling.rst b/doc/source/cluster/autoscaling.rst index d8719712b..022cad6d1 100644 --- a/doc/source/cluster/autoscaling.rst +++ b/doc/source/cluster/autoscaling.rst @@ -134,12 +134,10 @@ The ``worker_setup_commands`` field (and also the ``initialization_commands`` fi Docker Support ~~~~~~~~~~~~~~ -The ``worker_image`` and ``pull_before_run`` fields override the correpsonding field in the top level ``docker`` section for the node type. The ``worker_run_options`` field is combined with top level ``docker: run_options`` field to produce the docker run command for the given node_type. The following configuration is for a GPU enabled node type. +The ``worker_image`` and ``pull_before_run`` fields override the correpsonding field in the top level ``docker`` section for the node type. The ``worker_run_options`` field is combined with top level ``docker: run_options`` field to produce the docker run command for the given node_type. The following configuration is for a GPU enabled node type. Ray will automatically select the Nvidia docker runtime if it is available. .. code:: pull_before_run: True worker_image: - rayproject/ray-ml:latest-gpu - worker_run_options: - - --runtime=nvidia diff --git a/python/ray/autoscaler/_private/command_runner.py b/python/ray/autoscaler/_private/command_runner.py index 9234e8dd0..5f38ecee0 100644 --- a/python/ray/autoscaler/_private/command_runner.py +++ b/python/ray/autoscaler/_private/command_runner.py @@ -671,7 +671,8 @@ class DockerCommandRunner(CommandRunnerInterface): self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts, self.container_name, self.docker_config.get("run_options", []) + self.docker_config.get( - f"{'head' if as_head else 'worker'}_run_options", [])) + f"{'head' if as_head else 'worker'}_run_options", + []) + self._configure_runtime()) if not self._check_container_status(): self.run(start_command, run_env="host") @@ -714,3 +715,14 @@ class DockerCommandRunner(CommandRunnerInterface): container=self.container_name, dst=self._docker_expand_user(mount))) self.initialized = True + + def _configure_runtime(self): + if self.docker_config.get("disable_automatic_runtime_detection"): + return [] + + runtime_output = self.ssh_command_runner.run( + "docker info -f '{{.Runtimes}}' ", + with_output=True).decode().strip() + if "nvidia-container-runtime" in runtime_output: + return ["--runtime=nvidia"] + return [] diff --git a/python/ray/autoscaler/aws/example-full.yaml b/python/ray/autoscaler/aws/example-full.yaml index 7d74b64bd..2c83ed7e0 100644 --- a/python/ray/autoscaler/aws/example-full.yaml +++ b/python/ray/autoscaler/aws/example-full.yaml @@ -32,8 +32,7 @@ docker: # Example of running a GPU head with CPU workers # head_image: "rayproject/ray:0.8.7-gpu" - # head_run_options: - # - --runtime=nvidia + # Allow Ray to automatically detect GPUs # worker_image: "rayproject/ray:0.8.7" # worker_run_options: [] diff --git a/python/ray/autoscaler/aws/example-gpu-docker.yaml b/python/ray/autoscaler/aws/example-gpu-docker.yaml index 53467b461..07a48d234 100644 --- a/python/ray/autoscaler/aws/example-gpu-docker.yaml +++ b/python/ray/autoscaler/aws/example-gpu-docker.yaml @@ -25,16 +25,11 @@ autoscaling_mode: default docker: image: "rayproject/ray:0.8.7-gpu" container_name: "ray-nvidia-docker-test" # e.g. ray_docker - run_options: - - --runtime=nvidia # # Example of running a GPU head with CPU workers # head_image: "rayproject/ray:0.8.7-gpu" - # head_run_options: - # - --runtime=nvidia # worker_image: "rayproject/ray:0.8.7" - # worker_run_options: [] # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and diff --git a/python/ray/autoscaler/aws/example-ml.yaml b/python/ray/autoscaler/aws/example-ml.yaml index 9732006fa..0a965bbf5 100644 --- a/python/ray/autoscaler/aws/example-ml.yaml +++ b/python/ray/autoscaler/aws/example-ml.yaml @@ -37,11 +37,8 @@ docker: # Example of running a GPU head with CPU workers # head_image: "rayproject/ray:0.8.7-gpu" - # head_run_options: - # - --runtime=nvidia # worker_image: "rayproject/ray:0.8.7" - # worker_run_options: [] # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and diff --git a/python/ray/autoscaler/azure/example-full.yaml b/python/ray/autoscaler/azure/example-full.yaml index c3829ee8e..5f8130178 100644 --- a/python/ray/autoscaler/azure/example-full.yaml +++ b/python/ray/autoscaler/azure/example-full.yaml @@ -32,11 +32,8 @@ docker: # Example of running a GPU head with CPU workers # head_image: "rayproject/ray:0.8.7-gpu" - # head_run_options: - # - --runtime=nvidia # worker_image: "rayproject/ray:0.8.7" - # worker_run_options: [] # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and diff --git a/python/ray/autoscaler/azure/example-gpu-docker.yaml b/python/ray/autoscaler/azure/example-gpu-docker.yaml index 7f0098b3e..8825f8525 100644 --- a/python/ray/autoscaler/azure/example-gpu-docker.yaml +++ b/python/ray/autoscaler/azure/example-gpu-docker.yaml @@ -25,16 +25,11 @@ autoscaling_mode: default docker: image: "rayproject/ray:0.8.7-gpu" container_name: "ray-nvidia-docker-test" # e.g. ray_docker - run_options: - - --runtime=nvidia # # Example of running a GPU head with CPU workers # head_image: "rayproject/ray:0.8.7-gpu" - # head_run_options: - # - --runtime=nvidia # worker_image: "rayproject/ray:0.8.7" - # worker_run_options: [] # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and diff --git a/python/ray/autoscaler/azure/example-gpu.yaml b/python/ray/autoscaler/azure/example-gpu.yaml index 977338dd9..0f204e3a6 100644 --- a/python/ray/autoscaler/azure/example-gpu.yaml +++ b/python/ray/autoscaler/azure/example-gpu.yaml @@ -32,11 +32,8 @@ docker: # Example of running a GPU head with CPU workers # head_image: "rayproject/ray:0.8.7-gpu" - # head_run_options: - # - --runtime=nvidia # worker_image: "rayproject/ray:0.8.7" - # worker_run_options: [] # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and diff --git a/python/ray/autoscaler/gcp/example-gpu-docker.yaml b/python/ray/autoscaler/gcp/example-gpu-docker.yaml index 82caee39a..6ad3d916a 100644 --- a/python/ray/autoscaler/gcp/example-gpu-docker.yaml +++ b/python/ray/autoscaler/gcp/example-gpu-docker.yaml @@ -25,16 +25,12 @@ autoscaling_mode: default docker: image: "rayproject/ray:0.8.7-gpu" container_name: "ray-nvidia-docker-test" # e.g. ray_docker - run_options: - - --runtime=nvidia # # Example of running a GPU head with CPU workers # head_image: "rayproject/ray:0.8.7-gpu" - # head_run_options: - # - --runtime=nvidia + # worker_image: "rayproject/ray:0.8.7" - # worker_run_options: [] # The autoscaler will scale up the cluster to this target fraction of resource diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json index 172936341..9c602332d 100644 --- a/python/ray/autoscaler/ray-schema.json +++ b/python/ray/autoscaler/ray-schema.json @@ -217,6 +217,11 @@ "worker_run_options": { "type": "array", "description": "analogous to head_run_options" + }, + "disable_automatic_runtime_detection" : { + "type": "boolean", + "description": "disable Ray from automatically using the NVIDIA runtime if available", + "default": false } } },