[Docker] Automagically add "runtime=nvidia" (#11125)

2026-06-30 22:37:34 +08:00 · 2020-10-01 17:04:19 -07:00
parent 681c24754a
commit 0d5b09f426
10 changed files with 21 additions and 30 deletions
@@ -671,7 +671,8 @@ class DockerCommandRunner(CommandRunnerInterface):
            self.ssh_command_runner.ssh_user, image, cleaned_bind_mounts,
            self.container_name,
            self.docker_config.get("run_options", []) + self.docker_config.get(
-                f"{'head' if as_head else 'worker'}_run_options", []))
+                f"{'head' if as_head else 'worker'}_run_options",
+                []) + self._configure_runtime())

        if not self._check_container_status():
            self.run(start_command, run_env="host")
@@ -714,3 +715,14 @@ class DockerCommandRunner(CommandRunnerInterface):
                        container=self.container_name,
                        dst=self._docker_expand_user(mount)))
        self.initialized = True
+
+    def _configure_runtime(self):
+        if self.docker_config.get("disable_automatic_runtime_detection"):
+            return []
+
+        runtime_output = self.ssh_command_runner.run(
+            "docker info -f '{{.Runtimes}}' ",
+            with_output=True).decode().strip()
+        if "nvidia-container-runtime" in runtime_output:
+            return ["--runtime=nvidia"]
+        return []
@@ -32,8 +32,7 @@ docker:

    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray:0.8.7-gpu"
-    # head_run_options:
-    #     - --runtime=nvidia
+    # Allow Ray to automatically detect GPUs

    # worker_image: "rayproject/ray:0.8.7"
    # worker_run_options: []
@@ -25,16 +25,11 @@ autoscaling_mode: default
 docker:
    image: "rayproject/ray:0.8.7-gpu"
    container_name: "ray-nvidia-docker-test" # e.g. ray_docker
-    run_options:
-      - --runtime=nvidia

    # # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray:0.8.7-gpu"
-    # head_run_options:
-    #     - --runtime=nvidia

    # worker_image: "rayproject/ray:0.8.7"
-    # worker_run_options: []

 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -37,11 +37,8 @@ docker:

    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray:0.8.7-gpu"
-    # head_run_options:
-    #     - --runtime=nvidia

    # worker_image: "rayproject/ray:0.8.7"
-    # worker_run_options: []

 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -32,11 +32,8 @@ docker:

    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray:0.8.7-gpu"
-    # head_run_options:
-    #     - --runtime=nvidia

    # worker_image: "rayproject/ray:0.8.7"
-    # worker_run_options: []

 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -25,16 +25,11 @@ autoscaling_mode: default
 docker:
    image: "rayproject/ray:0.8.7-gpu"
    container_name: "ray-nvidia-docker-test" # e.g. ray_docker
-    run_options:
-      - --runtime=nvidia

    # # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray:0.8.7-gpu"
-    # head_run_options:
-    #     - --runtime=nvidia

    # worker_image: "rayproject/ray:0.8.7"
-    # worker_run_options: []

 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -32,11 +32,8 @@ docker:

    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray:0.8.7-gpu"
-    # head_run_options:
-    #     - --runtime=nvidia

    # worker_image: "rayproject/ray:0.8.7"
-    # worker_run_options: []

 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -25,16 +25,12 @@ autoscaling_mode: default
 docker:
    image: "rayproject/ray:0.8.7-gpu"
    container_name: "ray-nvidia-docker-test" # e.g. ray_docker
-    run_options:
-      - --runtime=nvidia

    # # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray:0.8.7-gpu"
-    # head_run_options:
-    #     - --runtime=nvidia
+

    # worker_image: "rayproject/ray:0.8.7"
-    # worker_run_options: []


 # The autoscaler will scale up the cluster to this target fraction of resource
@@ -217,6 +217,11 @@
                "worker_run_options": {
                    "type": "array",
                    "description": "analogous to head_run_options"
+                }, 
+                "disable_automatic_runtime_detection" : {
+                    "type": "boolean",
+                    "description": "disable Ray from automatically using the NVIDIA runtime if available",
+                    "default": false
                }
            }
        },