[autoscaler] docker run options (#3921)

Adds support for docker options, allowing for use of nvidia-docker. Closes #2657.
2026-07-01 18:04:09 +08:00 · 2019-02-13 12:26:28 -08:00
parent 4347ab644e
commit 729d0b2825
10 changed files with 419 additions and 101 deletions
@@ -90,6 +90,7 @@ CLUSTER_CONFIG_SCHEMA = {
        {
            "image": (str, OPTIONAL),  # e.g. tensorflow/tensorflow:1.5.0-py3
            "container_name": (str, OPTIONAL),  # e.g., ray_docker
+            "run_options": (list, OPTIONAL),
        },
        OPTIONAL),

@@ -102,7 +103,12 @@ CLUSTER_CONFIG_SCHEMA = {
    # Map of remote paths to local paths, e.g. {"/tmp/data": "/my/local/data"}
    "file_mounts": (dict, OPTIONAL),

-    # List of common shell commands to run to initialize nodes.
+    # List of commands that will be run before `setup_commands`. If docker is
+    # enabled, these commands will run outside the container and before docker
+    # is setup.
+    "initialization_commands": (list, OPTIONAL),
+
+    # List of common shell commands to run to setup nodes.
    "setup_commands": (list, OPTIONAL),

    # Commands that will be run on the head node after common setup.
@@ -527,13 +533,16 @@ class StandardAutoscaler(object):
                       "{}: No heartbeat in {}s, "
                       "restarting Ray to recover...".format(node_id, delta))
        updater = NodeUpdaterThread(
-            node_id,
-            self.config["provider"],
-            self.provider,
-            self.config["auth"],
-            self.config["cluster_name"], {},
-            with_head_node_ip(self.config["worker_start_ray_commands"]),
-            self.runtime_hash,
+            node_id=node_id,
+            provider_config=self.config["provider"],
+            provider=self.provider,
+            auth_config=self.config["auth"],
+            cluster_name=self.config["cluster_name"],
+            file_mounts={},
+            initialization_commands=[],
+            setup_commands=with_head_node_ip(
+                self.config["worker_start_ray_commands"]),
+            runtime_hash=self.runtime_hash,
            process_runner=self.process_runner,
            use_internal_ip=True)
        updater.start()
@@ -561,14 +570,16 @@ class StandardAutoscaler(object):

    def spawn_updater(self, node_id, init_commands):
        updater = NodeUpdaterThread(
-            node_id,
-            self.config["provider"],
-            self.provider,
-            self.config["auth"],
-            self.config["cluster_name"],
-            self.config["file_mounts"],
-            with_head_node_ip(init_commands),
-            self.runtime_hash,
+            node_id=node_id,
+            provider_config=self.config["provider"],
+            provider=self.provider,
+            auth_config=self.config["auth"],
+            cluster_name=self.config["cluster_name"],
+            file_mounts=self.config["file_mounts"],
+            initialization_commands=with_head_node_ip(
+                self.config["initialization_commands"]),
+            setup_commands=with_head_node_ip(init_commands),
+            runtime_hash=self.runtime_hash,
            process_runner=self.process_runner,
            use_internal_ip=True)
        updater.start()
@@ -20,6 +20,7 @@ initial_workers: 0
 docker:
    image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
    container_name: "" # e.g. ray_docker
+    run_options: []  # Extra options to pass into "docker run"

 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -54,7 +55,7 @@ auth:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 head_node:
    InstanceType: m5.large
-    ImageId: ami-3b6bce43  # Amazon Deep Learning AMI (Ubuntu)
+    ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0

    # You can provision additional disk space with a conf as follows
    BlockDeviceMappings:
@@ -70,7 +71,7 @@ head_node:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 worker_nodes:
    InstanceType: m5.large
-    ImageId: ami-3b6bce43  # Amazon Deep Learning AMI (Ubuntu)
+    ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0

    # Run workers on spot by default. Comment this out to use on-demand.
    InstanceMarketOptions:
@@ -88,6 +89,11 @@ file_mounts: {
 #    "/path2/on/remote/machine": "/path2/on/local/machine",
 }

+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
 # List of shell commands to run to set up nodes.
 setup_commands:
    # Note: if you're developing Ray, you probably want to create an AMI that
@@ -0,0 +1,114 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: gpu-docker
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The initial number of worker nodes to launch in addition to the head
+# node. When the cluster is first brought up (or when it is refreshed with a
+# subsequent `ray up`) this number of nodes will be started.
+initial_workers: 0
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker:
+    image: "tensorflow/tensorflow:1.12.0-gpu-py3"
+    container_name: "ray-nvidia-docker-test" # e.g. ray_docker
+    run_options:
+      - --runtime=nvidia
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This value must be less than 1.0 for scaling to happen.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: aws
+    region: us-west-2
+    # Availability zone(s), comma-separated, that nodes may be launched in.
+    # Nodes are currently spread between zones by a round-robin approach,
+    # however this implementation detail should not be relied upon.
+    availability_zone: us-west-2a,us-west-2b
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below.
+#    ssh_private_key: /path/to/your/key.pem
+
+# Provider-specific config for the head node, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+head_node:
+    InstanceType: p2.xlarge
+    ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
+
+    # You can provision additional disk space with a conf as follows
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+              VolumeSize: 100
+
+    # Additional options in the boto docs.
+
+# Provider-specific config for worker nodes, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+worker_nodes:
+    InstanceType: m5.large
+    ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
+
+    # Run workers on spot by default. Comment this out to use on-demand.
+    InstanceMarketOptions:
+        MarketType: spot
+        # Additional options can be found in the boto docs, e.g.
+        #   SpotOptions:
+        #       MaxPrice: MAX_HOURLY_PRICE
+
+    # Additional options in the boto docs.
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# List of shell commands to run to set up nodes.
+setup_commands:
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp27-cp27mu-manylinux1_x86_64.whl
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp35-cp35m-manylinux1_x86_64.whl
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp36-cp36m-manylinux1_x86_64.whl
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+    - pip install boto3==1.4.8  # 1.4.8 adds InstanceMarketOptions
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -26,6 +26,7 @@ from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_LAUNCH_CONFIG, \
    TAG_RAY_NODE_NAME
 from ray.autoscaler.updater import NodeUpdaterThread
 from ray.autoscaler.log_timer import LogTimer
+from ray.autoscaler.docker import with_docker_exec

 logger = logging.getLogger(__name__)

@@ -130,9 +131,16 @@ def kill_node(config_file, yes, override_cluster_name):
    node = random.choice(nodes)
    logger.info("kill_node: Terminating worker {}".format(node))

-    updater = NodeUpdaterThread(node, config["provider"], provider,
-                                config["auth"], config["cluster_name"],
-                                config["file_mounts"], [], "")
+    updater = NodeUpdaterThread(
+        node_id=node,
+        provider_config=config["provider"],
+        provider=provider,
+        auth_config=config["auth"],
+        cluster_name=config["cluster_name"],
+        file_mounts=config["file_mounts"],
+        initialization_commands=[],
+        setup_commands=[],
+        runtime_hash="")

    _exec(updater, "ray stop", False, False)

@@ -222,14 +230,15 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
                config["head_start_ray_commands"])

        updater = NodeUpdaterThread(
-            head_node,
-            config["provider"],
-            provider,
-            config["auth"],
-            config["cluster_name"],
-            config["file_mounts"],
-            init_commands,
-            runtime_hash,
+            node_id=head_node,
+            provider_config=config["provider"],
+            provider=provider,
+            auth_config=config["auth"],
+            cluster_name=config["cluster_name"],
+            file_mounts=config["file_mounts"],
+            initialization_commands=config["initialization_commands"],
+            setup_commands=init_commands,
+            runtime_hash=runtime_hash,
        )
        updater.start()
        updater.join()
@@ -247,19 +256,16 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
                        provider.external_ip(head_node)))

        monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*"
-        for s in init_commands:
-            if ("ray start" in s and "docker exec" in s
-                    and "--autoscaling-config" in s):
-                monitor_str = "docker exec {} /bin/sh -c {}".format(
-                    config["docker"]["container_name"], quote(monitor_str))
+        use_docker = bool(config["docker"]["container_name"])
        if override_cluster_name:
            modifiers = " --cluster-name={}".format(
                quote(override_cluster_name))
        else:
            modifiers = ""
        print("To monitor auto-scaling activity, you can run:\n\n"
-              "  ray exec {} {}{}\n".format(config_file, quote(monitor_str),
-                                            modifiers))
+              "  ray exec {} {}{}{}\n".format(
+                  config_file, "--docker " if use_docker else " ",
+                  quote(monitor_str), modifiers))
        print("To open a console on the cluster:\n\n"
              "  ray attach {}{}\n".format(config_file, modifiers))
        print("To ssh manually to the cluster, run:\n\n"
@@ -292,17 +298,18 @@ def attach_cluster(config_file, start, use_tmux, override_cluster_name, new):
        else:
            cmd = "screen -L -xRR"

-    exec_cluster(config_file, cmd, False, False, False, start,
+    exec_cluster(config_file, cmd, False, False, False, False, start,
                 override_cluster_name, None)


-def exec_cluster(config_file, cmd, screen, tmux, stop, start,
+def exec_cluster(config_file, cmd, docker, screen, tmux, stop, start,
                 override_cluster_name, port_forward):
    """Runs a command on the specified cluster.

    Arguments:
        config_file: path to the cluster yaml
        cmd: command to run
+        docker: whether to run command in docker container of config
        screen: whether to run in a screen
        tmux: whether to run in a tmux session
        stop: whether to stop the cluster after command run
@@ -316,25 +323,41 @@ def exec_cluster(config_file, cmd, screen, tmux, stop, start,
    if override_cluster_name is not None:
        config["cluster_name"] = override_cluster_name
    config = _bootstrap_config(config)
+
    head_node = _get_head_node(
        config, config_file, override_cluster_name, create_if_needed=start)

    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        updater = NodeUpdaterThread(
-            head_node,
-            config["provider"],
-            provider,
-            config["auth"],
-            config["cluster_name"],
-            config["file_mounts"],
-            [],
-            "",
+            node_id=head_node,
+            provider_config=config["provider"],
+            provider=provider,
+            auth_config=config["auth"],
+            cluster_name=config["cluster_name"],
+            file_mounts=config["file_mounts"],
+            initialization_commands=[],
+            setup_commands=[],
+            runtime_hash="",
        )
+
+        def wrap_docker(command):
+            container_name = config["docker"]["container_name"]
+            if not container_name:
+                raise ValueError("Docker container not specified in config.")
+            return with_docker_exec(
+                [command], container_name=container_name)[0]
+
+        cmd = wrap_docker(cmd) if docker else cmd
+
        if stop:
-            cmd += (
-                "; ray stop; ray teardown ~/ray_bootstrap_config.yaml --yes "
-                "--workers-only; sudo shutdown -h now")
+            shutdown_cmd = (
+                "ray stop; ray teardown ~/ray_bootstrap_config.yaml "
+                "--yes --workers-only")
+            if docker:
+                shutdown_cmd = wrap_docker(shutdown_cmd)
+            cmd += ("; {}; sudo shutdown -h now".format(shutdown_cmd))
+
        _exec(
            updater,
            cmd,
@@ -378,7 +401,6 @@ def _exec(updater, cmd, screen, tmux, expect_error=False, port_forward=None):
            cmd = " ".join(cmd)
        updater.ssh_cmd(
            cmd,
-            verbose=False,
            allocate_tty=True,
            expect_error=expect_error,
            port_forward=port_forward)
@@ -405,14 +427,15 @@ def rsync(config_file, source, target, override_cluster_name, down):
    provider = get_node_provider(config["provider"], config["cluster_name"])
    try:
        updater = NodeUpdaterThread(
-            head_node,
-            config["provider"],
-            provider,
-            config["auth"],
-            config["cluster_name"],
-            config["file_mounts"],
-            [],
-            "",
+            node_id=head_node,
+            provider_config=config["provider"],
+            provider=provider,
+            auth_config=config["auth"],
+            cluster_name=config["cluster_name"],
+            file_mounts=config["file_mounts"],
+            initialization_commands=[],
+            setup_commands=[],
+            runtime_hash="",
        )
        if down:
            rsync = updater.rsync_down
@@ -17,6 +17,8 @@ def dockerize_if_needed(config):
        return config
    docker_image = config["docker"].get("image")
    cname = config["docker"].get("container_name")
+    run_options = config["docker"].get("run_options", [])
+    ssh_user = config["auth"]["ssh_user"]
    if not docker_image:
        if cname:
            logger.warning(
@@ -26,10 +28,11 @@ def dockerize_if_needed(config):
    else:
        assert cname, "Must provide container name!"
    docker_mounts = {dst: dst for dst in config["file_mounts"]}
+
    config["setup_commands"] = (
-        docker_install_cmds() + docker_start_cmds(
-            config["auth"]["ssh_user"], docker_image, docker_mounts, cname) +
-        with_docker_exec(config["setup_commands"], container_name=cname))
+        docker_start_cmds(ssh_user, docker_image, docker_mounts, cname,
+                          run_options) + with_docker_exec(
+                              config["setup_commands"], container_name=cname))

    config["head_setup_commands"] = with_docker_exec(
        config["head_setup_commands"], container_name=cname)
@@ -58,13 +61,6 @@ def with_docker_exec(cmds, container_name, env_vars=None):
    ]


-def docker_install_cmds():
-    return [
-        aptwait_cmd() + " && sudo apt-get update",
-        aptwait_cmd() + " && sudo apt-get install -y docker.io"
-    ]
-
-
 def aptwait_cmd():
    return ("while sudo fuser"
            " /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock"
@@ -72,13 +68,8 @@ def aptwait_cmd():
            "do echo 'Waiting for release of dpkg/apt locks'; sleep 5; done")


-def docker_start_cmds(user, image, mount, cname):
+def docker_start_cmds(user, image, mount, cname, user_options):
    cmds = []
-    cmds.append("sudo kill -SIGUSR1 $(pidof dockerd) || true")
-    cmds.append("sudo service docker start")
-    cmds.append("sudo usermod -a -G docker {}".format(user))
-    cmds.append("docker rm -f {} || true".format(cname))
-    cmds.append("docker pull {}".format(image))

    # create flags
    # ports for the redis, object manager, and tune client
@@ -94,16 +85,21 @@ def docker_start_cmds(user, image, mount, cname):
    env_flags = " ".join(
        ["-e {name}={val}".format(name=k, val=v) for k, v in env_vars.items()])

+    user_options_str = " ".join(user_options)
    # docker run command
+    docker_check = [
+        "docker", "inspect", "-f", "'{{.State.Running}}'", cname, "||"
+    ]
    docker_run = [
        "docker", "run", "--rm", "--name {}".format(cname), "-d", "-it",
-        port_flags, mount_flags, env_flags, "--net=host", image, "bash"
+        port_flags, mount_flags, env_flags, user_options_str, "--net=host",
+        image, "bash"
+    ]
+    cmds.append(" ".join(docker_check + docker_run))
+    docker_update = [
+        " && ".join(("apt-get -y update", "apt-get -y upgrade",
+                     "apt-get install -y git wget cmake psmisc"))
    ]
-    cmds.append(" ".join(docker_run))
-    docker_update = []
-    docker_update.append("apt-get -y update")
-    docker_update.append("apt-get -y upgrade")
-    docker_update.append("apt-get install -y git wget cmake psmisc")
    cmds.extend(with_docker_exec(docker_update, container_name=cname))
    return cmds

@@ -20,6 +20,7 @@ initial_workers: 0
 docker:
    image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
    container_name: "" # e.g. ray_docker
+    run_options: []  # Extra options to pass into "docker run"


 # The autoscaler will scale up the cluster to this target fraction of resource
@@ -66,9 +67,9 @@ head_node:
    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert

-    # If the network interface is specified as below in both head and worker 
+    # If the network interface is specified as below in both head and worker
    # nodes, the manual network config is used.  Otherwise an existing subnet is
-    # used.  To use a shared subnet, ask the subnet owner to grant permission 
+    # used.  To use a shared subnet, ask the subnet owner to grant permission
    # for 'compute.subnetworks.use' to the ray autoscaler account...
    # networkInterfaces:
    #   - kind: compute#networkInterface
@@ -100,6 +101,11 @@ file_mounts: {
 #    "/path2/on/remote/machine": "/path2/on/local/machine",
 }

+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
 # List of shell commands to run to set up nodes.
 setup_commands:
    # Note: if you're developing Ray, you probably want to create an AMI that
@@ -107,11 +113,6 @@ setup_commands:
    # below with a git checkout <your_sha> (and possibly a recompile).
    # - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc

-    - >-
-      sudo apt-get update
-      && sudo apt-get install -y
-      psmisc
-
    # Install Anaconda.
    - >-
      wget https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh -O ~/anaconda3.sh
@@ -0,0 +1,159 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: gpu-docker
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The initial number of worker nodes to launch in addition to the head
+# node. When the cluster is first brought up (or when it is refreshed with a
+# subsequent `ray up`) this number of nodes will be started.
+initial_workers: 0
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker:
+    image: "tensorflow/tensorflow:1.12.0-gpu-py3"
+    container_name: "ray-nvidia-docker-test" # e.g. ray_docker
+    run_options:
+      - --runtime=nvidia
+
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This value must be less than 1.0 for scaling to happen.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: gcp
+    region: us-west1
+    availability_zone: us-west1-b
+    project_id: <project_id> # Globally unique project id
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below. This requires that you have added the key into the
+# project wide meta-data.
+#    ssh_private_key: /path/to/your/key.pem
+
+# Provider-specific config for the head node, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
+# For more documentation on available fields, see:
+# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+head_node:
+    machineType: custom-6-16384
+    disks:
+      - boot: true
+        autoDelete: true
+        type: PERSISTENT
+        initializeParams:
+          diskSizeGb: 50
+          # See https://cloud.google.com/compute/docs/images for more images
+          sourceImage: projects/deeplearning-platform-release/global/images/family/tf-latest-gpu
+    guestAccelerators:
+      - acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
+        acceleratorCount: 1
+    metadata:
+      items:
+        - key: install-nvidia-driver
+          value: "True"
+    scheduling:
+      - onHostMaintenance: TERMINATE
+
+    # Additional options can be found in in the compute docs at
+    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+
+worker_nodes:
+    machineType: n1-standard-2
+    disks:
+      - boot: true
+        autoDelete: true
+        type: PERSISTENT
+        initializeParams:
+          diskSizeGb: 50
+          # See https://cloud.google.com/compute/docs/images for more images
+          sourceImage: projects/deeplearning-platform-release/global/images/family/tf-latest-gpu
+    guestAccelerators:
+      - acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
+        acceleratorCount: 1
+    metadata:
+      items:
+        - key: install-nvidia-driver
+          value: "True"
+    # Run workers on preemtible instance by default.
+    # Comment this out to use on-demand.
+    scheduling:
+      - preemptible: true
+      - onHostMaintenance: TERMINATE
+
+    # Additional options can be found in in the compute docs at
+    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+initialization_commands:
+    # Wait until nvidia drivers are installed
+    - >-
+      timeout 300 bash -c "
+          command -v nvidia-smi && nvidia-smi
+          until [ \$? -eq 0 ]; do
+              command -v nvidia-smi && nvidia-smi
+          done"
+
+# List of shell commands to run to set up nodes.
+setup_commands:
+    # Note: if you're developing Ray, you probably want to create an AMI that
+    # has your Ray repo pre-cloned. Then, you can replace the pip installs
+    # below with a git checkout <your_sha> (and possibly a recompile).
+    # - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
+
+    # Install ray
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp27-cp27mu-manylinux1_x86_64.whl
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp35-cp35m-manylinux1_x86_64.whl
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp36-cp36m-manylinux1_x86_64.whl
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+  - pip install google-api-python-client==1.7.8
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - >-
+      ulimit -n 65536;
+      ray start
+      --head
+      --redis-port=6379
+      --object-manager-port=8076
+      --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - >-
+      ulimit -n 65536;
+      ray start
+      --redis-address=$RAY_HEAD_IP:6379
+      --object-manager-port=8076
@@ -49,7 +49,8 @@ class NodeUpdater(object):
                 auth_config,
                 cluster_name,
                 file_mounts,
-                 setup_cmds,
+                 initialization_commands,
+                 setup_commands,
                 runtime_hash,
                 process_runner=subprocess,
                 use_internal_ip=False):
@@ -66,7 +67,8 @@ class NodeUpdater(object):
            remote: os.path.expanduser(local)
            for remote, local in file_mounts.items()
        }
-        self.setup_cmds = setup_cmds
+        self.initialization_commands = initialization_commands
+        self.setup_commands = setup_commands
        self.runtime_hash = runtime_hash

    def get_caller(self, check_error):
@@ -215,13 +217,15 @@ class NodeUpdater(object):
        self.provider.set_node_tags(self.node_id,
                                    {TAG_RAY_NODE_STATUS: "setting-up"})

+        m = "{}: Initialization commands completed".format(self.node_id)
+        with LogTimer("NodeUpdater: {}".format(m)):
+            for cmd in self.initialization_commands:
+                self.ssh_cmd(cmd, redirect=open("/dev/null", "w"))
+
        m = "{}: Setup commands completed".format(self.node_id)
        with LogTimer("NodeUpdater: {}".format(m)):
-            for cmd in self.setup_cmds:
-                self.ssh_cmd(
-                    cmd,
-                    # verbose=True,
-                    redirect=open("/dev/null", "w"))
+            for cmd in self.setup_commands:
+                self.ssh_cmd(cmd, redirect=open("/dev/null", "w"))

    def rsync_up(self, source, target, redirect=None, check_error=True):
        self.set_ssh_ip_if_required()
@@ -253,7 +257,6 @@ class NodeUpdater(object):
                cmd,
                connect_timeout=120,
                redirect=None,
-                verbose=False,
                allocate_tty=False,
                emulate_interactive=True,
                expect_error=False,
@@ -261,9 +264,8 @@ class NodeUpdater(object):

        self.set_ssh_ip_if_required()

-        if verbose:
-            logger.info("NodeUpdater: "
-                        "Running {} on {}...".format(cmd, self.ssh_ip))
+        logger.info("NodeUpdater: Running {} on {}...".format(
+            cmd, self.ssh_ip))
        ssh = ["ssh"]
        if allocate_tty:
            ssh.append("-tt")
@@ -627,6 +627,11 @@ def submit(cluster_config_file, screen, tmux, stop, start, cluster_name,
@cli.command()
@click.argument("cluster_config_file", required=True, type=str)
@click.argument("cmd", required=True, type=str)
+@click.option(
+    "--docker",
+    is_flag=True,
+    default=False,
+    help="Runs command in the docker container specified in cluster_config.")
@click.option(
    "--stop",
    is_flag=True,
@@ -652,9 +657,9 @@ def submit(cluster_config_file, screen, tmux, stop, start, cluster_name,
    help="Override the configured cluster name.")
@click.option(
    "--port-forward", required=False, type=int, help="Port to forward.")
-def exec_cmd(cluster_config_file, cmd, screen, tmux, stop, start, cluster_name,
-             port_forward):
-    exec_cluster(cluster_config_file, cmd, screen, tmux, stop, start,
+def exec_cmd(cluster_config_file, cmd, docker, screen, tmux, stop, start,
+             cluster_name, port_forward):
+    exec_cluster(cluster_config_file, cmd, docker, screen, tmux, stop, start,
                 cluster_name, port_forward)