[autoscaler] Docker Support (#1505)

2026-06-28 11:53:32 +08:00 · 2018-02-20 00:24:01 -08:00
parent 0f766ae24b
commit 1cd2703cac
6 changed files with 166 additions and 13 deletions
@@ -55,6 +55,13 @@ CLUSTER_CONFIG_SCHEMA = {
    # How Ray will authenticate with newly launched nodes.
    "auth": dict,

+    # Docker configuration. If this is specified, all setup and start commands
+    # will be executed in the container.
+    "docker": {
+        "image": str,  # e.g. tensorflow/tensorflow:1.5.0-py3
+        "container_name": str
+    },
+
    # Provider-specific config for the head node, e.g. instance type.
    "head_node": dict,

@@ -3,11 +3,18 @@ cluster_name: default

 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
-min_workers: 2
+min_workers: 1

 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers.
-max_workers: 4
+max_workers: 2
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker:
+    image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
+    container_name: "" # e.g. ray_docker

 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -41,10 +48,10 @@ head_node:
    ImageId: ami-3b6bce43  # Amazon Deep Learning AMI (Ubuntu)

    # You can provision additional disk space with a conf as follows
-    # BlockDeviceMappings:
-    #     - DeviceName: /dev/sda1
-    #       Ebs:
-    #           VolumeSize: 100
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+              VolumeSize: 50

    # Additional options in the boto docs.

@@ -77,7 +84,7 @@ setup_commands:
    # Note: if you're developing Ray, you probably want to create an AMI that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
-    - pip install -U ray==0.3.1
+    - most_recent() { echo pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/$(aws s3 ls s3://ray-wheels --recursive | grep $1 | sort -r | head -n 1 | awk '{print $4}'); } && $( most_recent "cp36-cp36m-manylinux1" ) || $( most_recent "cp35-cp35m-manylinux1" )

 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
@@ -89,9 +96,9 @@ worker_setup_commands: []
 # Command to start ray on the head node. You don't need to change this.
 head_start_ray_commands:
    - ray stop
-    - ray start --head --redis-port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml
+    - ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml

 # Command to start ray on worker nodes. You don't need to change this.
 worker_start_ray_commands:
    - ray stop
-    - ray start --redis-address=$RAY_HEAD_IP:6379
+    - ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -9,9 +9,14 @@ import time
 import sys

 import yaml
+try:  # py3
+    from shlex import quote
+except ImportError:  # py2
+    from pipes import quote

 from ray.autoscaler.autoscaler import validate_config, hash_runtime_conf, \
    hash_launch_conf
+from ray.autoscaler.docker import dockerize_if_needed
 from ray.autoscaler.node_provider import get_node_provider, NODE_PROVIDERS
 from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_LAUNCH_CONFIG, \
    TAG_NAME
@@ -23,8 +28,9 @@ def create_or_update_cluster(
    """Create or updates an autoscaling Ray cluster from a config json."""

    config = yaml.load(open(config_file).read())
-
    validate_config(config)
+    dockerize_if_needed(config)
+
    if override_min_workers is not None:
        config["min_workers"] = override_min_workers
    if override_max_workers is not None:
@@ -44,10 +50,11 @@ def teardown_cluster(config_file):
    """Destroys all nodes of a Ray cluster described by a config json."""

    config = yaml.load(open(config_file).read())
+    validate_config(config)
+    dockerize_if_needed(config)

    confirm("This will destroy your cluster")

-    validate_config(config)
    provider = get_node_provider(config["provider"], config["cluster_name"])
    head_node_tags = {
        TAG_RAY_NODE_TYPE: "Head",
@@ -155,12 +162,21 @@ def get_or_create_head_node(config, no_restart):
    print(
        "Head node up-to-date, IP address is: {}".format(
            provider.external_ip(head_node)))
+
+    monitor_str = "tail -f /tmp/raylogs/monitor-*"
+    for s in init_commands:
+        if ("ray start" in s and "docker exec" in s and
+                "--autoscaling-config" in s):
+            monitor_str = "docker exec {} /bin/sh -c {}".format(
+                        config["docker"]["container_name"],
+                        quote(monitor_str))
    print(
        "To monitor auto-scaling activity, you can run:\n\n"
-        "  ssh -i {} {}@{} 'tail -f /tmp/raylogs/monitor-*'\n".format(
+        "  ssh -i {} {}@{} {}\n".format(
            config["auth"]["ssh_private_key"],
            config["auth"]["ssh_user"],
-            provider.external_ip(head_node)))
+            provider.external_ip(head_node),
+            quote(monitor_str)))
    print(
        "To login to the cluster, run:\n\n"
        "  ssh -i {} {}@{}\n".format(
@@ -0,0 +1,109 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+try:  # py3
+    from shlex import quote
+except ImportError:  # py2
+    from pipes import quote
+
+
+def dockerize_if_needed(config):
+    docker_image = config["docker"].get("image")
+    cname = config["docker"].get("container_name")
+    if not docker_image:
+        return config
+    else:
+        assert cname, "Must provide container name!"
+    docker_mounts = {dst: dst for dst in config["file_mounts"]}
+    config["setup_commands"] = (
+        docker_install_cmds() +
+        docker_start_cmds(
+            config["auth"]["ssh_user"], docker_image,
+            docker_mounts, cname) +
+        with_docker_exec(
+            config["setup_commands"], container_name=cname))
+
+    config["head_setup_commands"] = with_docker_exec(
+        config["head_setup_commands"], container_name=cname)
+    config["head_start_ray_commands"] = (
+        docker_autoscaler_setup(cname) +
+        with_docker_exec(
+            config["head_start_ray_commands"], container_name=cname))
+
+    config["worker_setup_commands"] = with_docker_exec(
+        config["worker_setup_commands"], container_name=cname)
+    config["worker_start_ray_commands"] = with_docker_exec(
+        config["worker_start_ray_commands"], container_name=cname,
+        env_vars=["RAY_HEAD_IP"])
+
+    return config
+
+
+def with_docker_exec(cmds, container_name, env_vars=None):
+    env_str = ""
+    if env_vars:
+        env_str = " ".join(
+            ["-e {env}=${env}".format(env=env) for env in env_vars])
+    return ["docker exec {} {} /bin/sh -c {} ".format(
+        env_str, container_name, quote(cmd)) for cmd in cmds]
+
+
+def docker_install_cmds():
+    return [aptwait_cmd() + " && sudo apt-get update",
+            aptwait_cmd() + " && sudo apt-get install -y docker.io"]
+
+
+def aptwait_cmd():
+    return (
+        "while sudo fuser"
+        " /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock"
+        " >/dev/null 2>&1; "
+        "do echo 'Waiting for release of dpkg/apt locks'; sleep 5; done")
+
+
+def docker_start_cmds(user, image, mount, cname):
+    cmds = []
+    cmds.append("sudo kill -SIGUSR1 $(pidof dockerd) || true")
+    cmds.append("sudo service docker start")
+    cmds.append("sudo usermod -a -G docker {}".format(user))
+    cmds.append("docker rm -f {} || true".format(cname))
+    cmds.append("docker pull {}".format(image))
+
+    # create flags
+    # ports for the redis, object manager, and tune client
+    port_flags = " ".join(["-p {port}:{port}".format(port=port)
+                           for port in ["6379", "8076", "4321"]])
+    mount_flags = " ".join(["-v {src}:{dest}".format(src=k, dest=v)
+                            for k, v in mount.items()])
+
+    # for click, used in ray cli
+    env_vars = {"LC_ALL": "C.UTF-8", "LANG": "C.UTF-8"}
+    env_flags = " ".join(
+        ["-e {name}={val}".format(name=k, val=v) for k, v in env_vars.items()])
+
+    # docker run command
+    docker_run = ["docker", "run", "--rm", "--name {}".format(cname),
+                  "-d", "-it", port_flags, mount_flags, env_flags,
+                  "--net=host", image, "bash"]
+    cmds.append(" ".join(docker_run))
+    docker_update = []
+    docker_update.append("apt-get -y update")
+    docker_update.append("apt-get -y upgrade")
+    docker_update.append("apt-get install -y git wget cmake psmisc")
+    cmds.extend(with_docker_exec(docker_update, container_name=cname))
+    return cmds
+
+
+def docker_autoscaler_setup(cname):
+    cmds = []
+    for path in ["~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"]:
+        # needed because docker doesn't allow relative paths
+        base_path = os.path.basename(path)
+        cmds.append("docker cp {path} {cname}:{dpath}".format(
+            path=path, dpath=base_path, cname=cname))
+        cmds.extend(with_docker_exec(
+            ["cp {} {}".format("/" + base_path, path)],
+            container_name=cname))
+    return cmds