From 1cd2703caccddd827e2b116be935a85dab40fa14 Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Tue, 20 Feb 2018 00:24:01 -0800
Subject: [PATCH] [autoscaler] Docker Support (#1505)

---
 doc/source/autoscaling.rst             |   9 ++
 python/ray/autoscaler/autoscaler.py    |   7 ++
 python/ray/autoscaler/aws/example.yaml |  25 ++++--
 python/ray/autoscaler/commands.py      |  24 +++++-
 python/ray/autoscaler/docker.py        | 109 +++++++++++++++++++++++++
 test/autoscaler_test.py                |   5 ++
 6 files changed, 166 insertions(+), 13 deletions(-)
 create mode 100644 python/ray/autoscaler/docker.py

diff --git a/doc/source/autoscaling.rst b/doc/source/autoscaling.rst
index 6208f36c7..dbe6aba9b 100644
--- a/doc/source/autoscaling.rst
+++ b/doc/source/autoscaling.rst
@@ -104,6 +104,15 @@ The ``example.yaml`` configuration is enough to get started with Ray, but for mo
     head_node:
         InstanceType: p2.8xlarge
 
+**Docker**: Specify docker image. This executes all commands on all nodes in the docker container,
+and opens all the necessary ports to support the Ray cluster.
+
+.. code-block:: yaml
+
+    docker:
+        image: tensorflow/tensorflow:1.5.0-py3
+        container_name: ray_docker
+
 **Mixed GPU and CPU nodes**: for RL applications that require proportionally more
 CPU than GPU resources, you can use additional CPU workers with a GPU head node.
 
diff --git a/python/ray/autoscaler/autoscaler.py b/python/ray/autoscaler/autoscaler.py
index d47857982..813a7f83f 100644
--- a/python/ray/autoscaler/autoscaler.py
+++ b/python/ray/autoscaler/autoscaler.py
@@ -55,6 +55,13 @@ CLUSTER_CONFIG_SCHEMA = {
     # How Ray will authenticate with newly launched nodes.
     "auth": dict,
 
+    # Docker configuration. If this is specified, all setup and start commands
+    # will be executed in the container.
+    "docker": {
+        "image": str,  # e.g. tensorflow/tensorflow:1.5.0-py3
+        "container_name": str
+    },
+
     # Provider-specific config for the head node, e.g. instance type.
     "head_node": dict,
 
diff --git a/python/ray/autoscaler/aws/example.yaml b/python/ray/autoscaler/aws/example.yaml
index a6a024ae4..25fe8052c 100644
--- a/python/ray/autoscaler/aws/example.yaml
+++ b/python/ray/autoscaler/aws/example.yaml
@@ -3,11 +3,18 @@ cluster_name: default
 
 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
-min_workers: 2
+min_workers: 1
 
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers.
-max_workers: 4
+max_workers: 2
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker:
+    image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
+    container_name: "" # e.g. ray_docker
 
 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -41,10 +48,10 @@ head_node:
     ImageId: ami-3b6bce43  # Amazon Deep Learning AMI (Ubuntu)
 
     # You can provision additional disk space with a conf as follows
-    # BlockDeviceMappings:
-    #     - DeviceName: /dev/sda1
-    #       Ebs:
-    #           VolumeSize: 100
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+              VolumeSize: 50
 
     # Additional options in the boto docs.
 
@@ -77,7 +84,7 @@ setup_commands:
     # Note: if you're developing Ray, you probably want to create an AMI that
     # has your Ray repo pre-cloned. Then, you can replace the pip installs
     # below with a git checkout <your_sha> (and possibly a recompile).
-    - pip install -U ray==0.3.1
+    - most_recent() { echo pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/$(aws s3 ls s3://ray-wheels --recursive | grep $1 | sort -r | head -n 1 | awk '{print $4}'); } && $( most_recent "cp36-cp36m-manylinux1" ) || $( most_recent "cp35-cp35m-manylinux1" )
 
 # Custom commands that will be run on the head node after common setup.
 head_setup_commands:
@@ -89,9 +96,9 @@ worker_setup_commands: []
 # Command to start ray on the head node. You don't need to change this.
 head_start_ray_commands:
     - ray stop
-    - ray start --head --redis-port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml
+    - ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
 
 # Command to start ray on worker nodes. You don't need to change this.
 worker_start_ray_commands:
     - ray stop
-    - ray start --redis-address=$RAY_HEAD_IP:6379
+    - ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
diff --git a/python/ray/autoscaler/commands.py b/python/ray/autoscaler/commands.py
index c494a1443..9e61759c8 100644
--- a/python/ray/autoscaler/commands.py
+++ b/python/ray/autoscaler/commands.py
@@ -9,9 +9,14 @@ import time
 import sys
 
 import yaml
+try:  # py3
+    from shlex import quote
+except ImportError:  # py2
+    from pipes import quote
 
 from ray.autoscaler.autoscaler import validate_config, hash_runtime_conf, \
     hash_launch_conf
+from ray.autoscaler.docker import dockerize_if_needed
 from ray.autoscaler.node_provider import get_node_provider, NODE_PROVIDERS
 from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_LAUNCH_CONFIG, \
     TAG_NAME
@@ -23,8 +28,9 @@ def create_or_update_cluster(
     """Create or updates an autoscaling Ray cluster from a config json."""
 
     config = yaml.load(open(config_file).read())
-
     validate_config(config)
+    dockerize_if_needed(config)
+
     if override_min_workers is not None:
         config["min_workers"] = override_min_workers
     if override_max_workers is not None:
@@ -44,10 +50,11 @@ def teardown_cluster(config_file):
     """Destroys all nodes of a Ray cluster described by a config json."""
 
     config = yaml.load(open(config_file).read())
+    validate_config(config)
+    dockerize_if_needed(config)
 
     confirm("This will destroy your cluster")
 
-    validate_config(config)
     provider = get_node_provider(config["provider"], config["cluster_name"])
     head_node_tags = {
         TAG_RAY_NODE_TYPE: "Head",
@@ -155,12 +162,21 @@ def get_or_create_head_node(config, no_restart):
     print(
         "Head node up-to-date, IP address is: {}".format(
             provider.external_ip(head_node)))
+
+    monitor_str = "tail -f /tmp/raylogs/monitor-*"
+    for s in init_commands:
+        if ("ray start" in s and "docker exec" in s and
+                "--autoscaling-config" in s):
+            monitor_str = "docker exec {} /bin/sh -c {}".format(
+                        config["docker"]["container_name"],
+                        quote(monitor_str))
     print(
         "To monitor auto-scaling activity, you can run:\n\n"
-        "  ssh -i {} {}@{} 'tail -f /tmp/raylogs/monitor-*'\n".format(
+        "  ssh -i {} {}@{} {}\n".format(
             config["auth"]["ssh_private_key"],
             config["auth"]["ssh_user"],
-            provider.external_ip(head_node)))
+            provider.external_ip(head_node),
+            quote(monitor_str)))
     print(
         "To login to the cluster, run:\n\n"
         "  ssh -i {} {}@{}\n".format(
diff --git a/python/ray/autoscaler/docker.py b/python/ray/autoscaler/docker.py
new file mode 100644
index 000000000..d6b71d729
--- /dev/null
+++ b/python/ray/autoscaler/docker.py
@@ -0,0 +1,109 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+try:  # py3
+    from shlex import quote
+except ImportError:  # py2
+    from pipes import quote
+
+
+def dockerize_if_needed(config):
+    docker_image = config["docker"].get("image")
+    cname = config["docker"].get("container_name")
+    if not docker_image:
+        return config
+    else:
+        assert cname, "Must provide container name!"
+    docker_mounts = {dst: dst for dst in config["file_mounts"]}
+    config["setup_commands"] = (
+        docker_install_cmds() +
+        docker_start_cmds(
+            config["auth"]["ssh_user"], docker_image,
+            docker_mounts, cname) +
+        with_docker_exec(
+            config["setup_commands"], container_name=cname))
+
+    config["head_setup_commands"] = with_docker_exec(
+        config["head_setup_commands"], container_name=cname)
+    config["head_start_ray_commands"] = (
+        docker_autoscaler_setup(cname) +
+        with_docker_exec(
+            config["head_start_ray_commands"], container_name=cname))
+
+    config["worker_setup_commands"] = with_docker_exec(
+        config["worker_setup_commands"], container_name=cname)
+    config["worker_start_ray_commands"] = with_docker_exec(
+        config["worker_start_ray_commands"], container_name=cname,
+        env_vars=["RAY_HEAD_IP"])
+
+    return config
+
+
+def with_docker_exec(cmds, container_name, env_vars=None):
+    env_str = ""
+    if env_vars:
+        env_str = " ".join(
+            ["-e {env}=${env}".format(env=env) for env in env_vars])
+    return ["docker exec {} {} /bin/sh -c {} ".format(
+        env_str, container_name, quote(cmd)) for cmd in cmds]
+
+
+def docker_install_cmds():
+    return [aptwait_cmd() + " && sudo apt-get update",
+            aptwait_cmd() + " && sudo apt-get install -y docker.io"]
+
+
+def aptwait_cmd():
+    return (
+        "while sudo fuser"
+        " /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock"
+        " >/dev/null 2>&1; "
+        "do echo 'Waiting for release of dpkg/apt locks'; sleep 5; done")
+
+
+def docker_start_cmds(user, image, mount, cname):
+    cmds = []
+    cmds.append("sudo kill -SIGUSR1 $(pidof dockerd) || true")
+    cmds.append("sudo service docker start")
+    cmds.append("sudo usermod -a -G docker {}".format(user))
+    cmds.append("docker rm -f {} || true".format(cname))
+    cmds.append("docker pull {}".format(image))
+
+    # create flags
+    # ports for the redis, object manager, and tune client
+    port_flags = " ".join(["-p {port}:{port}".format(port=port)
+                           for port in ["6379", "8076", "4321"]])
+    mount_flags = " ".join(["-v {src}:{dest}".format(src=k, dest=v)
+                            for k, v in mount.items()])
+
+    # for click, used in ray cli
+    env_vars = {"LC_ALL": "C.UTF-8", "LANG": "C.UTF-8"}
+    env_flags = " ".join(
+        ["-e {name}={val}".format(name=k, val=v) for k, v in env_vars.items()])
+
+    # docker run command
+    docker_run = ["docker", "run", "--rm", "--name {}".format(cname),
+                  "-d", "-it", port_flags, mount_flags, env_flags,
+                  "--net=host", image, "bash"]
+    cmds.append(" ".join(docker_run))
+    docker_update = []
+    docker_update.append("apt-get -y update")
+    docker_update.append("apt-get -y upgrade")
+    docker_update.append("apt-get install -y git wget cmake psmisc")
+    cmds.extend(with_docker_exec(docker_update, container_name=cname))
+    return cmds
+
+
+def docker_autoscaler_setup(cname):
+    cmds = []
+    for path in ["~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"]:
+        # needed because docker doesn't allow relative paths
+        base_path = os.path.basename(path)
+        cmds.append("docker cp {path} {cname}:{dpath}".format(
+            path=path, dpath=base_path, cname=cname))
+        cmds.extend(with_docker_exec(
+            ["cp {} {}".format("/" + base_path, path)],
+            container_name=cname))
+    return cmds
diff --git a/test/autoscaler_test.py b/test/autoscaler_test.py
index e2306546a..878a0013b 100644
--- a/test/autoscaler_test.py
+++ b/test/autoscaler_test.py
@@ -96,6 +96,11 @@ SMALL_CLUSTER = {
         "region": "us-east-1",
         "availability_zone": "us-east-1a",
     },
+    "docker": {
+        "image": "example",
+        "container_name": "mock",
+
+    },
     "auth": {
         "ssh_user": "ubuntu",
         "ssh_private_key": "/dev/null",