From 1cd2703caccddd827e2b116be935a85dab40fa14 Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Tue, 20 Feb 2018 00:24:01 -0800 Subject: [PATCH] [autoscaler] Docker Support (#1505) --- doc/source/autoscaling.rst | 9 ++ python/ray/autoscaler/autoscaler.py | 7 ++ python/ray/autoscaler/aws/example.yaml | 25 ++++-- python/ray/autoscaler/commands.py | 24 +++++- python/ray/autoscaler/docker.py | 109 +++++++++++++++++++++++++ test/autoscaler_test.py | 5 ++ 6 files changed, 166 insertions(+), 13 deletions(-) create mode 100644 python/ray/autoscaler/docker.py diff --git a/doc/source/autoscaling.rst b/doc/source/autoscaling.rst index 6208f36c7..dbe6aba9b 100644 --- a/doc/source/autoscaling.rst +++ b/doc/source/autoscaling.rst @@ -104,6 +104,15 @@ The ``example.yaml`` configuration is enough to get started with Ray, but for mo head_node: InstanceType: p2.8xlarge +**Docker**: Specify docker image. This executes all commands on all nodes in the docker container, +and opens all the necessary ports to support the Ray cluster. + +.. code-block:: yaml + + docker: + image: tensorflow/tensorflow:1.5.0-py3 + container_name: ray_docker + **Mixed GPU and CPU nodes**: for RL applications that require proportionally more CPU than GPU resources, you can use additional CPU workers with a GPU head node. diff --git a/python/ray/autoscaler/autoscaler.py b/python/ray/autoscaler/autoscaler.py index d47857982..813a7f83f 100644 --- a/python/ray/autoscaler/autoscaler.py +++ b/python/ray/autoscaler/autoscaler.py @@ -55,6 +55,13 @@ CLUSTER_CONFIG_SCHEMA = { # How Ray will authenticate with newly launched nodes. "auth": dict, + # Docker configuration. If this is specified, all setup and start commands + # will be executed in the container. + "docker": { + "image": str, # e.g. tensorflow/tensorflow:1.5.0-py3 + "container_name": str + }, + # Provider-specific config for the head node, e.g. instance type. "head_node": dict, diff --git a/python/ray/autoscaler/aws/example.yaml b/python/ray/autoscaler/aws/example.yaml index a6a024ae4..25fe8052c 100644 --- a/python/ray/autoscaler/aws/example.yaml +++ b/python/ray/autoscaler/aws/example.yaml @@ -3,11 +3,18 @@ cluster_name: default # The minimum number of workers nodes to launch in addition to the head # node. This number should be >= 0. -min_workers: 2 +min_workers: 1 # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. -max_workers: 4 +max_workers: 2 + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: + image: "" # e.g., tensorflow/tensorflow:1.5.0-py3 + container_name: "" # e.g. ray_docker # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and @@ -41,10 +48,10 @@ head_node: ImageId: ami-3b6bce43 # Amazon Deep Learning AMI (Ubuntu) # You can provision additional disk space with a conf as follows - # BlockDeviceMappings: - # - DeviceName: /dev/sda1 - # Ebs: - # VolumeSize: 100 + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 50 # Additional options in the boto docs. @@ -77,7 +84,7 @@ setup_commands: # Note: if you're developing Ray, you probably want to create an AMI that # has your Ray repo pre-cloned. Then, you can replace the pip installs # below with a git checkout (and possibly a recompile). - - pip install -U ray==0.3.1 + - most_recent() { echo pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/$(aws s3 ls s3://ray-wheels --recursive | grep $1 | sort -r | head -n 1 | awk '{print $4}'); } && $( most_recent "cp36-cp36m-manylinux1" ) || $( most_recent "cp35-cp35m-manylinux1" ) # Custom commands that will be run on the head node after common setup. head_setup_commands: @@ -89,9 +96,9 @@ worker_setup_commands: [] # Command to start ray on the head node. You don't need to change this. head_start_ray_commands: - ray stop - - ray start --head --redis-port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml + - ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml # Command to start ray on worker nodes. You don't need to change this. worker_start_ray_commands: - ray stop - - ray start --redis-address=$RAY_HEAD_IP:6379 + - ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/commands.py b/python/ray/autoscaler/commands.py index c494a1443..9e61759c8 100644 --- a/python/ray/autoscaler/commands.py +++ b/python/ray/autoscaler/commands.py @@ -9,9 +9,14 @@ import time import sys import yaml +try: # py3 + from shlex import quote +except ImportError: # py2 + from pipes import quote from ray.autoscaler.autoscaler import validate_config, hash_runtime_conf, \ hash_launch_conf +from ray.autoscaler.docker import dockerize_if_needed from ray.autoscaler.node_provider import get_node_provider, NODE_PROVIDERS from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_LAUNCH_CONFIG, \ TAG_NAME @@ -23,8 +28,9 @@ def create_or_update_cluster( """Create or updates an autoscaling Ray cluster from a config json.""" config = yaml.load(open(config_file).read()) - validate_config(config) + dockerize_if_needed(config) + if override_min_workers is not None: config["min_workers"] = override_min_workers if override_max_workers is not None: @@ -44,10 +50,11 @@ def teardown_cluster(config_file): """Destroys all nodes of a Ray cluster described by a config json.""" config = yaml.load(open(config_file).read()) + validate_config(config) + dockerize_if_needed(config) confirm("This will destroy your cluster") - validate_config(config) provider = get_node_provider(config["provider"], config["cluster_name"]) head_node_tags = { TAG_RAY_NODE_TYPE: "Head", @@ -155,12 +162,21 @@ def get_or_create_head_node(config, no_restart): print( "Head node up-to-date, IP address is: {}".format( provider.external_ip(head_node))) + + monitor_str = "tail -f /tmp/raylogs/monitor-*" + for s in init_commands: + if ("ray start" in s and "docker exec" in s and + "--autoscaling-config" in s): + monitor_str = "docker exec {} /bin/sh -c {}".format( + config["docker"]["container_name"], + quote(monitor_str)) print( "To monitor auto-scaling activity, you can run:\n\n" - " ssh -i {} {}@{} 'tail -f /tmp/raylogs/monitor-*'\n".format( + " ssh -i {} {}@{} {}\n".format( config["auth"]["ssh_private_key"], config["auth"]["ssh_user"], - provider.external_ip(head_node))) + provider.external_ip(head_node), + quote(monitor_str))) print( "To login to the cluster, run:\n\n" " ssh -i {} {}@{}\n".format( diff --git a/python/ray/autoscaler/docker.py b/python/ray/autoscaler/docker.py new file mode 100644 index 000000000..d6b71d729 --- /dev/null +++ b/python/ray/autoscaler/docker.py @@ -0,0 +1,109 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +try: # py3 + from shlex import quote +except ImportError: # py2 + from pipes import quote + + +def dockerize_if_needed(config): + docker_image = config["docker"].get("image") + cname = config["docker"].get("container_name") + if not docker_image: + return config + else: + assert cname, "Must provide container name!" + docker_mounts = {dst: dst for dst in config["file_mounts"]} + config["setup_commands"] = ( + docker_install_cmds() + + docker_start_cmds( + config["auth"]["ssh_user"], docker_image, + docker_mounts, cname) + + with_docker_exec( + config["setup_commands"], container_name=cname)) + + config["head_setup_commands"] = with_docker_exec( + config["head_setup_commands"], container_name=cname) + config["head_start_ray_commands"] = ( + docker_autoscaler_setup(cname) + + with_docker_exec( + config["head_start_ray_commands"], container_name=cname)) + + config["worker_setup_commands"] = with_docker_exec( + config["worker_setup_commands"], container_name=cname) + config["worker_start_ray_commands"] = with_docker_exec( + config["worker_start_ray_commands"], container_name=cname, + env_vars=["RAY_HEAD_IP"]) + + return config + + +def with_docker_exec(cmds, container_name, env_vars=None): + env_str = "" + if env_vars: + env_str = " ".join( + ["-e {env}=${env}".format(env=env) for env in env_vars]) + return ["docker exec {} {} /bin/sh -c {} ".format( + env_str, container_name, quote(cmd)) for cmd in cmds] + + +def docker_install_cmds(): + return [aptwait_cmd() + " && sudo apt-get update", + aptwait_cmd() + " && sudo apt-get install -y docker.io"] + + +def aptwait_cmd(): + return ( + "while sudo fuser" + " /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock" + " >/dev/null 2>&1; " + "do echo 'Waiting for release of dpkg/apt locks'; sleep 5; done") + + +def docker_start_cmds(user, image, mount, cname): + cmds = [] + cmds.append("sudo kill -SIGUSR1 $(pidof dockerd) || true") + cmds.append("sudo service docker start") + cmds.append("sudo usermod -a -G docker {}".format(user)) + cmds.append("docker rm -f {} || true".format(cname)) + cmds.append("docker pull {}".format(image)) + + # create flags + # ports for the redis, object manager, and tune client + port_flags = " ".join(["-p {port}:{port}".format(port=port) + for port in ["6379", "8076", "4321"]]) + mount_flags = " ".join(["-v {src}:{dest}".format(src=k, dest=v) + for k, v in mount.items()]) + + # for click, used in ray cli + env_vars = {"LC_ALL": "C.UTF-8", "LANG": "C.UTF-8"} + env_flags = " ".join( + ["-e {name}={val}".format(name=k, val=v) for k, v in env_vars.items()]) + + # docker run command + docker_run = ["docker", "run", "--rm", "--name {}".format(cname), + "-d", "-it", port_flags, mount_flags, env_flags, + "--net=host", image, "bash"] + cmds.append(" ".join(docker_run)) + docker_update = [] + docker_update.append("apt-get -y update") + docker_update.append("apt-get -y upgrade") + docker_update.append("apt-get install -y git wget cmake psmisc") + cmds.extend(with_docker_exec(docker_update, container_name=cname)) + return cmds + + +def docker_autoscaler_setup(cname): + cmds = [] + for path in ["~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"]: + # needed because docker doesn't allow relative paths + base_path = os.path.basename(path) + cmds.append("docker cp {path} {cname}:{dpath}".format( + path=path, dpath=base_path, cname=cname)) + cmds.extend(with_docker_exec( + ["cp {} {}".format("/" + base_path, path)], + container_name=cname)) + return cmds diff --git a/test/autoscaler_test.py b/test/autoscaler_test.py index e2306546a..878a0013b 100644 --- a/test/autoscaler_test.py +++ b/test/autoscaler_test.py @@ -96,6 +96,11 @@ SMALL_CLUSTER = { "region": "us-east-1", "availability_zone": "us-east-1a", }, + "docker": { + "image": "example", + "container_name": "mock", + + }, "auth": { "ssh_user": "ubuntu", "ssh_private_key": "/dev/null",