[autoscaler] Docker Support (#1505)

This commit is contained in:
Richard Liaw
2018-02-20 00:24:01 -08:00
committed by GitHub
parent 0f766ae24b
commit 1cd2703cac
6 changed files with 166 additions and 13 deletions
+7
View File
@@ -55,6 +55,13 @@ CLUSTER_CONFIG_SCHEMA = {
# How Ray will authenticate with newly launched nodes.
"auth": dict,
# Docker configuration. If this is specified, all setup and start commands
# will be executed in the container.
"docker": {
"image": str, # e.g. tensorflow/tensorflow:1.5.0-py3
"container_name": str
},
# Provider-specific config for the head node, e.g. instance type.
"head_node": dict,
+16 -9
View File
@@ -3,11 +3,18 @@ cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 2
min_workers: 1
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 4
max_workers: 2
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
container_name: "" # e.g. ray_docker
# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -41,10 +48,10 @@ head_node:
ImageId: ami-3b6bce43 # Amazon Deep Learning AMI (Ubuntu)
# You can provision additional disk space with a conf as follows
# BlockDeviceMappings:
# - DeviceName: /dev/sda1
# Ebs:
# VolumeSize: 100
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 50
# Additional options in the boto docs.
@@ -77,7 +84,7 @@ setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
- pip install -U ray==0.3.1
- most_recent() { echo pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/$(aws s3 ls s3://ray-wheels --recursive | grep $1 | sort -r | head -n 1 | awk '{print $4}'); } && $( most_recent "cp36-cp36m-manylinux1" ) || $( most_recent "cp35-cp35m-manylinux1" )
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
@@ -89,9 +96,9 @@ worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- ray start --head --redis-port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml
- ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ray start --redis-address=$RAY_HEAD_IP:6379
- ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
+20 -4
View File
@@ -9,9 +9,14 @@ import time
import sys
import yaml
try: # py3
from shlex import quote
except ImportError: # py2
from pipes import quote
from ray.autoscaler.autoscaler import validate_config, hash_runtime_conf, \
hash_launch_conf
from ray.autoscaler.docker import dockerize_if_needed
from ray.autoscaler.node_provider import get_node_provider, NODE_PROVIDERS
from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_LAUNCH_CONFIG, \
TAG_NAME
@@ -23,8 +28,9 @@ def create_or_update_cluster(
"""Create or updates an autoscaling Ray cluster from a config json."""
config = yaml.load(open(config_file).read())
validate_config(config)
dockerize_if_needed(config)
if override_min_workers is not None:
config["min_workers"] = override_min_workers
if override_max_workers is not None:
@@ -44,10 +50,11 @@ def teardown_cluster(config_file):
"""Destroys all nodes of a Ray cluster described by a config json."""
config = yaml.load(open(config_file).read())
validate_config(config)
dockerize_if_needed(config)
confirm("This will destroy your cluster")
validate_config(config)
provider = get_node_provider(config["provider"], config["cluster_name"])
head_node_tags = {
TAG_RAY_NODE_TYPE: "Head",
@@ -155,12 +162,21 @@ def get_or_create_head_node(config, no_restart):
print(
"Head node up-to-date, IP address is: {}".format(
provider.external_ip(head_node)))
monitor_str = "tail -f /tmp/raylogs/monitor-*"
for s in init_commands:
if ("ray start" in s and "docker exec" in s and
"--autoscaling-config" in s):
monitor_str = "docker exec {} /bin/sh -c {}".format(
config["docker"]["container_name"],
quote(monitor_str))
print(
"To monitor auto-scaling activity, you can run:\n\n"
" ssh -i {} {}@{} 'tail -f /tmp/raylogs/monitor-*'\n".format(
" ssh -i {} {}@{} {}\n".format(
config["auth"]["ssh_private_key"],
config["auth"]["ssh_user"],
provider.external_ip(head_node)))
provider.external_ip(head_node),
quote(monitor_str)))
print(
"To login to the cluster, run:\n\n"
" ssh -i {} {}@{}\n".format(
+109
View File
@@ -0,0 +1,109 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
try: # py3
from shlex import quote
except ImportError: # py2
from pipes import quote
def dockerize_if_needed(config):
docker_image = config["docker"].get("image")
cname = config["docker"].get("container_name")
if not docker_image:
return config
else:
assert cname, "Must provide container name!"
docker_mounts = {dst: dst for dst in config["file_mounts"]}
config["setup_commands"] = (
docker_install_cmds() +
docker_start_cmds(
config["auth"]["ssh_user"], docker_image,
docker_mounts, cname) +
with_docker_exec(
config["setup_commands"], container_name=cname))
config["head_setup_commands"] = with_docker_exec(
config["head_setup_commands"], container_name=cname)
config["head_start_ray_commands"] = (
docker_autoscaler_setup(cname) +
with_docker_exec(
config["head_start_ray_commands"], container_name=cname))
config["worker_setup_commands"] = with_docker_exec(
config["worker_setup_commands"], container_name=cname)
config["worker_start_ray_commands"] = with_docker_exec(
config["worker_start_ray_commands"], container_name=cname,
env_vars=["RAY_HEAD_IP"])
return config
def with_docker_exec(cmds, container_name, env_vars=None):
env_str = ""
if env_vars:
env_str = " ".join(
["-e {env}=${env}".format(env=env) for env in env_vars])
return ["docker exec {} {} /bin/sh -c {} ".format(
env_str, container_name, quote(cmd)) for cmd in cmds]
def docker_install_cmds():
return [aptwait_cmd() + " && sudo apt-get update",
aptwait_cmd() + " && sudo apt-get install -y docker.io"]
def aptwait_cmd():
return (
"while sudo fuser"
" /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock"
" >/dev/null 2>&1; "
"do echo 'Waiting for release of dpkg/apt locks'; sleep 5; done")
def docker_start_cmds(user, image, mount, cname):
cmds = []
cmds.append("sudo kill -SIGUSR1 $(pidof dockerd) || true")
cmds.append("sudo service docker start")
cmds.append("sudo usermod -a -G docker {}".format(user))
cmds.append("docker rm -f {} || true".format(cname))
cmds.append("docker pull {}".format(image))
# create flags
# ports for the redis, object manager, and tune client
port_flags = " ".join(["-p {port}:{port}".format(port=port)
for port in ["6379", "8076", "4321"]])
mount_flags = " ".join(["-v {src}:{dest}".format(src=k, dest=v)
for k, v in mount.items()])
# for click, used in ray cli
env_vars = {"LC_ALL": "C.UTF-8", "LANG": "C.UTF-8"}
env_flags = " ".join(
["-e {name}={val}".format(name=k, val=v) for k, v in env_vars.items()])
# docker run command
docker_run = ["docker", "run", "--rm", "--name {}".format(cname),
"-d", "-it", port_flags, mount_flags, env_flags,
"--net=host", image, "bash"]
cmds.append(" ".join(docker_run))
docker_update = []
docker_update.append("apt-get -y update")
docker_update.append("apt-get -y upgrade")
docker_update.append("apt-get install -y git wget cmake psmisc")
cmds.extend(with_docker_exec(docker_update, container_name=cname))
return cmds
def docker_autoscaler_setup(cname):
cmds = []
for path in ["~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"]:
# needed because docker doesn't allow relative paths
base_path = os.path.basename(path)
cmds.append("docker cp {path} {cname}:{dpath}".format(
path=path, dpath=base_path, cname=cname))
cmds.extend(with_docker_exec(
["cp {} {}".format("/" + base_path, path)],
container_name=cname))
return cmds