EC2 cluster setup scripts and initial version of auto-scaler (#1311)

This commit is contained in:
Eric Liang
2017-12-15 23:56:39 -08:00
committed by Robert Nishihara
parent 76b6b4a2d3
commit f5ea44338e
20 changed files with 1665 additions and 16 deletions
+1
View File
@@ -112,6 +112,7 @@ script:
- python test/runtest.py
- python test/array_test.py
- python test/actor_test.py
- python test/autoscaler_test.py
- python test/tensorflow_test.py
- python test/failure_test.py
- python test/microbenchmarks.py
+4 -4
View File
@@ -24,7 +24,7 @@ if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then
wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh -nv
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q numpy cloudpickle==0.5.2 cython cmake funcsigs click colorama psutil redis tensorflow gym flatbuffers opencv-python
pip install -q numpy cloudpickle==0.5.2 cython cmake funcsigs click colorama psutil redis tensorflow gym flatbuffers opencv-python pyyaml
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
sudo apt-get update
sudo apt-get install -y cmake pkg-config python-dev python-numpy build-essential autoconf curl libtool unzip
@@ -32,7 +32,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh -nv
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q numpy cloudpickle==0.5.2 cython cmake funcsigs click colorama psutil redis tensorflow gym flatbuffers opencv-python
pip install -q numpy cloudpickle==0.5.2 cython cmake funcsigs click colorama psutil redis tensorflow gym flatbuffers opencv-python pyyaml
elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
# check that brew is installed
which -s brew
@@ -48,7 +48,7 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
wget https://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh -nv
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q numpy cloudpickle==0.5.2 cython cmake funcsigs click colorama psutil redis tensorflow gym flatbuffers opencv-python
pip install -q numpy cloudpickle==0.5.2 cython cmake funcsigs click colorama psutil redis tensorflow gym flatbuffers opencv-python pyyaml
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
# check that brew is installed
which -s brew
@@ -64,7 +64,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh -nv
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q numpy cloudpickle==0.5.2 cython cmake funcsigs click colorama psutil redis tensorflow gym flatbuffers opencv-python
pip install -q numpy cloudpickle==0.5.2 cython cmake funcsigs click colorama psutil redis tensorflow gym flatbuffers opencv-python pyyaml
elif [[ "$LINT" == "1" ]]; then
sudo apt-get update
sudo apt-get install -y cmake build-essential autoconf curl libtool unzip
+1
View File
@@ -6,6 +6,7 @@ mock
numpy
opencv-python
pyarrow
pyyaml
psutil
recommonmark
redis
+88
View File
@@ -0,0 +1,88 @@
Cluster setup and auto-scaling (Experimental)
=============================================
Quick start
-----------
First, ensure you have configured your AWS credentials in ``~/.aws/credentials``,
as described in `the boto docs <http://boto3.readthedocs.io/en/latest/guide/configuration.html>`__.
Then you're ready to go. The provided `ray/python/ray/autoscaler/aws/example.yaml <https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/aws/example.yaml>`__ cluster config file will create a small cluster with a m4.large
head node (on-demand), and two m4.large `spot workers <https://aws.amazon.com/ec2/spot/>`__.
Try it out with these commands:
.. code-block:: bash
# Create or update the cluster
$ ray create_or_update ray/python/ray/autoscaler/aws/example.yaml
# Resize the cluster without interrupting running jobs
$ ray create_or_update ray/python/ray/autoscaler/aws/example.yaml \
--max-workers=N --sync-only
# Teardown the cluster
$ ray teardown ray/python/ray/autoscaler/aws/example.yaml
Common configurations
---------------------
Note: auto-scaling support is not fully implemented yet (targeted for 0.4.0).
The example configuration above is enough to get started with Ray, but for more
compute intensive workloads you will want to change the instance types to e.g.
use GPU or larger compute instance by editing the yaml file. Here are a few common
configurations:
**GPU single node**: use Ray on a single large GPU instance.
.. code-block:: yaml
max_workers: 0
head_node:
InstanceType: p2.8xlarge
**Mixed GPU and CPU nodes**: for RL applications that require proportionally more
CPU than GPU resources, you can use additional CPU workers with a GPU head node.
.. code-block:: yaml
max_workers: 10
head_node:
InstanceType: p2.8xlarge
worker_nodes:
InstanceType: m4.16xlarge
**Autoscaling CPU cluster**: use a small head node and have Ray auto-scale
workers as needed. This can be a cost-efficient configuration for clusters with
bursty workloads. You can also request spot workers for additional cost savings.
.. code-block:: yaml
min_workers: 0
max_workers: 10
head_node:
InstanceType: m4.large
worker_nodes:
InstanceMarketOptions:
MarketType: spot
InstanceType: m4.16xlarge
**Autoscaling GPU cluster**: similar to the autoscaling CPU cluster, but
with GPU worker nodes instead.
.. code-block:: yaml
min_workers: 0
max_workers: 10
head_node:
InstanceType: m4.large
worker_nodes:
InstanceMarketOptions:
MarketType: spot
InstanceType: p2.8xlarge
Additional Cloud providers
--------------------------
To use Ray autoscaling on other Cloud providers or cluster management systems, you can implement the ``NodeProvider`` interface
(~100 LOC) and register it in `node_provider.py <https://github.com/ray-project/ray/tree/master/python/ray/autoscaler/node_provider.py>`__.
+1
View File
@@ -84,6 +84,7 @@ Example Program
:maxdepth: 1
:caption: Cluster Usage
autoscaling.rst
using-ray-on-a-cluster.rst
using-ray-on-a-large-cluster.rst
using-ray-and-docker-on-a-cluster.md
View File
+330
View File
@@ -0,0 +1,330 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import hashlib
import os
import subprocess
import traceback
from collections import defaultdict
import yaml
from ray.autoscaler.node_provider import get_node_provider
from ray.autoscaler.updater import NodeUpdaterProcess
from ray.autoscaler.tags import TAG_RAY_LAUNCH_CONFIG, \
TAG_RAY_RUNTIME_CONFIG, TAG_RAY_NODE_STATUS, TAG_RAY_NODE_TYPE, TAG_NAME
import ray.services as services
CLUSTER_CONFIG_SCHEMA = {
# An unique identifier for the head node and workers of this cluster.
"cluster_name": str,
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
"min_workers": int,
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
"max_workers": int,
# Cloud-provider specific configuration.
"provider": {
"type": str, # e.g. aws
"region": str, # e.g. us-east-1
},
# How Ray will authenticate with newly launched nodes.
"auth": dict,
# Provider-specific config for the head node, e.g. instance type.
"head_node": dict,
# Provider-specific config for worker nodes. e.g. instance type.
"worker_nodes": dict,
# Map of remote paths to local paths, e.g. {"/tmp/data": "/my/local/data"}
"file_mounts": dict,
# List of shell commands to run to initialize the head node.
"head_init_commands": list,
# List of shell commands to run to initialize workers.
"worker_init_commands": list,
}
# Abort autoscaling if more than this number of errors are encountered. This
# is a safety feature to prevent e.g. runaway node launches.
MAX_NUM_FAILURES = 5
# Max number of nodes to launch at a time.
MAX_CONCURRENT_LAUNCHES = 10
class StandardAutoscaler(object):
"""The autoscaling control loop for a Ray cluster.
There are two ways to start an autoscaling cluster: manually by running
`ray start --head --autoscaling-config=/path/to/config.yaml` on a
instance that has permission to launch other instances, or you can also use
`ray create_or_update /path/to/config.yaml` from your laptop, which will
configure the right AWS/Cloud roles automatically.
StandardAutoscaler's `update` method is periodically called by `monitor.py`
to add and remove nodes as necessary. Currently, load-based autoscaling is
not implemented, so all this class does is try to maintain a constant
cluster size.
StandardAutoscaler is also used to bootstrap clusters (by adding workers
until the target cluster size is met).
"""
def __init__(
self, config_path,
max_concurrent_launches=MAX_CONCURRENT_LAUNCHES,
max_failures=MAX_NUM_FAILURES, process_runner=subprocess,
verbose_updates=False, node_updater_cls=NodeUpdaterProcess):
self.config_path = config_path
self.reload_config(errors_fatal=True)
self.provider = get_node_provider(
self.config["provider"], self.config["cluster_name"])
self.max_failures = max_failures
self.max_concurrent_launches = max_concurrent_launches
self.verbose_updates = verbose_updates
self.process_runner = process_runner
self.node_updater_cls = node_updater_cls
# Map from node_id to NodeUpdater processes
self.updaters = {}
self.num_failed_updates = defaultdict(int)
self.num_failures = 0
for local_path in self.config["file_mounts"].values():
assert os.path.exists(local_path)
print("StandardAutoscaler: {}".format(self.config))
def update(self):
try:
self.reload_config(errors_fatal=False)
self._update()
except Exception as e:
print(
"StandardAutoscaler: Error during autoscaling: {}",
traceback.format_exc())
self.num_failures += 1
if self.num_failures > self.max_failures:
print("*** StandardAutoscaler: Too many errors, abort. ***")
raise e
def _update(self):
nodes = self.workers()
target_num_workers = self.config["max_workers"]
# Terminate nodes while there are too many
while len(nodes) > target_num_workers:
print(
"StandardAutoscaler: Terminating unneeded node: "
"{}".format(nodes[-1]))
self.provider.terminate_node(nodes[-1])
nodes = self.workers()
print(self.debug_string())
if target_num_workers == 0:
return
# Update nodes with out-of-date files
for node_id in nodes:
self.update_if_needed(node_id)
# Launch a new node if needed
if len(nodes) < target_num_workers:
self.launch_new_node(
min(
self.max_concurrent_launches,
target_num_workers - len(nodes)))
print(self.debug_string())
return
else:
# If enough nodes, terminate an out-of-date node.
for node_id in nodes:
if not self.launch_config_ok(node_id):
print(
"StandardAutoscaler: Terminating outdated node: "
"{}".format(node_id))
self.provider.terminate_node(node_id)
print(self.debug_string())
return
# Process any completed updates
completed = []
for node_id, updater in self.updaters.items():
if not updater.is_alive():
completed.append(node_id)
if completed:
for node_id in completed:
if self.updaters[node_id].exitcode != 0:
self.num_failed_updates[node_id] += 1
del self.updaters[node_id]
print(self.debug_string())
def reload_config(self, errors_fatal=False):
try:
with open(self.config_path) as f:
new_config = yaml.load(f.read())
validate_config(new_config)
new_launch_hash = hash_launch_conf(
new_config["worker_nodes"], new_config["auth"])
new_runtime_hash = hash_runtime_conf(
new_config["file_mounts"], new_config["worker_init_commands"])
self.config = new_config
self.launch_hash = new_launch_hash
self.runtime_hash = new_runtime_hash
except Exception as e:
if errors_fatal:
raise e
else:
print(
"StandardAutoscaler: Error parsing config: {}",
traceback.format_exc())
def launch_config_ok(self, node_id):
launch_conf = self.provider.node_tags(node_id).get(
TAG_RAY_LAUNCH_CONFIG)
if self.launch_hash != launch_conf:
return False
return True
def files_up_to_date(self, node_id):
applied = self.provider.node_tags(node_id).get(TAG_RAY_RUNTIME_CONFIG)
if applied != self.runtime_hash:
print(
"StandardAutoscaler: {} has runtime state {}, want {}".format(
node_id, applied, self.runtime_hash))
return False
return True
def update_if_needed(self, node_id):
if not self.provider.is_running(node_id):
return
if not self.launch_config_ok(node_id):
return
if node_id in self.updaters:
return
if self.num_failed_updates.get(node_id, 0) > 0: # TODO(ekl) retry?
return
if self.files_up_to_date(node_id):
return
updater = self.node_updater_cls(
node_id,
self.config["provider"],
self.config["auth"],
self.config["cluster_name"],
self.config["file_mounts"],
with_head_node_ip(self.config["worker_init_commands"]),
self.runtime_hash,
redirect_output=not self.verbose_updates,
process_runner=self.process_runner)
updater.start()
self.updaters[node_id] = updater
def launch_new_node(self, count):
print("StandardAutoscaler: Launching {} new nodes".format(count))
num_before = len(self.workers())
self.provider.create_node(
self.config["worker_nodes"],
{
TAG_NAME: "ray-{}-worker".format(self.config["cluster_name"]),
TAG_RAY_NODE_TYPE: "Worker",
TAG_RAY_NODE_STATUS: "Uninitialized",
TAG_RAY_LAUNCH_CONFIG: self.launch_hash,
},
count)
# TODO(ekl) be less conservative in this check
assert len(self.workers()) > num_before, \
"Num nodes failed to increase after creating a new node"
def workers(self):
return self.provider.nodes(tag_filters={
TAG_RAY_NODE_TYPE: "Worker",
})
def debug_string(self, nodes=None):
if nodes is None:
nodes = self.workers()
target_num_workers = self.config["max_workers"]
suffix = ""
if self.updaters:
suffix += " ({} updating)".format(len(self.updaters))
if self.num_failed_updates:
suffix += " ({} failed to update)".format(
len(self.num_failed_updates))
return "StandardAutoscaler: Have {} / {} target nodes{}".format(
len(nodes), target_num_workers, suffix)
def validate_config(config, schema=CLUSTER_CONFIG_SCHEMA):
if type(config) is not dict:
raise ValueError("Config is not a dictionary")
for k, v in schema.items():
if k not in config:
raise ValueError(
"Missing required config key `{}` of type {}".format(
k, v.__name__))
if isinstance(v, type):
if not isinstance(config[k], v):
raise ValueError(
"Config key `{}` has wrong type {}, expected {}".format(
k, type(config[k]).__name__, v.__name__))
else:
validate_config(config[k], schema[k])
for k in config.keys():
if k not in schema:
raise ValueError(
"Unexpected config key `{}` not in {}".format(
k, schema.keys()))
def with_head_node_ip(cmds):
head_ip = services.get_node_ip_address()
out = []
for cmd in cmds:
out.append("export RAY_HEAD_IP={}; {}".format(head_ip, cmd))
return out
def hash_launch_conf(node_conf, auth):
hasher = hashlib.sha1()
hasher.update(json.dumps([node_conf, auth]).encode("utf-8"))
return hasher.hexdigest()
def hash_runtime_conf(file_mounts, extra_objs):
hasher = hashlib.sha1()
def add_content_hashes(path):
if os.path.isdir(path):
dirs = []
for dirpath, _, filenames in os.walk(path):
dirs.append((dirpath, sorted(filenames)))
for dirpath, filenames in sorted(dirs):
hasher.update(dirpath.encode("utf-8"))
for name in filenames:
hasher.update(name.encode("utf-8"))
with open(os.path.join(dirpath, name), "rb") as f:
hasher.update(f.read())
else:
with open(path, 'r') as f:
hasher.update(f.read().encode("utf-8"))
hasher.update(json.dumps(sorted(file_mounts.items())).encode("utf-8"))
hasher.update(json.dumps(extra_objs).encode("utf-8"))
for local_path in sorted(file_mounts.values()):
add_content_hashes(local_path)
return hasher.hexdigest()
+249
View File
@@ -0,0 +1,249 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import logging
import os
import time
import boto3
RAY = "ray-autoscaler"
DEFAULT_RAY_INSTANCE_PROFILE = RAY
DEFAULT_RAY_IAM_ROLE = RAY
SECURITY_GROUP_TEMPLATE = RAY + "-{}"
def key_pair(i):
"""Returns the ith default (aws_key_pair_name, key_pair_path)."""
if i == 0:
return RAY, os.path.expanduser("~/.ssh/{}.pem".format(RAY))
return (
"{}_{}".format(RAY, i),
os.path.expanduser("~/.ssh/{}_{}.pem".format(RAY, i)))
# Suppress excessive connection dropped logs from boto
logging.getLogger("botocore").setLevel(logging.WARNING)
def bootstrap_aws(config):
# The head node needs to have an IAM role that allows it to create further
# EC2 instances.
config = _configure_iam_role(config)
# Configure SSH access, using an existing key pair if possible.
config = _configure_key_pair(config)
# Pick a reasonable subnet if not specified by the user.
config = _configure_subnet(config)
# Cluster workers should be in a security group that permits traffic within
# the group, and also SSH access from outside.
config = _configure_security_group(config)
return config
def _configure_iam_role(config):
if "IamInstanceProfile" in config["head_node"]:
return config
profile = _get_instance_profile(DEFAULT_RAY_INSTANCE_PROFILE, config)
if profile is None:
print("Creating new instance profile {}".format(
DEFAULT_RAY_INSTANCE_PROFILE))
client = _client("iam", config)
client.create_instance_profile(
InstanceProfileName=DEFAULT_RAY_INSTANCE_PROFILE)
profile = _get_instance_profile(DEFAULT_RAY_INSTANCE_PROFILE, config)
time.sleep(15) # wait for propagation
assert profile is not None, "Failed to create instance profile"
if not profile.roles:
role = _get_role(DEFAULT_RAY_IAM_ROLE, config)
if role is None:
print("Creating new role {}".format(DEFAULT_RAY_IAM_ROLE))
iam = _resource("iam", config)
iam.create_role(
RoleName=DEFAULT_RAY_IAM_ROLE,
AssumeRolePolicyDocument=json.dumps({
"Statement": [
{
"Effect": "Allow",
"Principal": {"Service": "ec2.amazonaws.com"},
"Action": "sts:AssumeRole",
},
],
}))
role = _get_role(DEFAULT_RAY_IAM_ROLE, config)
assert role is not None, "Failed to create role"
role.attach_policy(
PolicyArn="arn:aws:iam::aws:policy/AmazonEC2FullAccess")
profile.add_role(RoleName=role.name)
time.sleep(15) # wait for propagation
print("Role not specified for head node, using {}".format(
profile.arn))
config["head_node"]["IamInstanceProfile"] = {"Arn": profile.arn}
return config
def _configure_key_pair(config):
if "ssh_private_key" in config["auth"]:
assert "KeyName" in config["head_node"]
assert "KeyName" in config["worker_nodes"]
return config
ec2 = _resource("ec2", config)
# Try a few times to get or create a good key pair.
for i in range(10):
key_name, key_path = key_pair(i)
key = _get_key(key_name, config)
# Found a good key.
if key and os.path.exists(key_path):
break
# We can safely create a new key.
if not key and not os.path.exists(key_path):
print("Creating new key pair {}".format(key_name))
key = ec2.create_key_pair(KeyName=key_name)
with open(key_path, "w") as f:
f.write(key.key_material)
os.chmod(key_path, 0o600)
break
assert key, "AWS keypair {} not found for {}".format(key_name, key_path)
assert os.path.exists(key_path), \
"Private key file {} not found for {}".format(key_path, key_name)
print("KeyName not specified for nodes, using {}".format(key_name))
config["auth"]["ssh_private_key"] = key_path
config["head_node"]["KeyName"] = key_name
config["worker_nodes"]["KeyName"] = key_name
return config
def _configure_subnet(config):
ec2 = _resource("ec2", config)
subnets = sorted(
[s for s in ec2.subnets.all() if s.state == "available"],
reverse=True, # sort from Z-A
key=lambda subnet: subnet.availability_zone)
default_subnet = subnets[0]
if "SubnetId" not in config["head_node"]:
config["head_node"]["SubnetId"] = default_subnet.id
print("SubnetId not specified for head node, using {} in {}".format(
default_subnet.id, default_subnet.availability_zone))
if "SubnetId" not in config["worker_nodes"]:
config["worker_nodes"]["SubnetId"] = default_subnet.id
print("SubnetId not specified for workers, using {} in {}".format(
default_subnet.id, default_subnet.availability_zone))
return config
def _configure_security_group(config):
if "SecurityGroupIds" in config["head_node"] and \
"SecurityGroupIds" in config["worker_nodes"]:
return config # have user-defined groups
group_name = SECURITY_GROUP_TEMPLATE.format(config["cluster_name"])
subnet = _get_subnet_or_die(config, config["worker_nodes"]["SubnetId"])
security_group = _get_security_group(config, subnet.vpc_id, group_name)
if security_group is None:
print("Creating new security group {}".format(group_name))
client = _client("ec2", config)
client.create_security_group(
Description="Auto-created security group for Ray workers",
GroupName=group_name,
VpcId=subnet.vpc_id)
security_group = _get_security_group(config, subnet.vpc_id, group_name)
assert security_group, "Failed to create security group"
if not security_group.ip_permissions:
security_group.authorize_ingress(
IpPermissions=[
{"FromPort": -1, "ToPort": -1, "IpProtocol": "-1",
"UserIdGroupPairs": [{"GroupId": security_group.id}]},
{"FromPort": 22, "ToPort": 22, "IpProtocol": "TCP",
"IpRanges": [{"CidrIp": "0.0.0.0/0"}]}])
if "SecurityGroupIds" not in config["head_node"]:
print("SecurityGroupIds not specified for head node, using {}".format(
security_group.group_name))
config["head_node"]["SecurityGroupIds"] = [security_group.id]
if "SecurityGroupIds" not in config["worker_nodes"]:
print("SecurityGroupIds not specified for workers, using {}".format(
security_group.group_name))
config["worker_nodes"]["SecurityGroupIds"] = [security_group.id]
return config
def _get_subnet_or_die(config, subnet_id):
ec2 = _resource("ec2", config)
subnet = list(
ec2.subnets.filter(Filters=[
{"Name": "subnet-id", "Values": [subnet_id]}]))
assert len(subnet) == 1, "Subnet not found"
subnet = subnet[0]
return subnet
def _get_security_group(config, vpc_id, group_name):
ec2 = _resource("ec2", config)
existing_groups = list(
ec2.security_groups.filter(Filters=[
{"Name": "vpc-id", "Values": [vpc_id]}]))
for sg in existing_groups:
if sg.group_name == group_name:
return sg
def _get_role(role_name, config):
iam = _resource("iam", config)
role = iam.Role(role_name)
try:
role.load()
return role
except Exception:
return None
def _get_instance_profile(profile_name, config):
iam = _resource("iam", config)
profile = iam.InstanceProfile(profile_name)
try:
profile.load()
return profile
except Exception:
return None
def _get_key(key_name, config):
ec2 = _resource("ec2", config)
for key in ec2.key_pairs.filter(
Filters=[{"Name": "key-name", "Values": [key_name]}]):
if key.name == key_name:
return key
def _client(name, config):
return boto3.client(name, config["provider"]["region"])
def _resource(name, config):
return boto3.resource(name, config["provider"]["region"])
+72
View File
@@ -0,0 +1,72 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# Cloud-provider specific configuration.
provider:
type: aws
region: us-east-1
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
head_node:
InstanceType: m5.large
ImageId: ami-212d465b
# Additional options in the boto docs.
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: m5.large
ImageId: ami-212d465b
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# List of shell commands to run to initialize the head node.
head_init_commands:
- cd ~/ray; git remote add eric https://github.com/ericl/ray.git || true
- cd ~/ray; git fetch eric && git reset --hard e1e97b3
- yes | ~/anaconda3/bin/conda install boto3=1.4.8 # 1.4.8 adds InstanceMarketOptions
- ~/.local/bin/ray stop
- ~/.local/bin/ray start --head --redis-port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml
# List of shell commands to run to initialize workers.
worker_init_commands:
- cd ~/ray; git remote add eric https://github.com/ericl/ray.git || true
- cd ~/ray; git fetch eric && git reset --hard e1e97b3
- ~/.local/bin/ray stop
- ~/.local/bin/ray start --redis-address=$RAY_HEAD_IP:6379
@@ -0,0 +1,95 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import boto3
from ray.autoscaler.node_provider import NodeProvider
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
class AWSNodeProvider(NodeProvider):
def __init__(self, provider_config, cluster_name):
NodeProvider.__init__(self, provider_config, cluster_name)
self.ec2 = boto3.resource("ec2", region_name=provider_config["region"])
def nodes(self, tag_filters):
filters = [
{
"Name": "instance-state-name",
"Values": ["pending", "running"],
},
{
"Name": "tag:{}".format(TAG_RAY_CLUSTER_NAME),
"Values": [self.cluster_name],
},
]
for k, v in tag_filters.items():
filters.append({
"Name": "tag:{}".format(k),
"Values": [v],
})
instances = list(self.ec2.instances.filter(Filters=filters))
return [i.id for i in instances]
def is_running(self, node_id):
node = self._node(node_id)
return node.state["Name"] == "running"
def is_terminated(self, node_id):
node = self._node(node_id)
state = node.state["Name"]
return state not in ["running", "pending"]
def node_tags(self, node_id):
node = self._node(node_id)
tags = {}
for tag in node.tags:
tags[tag["Key"]] = tag["Value"]
return tags
def external_ip(self, node_id):
node = self._node(node_id)
return node.public_ip_address
def set_node_tags(self, node_id, tags):
node = self._node(node_id)
tag_pairs = []
for k, v in tags.items():
tag_pairs.append({
"Key": k, "Value": v,
})
node.create_tags(Tags=tag_pairs)
def create_node(self, node_config, tags, count):
conf = node_config.copy()
tag_pairs = [{
"Key": TAG_RAY_CLUSTER_NAME,
"Value": self.cluster_name,
}]
for k, v in tags.items():
tag_pairs.append(
{
"Key": k,
"Value": v,
})
conf.update({
"MinCount": 1,
"MaxCount": count,
"TagSpecifications": conf.get("TagSpecifications", []) + [
{
"ResourceType": "instance",
"Tags": tag_pairs,
}
]
})
self.ec2.create_instances(**conf)
def terminate_node(self, node_id):
node = self._node(node_id)
node.terminate()
def _node(self, node_id):
matches = list(self.ec2.instances.filter(InstanceIds=[node_id]))
assert len(matches) == 1, "Invalid instance id {}".format(node_id)
return matches[0]
+146
View File
@@ -0,0 +1,146 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import json
import tempfile
import time
import sys
import yaml
from ray.autoscaler.autoscaler import validate_config, hash_runtime_conf, \
hash_launch_conf
from ray.autoscaler.node_provider import get_node_provider, NODE_PROVIDERS
from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_LAUNCH_CONFIG, \
TAG_RAY_RUNTIME_CONFIG, TAG_NAME
from ray.autoscaler.updater import NodeUpdaterProcess
def create_or_update_cluster(
config_file, override_min_workers, override_max_workers, sync_only):
"""Create or updates an autoscaling Ray cluster from a config json."""
config = yaml.load(open(config_file).read())
validate_config(config)
if override_min_workers is not None:
config["min_workers"] = override_min_workers
if override_max_workers is not None:
config["max_workers"] = override_max_workers
if sync_only:
config["worker_init_commands"] = []
config["head_init_commands"] = []
importer = NODE_PROVIDERS.get(config["provider"]["type"])
if not importer:
raise NotImplementedError(
"Unsupported provider {}".format(config["provider"]))
bootstrap_config, _ = importer()
config = bootstrap_config(config)
get_or_create_head_node(config)
def teardown_cluster(config_file):
"""Destroys all nodes of a Ray cluster described by a config json."""
config = yaml.load(open(config_file).read())
validate_config(config)
provider = get_node_provider(config["provider"], config["cluster_name"])
head_node_tags = {
TAG_RAY_NODE_TYPE: "Head",
}
for node in provider.nodes(head_node_tags):
print("Terminating head node {}".format(node))
provider.terminate_node(node)
nodes = provider.nodes({})
while nodes:
for node in nodes:
print("Terminating worker {}".format(node))
provider.terminate_node(node)
time.sleep(5)
nodes = provider.nodes({})
def get_or_create_head_node(config):
"""Create the cluster head node, which in turn creates the workers."""
provider = get_node_provider(config["provider"], config["cluster_name"])
head_node_tags = {
TAG_RAY_NODE_TYPE: "Head",
}
nodes = provider.nodes(head_node_tags)
if len(nodes) > 0:
head_node = nodes[0]
else:
head_node = None
launch_hash = hash_launch_conf(config["head_node"], config["auth"])
if head_node is None or provider.node_tags(head_node).get(
TAG_RAY_LAUNCH_CONFIG) != launch_hash:
if head_node is not None:
print("Terminating outdated head node {}".format(head_node))
provider.terminate_node(head_node)
print("Launching new head node...")
head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash
head_node_tags[TAG_NAME] = "ray-{}-head".format(config["cluster_name"])
provider.create_node(config["head_node"], head_node_tags, 1)
nodes = provider.nodes(head_node_tags)
assert len(nodes) == 1, "Failed to create head node."
head_node = nodes[0]
runtime_hash = hash_runtime_conf(config["file_mounts"], config)
if provider.node_tags(head_node).get(
TAG_RAY_RUNTIME_CONFIG) != runtime_hash:
print("Updating files on head node...")
# Rewrite the auth config so that the head node can update the workers
remote_key_path = "~/ray_bootstrap_key.pem"
remote_config = copy.deepcopy(config)
remote_config["auth"]["ssh_private_key"] = remote_key_path
# Adjust for new file locations
new_mounts = {}
for remote_path in config["file_mounts"]:
new_mounts[remote_path] = remote_path
remote_config["file_mounts"] = new_mounts
# Now inject the rewritten config and SSH key into the head node
remote_config_file = tempfile.NamedTemporaryFile(
"w", prefix="ray-bootstrap-")
remote_config_file.write(json.dumps(remote_config))
remote_config_file.flush()
config["file_mounts"].update({
remote_key_path: config["auth"]["ssh_private_key"],
"~/ray_bootstrap_config.yaml": remote_config_file.name
})
updater = NodeUpdaterProcess(
head_node,
config["provider"],
config["auth"],
config["cluster_name"],
config["file_mounts"],
config["head_init_commands"],
runtime_hash,
redirect_output=False)
updater.start()
updater.join()
if updater.exitcode != 0:
print("Error: updating {} failed".format(
provider.external_ip(head_node)))
sys.exit(1)
print(
"Head node up-to-date, IP address is: {}".format(
provider.external_ip(head_node)))
print(
"To monitor auto-scaling activity, you can run:\n\n"
" ssh -i {} {}@{} 'tail -f /tmp/raylogs/monitor-*'\n".format(
config["auth"]["ssh_private_key"],
config["auth"]["ssh_user"],
provider.external_ip(head_node)))
+83
View File
@@ -0,0 +1,83 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
def import_aws():
from ray.autoscaler.aws.config import bootstrap_aws
from ray.autoscaler.aws.node_provider import AWSNodeProvider
return bootstrap_aws, AWSNodeProvider
NODE_PROVIDERS = {
"aws": import_aws,
"gce": None, # TODO: support more node providers
"azure": None,
"kubernetes": None,
"docker": None,
"local_cluster": None,
}
def get_node_provider(provider_config, cluster_name):
importer = NODE_PROVIDERS.get(provider_config["type"])
if importer is None:
raise NotImplementedError(
"Unsupported node provider: {}".format(provider_config["type"]))
_, provider_cls = importer()
return provider_cls(provider_config, cluster_name)
class NodeProvider(object):
"""Interface for getting and returning nodes from a Cloud.
NodeProviders are namespaced by the `cluster_name` parameter; they only
operate on nodes within that namespace.
Nodes may be in one of three states: {pending, running, terminated}. Nodes
appear immediately once started by `create_node`, and transition
immediately to terminated when `terminate_node` is called.
"""
def __init__(self, provider_config, cluster_name):
self.provider_config = provider_config
self.cluster_name = cluster_name
def nodes(self, tag_filters):
"""Return a list of node ids filtered by the specified tags dict.
This list must not include terminated nodes.
Examples:
>>> provider.nodes({TAG_RAY_NODE_TYPE: "Worker"})
["node-1", "node-2"]
"""
raise NotImplementedError
def is_running(self, node_id):
"""Return whether the specified node is running."""
raise NotImplementedError
def is_terminated(self, node_id):
"""Return whether the specified node is terminated."""
raise NotImplementedError
def node_tags(self, node_id):
"""Returns the tags of the given node (string dict)."""
raise NotImplementedError
def external_ip(self, node_id):
"""Returns the external ip of the given node."""
raise NotImplementedError
def create_node(self, node_config, tags, count):
"""Creates a number of nodes within the namespace."""
raise NotImplementedError
def set_node_tags(self, node_id, tags):
"""Sets the tag values (string dict) for the specified node."""
raise NotImplementedError
def terminate_node(self, node_id):
"""Terminates the specified node."""
raise NotImplementedError
+23
View File
@@ -0,0 +1,23 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""The Ray autoscaler uses tags to associate metadata with instances."""
# Tag for the name of the node
TAG_NAME = "Name"
# Tag uniquely identifying all nodes of a cluster
TAG_RAY_CLUSTER_NAME = "ray:ClusterName"
# Tag for the type of node (e.g. Head, Worker)
TAG_RAY_NODE_TYPE = "ray:NodeType"
# Tag that reports the current state of the node (e.g. Updating, Up-to-date)
TAG_RAY_NODE_STATUS = "ray:NodeStatus"
# Hash of the node launch config, used to identify out-of-date nodes
TAG_RAY_LAUNCH_CONFIG = "ray:LaunchConfig"
# Hash of the node runtime config, used to determine if updates are needed
TAG_RAY_RUNTIME_CONFIG = "ray:RuntimeConfig"
+172
View File
@@ -0,0 +1,172 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import subprocess
import sys
import tempfile
import time
from multiprocessing import Process
from threading import Thread
from ray.autoscaler.node_provider import get_node_provider
from ray.autoscaler.tags import TAG_RAY_NODE_STATUS, TAG_RAY_RUNTIME_CONFIG
# How long to wait for a node to start, in seconds
NODE_START_WAIT_S = 300
class NodeUpdater(object):
"""A process for syncing files and running init commands on a node."""
def __init__(
self, node_id, provider_config, auth_config, cluster_name,
file_mounts, init_cmds, runtime_hash, redirect_output=True,
process_runner=subprocess):
self.daemon = True
self.process_runner = process_runner
self.provider = get_node_provider(provider_config, cluster_name)
self.ssh_private_key = auth_config["ssh_private_key"]
self.ssh_user = auth_config["ssh_user"]
self.ssh_ip = self.provider.external_ip(node_id)
self.node_id = node_id
self.file_mounts = file_mounts
self.init_cmds = init_cmds
self.runtime_hash = runtime_hash
if redirect_output:
self.logfile = tempfile.NamedTemporaryFile(
mode="w", prefix="node-updater-", delete=False)
self.output_name = self.logfile.name
self.stdout = self.logfile
self.stderr = self.logfile
else:
self.logfile = None
self.output_name = "(console)"
self.stdout = sys.stdout
self.stderr = sys.stderr
def run(self):
print("NodeUpdater: Updating {} to {}, logging to {}".format(
self.node_id, self.runtime_hash, self.output_name))
try:
self.do_update()
except Exception as e:
print(
"NodeUpdater: Error updating {}, "
"see {} for remote logs".format(e, self.output_name),
file=self.stdout)
self.provider.set_node_tags(
self.node_id, {TAG_RAY_NODE_STATUS: "UpdateFailed"})
if self.logfile is not None:
print(
"----- BEGIN REMOTE LOGS -----\n" +
open(self.logfile.name).read() +
"\n----- END REMOTE LOGS -----")
raise e
self.provider.set_node_tags(
self.node_id, {
TAG_RAY_NODE_STATUS: "Up-to-date",
TAG_RAY_RUNTIME_CONFIG: self.runtime_hash
})
print(
"NodeUpdater: Applied config {} to node {}".format(
self.runtime_hash, self.node_id),
file=self.stdout)
def do_update(self):
self.provider.set_node_tags(
self.node_id, {TAG_RAY_NODE_STATUS: "WaitingForSSH"})
deadline = time.time() + NODE_START_WAIT_S
# Wait for external IP
while time.time() < deadline and \
not self.provider.is_terminated(self.node_id):
print(
"NodeUpdater: Waiting for IP of {}...".format(self.node_id),
file=self.stdout)
self.ssh_ip = self.provider.external_ip(self.node_id)
if self.ssh_ip is not None:
break
time.sleep(5)
assert self.ssh_ip is not None, "Unable to find IP of node"
# Wait for SSH access
while time.time() < deadline and \
not self.provider.is_terminated(self.node_id):
try:
print(
"NodeUpdater: Waiting for SSH to {}...".format(
self.node_id),
file=self.stdout)
if not self.provider.is_running(self.node_id):
raise Exception()
self.ssh_cmd(
"uptime",
connect_timeout=5, redirect=open("/dev/null", "w"))
except Exception as e:
print(
"NodeUpdater: SSH not up, retrying: {}".format(e),
file=self.stdout)
time.sleep(5)
else:
break
assert not self.provider.is_terminated(self.node_id)
# Rsync file mounts
self.provider.set_node_tags(
self.node_id, {TAG_RAY_NODE_STATUS: "SyncingFiles"})
for remote_path, local_path in self.file_mounts.items():
print(
"NodeUpdater: Syncing {} to {}...".format(
local_path, remote_path),
file=self.stdout)
assert os.path.exists(local_path)
if os.path.isdir(local_path):
if not local_path.endswith("/"):
local_path += "/"
if not remote_path.endswith("/"):
remote_path += "/"
self.ssh_cmd(
"mkdir -p {}".format(os.path.dirname(remote_path)))
self.process_runner.check_call([
"rsync", "-e", "ssh -i {} ".format(self.ssh_private_key) +
"-o ConnectTimeout=60s -o StrictHostKeyChecking=no",
"--delete", "-avz", "{}".format(local_path),
"{}@{}:{}".format(self.ssh_user, self.ssh_ip, remote_path)
], stdout=self.stdout, stderr=self.stderr)
# Run init commands
self.provider.set_node_tags(
self.node_id, {TAG_RAY_NODE_STATUS: "RunningInitCmds"})
for cmd in self.init_cmds:
self.ssh_cmd(cmd, verbose=True)
def ssh_cmd(self, cmd, connect_timeout=60, redirect=None, verbose=False):
if verbose:
print(
"NodeUpdater: running {} on {}...".format(
cmd, self.ssh_ip),
file=self.stdout)
self.process_runner.check_call([
"ssh", "-o", "ConnectTimeout={}s".format(connect_timeout),
"-o", "StrictHostKeyChecking=no",
"-i", self.ssh_private_key,
"{}@{}".format(self.ssh_user, self.ssh_ip),
cmd,
], stdout=redirect or self.stdout, stderr=redirect or self.stderr)
class NodeUpdaterProcess(NodeUpdater, Process):
def __init__(self, *args, **kwargs):
Process.__init__(self)
NodeUpdater.__init__(self, *args, **kwargs)
# Single-threaded version for unit tests
class NodeUpdaterThread(NodeUpdater, Thread):
def __init__(self, *args, **kwargs):
Thread.__init__(self)
NodeUpdater.__init__(self, *args, **kwargs)
self.exitcode = 0
+21 -2
View File
@@ -5,6 +5,7 @@ from __future__ import print_function
import argparse
import json
import logging
import os
import time
from collections import Counter, defaultdict
@@ -15,6 +16,7 @@ import redis
from ray.core.generated.DriverTableMessage import DriverTableMessage
from ray.core.generated.SubscribeToDBClientTableReply import \
SubscribeToDBClientTableReply
from ray.autoscaler.autoscaler import StandardAutoscaler
from ray.core.generated.TaskInfo import TaskInfo
from ray.services import get_ip_address, get_port
from ray.utils import binary_to_hex, binary_to_object_id, hex_to_binary
@@ -75,7 +77,7 @@ class Monitor(object):
managers that were up at one point and have died since then.
"""
def __init__(self, redis_address, redis_port):
def __init__(self, redis_address, redis_port, autoscaling_config):
# Initialize the Redis clients.
self.state = ray.experimental.state.GlobalState()
self.state._initialize_global_state(redis_address, redis_port)
@@ -90,6 +92,10 @@ class Monitor(object):
self.dead_local_schedulers = set()
self.live_plasma_managers = Counter()
self.dead_plasma_managers = set()
if autoscaling_config:
self.autoscaler = StandardAutoscaler(autoscaling_config)
else:
self.autoscaler = None
def subscribe(self, channel):
"""Subscribe to the given channel.
@@ -556,6 +562,9 @@ class Monitor(object):
# Handle messages from the subscription channels.
while True:
# Process autoscaling actions
if self.autoscaler:
self.autoscaler.update()
# Record how many dead local schedulers and plasma managers we had
# at the beginning of this round.
num_dead_local_schedulers = len(self.dead_local_schedulers)
@@ -604,6 +613,11 @@ if __name__ == "__main__":
required=True,
type=str,
help="the address to use for Redis")
parser.add_argument(
"--autoscaling-config",
required=False,
type=str,
help="the path to the autoscaling config file")
args = parser.parse_args()
redis_ip_address = get_ip_address(args.redis_address)
@@ -612,5 +626,10 @@ if __name__ == "__main__":
# Initialize the global state.
ray.global_state._initialize_global_state(redis_ip_address, redis_port)
monitor = Monitor(redis_ip_address, redis_port)
if args.autoscaling_config:
autoscaling_config = os.path.expanduser(args.autoscaling_config)
else:
autoscaling_config = None
monitor = Monitor(redis_ip_address, redis_port, autoscaling_config)
monitor.run()
+33 -2
View File
@@ -7,6 +7,7 @@ import json
import subprocess
import ray.services as services
from ray.autoscaler.commands import create_or_update_cluster, teardown_cluster
def check_no_existing_redis_clients(node_ip_address, redis_client):
@@ -76,10 +77,12 @@ def cli():
help="object store directory for memory mapped files")
@click.option("--huge-pages", is_flag=True, default=False,
help="enable support for huge pages in the object store")
@click.option("--autoscaling-config", required=False, type=str,
help="the file that contains the autoscaling config")
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
redis_max_clients, object_manager_port, num_workers, num_cpus,
num_gpus, resources, head, no_ui, block, plasma_directory,
huge_pages):
huge_pages, autoscaling_config):
# Note that we redirect stdout and stderr to /dev/null because otherwise
# attempts to print may cause exceptions if a process is started inside of
# an SSH connection and the SSH connection dies. TODO(rkn): This is a
@@ -138,7 +141,8 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
redis_max_clients=redis_max_clients,
include_webui=(not no_ui),
plasma_directory=plasma_directory,
huge_pages=huge_pages)
huge_pages=huge_pages,
autoscaling_config=autoscaling_config)
print(address_info)
print("\nStarted Ray on this node. You can add additional nodes to "
"the cluster by calling\n\n"
@@ -238,8 +242,35 @@ def stop():
"awk '{ print $2 }') 2> /dev/null"], shell=True)
@click.command()
@click.argument("cluster_config_file", required=True, type=str)
@click.option(
"--sync-only", is_flag=True, default=False, help=(
"Whether to only perform the file sync stage when updating nodes. "
"This avoids interrupting running jobs. You can use this when "
"resizing the cluster with the min/max_workers flag."))
@click.option(
"--min-workers", required=False, type=int, help=(
"Override the configured min worker node count for the cluster."))
@click.option(
"--max-workers", required=False, type=int, help=(
"Override the configured max worker node count for the cluster."))
def create_or_update(
cluster_config_file, min_workers, max_workers, sync_only):
create_or_update_cluster(
cluster_config_file, min_workers, max_workers, sync_only)
@click.command()
@click.argument("cluster_config_file", required=True, type=str)
def teardown(cluster_config_file):
teardown_cluster(cluster_config_file)
cli.add_command(start)
cli.add_command(stop)
cli.add_command(create_or_update)
cli.add_command(teardown)
def main():
+19 -8
View File
@@ -4,6 +4,7 @@ from __future__ import print_function
import binascii
from collections import namedtuple, OrderedDict
from datetime import datetime
import cloudpickle
import json
import os
@@ -863,7 +864,7 @@ def start_worker(node_ip_address, object_store_name, object_store_manager_name,
def start_monitor(redis_address, node_ip_address, stdout_file=None,
stderr_file=None, cleanup=True):
stderr_file=None, cleanup=True, autoscaling_config=None):
"""Run a process to monitor the other processes.
Args:
@@ -878,12 +879,15 @@ def start_monitor(redis_address, node_ip_address, stdout_file=None,
then this process will be killed by services.cleanup() when the
Python process that imported services exits. This is True by
default.
autoscaling_config: path to autoscaling config file.
"""
monitor_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"monitor.py")
command = [sys.executable,
monitor_path,
"--redis-address=" + str(redis_address)]
if autoscaling_config:
command.append("--autoscaling-config=" + str(autoscaling_config))
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
if cleanup:
all_processes[PROCESS_TYPE_WORKER].append(p)
@@ -908,7 +912,8 @@ def start_ray_processes(address_info=None,
start_workers_from_local_scheduler=True,
resources=None,
plasma_directory=None,
huge_pages=False):
huge_pages=False,
autoscaling_config=None):
"""Helper method to start Ray processes.
Args:
@@ -956,6 +961,7 @@ def start_ray_processes(address_info=None,
be created.
huge_pages: Boolean flag indicating whether to start the Object
Store with hugetlbfs support. Requires plasma_directory.
autoscaling_config: path to autoscaling config file.
Returns:
A dictionary of the address information for the processes that were
@@ -1006,7 +1012,8 @@ def start_ray_processes(address_info=None,
node_ip_address,
stdout_file=monitor_stdout_file,
stderr_file=monitor_stderr_file,
cleanup=cleanup)
cleanup=cleanup,
autoscaling_config=autoscaling_config)
if redis_shards == []:
# Get redis shards from primary redis instance.
@@ -1221,7 +1228,8 @@ def start_ray_head(address_info=None,
redis_max_clients=None,
include_webui=True,
plasma_directory=None,
huge_pages=False):
huge_pages=False,
autoscaling_config=None):
"""Start Ray in local mode.
Args:
@@ -1263,6 +1271,7 @@ def start_ray_head(address_info=None,
be created.
huge_pages: Boolean flag indicating whether to start the Object
Store with hugetlbfs support. Requires plasma_directory.
autoscaling_config: path to autoscaling config file.
Returns:
A dictionary of the address information for the processes that were
@@ -1287,7 +1296,8 @@ def start_ray_head(address_info=None,
num_redis_shards=num_redis_shards,
redis_max_clients=redis_max_clients,
plasma_directory=plasma_directory,
huge_pages=huge_pages)
huge_pages=huge_pages,
autoscaling_config=autoscaling_config)
def try_to_create_directory(directory_path):
@@ -1333,9 +1343,10 @@ def new_log_files(name, redirect_output):
# Create another directory that will be used by some of the RL algorithms.
try_to_create_directory("/tmp/ray")
log_id = random.randint(0, 1000000000)
log_stdout = "{}/{}-{:010d}.out".format(logs_dir, name, log_id)
log_stderr = "{}/{}-{:010d}.err".format(logs_dir, name, log_id)
log_id = random.randint(0, 10000)
date_str = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
log_stdout = "{}/{}-{}-{:05d}.out".format(logs_dir, name, date_str, log_id)
log_stderr = "{}/{}-{}-{:05d}.err".format(logs_dir, name, date_str, log_id)
log_stdout_file = open(log_stdout, "a")
log_stderr_file = open(log_stderr, "a")
return log_stdout_file, log_stderr_file
+1
View File
@@ -112,6 +112,7 @@ setup(name="ray",
"colorama",
"psutil",
"pytest",
"pyyaml",
"redis",
"cloudpickle == 0.5.2",
# The six module is required by pyarrow.
+326
View File
@@ -0,0 +1,326 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import shutil
import tempfile
import time
import unittest
import yaml
import ray
from ray.autoscaler.autoscaler import StandardAutoscaler
from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_NODE_STATUS
from ray.autoscaler.node_provider import NODE_PROVIDERS, NodeProvider
from ray.autoscaler.updater import NodeUpdaterThread
class MockNode(object):
def __init__(self, node_id, tags):
self.node_id = node_id
self.state = "pending"
self.tags = tags
self.external_ip = "1.2.3.4"
def matches(self, tags):
for k, v in tags.items():
if k not in self.tags or self.tags[k] != v:
return False
return True
class MockProcessRunner(object):
def __init__(self, fail_cmds=[]):
self.calls = []
self.fail_cmds = fail_cmds
def check_call(self, cmd, *args, **kwargs):
for token in self.fail_cmds:
if token in str(cmd):
raise Exception("Failing command on purpose")
self.calls.append(cmd)
class MockProvider(NodeProvider):
def __init__(self):
self.mock_nodes = {}
self.next_id = 0
self.throw = False
self.fail_creates = False
def nodes(self, tag_filters):
if self.throw:
raise Exception("oops")
return [
n.node_id for n in self.mock_nodes.values()
if n.matches(tag_filters) and n.state != "terminated"]
def is_running(self, node_id):
return self.mock_nodes[node_id].state == "running"
def is_terminated(self, node_id):
return self.mock_nodes[node_id].state == "terminated"
def node_tags(self, node_id):
return self.mock_nodes[node_id].tags
def external_ip(self, node_id):
return self.mock_nodes[node_id].external_ip
def create_node(self, node_config, tags, count):
if self.fail_creates:
return
for _ in range(count):
self.mock_nodes[self.next_id] = MockNode(self.next_id, tags)
self.next_id += 1
def set_node_tags(self, node_id, tags):
self.mock_nodes[node_id].tags.update(tags)
def terminate_node(self, node_id):
self.mock_nodes[node_id].state = "terminated"
SMALL_CLUSTER = {
"cluster_name": "default",
"min_workers": 2,
"max_workers": 2,
"provider": {
"type": "mock",
"region": "us-east-1",
},
"auth": {
"ssh_user": "ubuntu",
"ssh_private_key": "/dev/null",
},
"head_node": {
"TestProp": 1,
},
"worker_nodes": {
"TestProp": 2,
},
"file_mounts": {},
"head_init_commands": ["cmd1", "cmd2"],
"worker_init_commands": ["cmd1"],
}
class AutoscalingTest(unittest.TestCase):
def setUp(self):
NODE_PROVIDERS["mock"] = \
lambda: (None, self.create_provider)
self.provider = None
self.tmpdir = tempfile.mkdtemp()
def tearDown(self):
del NODE_PROVIDERS["mock"]
shutil.rmtree(self.tmpdir)
ray.worker.cleanup()
def waitFor(self, condition):
for _ in range(50):
if condition():
return
time.sleep(.1)
raise Exception("Timed out waiting for {}".format(condition))
def create_provider(self, config, cluster_name):
assert self.provider
return self.provider
def write_config(self, config):
path = self.tmpdir + "/simple.yaml"
with open(path, "w") as f:
f.write(yaml.dump(config))
return path
def testInvalidConfig(self):
invalid_config = "/dev/null"
self.assertRaises(
ValueError, lambda: StandardAutoscaler(invalid_config))
def testScaleUp(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
autoscaler = StandardAutoscaler(config_path, max_failures=0)
self.assertEqual(len(self.provider.nodes({})), 0)
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 2)
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 2)
def testTerminateOutdatedNodesGracefully(self):
config = SMALL_CLUSTER.copy()
config["min_workers"] = 5
config["max_workers"] = 5
config_path = self.write_config(config)
self.provider = MockProvider()
self.provider.create_node({}, {TAG_RAY_NODE_TYPE: "Worker"}, 10)
autoscaler = StandardAutoscaler(config_path, max_failures=0)
self.assertEqual(len(self.provider.nodes({})), 10)
# Gradually scales down to meet target size, never going too low
for _ in range(10):
autoscaler.update()
self.assertLessEqual(len(self.provider.nodes({})), 5)
self.assertGreaterEqual(len(self.provider.nodes({})), 4)
# Eventually reaches steady state
self.assertEqual(len(self.provider.nodes({})), 5)
def testDynamicScaling(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
autoscaler = StandardAutoscaler(
config_path, max_concurrent_launches=5, max_failures=0)
self.assertEqual(len(self.provider.nodes({})), 0)
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 2)
# Update the config to reduce the cluster size
new_config = SMALL_CLUSTER.copy()
new_config["max_workers"] = 1
self.write_config(new_config)
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 1)
# Update the config to reduce the cluster size
new_config["max_workers"] = 10
self.write_config(new_config)
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 6)
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 10)
def testLaunchConfigChange(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
autoscaler = StandardAutoscaler(config_path, max_failures=0)
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 2)
# Update the config to change the node type
new_config = SMALL_CLUSTER.copy()
new_config["worker_nodes"]["InstanceType"] = "updated"
self.write_config(new_config)
existing_nodes = set(self.provider.nodes({}))
for _ in range(5):
autoscaler.update()
new_nodes = set(self.provider.nodes({}))
self.assertEqual(len(new_nodes), 2)
self.assertEqual(len(new_nodes.intersection(existing_nodes)), 0)
def testIgnoresCorruptedConfig(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
autoscaler = StandardAutoscaler(
config_path, max_concurrent_launches=10, max_failures=0)
autoscaler.update()
# Write a corrupted config
self.write_config("asdf")
for _ in range(10):
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 2)
# New a good config again
new_config = SMALL_CLUSTER.copy()
new_config["max_workers"] = 10
self.write_config(new_config)
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 10)
def testMaxFailures(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
self.provider.throw = True
autoscaler = StandardAutoscaler(config_path, max_failures=2)
autoscaler.update()
autoscaler.update()
self.assertRaises(Exception, autoscaler.update)
def testAbortOnCreationFailures(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
self.provider.fail_creates = True
autoscaler = StandardAutoscaler(config_path, max_failures=0)
self.assertRaises(AssertionError, autoscaler.update)
def testLaunchNewNodeOnOutOfBandTerminate(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
autoscaler = StandardAutoscaler(config_path, max_failures=0)
autoscaler.update()
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 2)
for node in self.provider.mock_nodes.values():
node.state = "terminated"
self.assertEqual(len(self.provider.nodes({})), 0)
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 2)
def testConfiguresNewNodes(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner()
autoscaler = StandardAutoscaler(
config_path, max_failures=0, process_runner=runner,
verbose_updates=True, node_updater_cls=NodeUpdaterThread)
autoscaler.update()
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 2)
for node in self.provider.mock_nodes.values():
node.state = "running"
assert len(self.provider.nodes(
{TAG_RAY_NODE_STATUS: "Uninitialized"})) == 2
autoscaler.update()
self.waitFor(
lambda: len(self.provider.nodes(
{TAG_RAY_NODE_STATUS: "Up-to-date"})) == 2)
def testReportsConfigFailures(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner(fail_cmds=["cmd1"])
autoscaler = StandardAutoscaler(
config_path, max_failures=0, process_runner=runner,
verbose_updates=True, node_updater_cls=NodeUpdaterThread)
autoscaler.update()
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 2)
for node in self.provider.mock_nodes.values():
node.state = "running"
assert len(self.provider.nodes(
{TAG_RAY_NODE_STATUS: "Uninitialized"})) == 2
autoscaler.update()
self.waitFor(
lambda: len(self.provider.nodes(
{TAG_RAY_NODE_STATUS: "UpdateFailed"})) == 2)
def testConfiguresOutdatedNodes(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
runner = MockProcessRunner()
autoscaler = StandardAutoscaler(
config_path, max_failures=0, process_runner=runner,
verbose_updates=True, node_updater_cls=NodeUpdaterThread)
autoscaler.update()
autoscaler.update()
self.assertEqual(len(self.provider.nodes({})), 2)
for node in self.provider.mock_nodes.values():
node.state = "running"
autoscaler.update()
self.waitFor(
lambda: len(self.provider.nodes(
{TAG_RAY_NODE_STATUS: "Up-to-date"})) == 2)
runner.calls = []
new_config = SMALL_CLUSTER.copy()
new_config["worker_init_commands"] = ["cmdX", "cmdY"]
self.write_config(new_config)
autoscaler.update()
autoscaler.update()
self.waitFor(lambda: len(runner.calls) > 0)
if __name__ == "__main__":
unittest.main(verbosity=2)