[autoscaler/tune] Optional YAML Fields + Fix Pretty Printing for Tune (#1541)

This commit is contained in:
Richard Liaw
2018-03-04 23:35:58 -08:00
committed by GitHub
parent 061e435411
commit 162d063f0d
12 changed files with 199 additions and 70 deletions
+73 -36
View File
@@ -18,77 +18,85 @@ import yaml
from ray.ray_constants import AUTOSCALER_MAX_NUM_FAILURES, \
AUTOSCALER_MAX_CONCURRENT_LAUNCHES, AUTOSCALER_UPDATE_INTERVAL_S, \
AUTOSCALER_HEARTBEAT_TIMEOUT_S
from ray.autoscaler.node_provider import get_node_provider
from ray.autoscaler.node_provider import get_node_provider, \
get_default_config
from ray.autoscaler.updater import NodeUpdaterProcess
from ray.autoscaler.docker import dockerize_if_needed
from ray.autoscaler.tags import TAG_RAY_LAUNCH_CONFIG, \
TAG_RAY_RUNTIME_CONFIG, TAG_RAY_NODE_STATUS, TAG_RAY_NODE_TYPE, TAG_NAME
import ray.services as services
REQUIRED, OPTIONAL = True, False
# For (a, b), if a is a dictionary object, then
# no extra fields can be introduced.
CLUSTER_CONFIG_SCHEMA = {
# An unique identifier for the head node and workers of this cluster.
"cluster_name": str,
"cluster_name": (str, REQUIRED),
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
"min_workers": int,
"min_workers": (int, OPTIONAL),
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
"max_workers": int,
"max_workers": (int, REQUIRED),
# The autoscaler will scale up the cluster to this target fraction of
# resources usage. For example, if a cluster of 8 nodes is 100% busy
# and target_utilization was 0.8, it would resize the cluster to 10.
"target_utilization_fraction": float,
"target_utilization_fraction": (float, OPTIONAL),
# If a node is idle for this many minutes, it will be removed.
"idle_timeout_minutes": int,
"idle_timeout_minutes": (int, OPTIONAL),
# Cloud-provider specific configuration.
"provider": {
"type": str, # e.g. aws
"region": str, # e.g. us-east-1
"availability_zone": str, # e.g. us-east-1a
},
"provider": ({
"type": (str, REQUIRED), # e.g. aws
"region": (str, REQUIRED), # e.g. us-east-1
"availability_zone": (str, REQUIRED), # e.g. us-east-1a
}, REQUIRED),
# How Ray will authenticate with newly launched nodes.
"auth": dict,
"auth": ({
"ssh_user": (str, REQUIRED), # e.g. ubuntu
"ssh_private_key": (str, OPTIONAL),
}, REQUIRED),
# Docker configuration. If this is specified, all setup and start commands
# will be executed in the container.
"docker": {
"image": str, # e.g. tensorflow/tensorflow:1.5.0-py3
"container_name": str
},
"docker": ({
"image": (str, OPTIONAL), # e.g. tensorflow/tensorflow:1.5.0-py3
"container_name": (str, OPTIONAL), # e.g., ray_docker
}, OPTIONAL),
# Provider-specific config for the head node, e.g. instance type.
"head_node": dict,
"head_node": (dict, OPTIONAL),
# Provider-specific config for worker nodes. e.g. instance type.
"worker_nodes": dict,
"worker_nodes": (dict, OPTIONAL),
# Map of remote paths to local paths, e.g. {"/tmp/data": "/my/local/data"}
"file_mounts": dict,
"file_mounts": (dict, OPTIONAL),
# List of common shell commands to run to initialize nodes.
"setup_commands": list,
"setup_commands": (list, OPTIONAL),
# Commands that will be run on the head node after common setup.
"head_setup_commands": list,
"head_setup_commands": (list, OPTIONAL),
# Commands that will be run on worker nodes after common setup.
"worker_setup_commands": list,
"worker_setup_commands": (list, OPTIONAL),
# Command to start ray on the head node. You shouldn't need to modify this.
"head_start_ray_commands": list,
"head_start_ray_commands": (list, OPTIONAL),
# Command to start ray on worker nodes. You shouldn't need to modify this.
"worker_start_ray_commands": list,
"worker_start_ray_commands": (list, OPTIONAL),
# Whether to avoid restarting the cluster during updates. This field is
# controlled by the ray --no-restart flag and cannot be set by the user.
"no_restart": None,
"no_restart": (None, OPTIONAL),
}
@@ -474,28 +482,57 @@ def typename(v):
return type(v).__name__
def validate_config(config, schema=CLUSTER_CONFIG_SCHEMA):
def check_required(config, schema):
# Check required schema entries
if type(config) is not dict:
raise ValueError("Config is not a dictionary")
for k, v in schema.items():
for k, (v, kreq) in schema.items():
if v is None:
continue # None means we don't validate the field
if k not in config:
if kreq is REQUIRED:
if k not in config:
type_str = typename(v)
raise ValueError(
"Missing required config key `{}` of type {}".format(
k, type_str))
if not isinstance(v, type):
check_required(config[k], v)
def check_extraneous(config, schema):
"""Make sure all items of config are in schema"""
if type(config) is not dict:
raise ValueError("Config is not a dictionary")
for k in config:
if k not in schema:
raise ValueError(
"Missing required config key `{}` of type {}".format(
k, typename(v)))
"Unexpected config key `{}` not in {}".format(
k, list(schema.keys())))
v, kreq = schema[k]
if isinstance(v, type):
if not isinstance(config[k], v):
raise ValueError(
"Config key `{}` has wrong type {}, expected {}".format(
k, type(config[k]).__name__, v.__name__))
else:
validate_config(config[k], schema[k])
for k in config.keys():
if k not in schema:
raise ValueError(
"Unexpected config key `{}` not in {}".format(
k, schema.keys()))
check_extraneous(config[k], v)
def validate_config(config, schema=CLUSTER_CONFIG_SCHEMA):
"""Required Dicts indicate that no extra fields can be introduced."""
if type(config) is not dict:
raise ValueError("Config is not a dictionary")
check_required(config, schema)
check_extraneous(config, schema)
def fillout_defaults(config):
defaults = get_default_config(config["provider"])
defaults.update(config)
dockerize_if_needed(defaults)
return defaults
def with_head_node_ip(cmds):
@@ -3,7 +3,7 @@ cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 1
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
@@ -85,7 +85,10 @@ setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
- source activate tensorflow_p36 && most_recent() { echo pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/$(aws s3 ls s3://ray-wheels --recursive | grep $1 | sort -r | head -n 1 | awk '{print $4}'); } && $( most_recent "cp36-cp36m-manylinux1" ) || $( most_recent "cp35-cp35m-manylinux1" )
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.3.1-cp27-cp27mu-manylinux1_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.3.1-cp35-cp35m-manylinux1_x86_64.whl
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.3.1-cp36-cp36m-manylinux1_x86_64.whl
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
@@ -0,0 +1,17 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: minimal
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers. min_workers default to 0.
max_workers: 1
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
availability_zone: us-west-2a
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
+3 -4
View File
@@ -16,8 +16,7 @@ except ImportError: # py2
from pipes import quote
from ray.autoscaler.autoscaler import validate_config, hash_runtime_conf, \
hash_launch_conf
from ray.autoscaler.docker import dockerize_if_needed
hash_launch_conf, fillout_defaults
from ray.autoscaler.node_provider import get_node_provider, NODE_PROVIDERS
from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_LAUNCH_CONFIG, \
TAG_NAME
@@ -31,7 +30,7 @@ def create_or_update_cluster(
config = yaml.load(open(config_file).read())
validate_config(config)
dockerize_if_needed(config)
config = fillout_defaults(config)
if override_min_workers is not None:
config["min_workers"] = override_min_workers
@@ -53,7 +52,7 @@ def teardown_cluster(config_file, yes):
config = yaml.load(open(config_file).read())
validate_config(config)
dockerize_if_needed(config)
config = fillout_defaults(config)
confirm("This will destroy your cluster", yes)
+2
View File
@@ -13,6 +13,8 @@ def dockerize_if_needed(config):
docker_image = config["docker"].get("image")
cname = config["docker"].get("container_name")
if not docker_image:
if cname:
print("Container name given but no Docker image - continuing...")
return config
else:
assert cname, "Must provide container name!"
+30
View File
@@ -2,6 +2,9 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import yaml
def import_aws():
from ray.autoscaler.aws.config import bootstrap_aws
@@ -9,6 +12,12 @@ def import_aws():
return bootstrap_aws, AWSNodeProvider
def load_aws_config():
import ray.autoscaler.aws as ray_aws
return os.path.join(os.path.dirname(
ray_aws.__file__), "example-full.yaml")
NODE_PROVIDERS = {
"aws": import_aws,
"gce": None, # TODO: support more node providers
@@ -18,6 +27,15 @@ NODE_PROVIDERS = {
"local_cluster": None,
}
DEFAULT_CONFIGS = {
"aws": load_aws_config,
"gce": None, # TODO: support more node providers
"azure": None,
"kubernetes": None,
"docker": None,
"local_cluster": None,
}
def get_node_provider(provider_config, cluster_name):
importer = NODE_PROVIDERS.get(provider_config["type"])
@@ -28,6 +46,18 @@ def get_node_provider(provider_config, cluster_name):
return provider_cls(provider_config, cluster_name)
def get_default_config(provider_config):
load_config = DEFAULT_CONFIGS.get(provider_config["type"])
if load_config is None:
raise NotImplementedError(
"Unsupported node provider: {}".format(provider_config["type"]))
path_to_default = load_config()
with open(path_to_default) as f:
defaults = yaml.load(f)
return defaults
class NodeProvider(object):
"""Interface for getting and returning nodes from a Cloud.
+12
View File
@@ -6,6 +6,7 @@ import csv
import json
import numpy as np
import os
import yaml
from ray.tune.result import TrainingResult
from ray.tune.log_sync import get_syncer
@@ -176,3 +177,14 @@ class _CustomEncoder(json.JSONEncoder):
return float(value)
if np.issubdtype(value, int):
return int(value)
def pretty_print(result):
result = result._replace(config=None) # drop config from pretty print
out = {}
for k, v in result._asdict().items():
if v is not None:
out[k] = v
cleaned = json.dumps(out, cls=_CustomEncoder)
return yaml.dump(json.loads(cleaned), default_flow_style=False)
-18
View File
@@ -3,14 +3,8 @@ from __future__ import division
from __future__ import print_function
from collections import namedtuple
import json
import os
try:
import yaml
except ImportError:
print("Could not import YAML module, falling back to JSON pretty-printing")
yaml = None
"""
When using ray.tune with custom training scripts, you must periodically report
@@ -93,16 +87,4 @@ TrainingResult = namedtuple("TrainingResult", [
])
def pretty_print(result):
result = result._replace(config=None) # drop config from pretty print
out = {}
for k, v in result._asdict().items():
if v is not None:
out[k] = v
if yaml:
return yaml.safe_dump(out, default_flow_style=False)
else:
return json.dumps(out) + "\n"
TrainingResult.__new__.__defaults__ = (None,) * len(TrainingResult._fields)
+2 -2
View File
@@ -11,9 +11,9 @@ import ray
import os
from ray.tune import TuneError
from ray.tune.logger import NoopLogger, UnifiedLogger
from ray.tune.logger import NoopLogger, UnifiedLogger, pretty_print
from ray.tune.registry import _default_registry, get_registry, TRAINABLE_CLASS
from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR, pretty_print
from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR
from ray.utils import random_string, binary_to_hex
DEBUG_PRINT_INTERVAL = 5
+6
View File
@@ -33,6 +33,10 @@ ray_ui_files = [
"ray/core/src/catapult_files/trace_viewer_full.html"
]
ray_autoscaler_files = [
"ray/autoscaler/aws/example-full.yaml"
]
# The UI files are mandatory if the INCLUDE_UI environment variable equals 1.
# Otherwise, they are optional.
if "INCLUDE_UI" in os.environ and os.environ["INCLUDE_UI"] == "1":
@@ -40,6 +44,8 @@ if "INCLUDE_UI" in os.environ and os.environ["INCLUDE_UI"] == "1":
else:
optional_ray_files += ray_ui_files
optional_ray_files += ray_autoscaler_files
extras = {
"rllib": [
"tensorflow", "pyyaml", "gym[atari]", "opencv-python",