mirror of
https://github.com/wassname/ray.git
synced 2026-07-04 15:40:42 +08:00
[autoscaler/tune] Optional YAML Fields + Fix Pretty Printing for Tune (#1541)
This commit is contained in:
@@ -18,77 +18,85 @@ import yaml
|
||||
from ray.ray_constants import AUTOSCALER_MAX_NUM_FAILURES, \
|
||||
AUTOSCALER_MAX_CONCURRENT_LAUNCHES, AUTOSCALER_UPDATE_INTERVAL_S, \
|
||||
AUTOSCALER_HEARTBEAT_TIMEOUT_S
|
||||
from ray.autoscaler.node_provider import get_node_provider
|
||||
from ray.autoscaler.node_provider import get_node_provider, \
|
||||
get_default_config
|
||||
from ray.autoscaler.updater import NodeUpdaterProcess
|
||||
from ray.autoscaler.docker import dockerize_if_needed
|
||||
from ray.autoscaler.tags import TAG_RAY_LAUNCH_CONFIG, \
|
||||
TAG_RAY_RUNTIME_CONFIG, TAG_RAY_NODE_STATUS, TAG_RAY_NODE_TYPE, TAG_NAME
|
||||
import ray.services as services
|
||||
|
||||
REQUIRED, OPTIONAL = True, False
|
||||
|
||||
# For (a, b), if a is a dictionary object, then
|
||||
# no extra fields can be introduced.
|
||||
CLUSTER_CONFIG_SCHEMA = {
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
"cluster_name": str,
|
||||
"cluster_name": (str, REQUIRED),
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
"min_workers": int,
|
||||
"min_workers": (int, OPTIONAL),
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
"max_workers": int,
|
||||
"max_workers": (int, REQUIRED),
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of
|
||||
# resources usage. For example, if a cluster of 8 nodes is 100% busy
|
||||
# and target_utilization was 0.8, it would resize the cluster to 10.
|
||||
"target_utilization_fraction": float,
|
||||
"target_utilization_fraction": (float, OPTIONAL),
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
"idle_timeout_minutes": int,
|
||||
"idle_timeout_minutes": (int, OPTIONAL),
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
"provider": {
|
||||
"type": str, # e.g. aws
|
||||
"region": str, # e.g. us-east-1
|
||||
"availability_zone": str, # e.g. us-east-1a
|
||||
},
|
||||
"provider": ({
|
||||
"type": (str, REQUIRED), # e.g. aws
|
||||
"region": (str, REQUIRED), # e.g. us-east-1
|
||||
"availability_zone": (str, REQUIRED), # e.g. us-east-1a
|
||||
}, REQUIRED),
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
"auth": dict,
|
||||
"auth": ({
|
||||
"ssh_user": (str, REQUIRED), # e.g. ubuntu
|
||||
"ssh_private_key": (str, OPTIONAL),
|
||||
}, REQUIRED),
|
||||
|
||||
# Docker configuration. If this is specified, all setup and start commands
|
||||
# will be executed in the container.
|
||||
"docker": {
|
||||
"image": str, # e.g. tensorflow/tensorflow:1.5.0-py3
|
||||
"container_name": str
|
||||
},
|
||||
"docker": ({
|
||||
"image": (str, OPTIONAL), # e.g. tensorflow/tensorflow:1.5.0-py3
|
||||
"container_name": (str, OPTIONAL), # e.g., ray_docker
|
||||
}, OPTIONAL),
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type.
|
||||
"head_node": dict,
|
||||
"head_node": (dict, OPTIONAL),
|
||||
|
||||
# Provider-specific config for worker nodes. e.g. instance type.
|
||||
"worker_nodes": dict,
|
||||
"worker_nodes": (dict, OPTIONAL),
|
||||
|
||||
# Map of remote paths to local paths, e.g. {"/tmp/data": "/my/local/data"}
|
||||
"file_mounts": dict,
|
||||
"file_mounts": (dict, OPTIONAL),
|
||||
|
||||
# List of common shell commands to run to initialize nodes.
|
||||
"setup_commands": list,
|
||||
"setup_commands": (list, OPTIONAL),
|
||||
|
||||
# Commands that will be run on the head node after common setup.
|
||||
"head_setup_commands": list,
|
||||
"head_setup_commands": (list, OPTIONAL),
|
||||
|
||||
# Commands that will be run on worker nodes after common setup.
|
||||
"worker_setup_commands": list,
|
||||
"worker_setup_commands": (list, OPTIONAL),
|
||||
|
||||
# Command to start ray on the head node. You shouldn't need to modify this.
|
||||
"head_start_ray_commands": list,
|
||||
"head_start_ray_commands": (list, OPTIONAL),
|
||||
|
||||
# Command to start ray on worker nodes. You shouldn't need to modify this.
|
||||
"worker_start_ray_commands": list,
|
||||
"worker_start_ray_commands": (list, OPTIONAL),
|
||||
|
||||
# Whether to avoid restarting the cluster during updates. This field is
|
||||
# controlled by the ray --no-restart flag and cannot be set by the user.
|
||||
"no_restart": None,
|
||||
"no_restart": (None, OPTIONAL),
|
||||
}
|
||||
|
||||
|
||||
@@ -474,28 +482,57 @@ def typename(v):
|
||||
return type(v).__name__
|
||||
|
||||
|
||||
def validate_config(config, schema=CLUSTER_CONFIG_SCHEMA):
|
||||
def check_required(config, schema):
|
||||
# Check required schema entries
|
||||
if type(config) is not dict:
|
||||
raise ValueError("Config is not a dictionary")
|
||||
for k, v in schema.items():
|
||||
|
||||
for k, (v, kreq) in schema.items():
|
||||
if v is None:
|
||||
continue # None means we don't validate the field
|
||||
if k not in config:
|
||||
if kreq is REQUIRED:
|
||||
if k not in config:
|
||||
type_str = typename(v)
|
||||
raise ValueError(
|
||||
"Missing required config key `{}` of type {}".format(
|
||||
k, type_str))
|
||||
if not isinstance(v, type):
|
||||
check_required(config[k], v)
|
||||
|
||||
|
||||
def check_extraneous(config, schema):
|
||||
"""Make sure all items of config are in schema"""
|
||||
if type(config) is not dict:
|
||||
raise ValueError("Config is not a dictionary")
|
||||
for k in config:
|
||||
if k not in schema:
|
||||
raise ValueError(
|
||||
"Missing required config key `{}` of type {}".format(
|
||||
k, typename(v)))
|
||||
"Unexpected config key `{}` not in {}".format(
|
||||
k, list(schema.keys())))
|
||||
v, kreq = schema[k]
|
||||
if isinstance(v, type):
|
||||
if not isinstance(config[k], v):
|
||||
raise ValueError(
|
||||
"Config key `{}` has wrong type {}, expected {}".format(
|
||||
k, type(config[k]).__name__, v.__name__))
|
||||
else:
|
||||
validate_config(config[k], schema[k])
|
||||
for k in config.keys():
|
||||
if k not in schema:
|
||||
raise ValueError(
|
||||
"Unexpected config key `{}` not in {}".format(
|
||||
k, schema.keys()))
|
||||
check_extraneous(config[k], v)
|
||||
|
||||
|
||||
def validate_config(config, schema=CLUSTER_CONFIG_SCHEMA):
|
||||
"""Required Dicts indicate that no extra fields can be introduced."""
|
||||
if type(config) is not dict:
|
||||
raise ValueError("Config is not a dictionary")
|
||||
|
||||
check_required(config, schema)
|
||||
check_extraneous(config, schema)
|
||||
|
||||
|
||||
def fillout_defaults(config):
|
||||
defaults = get_default_config(config["provider"])
|
||||
defaults.update(config)
|
||||
dockerize_if_needed(defaults)
|
||||
return defaults
|
||||
|
||||
|
||||
def with_head_node_ip(cmds):
|
||||
|
||||
+5
-2
@@ -3,7 +3,7 @@ cluster_name: default
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 1
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
@@ -85,7 +85,10 @@ setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
- source activate tensorflow_p36 && most_recent() { echo pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/$(aws s3 ls s3://ray-wheels --recursive | grep $1 | sort -r | head -n 1 | awk '{print $4}'); } && $( most_recent "cp36-cp36m-manylinux1" ) || $( most_recent "cp35-cp35m-manylinux1" )
|
||||
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.3.1-cp27-cp27mu-manylinux1_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.3.1-cp35-cp35m-manylinux1_x86_64.whl
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.3.1-cp36-cp36m-manylinux1_x86_64.whl
|
||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
||||
# - sudo pkill -9 apt-get || true
|
||||
# - sudo pkill -9 dpkg || true
|
||||
@@ -0,0 +1,17 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: minimal
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
max_workers: 1
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
availability_zone: us-west-2a
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
@@ -16,8 +16,7 @@ except ImportError: # py2
|
||||
from pipes import quote
|
||||
|
||||
from ray.autoscaler.autoscaler import validate_config, hash_runtime_conf, \
|
||||
hash_launch_conf
|
||||
from ray.autoscaler.docker import dockerize_if_needed
|
||||
hash_launch_conf, fillout_defaults
|
||||
from ray.autoscaler.node_provider import get_node_provider, NODE_PROVIDERS
|
||||
from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_LAUNCH_CONFIG, \
|
||||
TAG_NAME
|
||||
@@ -31,7 +30,7 @@ def create_or_update_cluster(
|
||||
|
||||
config = yaml.load(open(config_file).read())
|
||||
validate_config(config)
|
||||
dockerize_if_needed(config)
|
||||
config = fillout_defaults(config)
|
||||
|
||||
if override_min_workers is not None:
|
||||
config["min_workers"] = override_min_workers
|
||||
@@ -53,7 +52,7 @@ def teardown_cluster(config_file, yes):
|
||||
|
||||
config = yaml.load(open(config_file).read())
|
||||
validate_config(config)
|
||||
dockerize_if_needed(config)
|
||||
config = fillout_defaults(config)
|
||||
|
||||
confirm("This will destroy your cluster", yes)
|
||||
|
||||
|
||||
@@ -13,6 +13,8 @@ def dockerize_if_needed(config):
|
||||
docker_image = config["docker"].get("image")
|
||||
cname = config["docker"].get("container_name")
|
||||
if not docker_image:
|
||||
if cname:
|
||||
print("Container name given but no Docker image - continuing...")
|
||||
return config
|
||||
else:
|
||||
assert cname, "Must provide container name!"
|
||||
|
||||
@@ -2,6 +2,9 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import yaml
|
||||
|
||||
|
||||
def import_aws():
|
||||
from ray.autoscaler.aws.config import bootstrap_aws
|
||||
@@ -9,6 +12,12 @@ def import_aws():
|
||||
return bootstrap_aws, AWSNodeProvider
|
||||
|
||||
|
||||
def load_aws_config():
|
||||
import ray.autoscaler.aws as ray_aws
|
||||
return os.path.join(os.path.dirname(
|
||||
ray_aws.__file__), "example-full.yaml")
|
||||
|
||||
|
||||
NODE_PROVIDERS = {
|
||||
"aws": import_aws,
|
||||
"gce": None, # TODO: support more node providers
|
||||
@@ -18,6 +27,15 @@ NODE_PROVIDERS = {
|
||||
"local_cluster": None,
|
||||
}
|
||||
|
||||
DEFAULT_CONFIGS = {
|
||||
"aws": load_aws_config,
|
||||
"gce": None, # TODO: support more node providers
|
||||
"azure": None,
|
||||
"kubernetes": None,
|
||||
"docker": None,
|
||||
"local_cluster": None,
|
||||
}
|
||||
|
||||
|
||||
def get_node_provider(provider_config, cluster_name):
|
||||
importer = NODE_PROVIDERS.get(provider_config["type"])
|
||||
@@ -28,6 +46,18 @@ def get_node_provider(provider_config, cluster_name):
|
||||
return provider_cls(provider_config, cluster_name)
|
||||
|
||||
|
||||
def get_default_config(provider_config):
|
||||
load_config = DEFAULT_CONFIGS.get(provider_config["type"])
|
||||
if load_config is None:
|
||||
raise NotImplementedError(
|
||||
"Unsupported node provider: {}".format(provider_config["type"]))
|
||||
path_to_default = load_config()
|
||||
with open(path_to_default) as f:
|
||||
defaults = yaml.load(f)
|
||||
|
||||
return defaults
|
||||
|
||||
|
||||
class NodeProvider(object):
|
||||
"""Interface for getting and returning nodes from a Cloud.
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import csv
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import yaml
|
||||
|
||||
from ray.tune.result import TrainingResult
|
||||
from ray.tune.log_sync import get_syncer
|
||||
@@ -176,3 +177,14 @@ class _CustomEncoder(json.JSONEncoder):
|
||||
return float(value)
|
||||
if np.issubdtype(value, int):
|
||||
return int(value)
|
||||
|
||||
|
||||
def pretty_print(result):
|
||||
result = result._replace(config=None) # drop config from pretty print
|
||||
out = {}
|
||||
for k, v in result._asdict().items():
|
||||
if v is not None:
|
||||
out[k] = v
|
||||
|
||||
cleaned = json.dumps(out, cls=_CustomEncoder)
|
||||
return yaml.dump(json.loads(cleaned), default_flow_style=False)
|
||||
|
||||
@@ -3,14 +3,8 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from collections import namedtuple
|
||||
import json
|
||||
import os
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
print("Could not import YAML module, falling back to JSON pretty-printing")
|
||||
yaml = None
|
||||
|
||||
"""
|
||||
When using ray.tune with custom training scripts, you must periodically report
|
||||
@@ -93,16 +87,4 @@ TrainingResult = namedtuple("TrainingResult", [
|
||||
])
|
||||
|
||||
|
||||
def pretty_print(result):
|
||||
result = result._replace(config=None) # drop config from pretty print
|
||||
out = {}
|
||||
for k, v in result._asdict().items():
|
||||
if v is not None:
|
||||
out[k] = v
|
||||
if yaml:
|
||||
return yaml.safe_dump(out, default_flow_style=False)
|
||||
else:
|
||||
return json.dumps(out) + "\n"
|
||||
|
||||
|
||||
TrainingResult.__new__.__defaults__ = (None,) * len(TrainingResult._fields)
|
||||
|
||||
@@ -11,9 +11,9 @@ import ray
|
||||
import os
|
||||
|
||||
from ray.tune import TuneError
|
||||
from ray.tune.logger import NoopLogger, UnifiedLogger
|
||||
from ray.tune.logger import NoopLogger, UnifiedLogger, pretty_print
|
||||
from ray.tune.registry import _default_registry, get_registry, TRAINABLE_CLASS
|
||||
from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR, pretty_print
|
||||
from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR
|
||||
from ray.utils import random_string, binary_to_hex
|
||||
|
||||
DEBUG_PRINT_INTERVAL = 5
|
||||
|
||||
@@ -33,6 +33,10 @@ ray_ui_files = [
|
||||
"ray/core/src/catapult_files/trace_viewer_full.html"
|
||||
]
|
||||
|
||||
ray_autoscaler_files = [
|
||||
"ray/autoscaler/aws/example-full.yaml"
|
||||
]
|
||||
|
||||
# The UI files are mandatory if the INCLUDE_UI environment variable equals 1.
|
||||
# Otherwise, they are optional.
|
||||
if "INCLUDE_UI" in os.environ and os.environ["INCLUDE_UI"] == "1":
|
||||
@@ -40,6 +44,8 @@ if "INCLUDE_UI" in os.environ and os.environ["INCLUDE_UI"] == "1":
|
||||
else:
|
||||
optional_ray_files += ray_ui_files
|
||||
|
||||
optional_ray_files += ray_autoscaler_files
|
||||
|
||||
extras = {
|
||||
"rllib": [
|
||||
"tensorflow", "pyyaml", "gym[atari]", "opencv-python",
|
||||
|
||||
Reference in New Issue
Block a user