[autoscaler/tune] Optional YAML Fields + Fix Pretty Printing for Tune (#1541)

2026-07-04 15:40:42 +08:00 · 2018-03-04 23:35:58 -08:00
parent 061e435411
commit 162d063f0d
12 changed files with 199 additions and 70 deletions
@@ -18,77 +18,85 @@ import yaml
 from ray.ray_constants import AUTOSCALER_MAX_NUM_FAILURES, \
    AUTOSCALER_MAX_CONCURRENT_LAUNCHES, AUTOSCALER_UPDATE_INTERVAL_S, \
    AUTOSCALER_HEARTBEAT_TIMEOUT_S
-from ray.autoscaler.node_provider import get_node_provider
+from ray.autoscaler.node_provider import get_node_provider, \
+    get_default_config
 from ray.autoscaler.updater import NodeUpdaterProcess
+from ray.autoscaler.docker import dockerize_if_needed
 from ray.autoscaler.tags import TAG_RAY_LAUNCH_CONFIG, \
    TAG_RAY_RUNTIME_CONFIG, TAG_RAY_NODE_STATUS, TAG_RAY_NODE_TYPE, TAG_NAME
 import ray.services as services

+REQUIRED, OPTIONAL = True, False

+# For (a, b), if a is a dictionary object, then
+# no extra fields can be introduced.
 CLUSTER_CONFIG_SCHEMA = {
    # An unique identifier for the head node and workers of this cluster.
-    "cluster_name": str,
+    "cluster_name": (str, REQUIRED),

    # The minimum number of workers nodes to launch in addition to the head
    # node. This number should be >= 0.
-    "min_workers": int,
+    "min_workers": (int, OPTIONAL),

    # The maximum number of workers nodes to launch in addition to the head
    # node. This takes precedence over min_workers.
-    "max_workers": int,
+    "max_workers": (int, REQUIRED),

    # The autoscaler will scale up the cluster to this target fraction of
    # resources usage. For example, if a cluster of 8 nodes is 100% busy
    # and target_utilization was 0.8, it would resize the cluster to 10.
-    "target_utilization_fraction": float,
+    "target_utilization_fraction": (float, OPTIONAL),

    # If a node is idle for this many minutes, it will be removed.
-    "idle_timeout_minutes": int,
+    "idle_timeout_minutes": (int, OPTIONAL),

    # Cloud-provider specific configuration.
-    "provider": {
-        "type": str,  # e.g. aws
-        "region": str,  # e.g. us-east-1
-        "availability_zone": str,  # e.g. us-east-1a
-    },
+    "provider": ({
+        "type": (str, REQUIRED),  # e.g. aws
+        "region": (str, REQUIRED),  # e.g. us-east-1
+        "availability_zone": (str, REQUIRED),  # e.g. us-east-1a
+    }, REQUIRED),

    # How Ray will authenticate with newly launched nodes.
-    "auth": dict,
+    "auth": ({
+        "ssh_user": (str, REQUIRED),  # e.g. ubuntu
+        "ssh_private_key": (str, OPTIONAL),
+    }, REQUIRED),

    # Docker configuration. If this is specified, all setup and start commands
    # will be executed in the container.
-    "docker": {
-        "image": str,  # e.g. tensorflow/tensorflow:1.5.0-py3
-        "container_name": str
-    },
+    "docker": ({
+        "image": (str, OPTIONAL),  # e.g. tensorflow/tensorflow:1.5.0-py3
+        "container_name": (str, OPTIONAL),  # e.g., ray_docker
+    }, OPTIONAL),

    # Provider-specific config for the head node, e.g. instance type.
-    "head_node": dict,
+    "head_node": (dict, OPTIONAL),

    # Provider-specific config for worker nodes. e.g. instance type.
-    "worker_nodes": dict,
+    "worker_nodes": (dict, OPTIONAL),

    # Map of remote paths to local paths, e.g. {"/tmp/data": "/my/local/data"}
-    "file_mounts": dict,
+    "file_mounts": (dict, OPTIONAL),

    # List of common shell commands to run to initialize nodes.
-    "setup_commands": list,
+    "setup_commands": (list, OPTIONAL),

    # Commands that will be run on the head node after common setup.
-    "head_setup_commands": list,
+    "head_setup_commands": (list, OPTIONAL),

    # Commands that will be run on worker nodes after common setup.
-    "worker_setup_commands": list,
+    "worker_setup_commands": (list, OPTIONAL),

    # Command to start ray on the head node. You shouldn't need to modify this.
-    "head_start_ray_commands": list,
+    "head_start_ray_commands": (list, OPTIONAL),

    # Command to start ray on worker nodes. You shouldn't need to modify this.
-    "worker_start_ray_commands": list,
+    "worker_start_ray_commands": (list, OPTIONAL),

    # Whether to avoid restarting the cluster during updates. This field is
    # controlled by the ray --no-restart flag and cannot be set by the user.
-    "no_restart": None,
+    "no_restart": (None, OPTIONAL),
 }


@@ -474,28 +482,57 @@ def typename(v):
        return type(v).__name__


-def validate_config(config, schema=CLUSTER_CONFIG_SCHEMA):
+def check_required(config, schema):
+    # Check required schema entries
    if type(config) is not dict:
        raise ValueError("Config is not a dictionary")
-    for k, v in schema.items():
+
+    for k, (v, kreq) in schema.items():
        if v is None:
            continue  # None means we don't validate the field
-        if k not in config:
+        if kreq is REQUIRED:
+            if k not in config:
+                type_str = typename(v)
+                raise ValueError(
+                    "Missing required config key `{}` of type {}".format(
+                        k, type_str))
+            if not isinstance(v, type):
+                check_required(config[k], v)
+
+
+def check_extraneous(config, schema):
+    """Make sure all items of config are in schema"""
+    if type(config) is not dict:
+        raise ValueError("Config is not a dictionary")
+    for k in config:
+        if k not in schema:
            raise ValueError(
-                "Missing required config key `{}` of type {}".format(
-                    k, typename(v)))
+                "Unexpected config key `{}` not in {}".format(
+                    k, list(schema.keys())))
+        v, kreq = schema[k]
        if isinstance(v, type):
            if not isinstance(config[k], v):
                raise ValueError(
                    "Config key `{}` has wrong type {}, expected {}".format(
                        k, type(config[k]).__name__, v.__name__))
        else:
-            validate_config(config[k], schema[k])
-    for k in config.keys():
-        if k not in schema:
-            raise ValueError(
-                "Unexpected config key `{}` not in {}".format(
-                    k, schema.keys()))
+            check_extraneous(config[k], v)
+
+
+def validate_config(config, schema=CLUSTER_CONFIG_SCHEMA):
+    """Required Dicts indicate that no extra fields can be introduced."""
+    if type(config) is not dict:
+        raise ValueError("Config is not a dictionary")
+
+    check_required(config, schema)
+    check_extraneous(config, schema)
+
+
+def fillout_defaults(config):
+    defaults = get_default_config(config["provider"])
+    defaults.update(config)
+    dockerize_if_needed(defaults)
+    return defaults


 def with_head_node_ip(cmds):
@@ -3,7 +3,7 @@ cluster_name: default

 # The minimum number of workers nodes to launch in addition to the head
 # node. This number should be >= 0.
-min_workers: 1
+min_workers: 0

 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers.
@@ -85,7 +85,10 @@ setup_commands:
    # Note: if you're developing Ray, you probably want to create an AMI that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
-    - source activate tensorflow_p36 && most_recent() { echo pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/$(aws s3 ls s3://ray-wheels --recursive | grep $1 | sort -r | head -n 1 | awk '{print $4}'); } && $( most_recent "cp36-cp36m-manylinux1" ) || $( most_recent "cp35-cp35m-manylinux1" )
+    - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.3.1-cp27-cp27mu-manylinux1_x86_64.whl
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.3.1-cp35-cp35m-manylinux1_x86_64.whl
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.3.1-cp36-cp36m-manylinux1_x86_64.whl
    # Consider uncommenting these if you also want to run apt-get commands during setup
    # - sudo pkill -9 apt-get || true
    # - sudo pkill -9 dpkg || true
@@ -0,0 +1,17 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: minimal
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers. min_workers default to 0.
+max_workers: 1
+
+# Cloud-provider specific configuration.
+provider:
+    type: aws
+    region: us-west-2
+    availability_zone: us-west-2a
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+
@@ -16,8 +16,7 @@ except ImportError:  # py2
    from pipes import quote

 from ray.autoscaler.autoscaler import validate_config, hash_runtime_conf, \
-    hash_launch_conf
-from ray.autoscaler.docker import dockerize_if_needed
+    hash_launch_conf, fillout_defaults
 from ray.autoscaler.node_provider import get_node_provider, NODE_PROVIDERS
 from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_LAUNCH_CONFIG, \
    TAG_NAME
@@ -31,7 +30,7 @@ def create_or_update_cluster(

    config = yaml.load(open(config_file).read())
    validate_config(config)
-    dockerize_if_needed(config)
+    config = fillout_defaults(config)

    if override_min_workers is not None:
        config["min_workers"] = override_min_workers
@@ -53,7 +52,7 @@ def teardown_cluster(config_file, yes):

    config = yaml.load(open(config_file).read())
    validate_config(config)
-    dockerize_if_needed(config)
+    config = fillout_defaults(config)

    confirm("This will destroy your cluster", yes)

@@ -13,6 +13,8 @@ def dockerize_if_needed(config):
    docker_image = config["docker"].get("image")
    cname = config["docker"].get("container_name")
    if not docker_image:
+        if cname:
+            print("Container name given but no Docker image - continuing...")
        return config
    else:
        assert cname, "Must provide container name!"
@@ -2,6 +2,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import os
+import yaml
+

 def import_aws():
    from ray.autoscaler.aws.config import bootstrap_aws
@@ -9,6 +12,12 @@ def import_aws():
    return bootstrap_aws, AWSNodeProvider


+def load_aws_config():
+    import ray.autoscaler.aws as ray_aws
+    return os.path.join(os.path.dirname(
+        ray_aws.__file__), "example-full.yaml")
+
+
 NODE_PROVIDERS = {
    "aws": import_aws,
    "gce": None,  # TODO: support more node providers
@@ -18,6 +27,15 @@ NODE_PROVIDERS = {
    "local_cluster": None,
 }

+DEFAULT_CONFIGS = {
+    "aws": load_aws_config,
+    "gce": None,  # TODO: support more node providers
+    "azure": None,
+    "kubernetes": None,
+    "docker": None,
+    "local_cluster": None,
+}
+

 def get_node_provider(provider_config, cluster_name):
    importer = NODE_PROVIDERS.get(provider_config["type"])
@@ -28,6 +46,18 @@ def get_node_provider(provider_config, cluster_name):
    return provider_cls(provider_config, cluster_name)


+def get_default_config(provider_config):
+    load_config = DEFAULT_CONFIGS.get(provider_config["type"])
+    if load_config is None:
+        raise NotImplementedError(
+            "Unsupported node provider: {}".format(provider_config["type"]))
+    path_to_default = load_config()
+    with open(path_to_default) as f:
+        defaults = yaml.load(f)
+
+    return defaults
+
+
 class NodeProvider(object):
    """Interface for getting and returning nodes from a Cloud.

@@ -6,6 +6,7 @@ import csv
 import json
 import numpy as np
 import os
+import yaml

 from ray.tune.result import TrainingResult
 from ray.tune.log_sync import get_syncer
@@ -176,3 +177,14 @@ class _CustomEncoder(json.JSONEncoder):
            return float(value)
        if np.issubdtype(value, int):
            return int(value)
+
+
+def pretty_print(result):
+    result = result._replace(config=None)  # drop config from pretty print
+    out = {}
+    for k, v in result._asdict().items():
+        if v is not None:
+            out[k] = v
+
+    cleaned = json.dumps(out, cls=_CustomEncoder)
+    return yaml.dump(json.loads(cleaned), default_flow_style=False)
@@ -3,14 +3,8 @@ from __future__ import division
 from __future__ import print_function

 from collections import namedtuple
-import json
 import os

-try:
-    import yaml
-except ImportError:
-    print("Could not import YAML module, falling back to JSON pretty-printing")
-    yaml = None

 """
 When using ray.tune with custom training scripts, you must periodically report
@@ -93,16 +87,4 @@ TrainingResult = namedtuple("TrainingResult", [
 ])


-def pretty_print(result):
-    result = result._replace(config=None)  # drop config from pretty print
-    out = {}
-    for k, v in result._asdict().items():
-        if v is not None:
-            out[k] = v
-    if yaml:
-        return yaml.safe_dump(out, default_flow_style=False)
-    else:
-        return json.dumps(out) + "\n"
-
-
 TrainingResult.__new__.__defaults__ = (None,) * len(TrainingResult._fields)
@@ -11,9 +11,9 @@ import ray
 import os

 from ray.tune import TuneError
-from ray.tune.logger import NoopLogger, UnifiedLogger
+from ray.tune.logger import NoopLogger, UnifiedLogger, pretty_print
 from ray.tune.registry import _default_registry, get_registry, TRAINABLE_CLASS
-from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR, pretty_print
+from ray.tune.result import TrainingResult, DEFAULT_RESULTS_DIR
 from ray.utils import random_string, binary_to_hex

 DEBUG_PRINT_INTERVAL = 5
@@ -33,6 +33,10 @@ ray_ui_files = [
    "ray/core/src/catapult_files/trace_viewer_full.html"
 ]

+ray_autoscaler_files = [
+    "ray/autoscaler/aws/example-full.yaml"
+]
+
 # The UI files are mandatory if the INCLUDE_UI environment variable equals 1.
 # Otherwise, they are optional.
 if "INCLUDE_UI" in os.environ and os.environ["INCLUDE_UI"] == "1":
@@ -40,6 +44,8 @@ if "INCLUDE_UI" in os.environ and os.environ["INCLUDE_UI"] == "1":
 else:
    optional_ray_files += ray_ui_files

+optional_ray_files += ray_autoscaler_files
+
 extras = {
    "rllib": [
        "tensorflow", "pyyaml", "gym[atari]", "opencv-python",