[tune,autoscaler] Test yaml, add better distributed docs (#5403)

This commit is contained in:
Richard Liaw
2019-08-08 00:59:23 -07:00
committed by GitHub
parent 1f8ae17f60
commit ed89897a31
10 changed files with 184 additions and 130 deletions
+34
View File
@@ -0,0 +1,34 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import unittest
import yaml
from ray.autoscaler.autoscaler import fillout_defaults, validate_config
from ray.tests.utils import recursive_fnmatch
RAY_PATH = os.path.abspath(os.path.join(__file__, "../../"))
CONFIG_PATHS = recursive_fnmatch(
os.path.join(RAY_PATH, "autoscaler"), "*.yaml")
CONFIG_PATHS += recursive_fnmatch(
os.path.join(RAY_PATH, "tune/examples/"), "*.yaml")
class AutoscalingConfigTest(unittest.TestCase):
def testValidateDefaultConfig(self):
for config_path in CONFIG_PATHS:
with open(config_path) as f:
config = yaml.safe_load(f)
config = fillout_defaults(config)
try:
validate_config(config)
except Exception:
self.fail("Config did not pass validation test!")
if __name__ == "__main__":
unittest.main(verbosity=2)
+13
View File
@@ -2,6 +2,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import fnmatch
import os
import subprocess
import sys
@@ -116,3 +117,15 @@ def wait_for_condition(condition_predictor,
time_elapsed += retry_interval_ms
time.sleep(retry_interval_ms / 1000.0)
return False
def recursive_fnmatch(dirpath, pattern):
"""Looks at a file directory subtree for a filename pattern.
Similar to glob.glob(..., recursive=True) but also supports 2.7
"""
matches = []
for root, dirnames, filenames in os.walk(dirpath):
for filename in fnmatch.filter(filenames, pattern):
matches.append(os.path.join(root, filename))
return matches
+10 -51
View File
@@ -1,51 +1,10 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: tune-example
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 2
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
# Availability zone(s), comma-separated, that nodes may be launched in.
# Nodes are currently spread between zones by a round-robin approach,
# however this implementation detail should not be relied upon.
availability_zone: us-west-2a,us-west-2b
# How Ray will authenticate with newly launched nodes.
# By default Ray creates a new private keypair, but you can also use your own.
auth:
ssh_user: ubuntu
# Provider-specific config for the head node, e.g. instance type.
head_node:
InstanceType: c5.xlarge
ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
# Provider-specific config for worker nodes, e.g. instance type.
worker_nodes:
InstanceType: c5.xlarge
ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# List of shell commands to run to set up each node.
setup_commands:
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.8.0.dev3-cp36-cp36m-manylinux1_x86_64.whl
- pip install torch torchvision tabulate tensorboard filelock
cluster_name: tune-default
provider: {type: aws, region: us-west-2}
auth: {ssh_user: ubuntu}
min_workers: 3
max_workers: 3
# Deep Learning AMI (Ubuntu) Version 21.0
head_node: {InstanceType: c5.xlarge, ImageId: ami-0b294f219d14e6a82}
worker_nodes: {InstanceType: c5.xlarge, ImageId: ami-0b294f219d14e6a82}
setup_commands: # Set up each node.
- pip install ray torch torchvision tabulate tensorboard
@@ -0,0 +1,11 @@
cluster_name: local-default
provider:
type: local
head_ip: YOUR_HEAD_NODE_HOSTNAME
worker_ips: [WORKER_NODE_1_HOSTNAME, WORKER_NODE_2_HOSTNAME, ... ]
auth: {ssh_user: YOUR_USERNAME, ssh_private_key: ~/.ssh/id_rsa}
## Typically for local clusters, min_workers == max_workers.
min_workers: 3
max_workers: 3
setup_commands: # Set up each node.
- pip install ray torch torchvision tabulate tensorboard
+2 -1
View File
@@ -10,7 +10,8 @@ import unittest
import ray
from ray import tune
from ray.tune.util import recursive_fnmatch, validate_save_restore
from ray.tests.utils import recursive_fnmatch
from ray.tune.util import validate_save_restore
from ray.rllib import _register_all
-14
View File
@@ -4,9 +4,7 @@ from __future__ import print_function
import base64
import copy
import fnmatch
import logging
import os
import threading
import time
from collections import defaultdict
@@ -213,18 +211,6 @@ def _from_pinnable(obj):
return obj[0]
def recursive_fnmatch(dirpath, pattern):
"""Looks at a file directory subtree for a filename pattern.
Similar to glob.glob(..., recursive=True) but also supports 2.7
"""
matches = []
for root, dirnames, filenames in os.walk(dirpath):
for filename in fnmatch.filter(filenames, pattern):
matches.append(os.path.join(root, filename))
return matches
def validate_save_restore(trainable_cls, config=None, use_object_store=False):
"""Helper method to check if your Trainable class will resume correctly.