mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 08:07:54 +08:00
[tune,autoscaler] Test yaml, add better distributed docs (#5403)
This commit is contained in:
@@ -0,0 +1,34 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import unittest
|
||||
import yaml
|
||||
|
||||
from ray.autoscaler.autoscaler import fillout_defaults, validate_config
|
||||
from ray.tests.utils import recursive_fnmatch
|
||||
|
||||
RAY_PATH = os.path.abspath(os.path.join(__file__, "../../"))
|
||||
CONFIG_PATHS = recursive_fnmatch(
|
||||
os.path.join(RAY_PATH, "autoscaler"), "*.yaml")
|
||||
|
||||
CONFIG_PATHS += recursive_fnmatch(
|
||||
os.path.join(RAY_PATH, "tune/examples/"), "*.yaml")
|
||||
|
||||
|
||||
class AutoscalingConfigTest(unittest.TestCase):
|
||||
def testValidateDefaultConfig(self):
|
||||
|
||||
for config_path in CONFIG_PATHS:
|
||||
with open(config_path) as f:
|
||||
config = yaml.safe_load(f)
|
||||
config = fillout_defaults(config)
|
||||
try:
|
||||
validate_config(config)
|
||||
except Exception:
|
||||
self.fail("Config did not pass validation test!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
@@ -2,6 +2,7 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import fnmatch
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
@@ -116,3 +117,15 @@ def wait_for_condition(condition_predictor,
|
||||
time_elapsed += retry_interval_ms
|
||||
time.sleep(retry_interval_ms / 1000.0)
|
||||
return False
|
||||
|
||||
|
||||
def recursive_fnmatch(dirpath, pattern):
|
||||
"""Looks at a file directory subtree for a filename pattern.
|
||||
|
||||
Similar to glob.glob(..., recursive=True) but also supports 2.7
|
||||
"""
|
||||
matches = []
|
||||
for root, dirnames, filenames in os.walk(dirpath):
|
||||
for filename in fnmatch.filter(filenames, pattern):
|
||||
matches.append(os.path.join(root, filename))
|
||||
return matches
|
||||
|
||||
@@ -1,51 +1,10 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: tune-example
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 2
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
# Availability zone(s), comma-separated, that nodes may be launched in.
|
||||
# Nodes are currently spread between zones by a round-robin approach,
|
||||
# however this implementation detail should not be relied upon.
|
||||
availability_zone: us-west-2a,us-west-2b
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
# By default Ray creates a new private keypair, but you can also use your own.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type.
|
||||
head_node:
|
||||
InstanceType: c5.xlarge
|
||||
ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type.
|
||||
worker_nodes:
|
||||
InstanceType: c5.xlarge
|
||||
ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
|
||||
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# List of shell commands to run to set up each node.
|
||||
setup_commands:
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.8.0.dev3-cp36-cp36m-manylinux1_x86_64.whl
|
||||
- pip install torch torchvision tabulate tensorboard filelock
|
||||
|
||||
cluster_name: tune-default
|
||||
provider: {type: aws, region: us-west-2}
|
||||
auth: {ssh_user: ubuntu}
|
||||
min_workers: 3
|
||||
max_workers: 3
|
||||
# Deep Learning AMI (Ubuntu) Version 21.0
|
||||
head_node: {InstanceType: c5.xlarge, ImageId: ami-0b294f219d14e6a82}
|
||||
worker_nodes: {InstanceType: c5.xlarge, ImageId: ami-0b294f219d14e6a82}
|
||||
setup_commands: # Set up each node.
|
||||
- pip install ray torch torchvision tabulate tensorboard
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
cluster_name: local-default
|
||||
provider:
|
||||
type: local
|
||||
head_ip: YOUR_HEAD_NODE_HOSTNAME
|
||||
worker_ips: [WORKER_NODE_1_HOSTNAME, WORKER_NODE_2_HOSTNAME, ... ]
|
||||
auth: {ssh_user: YOUR_USERNAME, ssh_private_key: ~/.ssh/id_rsa}
|
||||
## Typically for local clusters, min_workers == max_workers.
|
||||
min_workers: 3
|
||||
max_workers: 3
|
||||
setup_commands: # Set up each node.
|
||||
- pip install ray torch torchvision tabulate tensorboard
|
||||
@@ -10,7 +10,8 @@ import unittest
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.tune.util import recursive_fnmatch, validate_save_restore
|
||||
from ray.tests.utils import recursive_fnmatch
|
||||
from ray.tune.util import validate_save_restore
|
||||
from ray.rllib import _register_all
|
||||
|
||||
|
||||
|
||||
@@ -4,9 +4,7 @@ from __future__ import print_function
|
||||
|
||||
import base64
|
||||
import copy
|
||||
import fnmatch
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from collections import defaultdict
|
||||
@@ -213,18 +211,6 @@ def _from_pinnable(obj):
|
||||
return obj[0]
|
||||
|
||||
|
||||
def recursive_fnmatch(dirpath, pattern):
|
||||
"""Looks at a file directory subtree for a filename pattern.
|
||||
|
||||
Similar to glob.glob(..., recursive=True) but also supports 2.7
|
||||
"""
|
||||
matches = []
|
||||
for root, dirnames, filenames in os.walk(dirpath):
|
||||
for filename in fnmatch.filter(filenames, pattern):
|
||||
matches.append(os.path.join(root, filename))
|
||||
return matches
|
||||
|
||||
|
||||
def validate_save_restore(trainable_cls, config=None, use_object_store=False):
|
||||
"""Helper method to check if your Trainable class will resume correctly.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user