mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 16:46:37 +08:00
deprecate useless fields in the cluster yaml. (#13637)
* prepare for head node * move command runner interface outside _private * remove space * Eric * flake * min_workers in multi node type * fixing edge cases * eric not idle * fix target_workers to consider min_workers of node types * idle timeout * minor * minor fix * test * lint * eric v2 * eric 3 * min_workers constraint before bin packing * Update resource_demand_scheduler.py * Revert "Update resource_demand_scheduler.py" This reverts commit 818a63a2c86d8437b3ef21c5035d701c1d1127b5. * reducing diff * make get_nodes_to_launch return a dict * merge * weird merge fix * auto fill instance types for AWS * Alex/Eric * Update doc/source/cluster/autoscaling.rst * merge autofill and input from user * logger.exception * make the yaml use the default autofill * docs Eric * remove test_autoscaler_yaml from windows tests * lets try changing the test a bit * return test * lets see * edward * Limit max launch concurrency * commenting frac TODO * move to resource demand scheduler * use STATUS UP TO DATE * Eric * make logger of gc freed refs debug instead of info * add cluster name to docker mount prefix directory * grrR * fix tests * moving docker directory to sdk * move the import to prevent circular dependency * smallf fix * ian * fix max launch concurrency bug to assume failing nodes as pending and consider only load_metric's connected nodes as running * small fix * deflake test_joblib * lint * placement groups bypass * remove space * Eric * first ocmmit * lint * exmaple * documentation * hmm * file path fix * fix test * some format issue in docs * modified docs * joblib strikes again on windows * add ability to not start autoscaler/monitor * a * remove worker_default * Remove default pod type from operator * Remove worker_default_node_type from rewrite_legacy_yaml_to_availble_node_types * deprecate useless fields Co-authored-by: Ameer Haj Ali <ameerhajali@ameers-mbp.lan> Co-authored-by: Alex Wu <alex@anyscale.io> Co-authored-by: Alex Wu <itswu.alex@gmail.com> Co-authored-by: Eric Liang <ekhliang@gmail.com> Co-authored-by: Ameer Haj Ali <ameerhajali@Ameers-MacBook-Pro.local> Co-authored-by: root <root@ip-172-31-56-188.us-west-2.compute.internal> Co-authored-by: Dmitri Gekhtman <dmitri.m.gekhtman@gmail.com>
This commit is contained in:
@@ -78,10 +78,7 @@ class ReportHead(dashboard_utils.DashboardHeadModule):
|
||||
|
||||
payload = {
|
||||
"min_workers": cfg["min_workers"],
|
||||
"max_workers": cfg["max_workers"],
|
||||
"initial_workers": cfg["initial_workers"],
|
||||
"autoscaling_mode": cfg["autoscaling_mode"],
|
||||
"idle_timeout_minutes": cfg["idle_timeout_minutes"],
|
||||
"max_workers": cfg["max_workers"]
|
||||
}
|
||||
|
||||
try:
|
||||
|
||||
@@ -9,23 +9,6 @@ min_workers: 1
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
|
||||
# The initial number of worker nodes to launch in addition to the head
|
||||
# node. When the cluster is first brought up (or when it is refreshed with a
|
||||
# subsequent `ray up`) this number of nodes will be started.
|
||||
initial_workers: 1
|
||||
|
||||
# Whether or not to autoscale aggressively. If this is enabled, if at any point
|
||||
# we would start more workers, we start at least enough to bring us to
|
||||
# initial_workers.
|
||||
autoscaling_mode: default
|
||||
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
|
||||
# can be decreased to increase the aggressiveness of upscaling.
|
||||
# This value must be less than 1.0 for scaling to happen.
|
||||
target_utilization_fraction: 0.48
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
"type": "string"
|
||||
},
|
||||
"min_workers": {
|
||||
"description": "The minimum number of workers nodes to launch in addition to the head node. This number should be >= 0",
|
||||
"description": "DEPRECATED. Use the per node_type min_workers field instead.",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
@@ -34,17 +34,17 @@
|
||||
"minimum": 0
|
||||
},
|
||||
"initial_workers": {
|
||||
"description": "The number of workers to launch initially, in addition to the head node.",
|
||||
"description": "DEPRECATED.",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"autoscaling_mode": {
|
||||
"description": "The mode of the autoscaler e.g. default, aggressive",
|
||||
"description": "DEPRECATED. Use upscaling_speed instead.",
|
||||
"type": "string",
|
||||
"enum": [ "default", "aggressive" ]
|
||||
},
|
||||
"target_utilization_fraction": {
|
||||
"description": "The autoscaler will scale up the cluster to this target fraction of resources usage. For example, if a cluster of 8 nodes is 100% busy # and target_utilization was 0.8, it would resize the cluster to 10.",
|
||||
"description": "DEPRECATED. Use upscaling_speed instead.",
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
@@ -254,6 +254,10 @@
|
||||
"type": "string",
|
||||
"description": "If using multiple node types, specifies the head node type."
|
||||
},
|
||||
"worker_default_node_type": {
|
||||
"type": "string",
|
||||
"description": "DEPRECATED."
|
||||
},
|
||||
"head_node": {
|
||||
"type": "object",
|
||||
"description": "Provider-specific config for the head node, e.g. instance type."
|
||||
|
||||
@@ -1,13 +1,10 @@
|
||||
cluster_name: default
|
||||
min_workers: 5
|
||||
max_workers: 5
|
||||
initial_workers: 5
|
||||
autoscaling_mode: default
|
||||
docker:
|
||||
image: 'anyscale/ray-ml:latest'
|
||||
container_name: ray_container
|
||||
pull_before_run: true
|
||||
target_utilization_fraction: 0.8
|
||||
idle_timeout_minutes: 5
|
||||
provider:
|
||||
type: aws
|
||||
|
||||
@@ -12,7 +12,6 @@ head_start_ray_commands:
|
||||
- ray stop
|
||||
- ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
idle_timeout_minutes: 5
|
||||
initial_workers: 1
|
||||
initialization_commands:
|
||||
- echo init
|
||||
max_workers: 2
|
||||
@@ -27,7 +26,6 @@ setup_commands:
|
||||
- echo a
|
||||
- echo b
|
||||
- echo ${echo hi}
|
||||
target_utilization_fraction: 0.9
|
||||
worker_nodes:
|
||||
ImageId: latest_dlami
|
||||
InstanceType: t1.micro
|
||||
|
||||
@@ -17,7 +17,6 @@ head_start_ray_commands:
|
||||
- ray stop
|
||||
- ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
idle_timeout_minutes: 5
|
||||
initial_workers: 1
|
||||
initialization_commands:
|
||||
- echo init
|
||||
max_workers: 2
|
||||
@@ -32,7 +31,6 @@ setup_commands:
|
||||
- echo a
|
||||
- echo b
|
||||
- echo ${echo hi}
|
||||
target_utilization_fraction: 0.9
|
||||
worker_nodes:
|
||||
ImageId: latest_dlami
|
||||
InstanceType: t3a.small
|
||||
|
||||
@@ -52,7 +52,6 @@ class OnPremCoordinatorServerTest(unittest.TestCase):
|
||||
"cluster_name": "random_name",
|
||||
"min_workers": 0,
|
||||
"max_workers": 0,
|
||||
"initial_workers": 0,
|
||||
"provider": {
|
||||
"type": "local",
|
||||
"head_ip": "0.0.0.0:2",
|
||||
@@ -154,7 +153,6 @@ class OnPremCoordinatorServerTest(unittest.TestCase):
|
||||
"cluster_name": "random_name",
|
||||
"min_workers": 0,
|
||||
"max_workers": 0,
|
||||
"initial_workers": 0,
|
||||
"provider": {
|
||||
"type": "local",
|
||||
"coordinator_address": self.coordinator_address,
|
||||
|
||||
@@ -4,11 +4,8 @@ cluster_name: sgd-tf
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
min_workers: 3
|
||||
initial_workers: 3
|
||||
max_workers: 3
|
||||
|
||||
target_utilization_fraction: 0.9
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 20
|
||||
# docker:
|
||||
|
||||
@@ -104,7 +104,6 @@ You can specify the number of nodes you want to use with the following configura
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
min_workers: <NUMBER_OF_NODES> # Change this to a custom quantity
|
||||
initial_workers: <NUMBER_OF_NODES> # same as above
|
||||
max_workers: <NUMBER_OF_NODES> # same as above
|
||||
|
||||
You may want to install FP16 support for PyTorch with the following configuration in the YAML file:
|
||||
|
||||
@@ -4,11 +4,8 @@ cluster_name: horovod-pytorch
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
min_workers: 1
|
||||
initial_workers: 1
|
||||
max_workers: 1
|
||||
|
||||
target_utilization_fraction: 0.9
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 50
|
||||
# docker:
|
||||
|
||||
@@ -4,11 +4,8 @@ cluster_name: sgd-pytorch
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
min_workers: 3
|
||||
initial_workers: 3
|
||||
max_workers: 3
|
||||
|
||||
target_utilization_fraction: 0.9
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 20
|
||||
# docker:
|
||||
|
||||
@@ -4,11 +4,8 @@ cluster_name: sgd-pytorch-imagenet
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
min_workers: 1
|
||||
initial_workers: 1
|
||||
max_workers: 1
|
||||
|
||||
target_utilization_fraction: 0.9
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 10
|
||||
# docker:
|
||||
|
||||
@@ -4,10 +4,8 @@ cluster_name: sgd-coco-pytorch
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
min_workers: 1
|
||||
initial_workers: 1
|
||||
max_workers: 1
|
||||
|
||||
target_utilization_fraction: 0.9
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
|
||||
@@ -4,11 +4,8 @@ cluster_name: sgd-pytorch
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
min_workers: 2
|
||||
initial_workers: 2
|
||||
max_workers: 2
|
||||
|
||||
target_utilization_fraction: 0.9
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 10
|
||||
# docker:
|
||||
|
||||
@@ -4,10 +4,8 @@ cluster_name: transformer-cluster
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
min_workers: 3
|
||||
initial_workers: 3
|
||||
max_workers: 3
|
||||
|
||||
target_utilization_fraction: 0.9
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
|
||||
@@ -10,8 +10,6 @@ min_workers: 3
|
||||
# node. This takes precedence over min_workers. min_workers defaults to 0.
|
||||
max_workers: 3
|
||||
|
||||
target_utilization_fraction: 0.8
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ cluster_name: long-running-distributed-tests
|
||||
min_workers: 3
|
||||
max_workers: 3
|
||||
|
||||
target_utilization_fraction: 0.8
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
docker:
|
||||
|
||||
@@ -3,7 +3,6 @@ cluster_name: ray-rllib-stress-tests
|
||||
min_workers: 9
|
||||
max_workers: 9
|
||||
|
||||
target_utilization_fraction: 0.8
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
docker:
|
||||
|
||||
@@ -13,13 +13,6 @@ min_workers: 100
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 100
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
|
||||
# can be decreased to increase the aggressiveness of upscaling.
|
||||
# This value must be less than 1.0 for scaling to happen.
|
||||
target_utilization_fraction: 0.8
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
|
||||
@@ -13,13 +13,6 @@ min_workers: 100
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 100
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
|
||||
# can be decreased to increase the aggressiveness of upscaling.
|
||||
# This value must be less than 1.0 for scaling to happen.
|
||||
target_utilization_fraction: 0.8
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
|
||||
@@ -2,9 +2,7 @@ cluster_name: ray-tune-scalability-tests
|
||||
|
||||
min_workers: 15
|
||||
max_workers: 15
|
||||
initial_workers: 15
|
||||
|
||||
target_utilization_fraction: 0.8
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
docker:
|
||||
|
||||
@@ -2,9 +2,7 @@ cluster_name: ray-xgboost-release-cpu-moderate
|
||||
|
||||
min_workers: 31
|
||||
max_workers: 31
|
||||
initial_workers: 31
|
||||
|
||||
target_utilization_fraction: 0.8
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
docker:
|
||||
|
||||
@@ -2,9 +2,7 @@ cluster_name: ray-xgboost-release-cpu-small
|
||||
|
||||
min_workers: 3
|
||||
max_workers: 3
|
||||
initial_workers: 3
|
||||
|
||||
target_utilization_fraction: 0.8
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
docker:
|
||||
|
||||
@@ -2,9 +2,7 @@ cluster_name: ray-xgboost-release-gpu-small
|
||||
|
||||
min_workers: 4
|
||||
max_workers: 4
|
||||
initial_workers: 4
|
||||
|
||||
target_utilization_fraction: 0.8
|
||||
idle_timeout_minutes: 15
|
||||
|
||||
docker:
|
||||
|
||||
Reference in New Issue
Block a user