From b7dd7ddb5231bc4bc83ae1e385edc761d5476627 Mon Sep 17 00:00:00 2001 From: Ameer Haj Ali Date: Sat, 23 Jan 2021 22:06:51 +0200 Subject: [PATCH] deprecate useless fields in the cluster yaml. (#13637) * prepare for head node * move command runner interface outside _private * remove space * Eric * flake * min_workers in multi node type * fixing edge cases * eric not idle * fix target_workers to consider min_workers of node types * idle timeout * minor * minor fix * test * lint * eric v2 * eric 3 * min_workers constraint before bin packing * Update resource_demand_scheduler.py * Revert "Update resource_demand_scheduler.py" This reverts commit 818a63a2c86d8437b3ef21c5035d701c1d1127b5. * reducing diff * make get_nodes_to_launch return a dict * merge * weird merge fix * auto fill instance types for AWS * Alex/Eric * Update doc/source/cluster/autoscaling.rst * merge autofill and input from user * logger.exception * make the yaml use the default autofill * docs Eric * remove test_autoscaler_yaml from windows tests * lets try changing the test a bit * return test * lets see * edward * Limit max launch concurrency * commenting frac TODO * move to resource demand scheduler * use STATUS UP TO DATE * Eric * make logger of gc freed refs debug instead of info * add cluster name to docker mount prefix directory * grrR * fix tests * moving docker directory to sdk * move the import to prevent circular dependency * smallf fix * ian * fix max launch concurrency bug to assume failing nodes as pending and consider only load_metric's connected nodes as running * small fix * deflake test_joblib * lint * placement groups bypass * remove space * Eric * first ocmmit * lint * exmaple * documentation * hmm * file path fix * fix test * some format issue in docs * modified docs * joblib strikes again on windows * add ability to not start autoscaler/monitor * a * remove worker_default * Remove default pod type from operator * Remove worker_default_node_type from rewrite_legacy_yaml_to_availble_node_types * deprecate useless fields Co-authored-by: Ameer Haj Ali Co-authored-by: Alex Wu Co-authored-by: Alex Wu Co-authored-by: Eric Liang Co-authored-by: Ameer Haj Ali Co-authored-by: root Co-authored-by: Dmitri Gekhtman --- dashboard/modules/reporter/reporter_head.py | 5 +---- doc/examples/lm/lm-cluster.yaml | 17 ----------------- python/ray/autoscaler/ray-schema.json | 12 ++++++++---- python/ray/serve/benchmarks/cluster.yaml | 3 --- .../test_cli_patterns/test_ray_up_config.yaml | 2 -- .../test_ray_up_docker_config.yaml | 2 -- python/ray/tests/test_coordinator_server.py | 2 -- .../util/sgd/tf/examples/tf-example-sgd.yaml | 3 --- .../sgd/torch/examples/benchmarks/README.rst | 1 - .../examples/benchmarks/horovod-benchmark.yaml | 3 --- .../util/sgd/torch/examples/example-sgd.yaml | 3 --- .../torch/examples/image_models/cluster.yaml | 3 --- .../torch/examples/segmentation/example.yaml | 2 -- .../sgd/torch/examples/sgd-development.yaml | 3 --- .../torch/examples/transformers/cluster.yaml | 2 -- release/horovod_tests/cluster.yaml | 2 -- .../long_running_distributed_tests/cluster.yaml | 1 - release/rllib_tests/stress_tests/cluster.yaml | 1 - release/stress_tests/autoscaler-cluster.yaml | 7 ------- release/stress_tests/cluster.yaml | 7 ------- .../tune_tests/scalability_tests/cluster.yaml | 2 -- release/xgboost_tests/cluster_cpu_moderate.yaml | 2 -- release/xgboost_tests/cluster_cpu_small.yaml | 2 -- release/xgboost_tests/cluster_gpu_small.yaml | 2 -- 24 files changed, 9 insertions(+), 80 deletions(-) diff --git a/dashboard/modules/reporter/reporter_head.py b/dashboard/modules/reporter/reporter_head.py index 2d84c6b65..7d375c8d6 100644 --- a/dashboard/modules/reporter/reporter_head.py +++ b/dashboard/modules/reporter/reporter_head.py @@ -78,10 +78,7 @@ class ReportHead(dashboard_utils.DashboardHeadModule): payload = { "min_workers": cfg["min_workers"], - "max_workers": cfg["max_workers"], - "initial_workers": cfg["initial_workers"], - "autoscaling_mode": cfg["autoscaling_mode"], - "idle_timeout_minutes": cfg["idle_timeout_minutes"], + "max_workers": cfg["max_workers"] } try: diff --git a/doc/examples/lm/lm-cluster.yaml b/doc/examples/lm/lm-cluster.yaml index 3590d482a..7ea6641f5 100644 --- a/doc/examples/lm/lm-cluster.yaml +++ b/doc/examples/lm/lm-cluster.yaml @@ -9,23 +9,6 @@ min_workers: 1 # node. This takes precedence over min_workers. max_workers: 2 -# The initial number of worker nodes to launch in addition to the head -# node. When the cluster is first brought up (or when it is refreshed with a -# subsequent `ray up`) this number of nodes will be started. -initial_workers: 1 - -# Whether or not to autoscale aggressively. If this is enabled, if at any point -# we would start more workers, we start at least enough to bring us to -# initial_workers. -autoscaling_mode: default - - -# The autoscaler will scale up the cluster to this target fraction of resource -# usage. For example, if a cluster of 10 nodes is 100% busy and -# target_utilization is 0.8, it would resize the cluster to 13. This fraction -# can be decreased to increase the aggressiveness of upscaling. -# This value must be less than 1.0 for scaling to happen. -target_utilization_fraction: 0.48 # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json index 22b21b84c..7c7b2a1ed 100644 --- a/python/ray/autoscaler/ray-schema.json +++ b/python/ray/autoscaler/ray-schema.json @@ -24,7 +24,7 @@ "type": "string" }, "min_workers": { - "description": "The minimum number of workers nodes to launch in addition to the head node. This number should be >= 0", + "description": "DEPRECATED. Use the per node_type min_workers field instead.", "type": "integer", "minimum": 0 }, @@ -34,17 +34,17 @@ "minimum": 0 }, "initial_workers": { - "description": "The number of workers to launch initially, in addition to the head node.", + "description": "DEPRECATED.", "type": "integer", "minimum": 0 }, "autoscaling_mode": { - "description": "The mode of the autoscaler e.g. default, aggressive", + "description": "DEPRECATED. Use upscaling_speed instead.", "type": "string", "enum": [ "default", "aggressive" ] }, "target_utilization_fraction": { - "description": "The autoscaler will scale up the cluster to this target fraction of resources usage. For example, if a cluster of 8 nodes is 100% busy # and target_utilization was 0.8, it would resize the cluster to 10.", + "description": "DEPRECATED. Use upscaling_speed instead.", "type": "number", "minimum": 0, "maximum": 1 @@ -254,6 +254,10 @@ "type": "string", "description": "If using multiple node types, specifies the head node type." }, + "worker_default_node_type": { + "type": "string", + "description": "DEPRECATED." + }, "head_node": { "type": "object", "description": "Provider-specific config for the head node, e.g. instance type." diff --git a/python/ray/serve/benchmarks/cluster.yaml b/python/ray/serve/benchmarks/cluster.yaml index d588dc06a..aad50bf97 100644 --- a/python/ray/serve/benchmarks/cluster.yaml +++ b/python/ray/serve/benchmarks/cluster.yaml @@ -1,13 +1,10 @@ cluster_name: default min_workers: 5 max_workers: 5 -initial_workers: 5 -autoscaling_mode: default docker: image: 'anyscale/ray-ml:latest' container_name: ray_container pull_before_run: true -target_utilization_fraction: 0.8 idle_timeout_minutes: 5 provider: type: aws diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml b/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml index 4d6342009..f3d6a03ce 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml +++ b/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml @@ -12,7 +12,6 @@ head_start_ray_commands: - ray stop - ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml idle_timeout_minutes: 5 -initial_workers: 1 initialization_commands: - echo init max_workers: 2 @@ -27,7 +26,6 @@ setup_commands: - echo a - echo b - echo ${echo hi} -target_utilization_fraction: 0.9 worker_nodes: ImageId: latest_dlami InstanceType: t1.micro diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml b/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml index 8d898f749..bffd0f53f 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml +++ b/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml @@ -17,7 +17,6 @@ head_start_ray_commands: - ray stop - ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml idle_timeout_minutes: 5 -initial_workers: 1 initialization_commands: - echo init max_workers: 2 @@ -32,7 +31,6 @@ setup_commands: - echo a - echo b - echo ${echo hi} -target_utilization_fraction: 0.9 worker_nodes: ImageId: latest_dlami InstanceType: t3a.small diff --git a/python/ray/tests/test_coordinator_server.py b/python/ray/tests/test_coordinator_server.py index 6fb654e3e..0c59b909e 100644 --- a/python/ray/tests/test_coordinator_server.py +++ b/python/ray/tests/test_coordinator_server.py @@ -52,7 +52,6 @@ class OnPremCoordinatorServerTest(unittest.TestCase): "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, - "initial_workers": 0, "provider": { "type": "local", "head_ip": "0.0.0.0:2", @@ -154,7 +153,6 @@ class OnPremCoordinatorServerTest(unittest.TestCase): "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, - "initial_workers": 0, "provider": { "type": "local", "coordinator_address": self.coordinator_address, diff --git a/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml b/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml index 846f5f10c..fcf31354b 100644 --- a/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml +++ b/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml @@ -4,11 +4,8 @@ cluster_name: sgd-tf # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 3 -initial_workers: 3 max_workers: 3 -target_utilization_fraction: 0.9 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 20 # docker: diff --git a/python/ray/util/sgd/torch/examples/benchmarks/README.rst b/python/ray/util/sgd/torch/examples/benchmarks/README.rst index 78dd71a15..54b3ce192 100644 --- a/python/ray/util/sgd/torch/examples/benchmarks/README.rst +++ b/python/ray/util/sgd/torch/examples/benchmarks/README.rst @@ -104,7 +104,6 @@ You can specify the number of nodes you want to use with the following configura # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: # Change this to a custom quantity - initial_workers: # same as above max_workers: # same as above You may want to install FP16 support for PyTorch with the following configuration in the YAML file: diff --git a/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml b/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml index 04cbd520e..7e3db5051 100644 --- a/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml +++ b/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml @@ -4,11 +4,8 @@ cluster_name: horovod-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 1 -initial_workers: 1 max_workers: 1 -target_utilization_fraction: 0.9 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 50 # docker: diff --git a/python/ray/util/sgd/torch/examples/example-sgd.yaml b/python/ray/util/sgd/torch/examples/example-sgd.yaml index fe9b18d19..6bbc64423 100644 --- a/python/ray/util/sgd/torch/examples/example-sgd.yaml +++ b/python/ray/util/sgd/torch/examples/example-sgd.yaml @@ -4,11 +4,8 @@ cluster_name: sgd-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 3 -initial_workers: 3 max_workers: 3 -target_utilization_fraction: 0.9 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 20 # docker: diff --git a/python/ray/util/sgd/torch/examples/image_models/cluster.yaml b/python/ray/util/sgd/torch/examples/image_models/cluster.yaml index fccd5f862..7d9ff9be8 100644 --- a/python/ray/util/sgd/torch/examples/image_models/cluster.yaml +++ b/python/ray/util/sgd/torch/examples/image_models/cluster.yaml @@ -4,11 +4,8 @@ cluster_name: sgd-pytorch-imagenet # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 1 -initial_workers: 1 max_workers: 1 -target_utilization_fraction: 0.9 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 10 # docker: diff --git a/python/ray/util/sgd/torch/examples/segmentation/example.yaml b/python/ray/util/sgd/torch/examples/segmentation/example.yaml index 78cd9bcb0..33db0f445 100644 --- a/python/ray/util/sgd/torch/examples/segmentation/example.yaml +++ b/python/ray/util/sgd/torch/examples/segmentation/example.yaml @@ -4,10 +4,8 @@ cluster_name: sgd-coco-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 1 -initial_workers: 1 max_workers: 1 -target_utilization_fraction: 0.9 # Cloud-provider specific configuration. provider: type: aws diff --git a/python/ray/util/sgd/torch/examples/sgd-development.yaml b/python/ray/util/sgd/torch/examples/sgd-development.yaml index 590cb63b0..bc79803ee 100644 --- a/python/ray/util/sgd/torch/examples/sgd-development.yaml +++ b/python/ray/util/sgd/torch/examples/sgd-development.yaml @@ -4,11 +4,8 @@ cluster_name: sgd-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 2 -initial_workers: 2 max_workers: 2 -target_utilization_fraction: 0.9 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 10 # docker: diff --git a/python/ray/util/sgd/torch/examples/transformers/cluster.yaml b/python/ray/util/sgd/torch/examples/transformers/cluster.yaml index 4cecd3bf8..434b48d30 100644 --- a/python/ray/util/sgd/torch/examples/transformers/cluster.yaml +++ b/python/ray/util/sgd/torch/examples/transformers/cluster.yaml @@ -4,10 +4,8 @@ cluster_name: transformer-cluster # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 3 -initial_workers: 3 max_workers: 3 -target_utilization_fraction: 0.9 # Cloud-provider specific configuration. provider: type: aws diff --git a/release/horovod_tests/cluster.yaml b/release/horovod_tests/cluster.yaml index 880ebdba2..5dbc457a7 100644 --- a/release/horovod_tests/cluster.yaml +++ b/release/horovod_tests/cluster.yaml @@ -10,8 +10,6 @@ min_workers: 3 # node. This takes precedence over min_workers. min_workers defaults to 0. max_workers: 3 -target_utilization_fraction: 0.8 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/release/long_running_distributed_tests/cluster.yaml b/release/long_running_distributed_tests/cluster.yaml index f8d10549a..4710a47fc 100644 --- a/release/long_running_distributed_tests/cluster.yaml +++ b/release/long_running_distributed_tests/cluster.yaml @@ -3,7 +3,6 @@ cluster_name: long-running-distributed-tests min_workers: 3 max_workers: 3 -target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/rllib_tests/stress_tests/cluster.yaml b/release/rllib_tests/stress_tests/cluster.yaml index 8f20a46af..4c83e27c3 100644 --- a/release/rllib_tests/stress_tests/cluster.yaml +++ b/release/rllib_tests/stress_tests/cluster.yaml @@ -3,7 +3,6 @@ cluster_name: ray-rllib-stress-tests min_workers: 9 max_workers: 9 -target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/stress_tests/autoscaler-cluster.yaml b/release/stress_tests/autoscaler-cluster.yaml index ed5ee2bd5..9c17d303e 100644 --- a/release/stress_tests/autoscaler-cluster.yaml +++ b/release/stress_tests/autoscaler-cluster.yaml @@ -13,13 +13,6 @@ min_workers: 100 # node. This takes precedence over min_workers. max_workers: 100 -# The autoscaler will scale up the cluster to this target fraction of resource -# usage. For example, if a cluster of 10 nodes is 100% busy and -# target_utilization is 0.8, it would resize the cluster to 13. This fraction -# can be decreased to increase the aggressiveness of upscaling. -# This value must be less than 1.0 for scaling to happen. -target_utilization_fraction: 0.8 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/release/stress_tests/cluster.yaml b/release/stress_tests/cluster.yaml index a513d9764..155ae1329 100644 --- a/release/stress_tests/cluster.yaml +++ b/release/stress_tests/cluster.yaml @@ -13,13 +13,6 @@ min_workers: 100 # node. This takes precedence over min_workers. max_workers: 100 -# The autoscaler will scale up the cluster to this target fraction of resource -# usage. For example, if a cluster of 10 nodes is 100% busy and -# target_utilization is 0.8, it would resize the cluster to 13. This fraction -# can be decreased to increase the aggressiveness of upscaling. -# This value must be less than 1.0 for scaling to happen. -target_utilization_fraction: 0.8 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/release/tune_tests/scalability_tests/cluster.yaml b/release/tune_tests/scalability_tests/cluster.yaml index e279efb37..fd966898b 100644 --- a/release/tune_tests/scalability_tests/cluster.yaml +++ b/release/tune_tests/scalability_tests/cluster.yaml @@ -2,9 +2,7 @@ cluster_name: ray-tune-scalability-tests min_workers: 15 max_workers: 15 -initial_workers: 15 -target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/xgboost_tests/cluster_cpu_moderate.yaml b/release/xgboost_tests/cluster_cpu_moderate.yaml index 18a18dceb..a65c49336 100644 --- a/release/xgboost_tests/cluster_cpu_moderate.yaml +++ b/release/xgboost_tests/cluster_cpu_moderate.yaml @@ -2,9 +2,7 @@ cluster_name: ray-xgboost-release-cpu-moderate min_workers: 31 max_workers: 31 -initial_workers: 31 -target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/xgboost_tests/cluster_cpu_small.yaml b/release/xgboost_tests/cluster_cpu_small.yaml index fe9e997f8..4b97439b9 100644 --- a/release/xgboost_tests/cluster_cpu_small.yaml +++ b/release/xgboost_tests/cluster_cpu_small.yaml @@ -2,9 +2,7 @@ cluster_name: ray-xgboost-release-cpu-small min_workers: 3 max_workers: 3 -initial_workers: 3 -target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/xgboost_tests/cluster_gpu_small.yaml b/release/xgboost_tests/cluster_gpu_small.yaml index 5bea4f19a..535d28490 100644 --- a/release/xgboost_tests/cluster_gpu_small.yaml +++ b/release/xgboost_tests/cluster_gpu_small.yaml @@ -2,9 +2,7 @@ cluster_name: ray-xgboost-release-gpu-small min_workers: 4 max_workers: 4 -initial_workers: 4 -target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: