From b7dd7ddb5231bc4bc83ae1e385edc761d5476627 Mon Sep 17 00:00:00 2001
From: Ameer Haj Ali <ameer@anyscale.com>
Date: Sat, 23 Jan 2021 22:06:51 +0200
Subject: [PATCH] deprecate useless fields in the cluster yaml. (#13637)

* prepare for head node

* move command runner interface outside _private

* remove space

* Eric

* flake

* min_workers in multi node type

* fixing edge cases

* eric not idle

* fix target_workers to consider min_workers of node types

* idle timeout

* minor

* minor fix

* test

* lint

* eric v2

* eric 3

* min_workers constraint before bin packing

* Update resource_demand_scheduler.py

* Revert "Update resource_demand_scheduler.py"

This reverts commit 818a63a2c86d8437b3ef21c5035d701c1d1127b5.

* reducing diff

* make get_nodes_to_launch return a dict

* merge

* weird merge fix

* auto fill instance types for AWS

* Alex/Eric

* Update doc/source/cluster/autoscaling.rst

* merge autofill and input from user

* logger.exception

* make the yaml use the default autofill

* docs Eric

* remove test_autoscaler_yaml from windows tests

* lets try changing the test a bit

* return test

* lets see

* edward

* Limit max launch concurrency

* commenting frac TODO

* move to resource demand scheduler

* use STATUS UP TO DATE

* Eric

* make logger of gc freed refs debug instead of info

* add cluster name to docker mount prefix directory

* grrR

* fix tests

* moving docker directory to sdk

* move the import to prevent circular dependency

* smallf fix

* ian

* fix max launch concurrency bug to assume failing nodes as pending and consider only load_metric's connected nodes as running

* small fix

* deflake test_joblib

* lint

* placement groups bypass

* remove space

* Eric

* first ocmmit

* lint

* exmaple

* documentation

* hmm

* file path fix

* fix test

* some format issue in docs

* modified docs

* joblib strikes again on windows

* add ability to not start autoscaler/monitor

* a

* remove worker_default

* Remove default pod type from operator

* Remove worker_default_node_type from rewrite_legacy_yaml_to_availble_node_types

* deprecate useless fields

Co-authored-by: Ameer Haj Ali <ameerhajali@ameers-mbp.lan>
Co-authored-by: Alex Wu <alex@anyscale.io>
Co-authored-by: Alex Wu <itswu.alex@gmail.com>
Co-authored-by: Eric Liang <ekhliang@gmail.com>
Co-authored-by: Ameer Haj Ali <ameerhajali@Ameers-MacBook-Pro.local>
Co-authored-by: root <root@ip-172-31-56-188.us-west-2.compute.internal>
Co-authored-by: Dmitri Gekhtman <dmitri.m.gekhtman@gmail.com>
---
 dashboard/modules/reporter/reporter_head.py     |  5 +----
 doc/examples/lm/lm-cluster.yaml                 | 17 -----------------
 python/ray/autoscaler/ray-schema.json           | 12 ++++++++----
 python/ray/serve/benchmarks/cluster.yaml        |  3 ---
 .../test_cli_patterns/test_ray_up_config.yaml   |  2 --
 .../test_ray_up_docker_config.yaml              |  2 --
 python/ray/tests/test_coordinator_server.py     |  2 --
 .../util/sgd/tf/examples/tf-example-sgd.yaml    |  3 ---
 .../sgd/torch/examples/benchmarks/README.rst    |  1 -
 .../examples/benchmarks/horovod-benchmark.yaml  |  3 ---
 .../util/sgd/torch/examples/example-sgd.yaml    |  3 ---
 .../torch/examples/image_models/cluster.yaml    |  3 ---
 .../torch/examples/segmentation/example.yaml    |  2 --
 .../sgd/torch/examples/sgd-development.yaml     |  3 ---
 .../torch/examples/transformers/cluster.yaml    |  2 --
 release/horovod_tests/cluster.yaml              |  2 --
 .../long_running_distributed_tests/cluster.yaml |  1 -
 release/rllib_tests/stress_tests/cluster.yaml   |  1 -
 release/stress_tests/autoscaler-cluster.yaml    |  7 -------
 release/stress_tests/cluster.yaml               |  7 -------
 .../tune_tests/scalability_tests/cluster.yaml   |  2 --
 release/xgboost_tests/cluster_cpu_moderate.yaml |  2 --
 release/xgboost_tests/cluster_cpu_small.yaml    |  2 --
 release/xgboost_tests/cluster_gpu_small.yaml    |  2 --
 24 files changed, 9 insertions(+), 80 deletions(-)

diff --git a/dashboard/modules/reporter/reporter_head.py b/dashboard/modules/reporter/reporter_head.py
index 2d84c6b65..7d375c8d6 100644
--- a/dashboard/modules/reporter/reporter_head.py
+++ b/dashboard/modules/reporter/reporter_head.py
@@ -78,10 +78,7 @@ class ReportHead(dashboard_utils.DashboardHeadModule):
 
             payload = {
                 "min_workers": cfg["min_workers"],
-                "max_workers": cfg["max_workers"],
-                "initial_workers": cfg["initial_workers"],
-                "autoscaling_mode": cfg["autoscaling_mode"],
-                "idle_timeout_minutes": cfg["idle_timeout_minutes"],
+                "max_workers": cfg["max_workers"]
             }
 
             try:
diff --git a/doc/examples/lm/lm-cluster.yaml b/doc/examples/lm/lm-cluster.yaml
index 3590d482a..7ea6641f5 100644
--- a/doc/examples/lm/lm-cluster.yaml
+++ b/doc/examples/lm/lm-cluster.yaml
@@ -9,23 +9,6 @@ min_workers: 1
 # node. This takes precedence over min_workers.
 max_workers: 2
 
-# The initial number of worker nodes to launch in addition to the head
-# node. When the cluster is first brought up (or when it is refreshed with a
-# subsequent `ray up`) this number of nodes will be started.
-initial_workers: 1
-
-# Whether or not to autoscale aggressively. If this is enabled, if at any point
-#   we would start more workers, we start at least enough to bring us to
-#   initial_workers.
-autoscaling_mode: default
-
-
-# The autoscaler will scale up the cluster to this target fraction of resource
-# usage. For example, if a cluster of 10 nodes is 100% busy and
-# target_utilization is 0.8, it would resize the cluster to 13. This fraction
-# can be decreased to increase the aggressiveness of upscaling.
-# This value must be less than 1.0 for scaling to happen.
-target_utilization_fraction: 0.48
 
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json
index 22b21b84c..7c7b2a1ed 100644
--- a/python/ray/autoscaler/ray-schema.json
+++ b/python/ray/autoscaler/ray-schema.json
@@ -24,7 +24,7 @@
             "type": "string"
         },
         "min_workers": {
-            "description": "The minimum number of workers nodes to launch in addition to the head node. This number should be >= 0",
+            "description": "DEPRECATED. Use the per node_type min_workers field instead.",
             "type": "integer",
             "minimum": 0
         },
@@ -34,17 +34,17 @@
             "minimum": 0
         },
         "initial_workers": {
-            "description": "The number of workers to launch initially, in addition to the head node.",
+            "description": "DEPRECATED.",
             "type": "integer",
             "minimum": 0
         },
         "autoscaling_mode": {
-            "description": "The mode of the autoscaler e.g. default, aggressive",
+            "description": "DEPRECATED. Use upscaling_speed instead.",
             "type": "string",
             "enum": [ "default", "aggressive" ]
         },
         "target_utilization_fraction": {
-            "description": "The autoscaler will scale up the cluster to this target fraction of resources usage. For example, if a cluster of 8 nodes is 100% busy # and target_utilization was 0.8, it would resize the cluster to 10.",
+            "description": "DEPRECATED. Use upscaling_speed instead.",
             "type": "number",
             "minimum": 0,
             "maximum": 1
@@ -254,6 +254,10 @@
             "type": "string",
             "description": "If using multiple node types, specifies the head node type."
         },
+        "worker_default_node_type": {
+            "type": "string",
+            "description": "DEPRECATED."
+        },
         "head_node": {
             "type": "object",
             "description": "Provider-specific config for the head node, e.g. instance type."
diff --git a/python/ray/serve/benchmarks/cluster.yaml b/python/ray/serve/benchmarks/cluster.yaml
index d588dc06a..aad50bf97 100644
--- a/python/ray/serve/benchmarks/cluster.yaml
+++ b/python/ray/serve/benchmarks/cluster.yaml
@@ -1,13 +1,10 @@
 cluster_name: default
 min_workers: 5
 max_workers: 5
-initial_workers: 5
-autoscaling_mode: default
 docker:
     image: 'anyscale/ray-ml:latest'
     container_name: ray_container
     pull_before_run: true
-target_utilization_fraction: 0.8
 idle_timeout_minutes: 5
 provider:
     type: aws
diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml b/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml
index 4d6342009..f3d6a03ce 100644
--- a/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml
+++ b/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml
@@ -12,7 +12,6 @@ head_start_ray_commands:
     - ray stop
     - ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml
 idle_timeout_minutes: 5
-initial_workers: 1
 initialization_commands:
     - echo init
 max_workers: 2
@@ -27,7 +26,6 @@ setup_commands:
     - echo a
     - echo b
     - echo ${echo hi}
-target_utilization_fraction: 0.9
 worker_nodes:
     ImageId: latest_dlami
     InstanceType: t1.micro
diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml b/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml
index 8d898f749..bffd0f53f 100644
--- a/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml
+++ b/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml
@@ -17,7 +17,6 @@ head_start_ray_commands:
     - ray stop
     - ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml
 idle_timeout_minutes: 5
-initial_workers: 1
 initialization_commands:
     - echo init
 max_workers: 2
@@ -32,7 +31,6 @@ setup_commands:
     - echo a
     - echo b
     - echo ${echo hi}
-target_utilization_fraction: 0.9
 worker_nodes:
     ImageId: latest_dlami
     InstanceType: t3a.small
diff --git a/python/ray/tests/test_coordinator_server.py b/python/ray/tests/test_coordinator_server.py
index 6fb654e3e..0c59b909e 100644
--- a/python/ray/tests/test_coordinator_server.py
+++ b/python/ray/tests/test_coordinator_server.py
@@ -52,7 +52,6 @@ class OnPremCoordinatorServerTest(unittest.TestCase):
             "cluster_name": "random_name",
             "min_workers": 0,
             "max_workers": 0,
-            "initial_workers": 0,
             "provider": {
                 "type": "local",
                 "head_ip": "0.0.0.0:2",
@@ -154,7 +153,6 @@ class OnPremCoordinatorServerTest(unittest.TestCase):
             "cluster_name": "random_name",
             "min_workers": 0,
             "max_workers": 0,
-            "initial_workers": 0,
             "provider": {
                 "type": "local",
                 "coordinator_address": self.coordinator_address,
diff --git a/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml b/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml
index 846f5f10c..fcf31354b 100644
--- a/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml
+++ b/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml
@@ -4,11 +4,8 @@ cluster_name: sgd-tf
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers. min_workers default to 0.
 min_workers: 3
-initial_workers: 3
 max_workers: 3
 
-target_utilization_fraction: 0.9
-
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 20
 # docker:
diff --git a/python/ray/util/sgd/torch/examples/benchmarks/README.rst b/python/ray/util/sgd/torch/examples/benchmarks/README.rst
index 78dd71a15..54b3ce192 100644
--- a/python/ray/util/sgd/torch/examples/benchmarks/README.rst
+++ b/python/ray/util/sgd/torch/examples/benchmarks/README.rst
@@ -104,7 +104,6 @@ You can specify the number of nodes you want to use with the following configura
     # The maximum number of workers nodes to launch in addition to the head
     # node. This takes precedence over min_workers. min_workers default to 0.
     min_workers: <NUMBER_OF_NODES>  # Change this to a custom quantity
-    initial_workers:  <NUMBER_OF_NODES>  # same as above
     max_workers:  <NUMBER_OF_NODES>  # same as above
 
 You may want to install FP16 support for PyTorch with the following configuration in the YAML file:
diff --git a/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml b/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml
index 04cbd520e..7e3db5051 100644
--- a/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml
+++ b/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml
@@ -4,11 +4,8 @@ cluster_name: horovod-pytorch
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers. min_workers default to 0.
 min_workers: 1
-initial_workers: 1
 max_workers: 1
 
-target_utilization_fraction: 0.9
-
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 50
 # docker:
diff --git a/python/ray/util/sgd/torch/examples/example-sgd.yaml b/python/ray/util/sgd/torch/examples/example-sgd.yaml
index fe9b18d19..6bbc64423 100644
--- a/python/ray/util/sgd/torch/examples/example-sgd.yaml
+++ b/python/ray/util/sgd/torch/examples/example-sgd.yaml
@@ -4,11 +4,8 @@ cluster_name: sgd-pytorch
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers. min_workers default to 0.
 min_workers: 3
-initial_workers: 3
 max_workers: 3
 
-target_utilization_fraction: 0.9
-
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 20
 # docker:
diff --git a/python/ray/util/sgd/torch/examples/image_models/cluster.yaml b/python/ray/util/sgd/torch/examples/image_models/cluster.yaml
index fccd5f862..7d9ff9be8 100644
--- a/python/ray/util/sgd/torch/examples/image_models/cluster.yaml
+++ b/python/ray/util/sgd/torch/examples/image_models/cluster.yaml
@@ -4,11 +4,8 @@ cluster_name: sgd-pytorch-imagenet
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers. min_workers default to 0.
 min_workers: 1
-initial_workers: 1
 max_workers: 1
 
-target_utilization_fraction: 0.9
-
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 10
 # docker:
diff --git a/python/ray/util/sgd/torch/examples/segmentation/example.yaml b/python/ray/util/sgd/torch/examples/segmentation/example.yaml
index 78cd9bcb0..33db0f445 100644
--- a/python/ray/util/sgd/torch/examples/segmentation/example.yaml
+++ b/python/ray/util/sgd/torch/examples/segmentation/example.yaml
@@ -4,10 +4,8 @@ cluster_name: sgd-coco-pytorch
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers. min_workers default to 0.
 min_workers: 1
-initial_workers: 1
 max_workers: 1
 
-target_utilization_fraction: 0.9
 # Cloud-provider specific configuration.
 provider:
     type: aws
diff --git a/python/ray/util/sgd/torch/examples/sgd-development.yaml b/python/ray/util/sgd/torch/examples/sgd-development.yaml
index 590cb63b0..bc79803ee 100644
--- a/python/ray/util/sgd/torch/examples/sgd-development.yaml
+++ b/python/ray/util/sgd/torch/examples/sgd-development.yaml
@@ -4,11 +4,8 @@ cluster_name: sgd-pytorch
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers. min_workers default to 0.
 min_workers: 2
-initial_workers: 2
 max_workers: 2
 
-target_utilization_fraction: 0.9
-
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 10
 # docker:
diff --git a/python/ray/util/sgd/torch/examples/transformers/cluster.yaml b/python/ray/util/sgd/torch/examples/transformers/cluster.yaml
index 4cecd3bf8..434b48d30 100644
--- a/python/ray/util/sgd/torch/examples/transformers/cluster.yaml
+++ b/python/ray/util/sgd/torch/examples/transformers/cluster.yaml
@@ -4,10 +4,8 @@ cluster_name: transformer-cluster
 # The maximum number of workers nodes to launch in addition to the head
 # node. This takes precedence over min_workers. min_workers default to 0.
 min_workers: 3
-initial_workers: 3
 max_workers: 3
 
-target_utilization_fraction: 0.9
 # Cloud-provider specific configuration.
 provider:
     type: aws
diff --git a/release/horovod_tests/cluster.yaml b/release/horovod_tests/cluster.yaml
index 880ebdba2..5dbc457a7 100644
--- a/release/horovod_tests/cluster.yaml
+++ b/release/horovod_tests/cluster.yaml
@@ -10,8 +10,6 @@ min_workers: 3
 # node. This takes precedence over min_workers. min_workers defaults to 0.
 max_workers: 3
 
-target_utilization_fraction: 0.8
-
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
 
diff --git a/release/long_running_distributed_tests/cluster.yaml b/release/long_running_distributed_tests/cluster.yaml
index f8d10549a..4710a47fc 100644
--- a/release/long_running_distributed_tests/cluster.yaml
+++ b/release/long_running_distributed_tests/cluster.yaml
@@ -3,7 +3,6 @@ cluster_name: long-running-distributed-tests
 min_workers: 3
 max_workers: 3
 
-target_utilization_fraction: 0.8
 idle_timeout_minutes: 15
 
 docker:
diff --git a/release/rllib_tests/stress_tests/cluster.yaml b/release/rllib_tests/stress_tests/cluster.yaml
index 8f20a46af..4c83e27c3 100644
--- a/release/rllib_tests/stress_tests/cluster.yaml
+++ b/release/rllib_tests/stress_tests/cluster.yaml
@@ -3,7 +3,6 @@ cluster_name: ray-rllib-stress-tests
 min_workers: 9
 max_workers: 9
 
-target_utilization_fraction: 0.8
 idle_timeout_minutes: 15
 
 docker:
diff --git a/release/stress_tests/autoscaler-cluster.yaml b/release/stress_tests/autoscaler-cluster.yaml
index ed5ee2bd5..9c17d303e 100644
--- a/release/stress_tests/autoscaler-cluster.yaml
+++ b/release/stress_tests/autoscaler-cluster.yaml
@@ -13,13 +13,6 @@ min_workers: 100
 # node. This takes precedence over min_workers.
 max_workers: 100
 
-# The autoscaler will scale up the cluster to this target fraction of resource
-# usage. For example, if a cluster of 10 nodes is 100% busy and
-# target_utilization is 0.8, it would resize the cluster to 13. This fraction
-# can be decreased to increase the aggressiveness of upscaling.
-# This value must be less than 1.0 for scaling to happen.
-target_utilization_fraction: 0.8
-
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
 
diff --git a/release/stress_tests/cluster.yaml b/release/stress_tests/cluster.yaml
index a513d9764..155ae1329 100644
--- a/release/stress_tests/cluster.yaml
+++ b/release/stress_tests/cluster.yaml
@@ -13,13 +13,6 @@ min_workers: 100
 # node. This takes precedence over min_workers.
 max_workers: 100
 
-# The autoscaler will scale up the cluster to this target fraction of resource
-# usage. For example, if a cluster of 10 nodes is 100% busy and
-# target_utilization is 0.8, it would resize the cluster to 13. This fraction
-# can be decreased to increase the aggressiveness of upscaling.
-# This value must be less than 1.0 for scaling to happen.
-target_utilization_fraction: 0.8
-
 # If a node is idle for this many minutes, it will be removed.
 idle_timeout_minutes: 5
 
diff --git a/release/tune_tests/scalability_tests/cluster.yaml b/release/tune_tests/scalability_tests/cluster.yaml
index e279efb37..fd966898b 100644
--- a/release/tune_tests/scalability_tests/cluster.yaml
+++ b/release/tune_tests/scalability_tests/cluster.yaml
@@ -2,9 +2,7 @@ cluster_name: ray-tune-scalability-tests
 
 min_workers: 15
 max_workers: 15
-initial_workers: 15
 
-target_utilization_fraction: 0.8
 idle_timeout_minutes: 15
 
 docker:
diff --git a/release/xgboost_tests/cluster_cpu_moderate.yaml b/release/xgboost_tests/cluster_cpu_moderate.yaml
index 18a18dceb..a65c49336 100644
--- a/release/xgboost_tests/cluster_cpu_moderate.yaml
+++ b/release/xgboost_tests/cluster_cpu_moderate.yaml
@@ -2,9 +2,7 @@ cluster_name: ray-xgboost-release-cpu-moderate
 
 min_workers: 31
 max_workers: 31
-initial_workers: 31
 
-target_utilization_fraction: 0.8
 idle_timeout_minutes: 15
 
 docker:
diff --git a/release/xgboost_tests/cluster_cpu_small.yaml b/release/xgboost_tests/cluster_cpu_small.yaml
index fe9e997f8..4b97439b9 100644
--- a/release/xgboost_tests/cluster_cpu_small.yaml
+++ b/release/xgboost_tests/cluster_cpu_small.yaml
@@ -2,9 +2,7 @@ cluster_name: ray-xgboost-release-cpu-small
 
 min_workers: 3
 max_workers: 3
-initial_workers: 3
 
-target_utilization_fraction: 0.8
 idle_timeout_minutes: 15
 
 docker:
diff --git a/release/xgboost_tests/cluster_gpu_small.yaml b/release/xgboost_tests/cluster_gpu_small.yaml
index 5bea4f19a..535d28490 100644
--- a/release/xgboost_tests/cluster_gpu_small.yaml
+++ b/release/xgboost_tests/cluster_gpu_small.yaml
@@ -2,9 +2,7 @@ cluster_name: ray-xgboost-release-gpu-small
 
 min_workers: 4
 max_workers: 4
-initial_workers: 4
 
-target_utilization_fraction: 0.8
 idle_timeout_minutes: 15
 
 docker: