From 4d6817c6832f64ae7340fb62989eb28b7c1ff3d1 Mon Sep 17 00:00:00 2001 From: Ameer Haj Ali Date: Fri, 29 Jan 2021 19:41:56 +0200 Subject: [PATCH] [autoscaler] Better validation for min_workers and max_workers (#13779) * prepare for head node * move command runner interface outside _private * remove space * Eric * flake * min_workers in multi node type * fixing edge cases * eric not idle * fix target_workers to consider min_workers of node types * idle timeout * minor * minor fix * test * lint * eric v2 * eric 3 * min_workers constraint before bin packing * Update resource_demand_scheduler.py * Revert "Update resource_demand_scheduler.py" This reverts commit 818a63a2c86d8437b3ef21c5035d701c1d1127b5. * reducing diff * make get_nodes_to_launch return a dict * merge * weird merge fix * auto fill instance types for AWS * Alex/Eric * Update doc/source/cluster/autoscaling.rst * merge autofill and input from user * logger.exception * make the yaml use the default autofill * docs Eric * remove test_autoscaler_yaml from windows tests * lets try changing the test a bit * return test * lets see * edward * Limit max launch concurrency * commenting frac TODO * move to resource demand scheduler * use STATUS UP TO DATE * Eric * make logger of gc freed refs debug instead of info * add cluster name to docker mount prefix directory * grrR * fix tests * moving docker directory to sdk * move the import to prevent circular dependency * smallf fix * ian * fix max launch concurrency bug to assume failing nodes as pending and consider only load_metric's connected nodes as running * small fix * deflake test_joblib * lint * placement groups bypass * remove space * Eric * first ocmmit * lint * exmaple * documentation * hmm * file path fix * fix test * some format issue in docs * modified docs * joblib strikes again on windows * add ability to not start autoscaler/monitor * a * remove worker_default * Remove default pod type from operator * Remove worker_default_node_type from rewrite_legacy_yaml_to_availble_node_types * deprecate useless fields * fix error msg * validate sum min_workers < max_workers * 1 more edge case test * lint Co-authored-by: Ameer Haj Ali Co-authored-by: Alex Wu Co-authored-by: Alex Wu Co-authored-by: Eric Liang Co-authored-by: Ameer Haj Ali Co-authored-by: root Co-authored-by: Dmitri Gekhtman --- python/ray/autoscaler/_private/util.py | 8 ++++++++ python/ray/tests/test_autoscaler_yaml.py | 25 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 2bd1e13e9..32758dec6 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -86,6 +86,14 @@ def validate_config(config: Dict[str, Any]) -> None: raise ValueError( "`head_node_type` must be one of `available_node_types`.") + sum_min_workers = sum( + config["available_node_types"][node_type].get("min_workers", 0) + for node_type in config["available_node_types"]) + if sum_min_workers > config["max_workers"]: + raise ValueError( + "The specified global `max_workers` is smaller than the " + "sum of `min_workers` of all the available node types.") + def prepare_config(config): with_defaults = fillout_defaults(config) diff --git a/python/ray/tests/test_autoscaler_yaml.py b/python/ray/tests/test_autoscaler_yaml.py index b712c8955..e5220771f 100644 --- a/python/ray/tests/test_autoscaler_yaml.py +++ b/python/ray/tests/test_autoscaler_yaml.py @@ -45,6 +45,31 @@ class AutoscalingConfigTest(unittest.TestCase): except Exception: self.fail("Config did not pass validation test!") + def testValidateDefaultConfigMinMaxWorkers(self): + aws_config_path = os.path.join( + RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml") + with open(aws_config_path) as f: + config = yaml.safe_load(f) + config = prepare_config(config) + for node_type in config["available_node_types"]: + config["available_node_types"][node_type]["resources"] = config[ + "available_node_types"][node_type].get("resources", {}) + try: + validate_config(config) + except Exception: + self.fail("Config did not pass validation test!") + + config["max_workers"] = 0 # the sum of min_workers is 1. + with pytest.raises(ValueError): + validate_config(config) + + # make sure edge case of exactly 1 passes too. + config["max_workers"] = 1 + try: + validate_config(config) + except Exception: + self.fail("Config did not pass validation test!") + @pytest.mark.skipif( sys.platform.startswith("win"), reason="TODO(ameer): fails on Windows.")