mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 17:34:51 +08:00
[autoscaler] Better validation for min_workers and max_workers (#13779)
* prepare for head node * move command runner interface outside _private * remove space * Eric * flake * min_workers in multi node type * fixing edge cases * eric not idle * fix target_workers to consider min_workers of node types * idle timeout * minor * minor fix * test * lint * eric v2 * eric 3 * min_workers constraint before bin packing * Update resource_demand_scheduler.py * Revert "Update resource_demand_scheduler.py" This reverts commit 818a63a2c86d8437b3ef21c5035d701c1d1127b5. * reducing diff * make get_nodes_to_launch return a dict * merge * weird merge fix * auto fill instance types for AWS * Alex/Eric * Update doc/source/cluster/autoscaling.rst * merge autofill and input from user * logger.exception * make the yaml use the default autofill * docs Eric * remove test_autoscaler_yaml from windows tests * lets try changing the test a bit * return test * lets see * edward * Limit max launch concurrency * commenting frac TODO * move to resource demand scheduler * use STATUS UP TO DATE * Eric * make logger of gc freed refs debug instead of info * add cluster name to docker mount prefix directory * grrR * fix tests * moving docker directory to sdk * move the import to prevent circular dependency * smallf fix * ian * fix max launch concurrency bug to assume failing nodes as pending and consider only load_metric's connected nodes as running * small fix * deflake test_joblib * lint * placement groups bypass * remove space * Eric * first ocmmit * lint * exmaple * documentation * hmm * file path fix * fix test * some format issue in docs * modified docs * joblib strikes again on windows * add ability to not start autoscaler/monitor * a * remove worker_default * Remove default pod type from operator * Remove worker_default_node_type from rewrite_legacy_yaml_to_availble_node_types * deprecate useless fields * fix error msg * validate sum min_workers < max_workers * 1 more edge case test * lint Co-authored-by: Ameer Haj Ali <ameerhajali@ameers-mbp.lan> Co-authored-by: Alex Wu <alex@anyscale.io> Co-authored-by: Alex Wu <itswu.alex@gmail.com> Co-authored-by: Eric Liang <ekhliang@gmail.com> Co-authored-by: Ameer Haj Ali <ameerhajali@Ameers-MacBook-Pro.local> Co-authored-by: root <root@ip-172-31-56-188.us-west-2.compute.internal> Co-authored-by: Dmitri Gekhtman <dmitri.m.gekhtman@gmail.com>
This commit is contained in:
@@ -86,6 +86,14 @@ def validate_config(config: Dict[str, Any]) -> None:
|
||||
raise ValueError(
|
||||
"`head_node_type` must be one of `available_node_types`.")
|
||||
|
||||
sum_min_workers = sum(
|
||||
config["available_node_types"][node_type].get("min_workers", 0)
|
||||
for node_type in config["available_node_types"])
|
||||
if sum_min_workers > config["max_workers"]:
|
||||
raise ValueError(
|
||||
"The specified global `max_workers` is smaller than the "
|
||||
"sum of `min_workers` of all the available node types.")
|
||||
|
||||
|
||||
def prepare_config(config):
|
||||
with_defaults = fillout_defaults(config)
|
||||
|
||||
@@ -45,6 +45,31 @@ class AutoscalingConfigTest(unittest.TestCase):
|
||||
except Exception:
|
||||
self.fail("Config did not pass validation test!")
|
||||
|
||||
def testValidateDefaultConfigMinMaxWorkers(self):
|
||||
aws_config_path = os.path.join(
|
||||
RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml")
|
||||
with open(aws_config_path) as f:
|
||||
config = yaml.safe_load(f)
|
||||
config = prepare_config(config)
|
||||
for node_type in config["available_node_types"]:
|
||||
config["available_node_types"][node_type]["resources"] = config[
|
||||
"available_node_types"][node_type].get("resources", {})
|
||||
try:
|
||||
validate_config(config)
|
||||
except Exception:
|
||||
self.fail("Config did not pass validation test!")
|
||||
|
||||
config["max_workers"] = 0 # the sum of min_workers is 1.
|
||||
with pytest.raises(ValueError):
|
||||
validate_config(config)
|
||||
|
||||
# make sure edge case of exactly 1 passes too.
|
||||
config["max_workers"] = 1
|
||||
try:
|
||||
validate_config(config)
|
||||
except Exception:
|
||||
self.fail("Config did not pass validation test!")
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform.startswith("win"),
|
||||
reason="TODO(ameer): fails on Windows.")
|
||||
|
||||
Reference in New Issue
Block a user