From 1643bc5c4fef64e86d995dc788e65e6f2194e1c4 Mon Sep 17 00:00:00 2001 From: Ameer Haj Ali Date: Mon, 8 Feb 2021 23:19:33 +0200 Subject: [PATCH] Fix autoscaler wrong parameter names (#13966) * prepare for head node * move command runner interface outside _private * remove space * Eric * flake * min_workers in multi node type * fixing edge cases * eric not idle * fix target_workers to consider min_workers of node types * idle timeout * minor * minor fix * test * lint * eric v2 * eric 3 * min_workers constraint before bin packing * Update resource_demand_scheduler.py * Revert "Update resource_demand_scheduler.py" This reverts commit 818a63a2c86d8437b3ef21c5035d701c1d1127b5. * reducing diff * make get_nodes_to_launch return a dict * merge * weird merge fix * auto fill instance types for AWS * Alex/Eric * Update doc/source/cluster/autoscaling.rst * merge autofill and input from user * logger.exception * make the yaml use the default autofill * docs Eric * remove test_autoscaler_yaml from windows tests * lets try changing the test a bit * return test * lets see * edward * Limit max launch concurrency * commenting frac TODO * move to resource demand scheduler * use STATUS UP TO DATE * Eric * make logger of gc freed refs debug instead of info * add cluster name to docker mount prefix directory * grrR * fix tests * moving docker directory to sdk * move the import to prevent circular dependency * smallf fix * ian * fix max launch concurrency bug to assume failing nodes as pending and consider only load_metric's connected nodes as running * small fix * improve code readability * lint Co-authored-by: Ameer Haj Ali Co-authored-by: Alex Wu Co-authored-by: Alex Wu Co-authored-by: Eric Liang Co-authored-by: Ameer Haj Ali --- python/ray/autoscaler/_private/autoscaler.py | 29 ++++++++++---------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/python/ray/autoscaler/_private/autoscaler.py b/python/ray/autoscaler/_private/autoscaler.py index 1166597ed..727c4db2e 100644 --- a/python/ray/autoscaler/_private/autoscaler.py +++ b/python/ray/autoscaler/_private/autoscaler.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) # that will be passed into a NodeUpdaterThread. UpdateInstructions = namedtuple( "UpdateInstructions", - ["node_id", "init_commands", "start_ray_commands", "docker_config"]) + ["node_id", "setup_commands", "ray_start_commands", "docker_config"]) AutoscalerSummary = namedtuple( "AutoscalerSummary", @@ -283,7 +283,7 @@ class StandardAutoscaler: # problems. They should at a minimum be spawned as daemon threads. # See https://github.com/ray-project/ray/pull/5903 for more info. T = [] - for node_id, commands, ray_start, docker_config in ( + for node_id, setup_commands, ray_start_commands, docker_config in ( self.should_update(node_id) for node_id in nodes): if node_id is not None: resources = self._node_resources(node_id) @@ -291,8 +291,8 @@ class StandardAutoscaler: T.append( threading.Thread( target=self.spawn_updater, - args=(node_id, commands, ray_start, resources, - docker_config))) + args=(node_id, setup_commands, ray_start_commands, + resources, docker_config))) for t in T: t.start() for t in T: @@ -633,25 +633,25 @@ class StandardAutoscaler: successful_updated = self.num_successful_updates.get(node_id, 0) > 0 if successful_updated and self.config.get("restart_only", False): - init_commands = [] - ray_commands = self.config["worker_start_ray_commands"] + setup_commands = [] + ray_start_commands = self.config["worker_start_ray_commands"] elif successful_updated and self.config.get("no_restart", False): - init_commands = self._get_node_type_specific_fields( + setup_commands = self._get_node_type_specific_fields( node_id, "worker_setup_commands") - ray_commands = [] + ray_start_commands = [] else: - init_commands = self._get_node_type_specific_fields( + setup_commands = self._get_node_type_specific_fields( node_id, "worker_setup_commands") - ray_commands = self.config["worker_start_ray_commands"] + ray_start_commands = self.config["worker_start_ray_commands"] docker_config = self._get_node_specific_docker_config(node_id) return UpdateInstructions( node_id=node_id, - init_commands=init_commands, - start_ray_commands=ray_commands, + setup_commands=setup_commands, + ray_start_commands=ray_start_commands, docker_config=docker_config) - def spawn_updater(self, node_id, init_commands, ray_start_commands, + def spawn_updater(self, node_id, setup_commands, ray_start_commands, node_resources, docker_config): logger.info(f"Creating new (spawn_updater) updater thread for node" f" {node_id}.") @@ -665,7 +665,8 @@ class StandardAutoscaler: initialization_commands=with_head_node_ip( self._get_node_type_specific_fields( node_id, "initialization_commands"), self.head_node_ip), - setup_commands=with_head_node_ip(init_commands, self.head_node_ip), + setup_commands=with_head_node_ip(setup_commands, + self.head_node_ip), ray_start_commands=with_head_node_ip(ray_start_commands, self.head_node_ip), runtime_hash=self.runtime_hash,