Fix autoscaler wrong parameter names (#13966)

* prepare for head node

* move command runner interface outside _private

* remove space

* Eric

* flake

* min_workers in multi node type

* fixing edge cases

* eric not idle

* fix target_workers to consider min_workers of node types

* idle timeout

* minor

* minor fix

* test

* lint

* eric v2

* eric 3

* min_workers constraint before bin packing

* Update resource_demand_scheduler.py

* Revert "Update resource_demand_scheduler.py"

This reverts commit 818a63a2c86d8437b3ef21c5035d701c1d1127b5.

* reducing diff

* make get_nodes_to_launch return a dict

* merge

* weird merge fix

* auto fill instance types for AWS

* Alex/Eric

* Update doc/source/cluster/autoscaling.rst

* merge autofill and input from user

* logger.exception

* make the yaml use the default autofill

* docs Eric

* remove test_autoscaler_yaml from windows tests

* lets try changing the test a bit

* return test

* lets see

* edward

* Limit max launch concurrency

* commenting frac TODO

* move to resource demand scheduler

* use STATUS UP TO DATE

* Eric

* make logger of gc freed refs debug instead of info

* add cluster name to docker mount prefix directory

* grrR

* fix tests

* moving docker directory to sdk

* move the import to prevent circular dependency

* smallf fix

* ian

* fix max launch concurrency bug to assume failing nodes as pending and consider only load_metric's connected nodes as running

* small fix

* improve code readability

* lint

Co-authored-by: Ameer Haj Ali <ameerhajali@ameers-mbp.lan>
Co-authored-by: Alex Wu <alex@anyscale.io>
Co-authored-by: Alex Wu <itswu.alex@gmail.com>
Co-authored-by: Eric Liang <ekhliang@gmail.com>
Co-authored-by: Ameer Haj Ali <ameerhajali@Ameers-MacBook-Pro.local>
This commit is contained in:
Ameer Haj Ali
2021-02-08 23:19:33 +02:00
committed by GitHub
parent 09242e6d31
commit 1643bc5c4f
+15 -14
View File
@@ -43,7 +43,7 @@ logger = logging.getLogger(__name__)
# that will be passed into a NodeUpdaterThread.
UpdateInstructions = namedtuple(
"UpdateInstructions",
["node_id", "init_commands", "start_ray_commands", "docker_config"])
["node_id", "setup_commands", "ray_start_commands", "docker_config"])
AutoscalerSummary = namedtuple(
"AutoscalerSummary",
@@ -283,7 +283,7 @@ class StandardAutoscaler:
# problems. They should at a minimum be spawned as daemon threads.
# See https://github.com/ray-project/ray/pull/5903 for more info.
T = []
for node_id, commands, ray_start, docker_config in (
for node_id, setup_commands, ray_start_commands, docker_config in (
self.should_update(node_id) for node_id in nodes):
if node_id is not None:
resources = self._node_resources(node_id)
@@ -291,8 +291,8 @@ class StandardAutoscaler:
T.append(
threading.Thread(
target=self.spawn_updater,
args=(node_id, commands, ray_start, resources,
docker_config)))
args=(node_id, setup_commands, ray_start_commands,
resources, docker_config)))
for t in T:
t.start()
for t in T:
@@ -633,25 +633,25 @@ class StandardAutoscaler:
successful_updated = self.num_successful_updates.get(node_id, 0) > 0
if successful_updated and self.config.get("restart_only", False):
init_commands = []
ray_commands = self.config["worker_start_ray_commands"]
setup_commands = []
ray_start_commands = self.config["worker_start_ray_commands"]
elif successful_updated and self.config.get("no_restart", False):
init_commands = self._get_node_type_specific_fields(
setup_commands = self._get_node_type_specific_fields(
node_id, "worker_setup_commands")
ray_commands = []
ray_start_commands = []
else:
init_commands = self._get_node_type_specific_fields(
setup_commands = self._get_node_type_specific_fields(
node_id, "worker_setup_commands")
ray_commands = self.config["worker_start_ray_commands"]
ray_start_commands = self.config["worker_start_ray_commands"]
docker_config = self._get_node_specific_docker_config(node_id)
return UpdateInstructions(
node_id=node_id,
init_commands=init_commands,
start_ray_commands=ray_commands,
setup_commands=setup_commands,
ray_start_commands=ray_start_commands,
docker_config=docker_config)
def spawn_updater(self, node_id, init_commands, ray_start_commands,
def spawn_updater(self, node_id, setup_commands, ray_start_commands,
node_resources, docker_config):
logger.info(f"Creating new (spawn_updater) updater thread for node"
f" {node_id}.")
@@ -665,7 +665,8 @@ class StandardAutoscaler:
initialization_commands=with_head_node_ip(
self._get_node_type_specific_fields(
node_id, "initialization_commands"), self.head_node_ip),
setup_commands=with_head_node_ip(init_commands, self.head_node_ip),
setup_commands=with_head_node_ip(setup_commands,
self.head_node_ip),
ray_start_commands=with_head_node_ip(ray_start_commands,
self.head_node_ip),
runtime_hash=self.runtime_hash,