[autoscaler] Add an aggressive_autoscaling flag (#4285)

This commit is contained in:
Daniel Edgecumbe
2019-04-14 02:44:32 +01:00
committed by Robert Nishihara
parent 56a78baf67
commit 3e1adafbce
11 changed files with 113 additions and 17 deletions
+62
View File
@@ -64,6 +64,14 @@ class MockProvider(NodeProvider):
if n.matches(tag_filters) and n.state != "terminated"
]
def non_terminated_node_ips(self, tag_filters):
if self.throw:
raise Exception("oops")
return [
n.internal_ip for n in self.mock_nodes.values()
if n.matches(tag_filters) and n.state != "terminated"
]
def is_running(self, node_id):
return self.mock_nodes[node_id].state == "running"
@@ -99,6 +107,7 @@ SMALL_CLUSTER = {
"min_workers": 2,
"max_workers": 2,
"initial_workers": 0,
"autoscaling_mode": "default",
"target_utilization_fraction": 0.8,
"idle_timeout_minutes": 5,
"provider": {
@@ -337,6 +346,59 @@ class AutoscalingTest(unittest.TestCase):
self.waitForNodes(10)
autoscaler.update()
def testAggressiveAutoscaling(self):
config = SMALL_CLUSTER.copy()
config["min_workers"] = 0
config["max_workers"] = 20
config["initial_workers"] = 10
config["idle_timeout_minutes"] = 0
config["autoscaling_mode"] = "aggressive"
config_path = self.write_config(config)
self.provider = MockProvider()
self.provider.create_node({}, {TAG_RAY_NODE_TYPE: "head"}, 1)
head_ip = self.provider.non_terminated_node_ips(
tag_filters={TAG_RAY_NODE_TYPE: "head"}, )[0]
lm = LoadMetrics()
lm.local_ip = head_ip
autoscaler = StandardAutoscaler(
config_path,
lm,
max_launch_batch=5,
max_concurrent_launches=5,
max_failures=0,
update_interval_s=0)
self.waitForNodes(1)
autoscaler.update()
self.waitForNodes(6) # expected due to batch sizes and concurrency
autoscaler.update()
self.waitForNodes(11)
# Connect the head and workers to end the bringup phase
addrs = self.provider.non_terminated_node_ips(
tag_filters={TAG_RAY_NODE_TYPE: "worker"}, )
addrs += head_ip
for addr in addrs:
lm.update(addr, {"CPU": 2}, {"CPU": 0})
lm.update(addr, {"CPU": 2}, {"CPU": 2})
assert autoscaler.bringup
autoscaler.update()
assert not autoscaler.bringup
autoscaler.update()
self.waitForNodes(1)
# All of the nodes are down. Simulate some load on the head node
lm.update(head_ip, {"CPU": 2}, {"CPU": 0})
autoscaler.update()
self.waitForNodes(6) # expected due to batch sizes and concurrency
autoscaler.update()
self.waitForNodes(11)
def testDelayedLaunch(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()