Take into account queue length in autoscaling (#5684)

This commit is contained in:
Eric Liang
2019-09-11 11:31:35 -07:00
committed by GitHub
parent 9ce6dd9b88
commit 2fdefe19b7
6 changed files with 63 additions and 46 deletions
+13 -15
View File
@@ -10,7 +10,6 @@ import math
import os
import subprocess
import threading
import traceback
import time
from collections import defaultdict
@@ -157,9 +156,11 @@ class LoadMetrics(object):
self.last_heartbeat_time_by_ip = {}
self.static_resources_by_ip = {}
self.dynamic_resources_by_ip = {}
self.resource_load_by_ip = {}
self.local_ip = services.get_node_ip_address()
def update(self, ip, static_resources, dynamic_resources):
def update(self, ip, static_resources, dynamic_resources, resource_load):
self.resource_load_by_ip[ip] = resource_load
self.static_resources_by_ip[ip] = static_resources
# We are not guaranteed to have a corresponding dynamic resource for
@@ -204,6 +205,7 @@ class LoadMetrics(object):
prune(self.last_used_time_by_ip)
prune(self.static_resources_by_ip)
prune(self.dynamic_resources_by_ip)
prune(self.resource_load_by_ip)
prune(self.last_heartbeat_time_by_ip)
def approx_workers_used(self):
@@ -218,7 +220,11 @@ class LoadMetrics(object):
resources_total = {}
for ip, max_resources in self.static_resources_by_ip.items():
avail_resources = self.dynamic_resources_by_ip[ip]
resource_load = self.resource_load_by_ip[ip]
max_frac = 0.0
for resource_id, amount in resource_load.items():
if amount > 0:
max_frac = 1.0 # the resource is saturated
for resource_id, amount in max_resources.items():
used = amount - avail_resources[resource_id]
if resource_id not in resources_used:
@@ -722,19 +728,11 @@ class StandardAutoscaler(object):
def kill_workers(self):
logger.error("StandardAutoscaler: kill_workers triggered")
while True:
try:
nodes = self.workers()
if nodes:
self.provider.terminate_nodes(nodes)
logger.error(
"StandardAutoscaler: terminated {} node(s)".format(
len(nodes)))
except Exception:
traceback.print_exc()
time.sleep(10)
nodes = self.workers()
if nodes:
self.provider.terminate_nodes(nodes)
logger.error("StandardAutoscaler: terminated {} node(s)".format(
len(nodes)))
def typename(v):
+2 -1
View File
@@ -115,7 +115,8 @@ class LogMonitor(object):
log_file_paths = glob.glob("{}/worker*[.out|.err]".format(
self.logs_dir))
# segfaults and other serious errors are logged here
raylet_err_paths = glob.glob("{}/raylet*.err".format(self.logs_dir))
raylet_err_paths = (glob.glob("{}/raylet*.err".format(self.logs_dir)) +
glob.glob("{}/monitor*.err".format(self.logs_dir)))
for file_path in log_file_paths + raylet_err_paths:
if os.path.isfile(
file_path) and file_path not in self.log_filenames:
+5 -1
View File
@@ -108,6 +108,9 @@ class Monitor(object):
message = ray.gcs_utils.HeartbeatBatchTableData.FromString(
heartbeat_data)
for heartbeat_message in message.batch:
resource_load = dict(
zip(heartbeat_message.resource_load_label,
heartbeat_message.resource_load_capacity))
total_resources = dict(
zip(heartbeat_message.resources_total_label,
heartbeat_message.resources_total_capacity))
@@ -122,7 +125,7 @@ class Monitor(object):
ip = self.raylet_id_to_ip_map.get(client_id)
if ip:
self.load_metrics.update(ip, total_resources,
available_resources)
available_resources, resource_load)
else:
logger.warning(
"Monitor: "
@@ -357,6 +360,7 @@ class Monitor(object):
try:
self._run()
except Exception:
logger.exception("Error in monitor loop")
if self.autoscaler:
self.autoscaler.kill_workers()
raise
+33 -22
View File
@@ -142,29 +142,40 @@ SMALL_CLUSTER = {
class LoadMetricsTest(unittest.TestCase):
def testUpdate(self):
lm = LoadMetrics()
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1})
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {})
assert lm.approx_workers_used() == 0.5
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0})
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {})
assert lm.approx_workers_used() == 1.0
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 0})
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 0}, {})
assert lm.approx_workers_used() == 2.0
def testLoadMessages(self):
lm = LoadMetrics()
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {})
assert lm.approx_workers_used() == 0.5
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {"CPU": 1})
assert lm.approx_workers_used() == 1.0
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 1}, {})
assert lm.approx_workers_used() == 1.5
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 1}, {"GPU": 1})
assert lm.approx_workers_used() == 2.0
def testPruneByNodeIp(self):
lm = LoadMetrics()
lm.update("1.1.1.1", {"CPU": 1}, {"CPU": 0})
lm.update("2.2.2.2", {"CPU": 1}, {"CPU": 0})
lm.update("1.1.1.1", {"CPU": 1}, {"CPU": 0}, {})
lm.update("2.2.2.2", {"CPU": 1}, {"CPU": 0}, {})
lm.prune_active_ips({"1.1.1.1", "4.4.4.4"})
assert lm.approx_workers_used() == 1.0
def testBottleneckResource(self):
lm = LoadMetrics()
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0})
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2})
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {})
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2}, {})
assert lm.approx_workers_used() == 1.88
def testHeartbeat(self):
lm = LoadMetrics()
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1})
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {})
lm.mark_active("2.2.2.2")
assert "1.1.1.1" in lm.last_heartbeat_time_by_ip
assert "2.2.2.2" in lm.last_heartbeat_time_by_ip
@@ -172,15 +183,15 @@ class LoadMetricsTest(unittest.TestCase):
def testDebugString(self):
lm = LoadMetrics()
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0})
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2})
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {})
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2}, {})
lm.update("3.3.3.3", {
"memory": 20,
"object_store_memory": 40
}, {
"memory": 0,
"object_store_memory": 20
})
}, {})
debug = lm.info_string()
assert ("ResourceUsage=2.0/4.0 CPU, 14.0/16.0 GPU, "
"1.05 GiB/1.05 GiB memory, "
@@ -418,8 +429,8 @@ class AutoscalingTest(unittest.TestCase):
tag_filters={TAG_RAY_NODE_TYPE: "worker"}, )
addrs += head_ip
for addr in addrs:
lm.update(addr, {"CPU": 2}, {"CPU": 0})
lm.update(addr, {"CPU": 2}, {"CPU": 2})
lm.update(addr, {"CPU": 2}, {"CPU": 0}, {})
lm.update(addr, {"CPU": 2}, {"CPU": 2}, {})
assert autoscaler.bringup
autoscaler.update()
@@ -428,7 +439,7 @@ class AutoscalingTest(unittest.TestCase):
self.waitForNodes(1)
# All of the nodes are down. Simulate some load on the head node
lm.update(head_ip, {"CPU": 2}, {"CPU": 0})
lm.update(head_ip, {"CPU": 2}, {"CPU": 0}, {})
autoscaler.update()
self.waitForNodes(6) # expected due to batch sizes and concurrency
@@ -702,17 +713,17 @@ class AutoscalingTest(unittest.TestCase):
# Scales up as nodes are reported as used
local_ip = services.get_node_ip_address()
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}) # head
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 0}) # worker 1
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}, {}) # head
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 0}, {}) # worker 1
autoscaler.update()
self.waitForNodes(3)
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 0})
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 0}, {})
autoscaler.update()
self.waitForNodes(5)
# Holds steady when load is removed
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 2})
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 2})
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 2}, {})
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 2}, {})
autoscaler.update()
assert autoscaler.num_launches_pending.value == 0
assert len(self.provider.non_terminated_nodes({})) == 5
@@ -746,20 +757,20 @@ class AutoscalingTest(unittest.TestCase):
# Scales up as nodes are reported as used
local_ip = services.get_node_ip_address()
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}) # head
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}, {}) # head
# 1.0 nodes used => target nodes = 2 => target workers = 1
autoscaler.update()
self.waitForNodes(1)
# Make new node idle, and never used.
# Should hold steady as target is still 2.
lm.update("172.0.0.0", {"CPU": 0}, {"CPU": 0})
lm.update("172.0.0.0", {"CPU": 0}, {"CPU": 0}, {})
lm.last_used_time_by_ip["172.0.0.0"] = 0
autoscaler.update()
assert len(self.provider.non_terminated_nodes({})) == 1
# Reduce load on head => target nodes = 1 => target workers = 0
lm.update(local_ip, {"CPU": 2}, {"CPU": 1})
lm.update(local_ip, {"CPU": 2}, {"CPU": 1}, {})
autoscaler.update()
assert len(self.provider.non_terminated_nodes({})) == 0