mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 16:13:54 +08:00
Take into account queue length in autoscaling (#5684)
This commit is contained in:
@@ -10,7 +10,6 @@ import math
|
||||
import os
|
||||
import subprocess
|
||||
import threading
|
||||
import traceback
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
@@ -157,9 +156,11 @@ class LoadMetrics(object):
|
||||
self.last_heartbeat_time_by_ip = {}
|
||||
self.static_resources_by_ip = {}
|
||||
self.dynamic_resources_by_ip = {}
|
||||
self.resource_load_by_ip = {}
|
||||
self.local_ip = services.get_node_ip_address()
|
||||
|
||||
def update(self, ip, static_resources, dynamic_resources):
|
||||
def update(self, ip, static_resources, dynamic_resources, resource_load):
|
||||
self.resource_load_by_ip[ip] = resource_load
|
||||
self.static_resources_by_ip[ip] = static_resources
|
||||
|
||||
# We are not guaranteed to have a corresponding dynamic resource for
|
||||
@@ -204,6 +205,7 @@ class LoadMetrics(object):
|
||||
prune(self.last_used_time_by_ip)
|
||||
prune(self.static_resources_by_ip)
|
||||
prune(self.dynamic_resources_by_ip)
|
||||
prune(self.resource_load_by_ip)
|
||||
prune(self.last_heartbeat_time_by_ip)
|
||||
|
||||
def approx_workers_used(self):
|
||||
@@ -218,7 +220,11 @@ class LoadMetrics(object):
|
||||
resources_total = {}
|
||||
for ip, max_resources in self.static_resources_by_ip.items():
|
||||
avail_resources = self.dynamic_resources_by_ip[ip]
|
||||
resource_load = self.resource_load_by_ip[ip]
|
||||
max_frac = 0.0
|
||||
for resource_id, amount in resource_load.items():
|
||||
if amount > 0:
|
||||
max_frac = 1.0 # the resource is saturated
|
||||
for resource_id, amount in max_resources.items():
|
||||
used = amount - avail_resources[resource_id]
|
||||
if resource_id not in resources_used:
|
||||
@@ -722,19 +728,11 @@ class StandardAutoscaler(object):
|
||||
|
||||
def kill_workers(self):
|
||||
logger.error("StandardAutoscaler: kill_workers triggered")
|
||||
|
||||
while True:
|
||||
try:
|
||||
nodes = self.workers()
|
||||
if nodes:
|
||||
self.provider.terminate_nodes(nodes)
|
||||
logger.error(
|
||||
"StandardAutoscaler: terminated {} node(s)".format(
|
||||
len(nodes)))
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
|
||||
time.sleep(10)
|
||||
nodes = self.workers()
|
||||
if nodes:
|
||||
self.provider.terminate_nodes(nodes)
|
||||
logger.error("StandardAutoscaler: terminated {} node(s)".format(
|
||||
len(nodes)))
|
||||
|
||||
|
||||
def typename(v):
|
||||
|
||||
@@ -115,7 +115,8 @@ class LogMonitor(object):
|
||||
log_file_paths = glob.glob("{}/worker*[.out|.err]".format(
|
||||
self.logs_dir))
|
||||
# segfaults and other serious errors are logged here
|
||||
raylet_err_paths = glob.glob("{}/raylet*.err".format(self.logs_dir))
|
||||
raylet_err_paths = (glob.glob("{}/raylet*.err".format(self.logs_dir)) +
|
||||
glob.glob("{}/monitor*.err".format(self.logs_dir)))
|
||||
for file_path in log_file_paths + raylet_err_paths:
|
||||
if os.path.isfile(
|
||||
file_path) and file_path not in self.log_filenames:
|
||||
|
||||
@@ -108,6 +108,9 @@ class Monitor(object):
|
||||
message = ray.gcs_utils.HeartbeatBatchTableData.FromString(
|
||||
heartbeat_data)
|
||||
for heartbeat_message in message.batch:
|
||||
resource_load = dict(
|
||||
zip(heartbeat_message.resource_load_label,
|
||||
heartbeat_message.resource_load_capacity))
|
||||
total_resources = dict(
|
||||
zip(heartbeat_message.resources_total_label,
|
||||
heartbeat_message.resources_total_capacity))
|
||||
@@ -122,7 +125,7 @@ class Monitor(object):
|
||||
ip = self.raylet_id_to_ip_map.get(client_id)
|
||||
if ip:
|
||||
self.load_metrics.update(ip, total_resources,
|
||||
available_resources)
|
||||
available_resources, resource_load)
|
||||
else:
|
||||
logger.warning(
|
||||
"Monitor: "
|
||||
@@ -357,6 +360,7 @@ class Monitor(object):
|
||||
try:
|
||||
self._run()
|
||||
except Exception:
|
||||
logger.exception("Error in monitor loop")
|
||||
if self.autoscaler:
|
||||
self.autoscaler.kill_workers()
|
||||
raise
|
||||
|
||||
@@ -142,29 +142,40 @@ SMALL_CLUSTER = {
|
||||
class LoadMetricsTest(unittest.TestCase):
|
||||
def testUpdate(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {})
|
||||
assert lm.approx_workers_used() == 0.5
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {})
|
||||
assert lm.approx_workers_used() == 1.0
|
||||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 0})
|
||||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 0}, {})
|
||||
assert lm.approx_workers_used() == 2.0
|
||||
|
||||
def testLoadMessages(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {})
|
||||
assert lm.approx_workers_used() == 0.5
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {"CPU": 1})
|
||||
assert lm.approx_workers_used() == 1.0
|
||||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 1}, {})
|
||||
assert lm.approx_workers_used() == 1.5
|
||||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 1}, {"GPU": 1})
|
||||
assert lm.approx_workers_used() == 2.0
|
||||
|
||||
def testPruneByNodeIp(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 1}, {"CPU": 0})
|
||||
lm.update("2.2.2.2", {"CPU": 1}, {"CPU": 0})
|
||||
lm.update("1.1.1.1", {"CPU": 1}, {"CPU": 0}, {})
|
||||
lm.update("2.2.2.2", {"CPU": 1}, {"CPU": 0}, {})
|
||||
lm.prune_active_ips({"1.1.1.1", "4.4.4.4"})
|
||||
assert lm.approx_workers_used() == 1.0
|
||||
|
||||
def testBottleneckResource(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0})
|
||||
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {})
|
||||
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2}, {})
|
||||
assert lm.approx_workers_used() == 1.88
|
||||
|
||||
def testHeartbeat(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {})
|
||||
lm.mark_active("2.2.2.2")
|
||||
assert "1.1.1.1" in lm.last_heartbeat_time_by_ip
|
||||
assert "2.2.2.2" in lm.last_heartbeat_time_by_ip
|
||||
@@ -172,15 +183,15 @@ class LoadMetricsTest(unittest.TestCase):
|
||||
|
||||
def testDebugString(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0})
|
||||
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {})
|
||||
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2}, {})
|
||||
lm.update("3.3.3.3", {
|
||||
"memory": 20,
|
||||
"object_store_memory": 40
|
||||
}, {
|
||||
"memory": 0,
|
||||
"object_store_memory": 20
|
||||
})
|
||||
}, {})
|
||||
debug = lm.info_string()
|
||||
assert ("ResourceUsage=2.0/4.0 CPU, 14.0/16.0 GPU, "
|
||||
"1.05 GiB/1.05 GiB memory, "
|
||||
@@ -418,8 +429,8 @@ class AutoscalingTest(unittest.TestCase):
|
||||
tag_filters={TAG_RAY_NODE_TYPE: "worker"}, )
|
||||
addrs += head_ip
|
||||
for addr in addrs:
|
||||
lm.update(addr, {"CPU": 2}, {"CPU": 0})
|
||||
lm.update(addr, {"CPU": 2}, {"CPU": 2})
|
||||
lm.update(addr, {"CPU": 2}, {"CPU": 0}, {})
|
||||
lm.update(addr, {"CPU": 2}, {"CPU": 2}, {})
|
||||
assert autoscaler.bringup
|
||||
autoscaler.update()
|
||||
|
||||
@@ -428,7 +439,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
self.waitForNodes(1)
|
||||
|
||||
# All of the nodes are down. Simulate some load on the head node
|
||||
lm.update(head_ip, {"CPU": 2}, {"CPU": 0})
|
||||
lm.update(head_ip, {"CPU": 2}, {"CPU": 0}, {})
|
||||
|
||||
autoscaler.update()
|
||||
self.waitForNodes(6) # expected due to batch sizes and concurrency
|
||||
@@ -702,17 +713,17 @@ class AutoscalingTest(unittest.TestCase):
|
||||
|
||||
# Scales up as nodes are reported as used
|
||||
local_ip = services.get_node_ip_address()
|
||||
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}) # head
|
||||
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 0}) # worker 1
|
||||
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}, {}) # head
|
||||
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 0}, {}) # worker 1
|
||||
autoscaler.update()
|
||||
self.waitForNodes(3)
|
||||
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 0})
|
||||
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 0}, {})
|
||||
autoscaler.update()
|
||||
self.waitForNodes(5)
|
||||
|
||||
# Holds steady when load is removed
|
||||
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 2})
|
||||
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 2})
|
||||
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 2}, {})
|
||||
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 2}, {})
|
||||
autoscaler.update()
|
||||
assert autoscaler.num_launches_pending.value == 0
|
||||
assert len(self.provider.non_terminated_nodes({})) == 5
|
||||
@@ -746,20 +757,20 @@ class AutoscalingTest(unittest.TestCase):
|
||||
|
||||
# Scales up as nodes are reported as used
|
||||
local_ip = services.get_node_ip_address()
|
||||
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}) # head
|
||||
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}, {}) # head
|
||||
# 1.0 nodes used => target nodes = 2 => target workers = 1
|
||||
autoscaler.update()
|
||||
self.waitForNodes(1)
|
||||
|
||||
# Make new node idle, and never used.
|
||||
# Should hold steady as target is still 2.
|
||||
lm.update("172.0.0.0", {"CPU": 0}, {"CPU": 0})
|
||||
lm.update("172.0.0.0", {"CPU": 0}, {"CPU": 0}, {})
|
||||
lm.last_used_time_by_ip["172.0.0.0"] = 0
|
||||
autoscaler.update()
|
||||
assert len(self.provider.non_terminated_nodes({})) == 1
|
||||
|
||||
# Reduce load on head => target nodes = 1 => target workers = 0
|
||||
lm.update(local_ip, {"CPU": 2}, {"CPU": 1})
|
||||
lm.update(local_ip, {"CPU": 2}, {"CPU": 1}, {})
|
||||
autoscaler.update()
|
||||
assert len(self.provider.non_terminated_nodes({})) == 0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user