mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 12:55:34 +08:00
[GCS]Use direct getting instead of pub-sub to update load metrics in monitor.py (#11339)
This commit is contained in:
@@ -33,22 +33,13 @@ class LoadMetrics:
|
||||
def update(self,
|
||||
ip: str,
|
||||
static_resources: Dict[str, Dict],
|
||||
update_dynamic_resources: bool,
|
||||
dynamic_resources: Dict[str, Dict],
|
||||
update_resource_load: bool,
|
||||
resource_load: Dict[str, Dict],
|
||||
waiting_bundles: List[Dict[str, float]] = None,
|
||||
infeasible_bundles: List[Dict[str, float]] = None,
|
||||
pending_placement_groups: List[PlacementGroupTableData] = None):
|
||||
# If light heartbeat enabled, only resources changed will be received.
|
||||
# We should update the changed part and compare static_resources with
|
||||
# dynamic_resources using those updated.
|
||||
if ip not in self.static_resources_by_ip or len(static_resources) > 0:
|
||||
self.static_resources_by_ip[ip] = static_resources
|
||||
if ip not in self.dynamic_resources_by_ip or update_dynamic_resources:
|
||||
self.dynamic_resources_by_ip[ip] = dynamic_resources
|
||||
if ip not in self.resource_load_by_ip or update_resource_load:
|
||||
self.resource_load_by_ip[ip] = resource_load
|
||||
self.resource_load_by_ip[ip] = resource_load
|
||||
self.static_resources_by_ip[ip] = static_resources
|
||||
|
||||
if not waiting_bundles:
|
||||
waiting_bundles = []
|
||||
@@ -61,7 +52,7 @@ class LoadMetrics:
|
||||
# for every static resource because dynamic resources are based on
|
||||
# the available resources in the heartbeat, which does not exist
|
||||
# if it is zero. Thus, we have to update dynamic resources here.
|
||||
dynamic_resources_update = self.dynamic_resources_by_ip[ip].copy()
|
||||
dynamic_resources_update = dynamic_resources.copy()
|
||||
for resource_name, capacity in self.static_resources_by_ip[ip].items():
|
||||
if resource_name not in dynamic_resources_update:
|
||||
dynamic_resources_update[resource_name] = 0.0
|
||||
|
||||
@@ -23,6 +23,7 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
|
||||
c_vector[c_string] GetAllProfileInfo()
|
||||
c_vector[c_string] GetAllObjectInfo()
|
||||
unique_ptr[c_string] GetObjectInfo(const CObjectID &object_id)
|
||||
unique_ptr[c_string] GetAllHeartbeat()
|
||||
c_vector[c_string] GetAllActorInfo()
|
||||
unique_ptr[c_string] GetActorInfo(const CActorID &actor_id)
|
||||
c_string GetNodeResourceInfo(const CNodeID &node_id)
|
||||
|
||||
@@ -78,6 +78,15 @@ cdef class GlobalStateAccessor:
|
||||
return c_string(object_info.get().data(), object_info.get().size())
|
||||
return None
|
||||
|
||||
def get_all_heartbeat(self):
|
||||
"""Get newest heartbeat of all nodes from GCS service."""
|
||||
cdef unique_ptr[c_string] result
|
||||
with nogil:
|
||||
result = self.inner.get().GetAllHeartbeat()
|
||||
if result:
|
||||
return c_string(result.get().data(), result.get().size())
|
||||
return None
|
||||
|
||||
def get_actor_table(self):
|
||||
cdef c_vector[c_string] result
|
||||
with nogil:
|
||||
|
||||
+27
-31
@@ -8,11 +8,14 @@ import json
|
||||
import ray
|
||||
from ray.autoscaler._private.autoscaler import StandardAutoscaler
|
||||
from ray.autoscaler._private.commands import teardown_cluster
|
||||
from ray.autoscaler._private.constants import AUTOSCALER_UPDATE_INTERVAL_S
|
||||
from ray.autoscaler._private.load_metrics import LoadMetrics
|
||||
import ray.gcs_utils
|
||||
import ray.utils
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray.utils import binary_to_hex, setup_logger
|
||||
from ray._raylet import GlobalStateAccessor
|
||||
|
||||
import redis
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -75,6 +78,9 @@ class Monitor:
|
||||
redis_address, redis_password=redis_password)
|
||||
self.redis = ray._private.services.create_redis_client(
|
||||
redis_address, password=redis_password)
|
||||
self.global_state_accessor = GlobalStateAccessor(
|
||||
redis_address, redis_password, False)
|
||||
self.global_state_accessor.connect()
|
||||
# Set the redis client and mode so _internal_kv works for autoscaler.
|
||||
worker = ray.worker.global_worker
|
||||
worker.redis_client = self.redis
|
||||
@@ -85,7 +91,6 @@ class Monitor:
|
||||
# Keep a mapping from raylet client ID to IP address to use
|
||||
# for updating the load metrics.
|
||||
self.raylet_id_to_ip_map = {}
|
||||
self.light_heartbeat_enabled = ray._config.light_heartbeat_enabled()
|
||||
self.load_metrics = LoadMetrics()
|
||||
if autoscaling_config:
|
||||
self.autoscaler = StandardAutoscaler(autoscaling_config,
|
||||
@@ -104,6 +109,9 @@ class Monitor:
|
||||
primary_subscribe_client = None
|
||||
if primary_subscribe_client is not None:
|
||||
primary_subscribe_client.close()
|
||||
if self.global_state_accessor is not None:
|
||||
self.global_state_accessor.disconnect()
|
||||
self.global_state_accessor = None
|
||||
|
||||
def subscribe(self, channel):
|
||||
"""Subscribe to the given channel on the primary Redis shard.
|
||||
@@ -127,38 +135,29 @@ class Monitor:
|
||||
"""
|
||||
self.primary_subscribe_client.psubscribe(pattern)
|
||||
|
||||
def xray_heartbeat_batch_handler(self, unused_channel, data):
|
||||
"""Handle an xray heartbeat batch message from Redis."""
|
||||
|
||||
pub_message = ray.gcs_utils.PubSubMessage.FromString(data)
|
||||
heartbeat_data = pub_message.data
|
||||
|
||||
message = ray.gcs_utils.HeartbeatBatchTableData.FromString(
|
||||
heartbeat_data)
|
||||
for heartbeat_message in message.batch:
|
||||
def get_all_heartbeat(self):
|
||||
all_heartbeat = self.global_state_accessor.get_all_heartbeat()
|
||||
heartbeat_batch_data = \
|
||||
ray.gcs_utils.HeartbeatBatchTableData.FromString(all_heartbeat)
|
||||
for heartbeat_message in heartbeat_batch_data.batch:
|
||||
resource_load = dict(heartbeat_message.resource_load)
|
||||
total_resources = dict(heartbeat_message.resources_total)
|
||||
available_resources = dict(heartbeat_message.resources_available)
|
||||
|
||||
waiting_bundles, infeasible_bundles = \
|
||||
parse_resource_demands(message.resource_load_by_shape)
|
||||
waiting_bundles, infeasible_bundles = parse_resource_demands(
|
||||
heartbeat_batch_data.resource_load_by_shape)
|
||||
|
||||
pending_placement_groups = list(
|
||||
message.placement_group_load.placement_group_data)
|
||||
heartbeat_batch_data.placement_group_load.placement_group_data)
|
||||
|
||||
# Update the load metrics for this raylet.
|
||||
client_id = ray.utils.binary_to_hex(heartbeat_message.client_id)
|
||||
ip = self.raylet_id_to_ip_map.get(client_id)
|
||||
if ip:
|
||||
update_available_resources = not self.light_heartbeat_enabled \
|
||||
or heartbeat_message.resources_available_changed()
|
||||
update_resource_load = not self.light_heartbeat_enabled \
|
||||
or heartbeat_message.resource_load_changed()
|
||||
self.load_metrics.update(
|
||||
ip, total_resources, update_available_resources,
|
||||
available_resources, update_resource_load, resource_load,
|
||||
waiting_bundles, infeasible_bundles,
|
||||
pending_placement_groups)
|
||||
self.load_metrics.update(ip, total_resources,
|
||||
available_resources, resource_load,
|
||||
waiting_bundles, infeasible_bundles,
|
||||
pending_placement_groups)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Monitor: could not find ip for client {client_id}")
|
||||
@@ -227,10 +226,7 @@ class Monitor:
|
||||
data = message["data"]
|
||||
|
||||
# Determine the appropriate message handler.
|
||||
if pattern == ray.gcs_utils.XRAY_HEARTBEAT_BATCH_PATTERN:
|
||||
# Similar functionality as raylet info channel
|
||||
message_handler = self.xray_heartbeat_batch_handler
|
||||
elif pattern == ray.gcs_utils.XRAY_JOB_PATTERN:
|
||||
if pattern == ray.gcs_utils.XRAY_JOB_PATTERN:
|
||||
# Handles driver death.
|
||||
message_handler = self.xray_job_notification_handler
|
||||
elif (channel ==
|
||||
@@ -269,8 +265,8 @@ class Monitor:
|
||||
# Initialize the mapping from raylet client ID to IP address.
|
||||
self.update_raylet_map()
|
||||
|
||||
self.get_all_heartbeat()
|
||||
# Initialize the subscription channel.
|
||||
self.psubscribe(ray.gcs_utils.XRAY_HEARTBEAT_BATCH_PATTERN)
|
||||
self.psubscribe(ray.gcs_utils.XRAY_JOB_PATTERN)
|
||||
|
||||
if self.autoscaler:
|
||||
@@ -288,13 +284,13 @@ class Monitor:
|
||||
self.update_raylet_map()
|
||||
self.autoscaler.update()
|
||||
|
||||
self.get_all_heartbeat()
|
||||
# Process a round of messages.
|
||||
self.process_messages()
|
||||
|
||||
# Wait for a heartbeat interval before processing the next round of
|
||||
# messages.
|
||||
time.sleep(
|
||||
ray._config.raylet_heartbeat_timeout_milliseconds() * 1e-3)
|
||||
# Wait for a autoscaler update interval before processing the next
|
||||
# round of messages.
|
||||
time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
|
||||
|
||||
def destroy_autoscaler_workers(self):
|
||||
"""Cleanup the autoscaler, in case of an exception in the run() method.
|
||||
|
||||
@@ -256,101 +256,61 @@ SMALL_CLUSTER = {
|
||||
class LoadMetricsTest(unittest.TestCase):
|
||||
def testUpdate(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {})
|
||||
assert lm.approx_workers_used() == 0.5
|
||||
lm.update("1.1.1.1", {"CPU": 2}, True, {"CPU": 0}, True, {})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {})
|
||||
assert lm.approx_workers_used() == 1.0
|
||||
lm.update("2.2.2.2", {"CPU": 2}, True, {"CPU": 0}, True, {})
|
||||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 0}, {})
|
||||
assert lm.approx_workers_used() == 2.0
|
||||
|
||||
def testLoadMessages(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {})
|
||||
self.assertEqual(lm.approx_workers_used(), 0.5)
|
||||
lm.update("1.1.1.1", {"CPU": 2}, True, {"CPU": 1}, True, {"CPU": 1})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {"CPU": 1})
|
||||
self.assertEqual(lm.approx_workers_used(), 1.0)
|
||||
|
||||
# Both nodes count as busy since there is a queue on one.
|
||||
lm.update("2.2.2.2", {"CPU": 2}, True, {"CPU": 2}, True, {})
|
||||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 2}, {})
|
||||
self.assertEqual(lm.approx_workers_used(), 2.0)
|
||||
lm.update("2.2.2.2", {"CPU": 2}, True, {"CPU": 0}, True, {})
|
||||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 0}, {})
|
||||
self.assertEqual(lm.approx_workers_used(), 2.0)
|
||||
lm.update("2.2.2.2", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 1}, {})
|
||||
self.assertEqual(lm.approx_workers_used(), 2.0)
|
||||
|
||||
# No queue anymore, so we're back to exact accounting.
|
||||
lm.update("1.1.1.1", {"CPU": 2}, True, {"CPU": 0}, True, {})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {})
|
||||
self.assertEqual(lm.approx_workers_used(), 1.5)
|
||||
lm.update("2.2.2.2", {"CPU": 2}, True, {"CPU": 1}, True, {"GPU": 1})
|
||||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 1}, {"GPU": 1})
|
||||
self.assertEqual(lm.approx_workers_used(), 2.0)
|
||||
|
||||
lm.update("3.3.3.3", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("4.3.3.3", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("5.3.3.3", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("6.3.3.3", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("7.3.3.3", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("8.3.3.3", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("3.3.3.3", {"CPU": 2}, {"CPU": 1}, {})
|
||||
lm.update("4.3.3.3", {"CPU": 2}, {"CPU": 1}, {})
|
||||
lm.update("5.3.3.3", {"CPU": 2}, {"CPU": 1}, {})
|
||||
lm.update("6.3.3.3", {"CPU": 2}, {"CPU": 1}, {})
|
||||
lm.update("7.3.3.3", {"CPU": 2}, {"CPU": 1}, {})
|
||||
lm.update("8.3.3.3", {"CPU": 2}, {"CPU": 1}, {})
|
||||
self.assertEqual(lm.approx_workers_used(), 8.0)
|
||||
|
||||
lm.update("2.2.2.2", {"CPU": 2}, True, {"CPU": 1}, True,
|
||||
{}) # no queue anymore
|
||||
self.assertEqual(lm.approx_workers_used(), 4.5)
|
||||
|
||||
def testLoadMessagesWithLightHeartbeat(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
self.assertEqual(lm.approx_workers_used(), 0.5)
|
||||
lm.update("1.1.1.1", {}, False, {}, True, {"CPU": 1})
|
||||
self.assertEqual(lm.approx_workers_used(), 1.0)
|
||||
|
||||
# Both nodes count as busy since there is a queue on one.
|
||||
lm.update("2.2.2.2", {"CPU": 2}, True, {"CPU": 2}, True, {})
|
||||
self.assertEqual(lm.approx_workers_used(), 2.0)
|
||||
lm.update("2.2.2.2", {}, True, {"CPU": 0}, False, {})
|
||||
self.assertEqual(lm.approx_workers_used(), 2.0)
|
||||
lm.update("2.2.2.2", {}, True, {"CPU": 1}, False, {})
|
||||
self.assertEqual(lm.approx_workers_used(), 2.0)
|
||||
|
||||
# No queue anymore, so we're back to exact accounting.
|
||||
lm.update("1.1.1.1", {}, True, {"CPU": 0}, True, {})
|
||||
self.assertEqual(lm.approx_workers_used(), 1.5)
|
||||
lm.update("2.2.2.2", {}, False, {}, True, {"GPU": 1})
|
||||
self.assertEqual(lm.approx_workers_used(), 2.0)
|
||||
|
||||
lm.update("3.3.3.3", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("4.3.3.3", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("5.3.3.3", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("6.3.3.3", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("7.3.3.3", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("8.3.3.3", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
self.assertEqual(lm.approx_workers_used(), 8.0)
|
||||
|
||||
lm.update("2.2.2.2", {}, False, {"CPU": 1}, True,
|
||||
{}) # no queue anymore
|
||||
lm.update("2.2.2.2", {"CPU": 2}, {"CPU": 1}, {}) # no queue anymore
|
||||
self.assertEqual(lm.approx_workers_used(), 4.5)
|
||||
|
||||
def testPruneByNodeIp(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 1}, True, {"CPU": 0}, True, {})
|
||||
lm.update("2.2.2.2", {"CPU": 1}, True, {"CPU": 0}, True, {})
|
||||
lm.update("1.1.1.1", {"CPU": 1}, {"CPU": 0}, {})
|
||||
lm.update("2.2.2.2", {"CPU": 1}, {"CPU": 0}, {})
|
||||
lm.prune_active_ips({"1.1.1.1", "4.4.4.4"})
|
||||
assert lm.approx_workers_used() == 1.0
|
||||
|
||||
def testBottleneckResource(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 2}, True, {"CPU": 0}, True, {})
|
||||
lm.update("2.2.2.2", {
|
||||
"CPU": 2,
|
||||
"GPU": 16
|
||||
}, True, {
|
||||
"CPU": 2,
|
||||
"GPU": 2
|
||||
}, True, {})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {})
|
||||
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2}, {})
|
||||
assert lm.approx_workers_used() == 1.88
|
||||
|
||||
def testHeartbeat(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {})
|
||||
lm.mark_active("2.2.2.2")
|
||||
assert "1.1.1.1" in lm.last_heartbeat_time_by_ip
|
||||
assert "2.2.2.2" in lm.last_heartbeat_time_by_ip
|
||||
@@ -358,21 +318,15 @@ class LoadMetricsTest(unittest.TestCase):
|
||||
|
||||
def testDebugString(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update("1.1.1.1", {"CPU": 2}, True, {"CPU": 0}, True, {})
|
||||
lm.update("2.2.2.2", {
|
||||
"CPU": 2,
|
||||
"GPU": 16
|
||||
}, True, {
|
||||
"CPU": 2,
|
||||
"GPU": 2
|
||||
}, True, {})
|
||||
lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 0}, {})
|
||||
lm.update("2.2.2.2", {"CPU": 2, "GPU": 16}, {"CPU": 2, "GPU": 2}, {})
|
||||
lm.update("3.3.3.3", {
|
||||
"memory": 20,
|
||||
"object_store_memory": 40
|
||||
}, True, {
|
||||
}, {
|
||||
"memory": 0,
|
||||
"object_store_memory": 20
|
||||
}, True, {})
|
||||
}, {})
|
||||
debug = lm.info_string()
|
||||
assert ("ResourceUsage: 2.0/4.0 CPU, 14.0/16.0 GPU, "
|
||||
"1.05 GiB/1.05 GiB memory, "
|
||||
@@ -759,8 +713,8 @@ class AutoscalingTest(unittest.TestCase):
|
||||
tag_filters={TAG_RAY_NODE_KIND: "worker"}, )
|
||||
addrs += head_ip
|
||||
for addr in addrs:
|
||||
lm.update(addr, {"CPU": 2}, True, {"CPU": 0}, True, {})
|
||||
lm.update(addr, {"CPU": 2}, True, {"CPU": 2}, True, {})
|
||||
lm.update(addr, {"CPU": 2}, {"CPU": 0}, {})
|
||||
lm.update(addr, {"CPU": 2}, {"CPU": 2}, {})
|
||||
assert autoscaler.bringup
|
||||
autoscaler.update()
|
||||
|
||||
@@ -769,7 +723,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
self.waitForNodes(1)
|
||||
|
||||
# All of the nodes are down. Simulate some load on the head node
|
||||
lm.update(head_ip, {"CPU": 2}, True, {"CPU": 0}, True, {})
|
||||
lm.update(head_ip, {"CPU": 2}, {"CPU": 0}, {})
|
||||
|
||||
autoscaler.update()
|
||||
self.waitForNodes(6) # expected due to batch sizes and concurrency
|
||||
@@ -812,12 +766,12 @@ class AutoscalingTest(unittest.TestCase):
|
||||
autoscaler.update()
|
||||
self.waitForNodes(2)
|
||||
# This node has num_cpus=0
|
||||
lm.update(head_ip, {"CPU": 1}, True, {"CPU": 0}, True, {})
|
||||
lm.update(unmanaged_ip, {"CPU": 0}, True, {"CPU": 0}, True, {})
|
||||
lm.update(head_ip, {"CPU": 1}, {"CPU": 0}, {})
|
||||
lm.update(unmanaged_ip, {"CPU": 0}, {"CPU": 0}, {})
|
||||
autoscaler.update()
|
||||
self.waitForNodes(2)
|
||||
# 1 CPU task cannot be scheduled.
|
||||
lm.update(unmanaged_ip, {"CPU": 0}, True, {"CPU": 0}, True, {"CPU": 1})
|
||||
lm.update(unmanaged_ip, {"CPU": 0}, {"CPU": 0}, {"CPU": 1})
|
||||
autoscaler.update()
|
||||
self.waitForNodes(3)
|
||||
|
||||
@@ -856,8 +810,8 @@ class AutoscalingTest(unittest.TestCase):
|
||||
process_runner=runner,
|
||||
update_interval_s=0)
|
||||
|
||||
lm.update(head_ip, {"CPU": 1}, True, {"CPU": 0}, True, {"CPU": 1})
|
||||
lm.update(unmanaged_ip, {"CPU": 0}, True, {"CPU": 0}, True, {})
|
||||
lm.update(head_ip, {"CPU": 1}, {"CPU": 0}, {"CPU": 1})
|
||||
lm.update(unmanaged_ip, {"CPU": 0}, {"CPU": 0}, {})
|
||||
|
||||
# Note that we shouldn't autoscale here because the resource demand
|
||||
# vector is not set and target utilization fraction = 1.
|
||||
@@ -1153,18 +1107,17 @@ class AutoscalingTest(unittest.TestCase):
|
||||
|
||||
# Scales up as nodes are reported as used
|
||||
local_ip = services.get_node_ip_address()
|
||||
lm.update(local_ip, {"CPU": 2}, True, {"CPU": 0}, True, {}) # head
|
||||
lm.update("172.0.0.0", {"CPU": 2}, True, {"CPU": 0}, True,
|
||||
{}) # worker 1
|
||||
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}, {}) # head
|
||||
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 0}, {}) # worker 1
|
||||
autoscaler.update()
|
||||
self.waitForNodes(3)
|
||||
lm.update("172.0.0.1", {"CPU": 2}, True, {"CPU": 0}, True, {})
|
||||
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 0}, {})
|
||||
autoscaler.update()
|
||||
self.waitForNodes(5)
|
||||
|
||||
# Holds steady when load is removed
|
||||
lm.update("172.0.0.0", {"CPU": 2}, True, {"CPU": 2}, True, {})
|
||||
lm.update("172.0.0.1", {"CPU": 2}, True, {"CPU": 2}, True, {})
|
||||
lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 2}, {})
|
||||
lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 2}, {})
|
||||
autoscaler.update()
|
||||
assert autoscaler.pending_launches.value == 0
|
||||
assert len(self.provider.non_terminated_nodes({})) == 5
|
||||
@@ -1203,20 +1156,20 @@ class AutoscalingTest(unittest.TestCase):
|
||||
|
||||
# Scales up as nodes are reported as used
|
||||
local_ip = services.get_node_ip_address()
|
||||
lm.update(local_ip, {"CPU": 2}, True, {"CPU": 0}, True, {}) # head
|
||||
lm.update(local_ip, {"CPU": 2}, {"CPU": 0}, {}) # head
|
||||
# 1.0 nodes used => target nodes = 2 => target workers = 1
|
||||
autoscaler.update()
|
||||
self.waitForNodes(1)
|
||||
|
||||
# Make new node idle, and never used.
|
||||
# Should hold steady as target is still 2.
|
||||
lm.update("172.0.0.0", {"CPU": 0}, True, {"CPU": 0}, True, {})
|
||||
lm.update("172.0.0.0", {"CPU": 0}, {"CPU": 0}, {})
|
||||
lm.last_used_time_by_ip["172.0.0.0"] = 0
|
||||
autoscaler.update()
|
||||
assert len(self.provider.non_terminated_nodes({})) == 1
|
||||
|
||||
# Reduce load on head => target nodes = 1 => target workers = 0
|
||||
lm.update(local_ip, {"CPU": 2}, True, {"CPU": 1}, True, {})
|
||||
lm.update(local_ip, {"CPU": 2}, {"CPU": 1}, {})
|
||||
autoscaler.update()
|
||||
assert len(self.provider.non_terminated_nodes({})) == 0
|
||||
|
||||
|
||||
@@ -9,6 +9,8 @@ import ray
|
||||
import ray.ray_constants
|
||||
import ray.test_utils
|
||||
|
||||
from ray._raylet import GlobalStateAccessor
|
||||
|
||||
|
||||
# TODO(rliaw): The proper way to do this is to have the pytest config setup.
|
||||
@pytest.mark.skipif(
|
||||
@@ -142,11 +144,9 @@ def test_load_report(shutdown_only, max_shapes):
|
||||
_system_config={
|
||||
"max_resource_shapes_per_load_report": max_shapes,
|
||||
})
|
||||
redis = ray._private.services.create_redis_client(
|
||||
cluster["redis_address"],
|
||||
password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
client = redis.pubsub(ignore_subscribe_messages=True)
|
||||
client.psubscribe(ray.gcs_utils.XRAY_HEARTBEAT_BATCH_PATTERN)
|
||||
global_state_accessor = GlobalStateAccessor(
|
||||
cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
global_state_accessor.connect()
|
||||
|
||||
@ray.remote
|
||||
def sleep():
|
||||
@@ -163,22 +163,12 @@ def test_load_report(shutdown_only, max_shapes):
|
||||
self.report = None
|
||||
|
||||
def check_load_report(self):
|
||||
try:
|
||||
message = client.get_message()
|
||||
except redis.exceptions.ConnectionError:
|
||||
pass
|
||||
message = global_state_accessor.get_all_heartbeat()
|
||||
if message is None:
|
||||
return False
|
||||
|
||||
pattern = message["pattern"]
|
||||
data = message["data"]
|
||||
if pattern != ray.gcs_utils.XRAY_HEARTBEAT_BATCH_PATTERN:
|
||||
return False
|
||||
|
||||
pub_message = ray.gcs_utils.PubSubMessage.FromString(data)
|
||||
heartbeat_data = pub_message.data
|
||||
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(
|
||||
heartbeat_data)
|
||||
message)
|
||||
self.report = heartbeat.resource_load_by_shape.resource_demands
|
||||
if max_shapes == 0:
|
||||
return True
|
||||
@@ -212,7 +202,7 @@ def test_load_report(shutdown_only, max_shapes):
|
||||
else:
|
||||
assert demand.num_ready_requests_queued > 0
|
||||
assert demand.num_infeasible_requests_queued == 0
|
||||
client.close()
|
||||
global_state_accessor.disconnect()
|
||||
|
||||
|
||||
def test_placement_group_load_report(ray_start_cluster):
|
||||
@@ -220,12 +210,9 @@ def test_placement_group_load_report(ray_start_cluster):
|
||||
# Add a head node that doesn't have gpu resource.
|
||||
cluster.add_node(num_cpus=4)
|
||||
ray.init(address=cluster.address)
|
||||
redis = ray._private.services.create_redis_client(
|
||||
cluster.address, password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
redis = ray._private.services.create_redis_client(
|
||||
cluster.address, password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
client = redis.pubsub(ignore_subscribe_messages=True)
|
||||
client.psubscribe(ray.gcs_utils.XRAY_HEARTBEAT_BATCH_PATTERN)
|
||||
global_state_accessor = GlobalStateAccessor(
|
||||
cluster.address, ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
global_state_accessor.connect()
|
||||
|
||||
class PgLoadChecker:
|
||||
def nothing_is_ready(self):
|
||||
@@ -256,21 +243,12 @@ def test_placement_group_load_report(ray_start_cluster):
|
||||
return False
|
||||
|
||||
def _read_heartbeat(self):
|
||||
try:
|
||||
message = client.get_message()
|
||||
except redis.exceptions.ConnectionError:
|
||||
pass
|
||||
message = global_state_accessor.get_all_heartbeat()
|
||||
if message is None:
|
||||
return None
|
||||
return False
|
||||
|
||||
pattern = message["pattern"]
|
||||
data = message["data"]
|
||||
if pattern != ray.gcs_utils.XRAY_HEARTBEAT_BATCH_PATTERN:
|
||||
return None
|
||||
pub_message = ray.gcs_utils.PubSubMessage.FromString(data)
|
||||
heartbeat_data = pub_message.data
|
||||
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(
|
||||
heartbeat_data)
|
||||
message)
|
||||
return heartbeat
|
||||
|
||||
checker = PgLoadChecker()
|
||||
@@ -292,7 +270,7 @@ def test_placement_group_load_report(ray_start_cluster):
|
||||
_, unready = ray.wait([pg_infeasible_second.ready()], timeout=0)
|
||||
assert len(unready) == 1
|
||||
ray.test_utils.wait_for_condition(checker.two_infeasible_pg)
|
||||
client.close()
|
||||
global_state_accessor.disconnect()
|
||||
|
||||
|
||||
def test_backlog_report(shutdown_only):
|
||||
@@ -300,11 +278,9 @@ def test_backlog_report(shutdown_only):
|
||||
num_cpus=1, _system_config={
|
||||
"report_worker_backlog": True,
|
||||
})
|
||||
redis = ray._private.services.create_redis_client(
|
||||
cluster["redis_address"],
|
||||
password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
client = redis.pubsub(ignore_subscribe_messages=True)
|
||||
client.psubscribe(ray.gcs_utils.XRAY_HEARTBEAT_BATCH_PATTERN)
|
||||
global_state_accessor = GlobalStateAccessor(
|
||||
cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
global_state_accessor.connect()
|
||||
|
||||
@ray.remote(num_cpus=1)
|
||||
def foo(x):
|
||||
@@ -313,21 +289,13 @@ def test_backlog_report(shutdown_only):
|
||||
return None
|
||||
|
||||
def backlog_size_set():
|
||||
try:
|
||||
raw_message = client.get_message()
|
||||
except Exception:
|
||||
return False
|
||||
if raw_message is None:
|
||||
message = global_state_accessor.get_all_heartbeat()
|
||||
if message is None:
|
||||
return False
|
||||
|
||||
data = raw_message["data"]
|
||||
pub_message = ray.gcs_utils.PubSubMessage.FromString(data)
|
||||
heartbeat_data = pub_message.data
|
||||
|
||||
message = ray.gcs_utils.HeartbeatBatchTableData.FromString(
|
||||
heartbeat_data)
|
||||
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
|
||||
aggregate_resource_load = \
|
||||
message.resource_load_by_shape.resource_demands
|
||||
heartbeat.resource_load_by_shape.resource_demands
|
||||
if len(aggregate_resource_load) == 1:
|
||||
backlog_size = aggregate_resource_load[0].backlog_size
|
||||
print(backlog_size)
|
||||
@@ -349,6 +317,7 @@ def test_backlog_report(shutdown_only):
|
||||
# request is sent to the raylet with backlog=7
|
||||
|
||||
ray.test_utils.wait_for_condition(backlog_size_set, timeout=2)
|
||||
global_state_accessor.disconnect()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -67,14 +67,14 @@ def test_system_config(ray_start_cluster_head):
|
||||
def setup_monitor(address):
|
||||
monitor = Monitor(
|
||||
address, None, redis_password=ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
monitor.psubscribe(ray.gcs_utils.XRAY_HEARTBEAT_BATCH_PATTERN)
|
||||
monitor.psubscribe(ray.gcs_utils.XRAY_JOB_PATTERN) # TODO: Remove?
|
||||
monitor.update_raylet_map(_append_port=True)
|
||||
monitor.psubscribe(ray.gcs_utils.XRAY_JOB_PATTERN) # TODO: Remove?
|
||||
return monitor
|
||||
|
||||
|
||||
def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30):
|
||||
while True:
|
||||
monitor.get_all_heartbeat()
|
||||
monitor.process_messages()
|
||||
resource_usage = monitor.load_metrics._get_resource_usage()
|
||||
|
||||
|
||||
@@ -614,9 +614,7 @@ class LoadMetricsTest(unittest.TestCase):
|
||||
def testResourceDemandVector(self):
|
||||
lm = LoadMetrics()
|
||||
lm.update(
|
||||
"1.1.1.1", {"CPU": 2},
|
||||
True, {"CPU": 1},
|
||||
True, {},
|
||||
"1.1.1.1", {"CPU": 2}, {"CPU": 1}, {},
|
||||
waiting_bundles=[{
|
||||
"GPU": 1
|
||||
}],
|
||||
@@ -642,9 +640,7 @@ class LoadMetricsTest(unittest.TestCase):
|
||||
bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
|
||||
]
|
||||
lm.update(
|
||||
"1.1.1.1", {},
|
||||
True, {},
|
||||
True, {},
|
||||
"1.1.1.1", {}, {}, {},
|
||||
pending_placement_groups=pending_placement_groups)
|
||||
assert lm.get_pending_placement_groups() == pending_placement_groups
|
||||
|
||||
@@ -773,9 +769,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
"GPU_group_6c2506ac733bc37496295b02c4fad446": 0.0101
|
||||
}]
|
||||
lm.update(
|
||||
head_ip, {"CPU": 16},
|
||||
True, {"CPU": 16},
|
||||
False, {},
|
||||
head_ip, {"CPU": 16}, {"CPU": 16}, {},
|
||||
infeasible_bundles=placement_group_resource_demands,
|
||||
waiting_bundles=[{
|
||||
"GPU": 8
|
||||
@@ -873,16 +867,14 @@ class AutoscalingTest(unittest.TestCase):
|
||||
update_interval_s=0)
|
||||
autoscaler.update()
|
||||
self.waitForNodes(1)
|
||||
lm.update(head_ip, {"CPU": 4, "GPU": 1}, True, {}, True, {})
|
||||
lm.update(head_ip, {"CPU": 4, "GPU": 1}, {}, {})
|
||||
self.waitForNodes(1)
|
||||
|
||||
lm.update(
|
||||
head_ip, {
|
||||
"CPU": 4,
|
||||
"GPU": 1
|
||||
},
|
||||
True, {"GPU": 0},
|
||||
True, {},
|
||||
}, {"GPU": 0}, {},
|
||||
waiting_bundles=[{
|
||||
"GPU": 1
|
||||
}])
|
||||
@@ -1016,9 +1008,7 @@ class AutoscalingTest(unittest.TestCase):
|
||||
self.waitForNodes(0)
|
||||
autoscaler.update()
|
||||
lm.update(
|
||||
"1.2.3.4", {},
|
||||
True, {},
|
||||
True, {},
|
||||
"1.2.3.4", {}, {}, {},
|
||||
waiting_bundles=[{
|
||||
"GPU": 1
|
||||
}],
|
||||
|
||||
Reference in New Issue
Block a user