mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 05:17:38 +08:00
Split heartbeat message (#12535)
* first * xxx * Split heartbeat message * only report resource usage when changed * Fix GetAllResourceUsage * Fix report resource usage * Increase default heartbeat interval * regularize heartbeat interval in test case
This commit is contained in:
@@ -7,8 +7,8 @@ from ray.core.generated.gcs_pb2 import (
|
||||
JobConfig,
|
||||
ErrorTableData,
|
||||
GcsEntry,
|
||||
HeartbeatBatchTableData,
|
||||
HeartbeatTableData,
|
||||
ResourceUsageBatchData,
|
||||
ResourcesData,
|
||||
ObjectTableData,
|
||||
ProfileTableData,
|
||||
TablePrefix,
|
||||
@@ -33,8 +33,8 @@ __all__ = [
|
||||
"ErrorTableData",
|
||||
"ErrorType",
|
||||
"GcsEntry",
|
||||
"HeartbeatBatchTableData",
|
||||
"HeartbeatTableData",
|
||||
"ResourceUsageBatchData",
|
||||
"ResourcesData",
|
||||
"ObjectTableData",
|
||||
"ProfileTableData",
|
||||
"TablePrefix",
|
||||
@@ -55,8 +55,8 @@ FUNCTION_PREFIX = "RemoteFunction:"
|
||||
LOG_FILE_CHANNEL = "RAY_LOG_CHANNEL"
|
||||
REPORTER_CHANNEL = "RAY_REPORTER"
|
||||
|
||||
# xray heartbeats
|
||||
XRAY_HEARTBEAT_BATCH_PATTERN = "HEARTBEAT_BATCH:".encode("ascii")
|
||||
# xray resource usages
|
||||
XRAY_RESOURCES_BATCH_PATTERN = "RESOURCES_BATCH:".encode("ascii")
|
||||
|
||||
# xray job updates
|
||||
XRAY_JOB_PATTERN = "JOB:*".encode("ascii")
|
||||
|
||||
@@ -23,7 +23,7 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
|
||||
c_vector[c_string] GetAllProfileInfo()
|
||||
c_vector[c_string] GetAllObjectInfo()
|
||||
unique_ptr[c_string] GetObjectInfo(const CObjectID &object_id)
|
||||
unique_ptr[c_string] GetAllHeartbeat()
|
||||
unique_ptr[c_string] GetAllResourceUsage()
|
||||
c_vector[c_string] GetAllActorInfo()
|
||||
unique_ptr[c_string] GetActorInfo(const CActorID &actor_id)
|
||||
c_string GetNodeResourceInfo(const CNodeID &node_id)
|
||||
|
||||
@@ -78,11 +78,11 @@ cdef class GlobalStateAccessor:
|
||||
return c_string(object_info.get().data(), object_info.get().size())
|
||||
return None
|
||||
|
||||
def get_all_heartbeat(self):
|
||||
"""Get newest heartbeat of all nodes from GCS service."""
|
||||
def get_all_resource_usage(self):
|
||||
"""Get newest resource usage of all nodes from GCS service."""
|
||||
cdef unique_ptr[c_string] result
|
||||
with nogil:
|
||||
result = self.inner.get().GetAllHeartbeat()
|
||||
result = self.inner.get().GetAllResourceUsage()
|
||||
if result:
|
||||
return c_string(result.get().data(), result.get().size())
|
||||
return None
|
||||
|
||||
@@ -15,7 +15,7 @@ cdef extern from "ray/common/ray_config.h" nogil:
|
||||
|
||||
int64_t raylet_heartbeat_timeout_milliseconds() const
|
||||
|
||||
c_bool light_heartbeat_enabled() const
|
||||
c_bool light_report_resource_usage_enabled() const
|
||||
|
||||
int64_t debug_dump_period_milliseconds() const
|
||||
|
||||
|
||||
@@ -14,8 +14,8 @@ cdef class Config:
|
||||
return RayConfig.instance().raylet_heartbeat_timeout_milliseconds()
|
||||
|
||||
@staticmethod
|
||||
def light_heartbeat_enabled():
|
||||
return RayConfig.instance().light_heartbeat_enabled()
|
||||
def light_report_resource_usage_enabled():
|
||||
return RayConfig.instance().light_report_resource_usage_enabled()
|
||||
|
||||
@staticmethod
|
||||
def debug_dump_period_milliseconds():
|
||||
|
||||
+11
-11
@@ -139,24 +139,24 @@ class Monitor:
|
||||
self.primary_subscribe_client.subscribe(channel)
|
||||
|
||||
def update_load_metrics(self):
|
||||
"""Fetches heartbeat data from GCS and updates load metrics."""
|
||||
"""Fetches resource usage data from GCS and updates load metrics."""
|
||||
|
||||
all_heartbeat = self.global_state_accessor.get_all_heartbeat()
|
||||
heartbeat_batch_data = \
|
||||
ray.gcs_utils.HeartbeatBatchTableData.FromString(all_heartbeat)
|
||||
for heartbeat_message in heartbeat_batch_data.batch:
|
||||
resource_load = dict(heartbeat_message.resource_load)
|
||||
total_resources = dict(heartbeat_message.resources_total)
|
||||
available_resources = dict(heartbeat_message.resources_available)
|
||||
all_resources = self.global_state_accessor.get_all_resource_usage()
|
||||
resources_batch_data = \
|
||||
ray.gcs_utils.ResourceUsageBatchData.FromString(all_resources)
|
||||
for resource_message in resources_batch_data.batch:
|
||||
resource_load = dict(resource_message.resource_load)
|
||||
total_resources = dict(resource_message.resources_total)
|
||||
available_resources = dict(resource_message.resources_available)
|
||||
|
||||
waiting_bundles, infeasible_bundles = parse_resource_demands(
|
||||
heartbeat_batch_data.resource_load_by_shape)
|
||||
resources_batch_data.resource_load_by_shape)
|
||||
|
||||
pending_placement_groups = list(
|
||||
heartbeat_batch_data.placement_group_load.placement_group_data)
|
||||
resources_batch_data.placement_group_load.placement_group_data)
|
||||
|
||||
# Update the load metrics for this raylet.
|
||||
node_id = ray.utils.binary_to_hex(heartbeat_message.node_id)
|
||||
node_id = ray.utils.binary_to_hex(resource_message.node_id)
|
||||
ip = self.raylet_id_to_ip_map.get(node_id)
|
||||
if ip:
|
||||
self.load_metrics.update(ip, total_resources,
|
||||
|
||||
@@ -1055,11 +1055,11 @@ def test_actor_resource_demand(shutdown_only):
|
||||
ray.get(a.foo.remote())
|
||||
time.sleep(1)
|
||||
|
||||
message = global_state_accessor.get_all_heartbeat()
|
||||
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
|
||||
message = global_state_accessor.get_all_resource_usage()
|
||||
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
|
||||
|
||||
# The actor is scheduled so there should be no more demands left.
|
||||
assert len(heartbeat.resource_load_by_shape.resource_demands) == 0
|
||||
assert len(resource_usages.resource_load_by_shape.resource_demands) == 0
|
||||
|
||||
@ray.remote(num_cpus=80)
|
||||
class Actor2:
|
||||
@@ -1070,23 +1070,24 @@ def test_actor_resource_demand(shutdown_only):
|
||||
time.sleep(1)
|
||||
|
||||
# This actor cannot be scheduled.
|
||||
message = global_state_accessor.get_all_heartbeat()
|
||||
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
|
||||
assert len(heartbeat.resource_load_by_shape.resource_demands) == 1
|
||||
assert (heartbeat.resource_load_by_shape.resource_demands[0].shape == {
|
||||
"CPU": 80.0
|
||||
})
|
||||
assert (heartbeat.resource_load_by_shape.resource_demands[0]
|
||||
message = global_state_accessor.get_all_resource_usage()
|
||||
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
|
||||
assert len(resource_usages.resource_load_by_shape.resource_demands) == 1
|
||||
assert (
|
||||
resource_usages.resource_load_by_shape.resource_demands[0].shape == {
|
||||
"CPU": 80.0
|
||||
})
|
||||
assert (resource_usages.resource_load_by_shape.resource_demands[0]
|
||||
.num_infeasible_requests_queued == 1)
|
||||
|
||||
actors.append(Actor2.remote())
|
||||
time.sleep(1)
|
||||
|
||||
# Two actors cannot be scheduled.
|
||||
message = global_state_accessor.get_all_heartbeat()
|
||||
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
|
||||
assert len(heartbeat.resource_load_by_shape.resource_demands) == 1
|
||||
assert (heartbeat.resource_load_by_shape.resource_demands[0]
|
||||
message = global_state_accessor.get_all_resource_usage()
|
||||
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
|
||||
assert len(resource_usages.resource_load_by_shape.resource_demands) == 1
|
||||
assert (resource_usages.resource_load_by_shape.resource_demands[0]
|
||||
.num_infeasible_requests_queued == 2)
|
||||
|
||||
global_state_accessor.disconnect()
|
||||
|
||||
@@ -237,7 +237,7 @@ def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster):
|
||||
cluster.add_node(
|
||||
num_cpus=10 * num_gpus_per_raylet,
|
||||
num_gpus=num_gpus_per_raylet,
|
||||
_system_config={"num_heartbeats_timeout": 1000} if i == 0 else {})
|
||||
_system_config={"num_heartbeats_timeout": 100} if i == 0 else {})
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@ray.remote
|
||||
|
||||
@@ -610,9 +610,10 @@ def test_lease_request_leak(shutdown_only):
|
||||
del obj_ref
|
||||
ray.get(tasks)
|
||||
|
||||
time.sleep(
|
||||
1) # Sleep for an amount longer than the reconstruction timeout.
|
||||
assert len(ray.objects()) == 0, ray.objects()
|
||||
def _no_objects():
|
||||
return len(ray.objects()) == 0
|
||||
|
||||
wait_for_condition(_no_objects, timeout=10)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@@ -143,7 +143,7 @@ def check_components_alive(cluster, component_type, check_component_alive):
|
||||
"num_cpus": 8,
|
||||
"num_nodes": 4,
|
||||
"_system_config": {
|
||||
"num_heartbeats_timeout": 100
|
||||
"num_heartbeats_timeout": 10
|
||||
},
|
||||
}],
|
||||
indirect=True)
|
||||
|
||||
@@ -76,7 +76,7 @@ def test_gcs_server_restart_during_actor_creation(ray_start_regular):
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_system_config_map(
|
||||
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
|
||||
num_heartbeats_timeout=2, ping_gcs_rpc_server_max_retries=60)
|
||||
],
|
||||
indirect=True)
|
||||
def test_node_failure_detector_when_gcs_server_restart(ray_start_cluster_head):
|
||||
|
||||
@@ -173,13 +173,14 @@ def test_load_report(shutdown_only, max_shapes):
|
||||
self.report = None
|
||||
|
||||
def check_load_report(self):
|
||||
message = global_state_accessor.get_all_heartbeat()
|
||||
message = global_state_accessor.get_all_resource_usage()
|
||||
if message is None:
|
||||
return False
|
||||
|
||||
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(
|
||||
resource_usage = ray.gcs_utils.ResourceUsageBatchData.FromString(
|
||||
message)
|
||||
self.report = heartbeat.resource_load_by_shape.resource_demands
|
||||
self.report = \
|
||||
resource_usage.resource_load_by_shape.resource_demands
|
||||
if max_shapes == 0:
|
||||
return True
|
||||
elif max_shapes == 2:
|
||||
@@ -227,40 +228,40 @@ def test_placement_group_load_report(ray_start_cluster):
|
||||
|
||||
class PgLoadChecker:
|
||||
def nothing_is_ready(self):
|
||||
heartbeat = self._read_heartbeat()
|
||||
if not heartbeat:
|
||||
resource_usage = self._read_resource_usage()
|
||||
if not resource_usage:
|
||||
return False
|
||||
if heartbeat.HasField("placement_group_load"):
|
||||
pg_load = heartbeat.placement_group_load
|
||||
if resource_usage.HasField("placement_group_load"):
|
||||
pg_load = resource_usage.placement_group_load
|
||||
return len(pg_load.placement_group_data) == 2
|
||||
return False
|
||||
|
||||
def only_first_one_ready(self):
|
||||
heartbeat = self._read_heartbeat()
|
||||
if not heartbeat:
|
||||
resource_usage = self._read_resource_usage()
|
||||
if not resource_usage:
|
||||
return False
|
||||
if heartbeat.HasField("placement_group_load"):
|
||||
pg_load = heartbeat.placement_group_load
|
||||
if resource_usage.HasField("placement_group_load"):
|
||||
pg_load = resource_usage.placement_group_load
|
||||
return len(pg_load.placement_group_data) == 1
|
||||
return False
|
||||
|
||||
def two_infeasible_pg(self):
|
||||
heartbeat = self._read_heartbeat()
|
||||
if not heartbeat:
|
||||
resource_usage = self._read_resource_usage()
|
||||
if not resource_usage:
|
||||
return False
|
||||
if heartbeat.HasField("placement_group_load"):
|
||||
pg_load = heartbeat.placement_group_load
|
||||
if resource_usage.HasField("placement_group_load"):
|
||||
pg_load = resource_usage.placement_group_load
|
||||
return len(pg_load.placement_group_data) == 2
|
||||
return False
|
||||
|
||||
def _read_heartbeat(self):
|
||||
message = global_state_accessor.get_all_heartbeat()
|
||||
def _read_resource_usage(self):
|
||||
message = global_state_accessor.get_all_resource_usage()
|
||||
if message is None:
|
||||
return False
|
||||
|
||||
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(
|
||||
resource_usage = ray.gcs_utils.ResourceUsageBatchData.FromString(
|
||||
message)
|
||||
return heartbeat
|
||||
return resource_usage
|
||||
|
||||
checker = PgLoadChecker()
|
||||
|
||||
@@ -301,13 +302,14 @@ def test_backlog_report(shutdown_only):
|
||||
return None
|
||||
|
||||
def backlog_size_set():
|
||||
message = global_state_accessor.get_all_heartbeat()
|
||||
message = global_state_accessor.get_all_resource_usage()
|
||||
if message is None:
|
||||
return False
|
||||
|
||||
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
|
||||
resource_usage = ray.gcs_utils.ResourceUsageBatchData.FromString(
|
||||
message)
|
||||
aggregate_resource_load = \
|
||||
heartbeat.resource_load_by_shape.resource_demands
|
||||
resource_usage.resource_load_by_shape.resource_demands
|
||||
if len(aggregate_resource_load) == 1:
|
||||
backlog_size = aggregate_resource_load[0].backlog_size
|
||||
print(backlog_size)
|
||||
|
||||
@@ -34,7 +34,7 @@ def test_shutdown():
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_system_config_map(
|
||||
num_heartbeats_timeout=20, object_timeout_milliseconds=12345)
|
||||
num_heartbeats_timeout=2, object_timeout_milliseconds=12345)
|
||||
],
|
||||
indirect=True)
|
||||
def test_system_config(ray_start_cluster_head):
|
||||
@@ -52,12 +52,12 @@ def test_system_config(ray_start_cluster_head):
|
||||
@ray.remote
|
||||
def f():
|
||||
assert ray._config.object_timeout_milliseconds() == 12345
|
||||
assert ray._config.num_heartbeats_timeout() == 20
|
||||
assert ray._config.num_heartbeats_timeout() == 2
|
||||
|
||||
ray.get([f.remote() for _ in range(5)])
|
||||
|
||||
cluster.remove_node(worker, allow_graceful=False)
|
||||
time.sleep(1)
|
||||
time.sleep(0.9)
|
||||
assert ray.cluster_resources()["CPU"] == 2
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
@@ -1165,7 +1165,7 @@ ray.shutdown()
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_system_config_map(
|
||||
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
|
||||
num_heartbeats_timeout=3, ping_gcs_rpc_server_max_retries=60)
|
||||
],
|
||||
indirect=True)
|
||||
def test_create_placement_group_after_gcs_server_restart(
|
||||
@@ -1203,7 +1203,7 @@ def test_create_placement_group_after_gcs_server_restart(
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_system_config_map(
|
||||
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
|
||||
num_heartbeats_timeout=3, ping_gcs_rpc_server_max_retries=60)
|
||||
],
|
||||
indirect=True)
|
||||
def test_create_actor_with_placement_group_after_gcs_server_restart(
|
||||
@@ -1227,7 +1227,7 @@ def test_create_actor_with_placement_group_after_gcs_server_restart(
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_system_config_map(
|
||||
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
|
||||
num_heartbeats_timeout=3, ping_gcs_rpc_server_max_retries=60)
|
||||
],
|
||||
indirect=True)
|
||||
def test_create_placement_group_during_gcs_server_restart(
|
||||
|
||||
Reference in New Issue
Block a user