Split heartbeat message (#12535)

* first

* xxx

* Split heartbeat message

* only report resource usage when changed

* Fix GetAllResourceUsage

* Fix report resource usage

* Increase default heartbeat interval

* regularize heartbeat interval in test case
This commit is contained in:
Tao Wang
2020-12-11 21:19:57 +08:00
committed by GitHub
parent 867d2a8aa3
commit 295b6e5ce4
58 changed files with 1018 additions and 910 deletions
+6 -6
View File
@@ -7,8 +7,8 @@ from ray.core.generated.gcs_pb2 import (
JobConfig,
ErrorTableData,
GcsEntry,
HeartbeatBatchTableData,
HeartbeatTableData,
ResourceUsageBatchData,
ResourcesData,
ObjectTableData,
ProfileTableData,
TablePrefix,
@@ -33,8 +33,8 @@ __all__ = [
"ErrorTableData",
"ErrorType",
"GcsEntry",
"HeartbeatBatchTableData",
"HeartbeatTableData",
"ResourceUsageBatchData",
"ResourcesData",
"ObjectTableData",
"ProfileTableData",
"TablePrefix",
@@ -55,8 +55,8 @@ FUNCTION_PREFIX = "RemoteFunction:"
LOG_FILE_CHANNEL = "RAY_LOG_CHANNEL"
REPORTER_CHANNEL = "RAY_REPORTER"
# xray heartbeats
XRAY_HEARTBEAT_BATCH_PATTERN = "HEARTBEAT_BATCH:".encode("ascii")
# xray resource usages
XRAY_RESOURCES_BATCH_PATTERN = "RESOURCES_BATCH:".encode("ascii")
# xray job updates
XRAY_JOB_PATTERN = "JOB:*".encode("ascii")
@@ -23,7 +23,7 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
c_vector[c_string] GetAllProfileInfo()
c_vector[c_string] GetAllObjectInfo()
unique_ptr[c_string] GetObjectInfo(const CObjectID &object_id)
unique_ptr[c_string] GetAllHeartbeat()
unique_ptr[c_string] GetAllResourceUsage()
c_vector[c_string] GetAllActorInfo()
unique_ptr[c_string] GetActorInfo(const CActorID &actor_id)
c_string GetNodeResourceInfo(const CNodeID &node_id)
@@ -78,11 +78,11 @@ cdef class GlobalStateAccessor:
return c_string(object_info.get().data(), object_info.get().size())
return None
def get_all_heartbeat(self):
"""Get newest heartbeat of all nodes from GCS service."""
def get_all_resource_usage(self):
"""Get newest resource usage of all nodes from GCS service."""
cdef unique_ptr[c_string] result
with nogil:
result = self.inner.get().GetAllHeartbeat()
result = self.inner.get().GetAllResourceUsage()
if result:
return c_string(result.get().data(), result.get().size())
return None
+1 -1
View File
@@ -15,7 +15,7 @@ cdef extern from "ray/common/ray_config.h" nogil:
int64_t raylet_heartbeat_timeout_milliseconds() const
c_bool light_heartbeat_enabled() const
c_bool light_report_resource_usage_enabled() const
int64_t debug_dump_period_milliseconds() const
+2 -2
View File
@@ -14,8 +14,8 @@ cdef class Config:
return RayConfig.instance().raylet_heartbeat_timeout_milliseconds()
@staticmethod
def light_heartbeat_enabled():
return RayConfig.instance().light_heartbeat_enabled()
def light_report_resource_usage_enabled():
return RayConfig.instance().light_report_resource_usage_enabled()
@staticmethod
def debug_dump_period_milliseconds():
+11 -11
View File
@@ -139,24 +139,24 @@ class Monitor:
self.primary_subscribe_client.subscribe(channel)
def update_load_metrics(self):
"""Fetches heartbeat data from GCS and updates load metrics."""
"""Fetches resource usage data from GCS and updates load metrics."""
all_heartbeat = self.global_state_accessor.get_all_heartbeat()
heartbeat_batch_data = \
ray.gcs_utils.HeartbeatBatchTableData.FromString(all_heartbeat)
for heartbeat_message in heartbeat_batch_data.batch:
resource_load = dict(heartbeat_message.resource_load)
total_resources = dict(heartbeat_message.resources_total)
available_resources = dict(heartbeat_message.resources_available)
all_resources = self.global_state_accessor.get_all_resource_usage()
resources_batch_data = \
ray.gcs_utils.ResourceUsageBatchData.FromString(all_resources)
for resource_message in resources_batch_data.batch:
resource_load = dict(resource_message.resource_load)
total_resources = dict(resource_message.resources_total)
available_resources = dict(resource_message.resources_available)
waiting_bundles, infeasible_bundles = parse_resource_demands(
heartbeat_batch_data.resource_load_by_shape)
resources_batch_data.resource_load_by_shape)
pending_placement_groups = list(
heartbeat_batch_data.placement_group_load.placement_group_data)
resources_batch_data.placement_group_load.placement_group_data)
# Update the load metrics for this raylet.
node_id = ray.utils.binary_to_hex(heartbeat_message.node_id)
node_id = ray.utils.binary_to_hex(resource_message.node_id)
ip = self.raylet_id_to_ip_map.get(node_id)
if ip:
self.load_metrics.update(ip, total_resources,
+15 -14
View File
@@ -1055,11 +1055,11 @@ def test_actor_resource_demand(shutdown_only):
ray.get(a.foo.remote())
time.sleep(1)
message = global_state_accessor.get_all_heartbeat()
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
message = global_state_accessor.get_all_resource_usage()
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
# The actor is scheduled so there should be no more demands left.
assert len(heartbeat.resource_load_by_shape.resource_demands) == 0
assert len(resource_usages.resource_load_by_shape.resource_demands) == 0
@ray.remote(num_cpus=80)
class Actor2:
@@ -1070,23 +1070,24 @@ def test_actor_resource_demand(shutdown_only):
time.sleep(1)
# This actor cannot be scheduled.
message = global_state_accessor.get_all_heartbeat()
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
assert len(heartbeat.resource_load_by_shape.resource_demands) == 1
assert (heartbeat.resource_load_by_shape.resource_demands[0].shape == {
"CPU": 80.0
})
assert (heartbeat.resource_load_by_shape.resource_demands[0]
message = global_state_accessor.get_all_resource_usage()
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
assert len(resource_usages.resource_load_by_shape.resource_demands) == 1
assert (
resource_usages.resource_load_by_shape.resource_demands[0].shape == {
"CPU": 80.0
})
assert (resource_usages.resource_load_by_shape.resource_demands[0]
.num_infeasible_requests_queued == 1)
actors.append(Actor2.remote())
time.sleep(1)
# Two actors cannot be scheduled.
message = global_state_accessor.get_all_heartbeat()
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
assert len(heartbeat.resource_load_by_shape.resource_demands) == 1
assert (heartbeat.resource_load_by_shape.resource_demands[0]
message = global_state_accessor.get_all_resource_usage()
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(message)
assert len(resource_usages.resource_load_by_shape.resource_demands) == 1
assert (resource_usages.resource_load_by_shape.resource_demands[0]
.num_infeasible_requests_queued == 2)
global_state_accessor.disconnect()
+1 -1
View File
@@ -237,7 +237,7 @@ def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster):
cluster.add_node(
num_cpus=10 * num_gpus_per_raylet,
num_gpus=num_gpus_per_raylet,
_system_config={"num_heartbeats_timeout": 1000} if i == 0 else {})
_system_config={"num_heartbeats_timeout": 100} if i == 0 else {})
ray.init(address=cluster.address)
@ray.remote
+4 -3
View File
@@ -610,9 +610,10 @@ def test_lease_request_leak(shutdown_only):
del obj_ref
ray.get(tasks)
time.sleep(
1) # Sleep for an amount longer than the reconstruction timeout.
assert len(ray.objects()) == 0, ray.objects()
def _no_objects():
return len(ray.objects()) == 0
wait_for_condition(_no_objects, timeout=10)
@pytest.mark.parametrize(
@@ -143,7 +143,7 @@ def check_components_alive(cluster, component_type, check_component_alive):
"num_cpus": 8,
"num_nodes": 4,
"_system_config": {
"num_heartbeats_timeout": 100
"num_heartbeats_timeout": 10
},
}],
indirect=True)
+1 -1
View File
@@ -76,7 +76,7 @@ def test_gcs_server_restart_during_actor_creation(ray_start_regular):
@pytest.mark.parametrize(
"ray_start_cluster_head", [
generate_system_config_map(
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
num_heartbeats_timeout=2, ping_gcs_rpc_server_max_retries=60)
],
indirect=True)
def test_node_failure_detector_when_gcs_server_restart(ray_start_cluster_head):
+24 -22
View File
@@ -173,13 +173,14 @@ def test_load_report(shutdown_only, max_shapes):
self.report = None
def check_load_report(self):
message = global_state_accessor.get_all_heartbeat()
message = global_state_accessor.get_all_resource_usage()
if message is None:
return False
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(
resource_usage = ray.gcs_utils.ResourceUsageBatchData.FromString(
message)
self.report = heartbeat.resource_load_by_shape.resource_demands
self.report = \
resource_usage.resource_load_by_shape.resource_demands
if max_shapes == 0:
return True
elif max_shapes == 2:
@@ -227,40 +228,40 @@ def test_placement_group_load_report(ray_start_cluster):
class PgLoadChecker:
def nothing_is_ready(self):
heartbeat = self._read_heartbeat()
if not heartbeat:
resource_usage = self._read_resource_usage()
if not resource_usage:
return False
if heartbeat.HasField("placement_group_load"):
pg_load = heartbeat.placement_group_load
if resource_usage.HasField("placement_group_load"):
pg_load = resource_usage.placement_group_load
return len(pg_load.placement_group_data) == 2
return False
def only_first_one_ready(self):
heartbeat = self._read_heartbeat()
if not heartbeat:
resource_usage = self._read_resource_usage()
if not resource_usage:
return False
if heartbeat.HasField("placement_group_load"):
pg_load = heartbeat.placement_group_load
if resource_usage.HasField("placement_group_load"):
pg_load = resource_usage.placement_group_load
return len(pg_load.placement_group_data) == 1
return False
def two_infeasible_pg(self):
heartbeat = self._read_heartbeat()
if not heartbeat:
resource_usage = self._read_resource_usage()
if not resource_usage:
return False
if heartbeat.HasField("placement_group_load"):
pg_load = heartbeat.placement_group_load
if resource_usage.HasField("placement_group_load"):
pg_load = resource_usage.placement_group_load
return len(pg_load.placement_group_data) == 2
return False
def _read_heartbeat(self):
message = global_state_accessor.get_all_heartbeat()
def _read_resource_usage(self):
message = global_state_accessor.get_all_resource_usage()
if message is None:
return False
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(
resource_usage = ray.gcs_utils.ResourceUsageBatchData.FromString(
message)
return heartbeat
return resource_usage
checker = PgLoadChecker()
@@ -301,13 +302,14 @@ def test_backlog_report(shutdown_only):
return None
def backlog_size_set():
message = global_state_accessor.get_all_heartbeat()
message = global_state_accessor.get_all_resource_usage()
if message is None:
return False
heartbeat = ray.gcs_utils.HeartbeatBatchTableData.FromString(message)
resource_usage = ray.gcs_utils.ResourceUsageBatchData.FromString(
message)
aggregate_resource_load = \
heartbeat.resource_load_by_shape.resource_demands
resource_usage.resource_load_by_shape.resource_demands
if len(aggregate_resource_load) == 1:
backlog_size = aggregate_resource_load[0].backlog_size
print(backlog_size)
+3 -3
View File
@@ -34,7 +34,7 @@ def test_shutdown():
@pytest.mark.parametrize(
"ray_start_cluster_head", [
generate_system_config_map(
num_heartbeats_timeout=20, object_timeout_milliseconds=12345)
num_heartbeats_timeout=2, object_timeout_milliseconds=12345)
],
indirect=True)
def test_system_config(ray_start_cluster_head):
@@ -52,12 +52,12 @@ def test_system_config(ray_start_cluster_head):
@ray.remote
def f():
assert ray._config.object_timeout_milliseconds() == 12345
assert ray._config.num_heartbeats_timeout() == 20
assert ray._config.num_heartbeats_timeout() == 2
ray.get([f.remote() for _ in range(5)])
cluster.remove_node(worker, allow_graceful=False)
time.sleep(1)
time.sleep(0.9)
assert ray.cluster_resources()["CPU"] == 2
time.sleep(2)
+3 -3
View File
@@ -1165,7 +1165,7 @@ ray.shutdown()
@pytest.mark.parametrize(
"ray_start_cluster_head", [
generate_system_config_map(
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
num_heartbeats_timeout=3, ping_gcs_rpc_server_max_retries=60)
],
indirect=True)
def test_create_placement_group_after_gcs_server_restart(
@@ -1203,7 +1203,7 @@ def test_create_placement_group_after_gcs_server_restart(
@pytest.mark.parametrize(
"ray_start_cluster_head", [
generate_system_config_map(
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
num_heartbeats_timeout=3, ping_gcs_rpc_server_max_retries=60)
],
indirect=True)
def test_create_actor_with_placement_group_after_gcs_server_restart(
@@ -1227,7 +1227,7 @@ def test_create_actor_with_placement_group_after_gcs_server_restart(
@pytest.mark.parametrize(
"ray_start_cluster_head", [
generate_system_config_map(
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
num_heartbeats_timeout=3, ping_gcs_rpc_server_max_retries=60)
],
indirect=True)
def test_create_placement_group_during_gcs_server_restart(