[GCS]Use new getting all available resources interface instead of pub-sub … (#10914)

* Use new all available resources getting interface instead of pub-sub in state.py

* add missing server handler and test cases, fix comments

* add fine grained test assert

* per comments

* involve new added function _available_resources_per_node

* change  ClientID to NodeID

* fix compile

* fix client id and lint

* robust tests check

* robust tests
This commit is contained in:
Tao Wang
2020-09-30 00:41:10 +08:00
committed by GitHub
parent 47eb6613b5
commit 1db83764bf
20 changed files with 192 additions and 41 deletions
+18 -33
View File
@@ -760,41 +760,26 @@ class GlobalState:
"""Returns a dictionary mapping node id to avaiable resources."""
available_resources_by_id = {}
subscribe_client = self.redis_client.pubsub(
ignore_subscribe_messages=True)
subscribe_client.psubscribe(gcs_utils.XRAY_HEARTBEAT_PATTERN)
client_ids = self._live_client_ids()
while set(available_resources_by_id.keys()) != client_ids:
# Parse client message
raw_message = subscribe_client.get_message()
if (raw_message is None or raw_message["pattern"] !=
gcs_utils.XRAY_HEARTBEAT_PATTERN):
continue
data = raw_message["data"]
pub_message = gcs_utils.PubSubMessage.FromString(data)
heartbeat_data = pub_message.data
message = gcs_utils.HeartbeatTableData.FromString(heartbeat_data)
# Calculate available resources for this client
all_available_resources = \
self.global_state_accessor.get_all_available_resources()
for available_resource in all_available_resources:
message = ray.gcs_utils.AvailableResources.FromString(
available_resource)
# Calculate available resources for this node.
dynamic_resources = {}
for resource_id, capacity in message.resources_available.items():
for resource_id, capacity in \
message.resources_available.items():
dynamic_resources[resource_id] = capacity
# Update available resources for this node.
node_id = ray.utils.binary_to_hex(message.node_id)
available_resources_by_id[node_id] = dynamic_resources
# Update available resources for this client
client_id = ray.utils.binary_to_hex(message.client_id)
available_resources_by_id[client_id] = dynamic_resources
# Update clients in cluster
client_ids = self._live_client_ids()
# Remove disconnected clients
for client_id in list(available_resources_by_id.keys()):
if client_id not in client_ids:
del available_resources_by_id[client_id]
# Close the pubsub clients to avoid leaking file descriptors.
subscribe_client.close()
# Update nodes in cluster.
node_ids = self._live_client_ids()
# Remove disconnected nodes.
for node_id in available_resources_by_id.keys():
if node_id not in node_ids:
del available_resources_by_id[node_id]
return available_resources_by_id
@@ -814,7 +799,7 @@ class GlobalState:
available_resources_by_id = self._available_resources_per_node()
# Calculate total available resources
# Calculate total available resources.
total_available_resources = defaultdict(int)
for available_resources in available_resources_by_id.values():
for resource_id, num_available in available_resources.items():