mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 23:08:32 +08:00
[GCS]Use new getting all available resources interface instead of pub-sub … (#10914)
* Use new all available resources getting interface instead of pub-sub in state.py * add missing server handler and test cases, fix comments * add fine grained test assert * per comments * involve new added function _available_resources_per_node * change ClientID to NodeID * fix compile * fix client id and lint * robust tests check * robust tests
This commit is contained in:
@@ -3,6 +3,7 @@ from ray.core.generated.gcs_pb2 import (
|
||||
ActorCheckpointIdData,
|
||||
ActorTableData,
|
||||
GcsNodeInfo,
|
||||
AvailableResources,
|
||||
JobTableData,
|
||||
JobConfig,
|
||||
ErrorTableData,
|
||||
@@ -26,6 +27,7 @@ __all__ = [
|
||||
"ActorCheckpointIdData",
|
||||
"ActorTableData",
|
||||
"GcsNodeInfo",
|
||||
"AvailableResources",
|
||||
"JobTableData",
|
||||
"JobConfig",
|
||||
"ErrorTableData",
|
||||
|
||||
@@ -19,6 +19,7 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
|
||||
void Disconnect()
|
||||
c_vector[c_string] GetAllJobInfo()
|
||||
c_vector[c_string] GetAllNodeInfo()
|
||||
c_vector[c_string] GetAllAvailableResources()
|
||||
c_vector[c_string] GetAllProfileInfo()
|
||||
c_vector[c_string] GetAllObjectInfo()
|
||||
unique_ptr[c_string] GetObjectInfo(const CObjectID &object_id)
|
||||
|
||||
@@ -51,6 +51,12 @@ cdef class GlobalStateAccessor:
|
||||
result = self.inner.get().GetAllNodeInfo()
|
||||
return result
|
||||
|
||||
def get_all_available_resources(self):
|
||||
cdef c_vector[c_string] result
|
||||
with nogil:
|
||||
result = self.inner.get().GetAllAvailableResources()
|
||||
return result
|
||||
|
||||
def get_profile_table(self):
|
||||
cdef c_vector[c_string] result
|
||||
with nogil:
|
||||
|
||||
+18
-33
@@ -760,41 +760,26 @@ class GlobalState:
|
||||
"""Returns a dictionary mapping node id to avaiable resources."""
|
||||
available_resources_by_id = {}
|
||||
|
||||
subscribe_client = self.redis_client.pubsub(
|
||||
ignore_subscribe_messages=True)
|
||||
subscribe_client.psubscribe(gcs_utils.XRAY_HEARTBEAT_PATTERN)
|
||||
|
||||
client_ids = self._live_client_ids()
|
||||
|
||||
while set(available_resources_by_id.keys()) != client_ids:
|
||||
# Parse client message
|
||||
raw_message = subscribe_client.get_message()
|
||||
if (raw_message is None or raw_message["pattern"] !=
|
||||
gcs_utils.XRAY_HEARTBEAT_PATTERN):
|
||||
continue
|
||||
data = raw_message["data"]
|
||||
pub_message = gcs_utils.PubSubMessage.FromString(data)
|
||||
heartbeat_data = pub_message.data
|
||||
message = gcs_utils.HeartbeatTableData.FromString(heartbeat_data)
|
||||
# Calculate available resources for this client
|
||||
all_available_resources = \
|
||||
self.global_state_accessor.get_all_available_resources()
|
||||
for available_resource in all_available_resources:
|
||||
message = ray.gcs_utils.AvailableResources.FromString(
|
||||
available_resource)
|
||||
# Calculate available resources for this node.
|
||||
dynamic_resources = {}
|
||||
for resource_id, capacity in message.resources_available.items():
|
||||
for resource_id, capacity in \
|
||||
message.resources_available.items():
|
||||
dynamic_resources[resource_id] = capacity
|
||||
# Update available resources for this node.
|
||||
node_id = ray.utils.binary_to_hex(message.node_id)
|
||||
available_resources_by_id[node_id] = dynamic_resources
|
||||
|
||||
# Update available resources for this client
|
||||
client_id = ray.utils.binary_to_hex(message.client_id)
|
||||
available_resources_by_id[client_id] = dynamic_resources
|
||||
|
||||
# Update clients in cluster
|
||||
client_ids = self._live_client_ids()
|
||||
|
||||
# Remove disconnected clients
|
||||
for client_id in list(available_resources_by_id.keys()):
|
||||
if client_id not in client_ids:
|
||||
del available_resources_by_id[client_id]
|
||||
|
||||
# Close the pubsub clients to avoid leaking file descriptors.
|
||||
subscribe_client.close()
|
||||
# Update nodes in cluster.
|
||||
node_ids = self._live_client_ids()
|
||||
# Remove disconnected nodes.
|
||||
for node_id in available_resources_by_id.keys():
|
||||
if node_id not in node_ids:
|
||||
del available_resources_by_id[node_id]
|
||||
|
||||
return available_resources_by_id
|
||||
|
||||
@@ -814,7 +799,7 @@ class GlobalState:
|
||||
|
||||
available_resources_by_id = self._available_resources_per_node()
|
||||
|
||||
# Calculate total available resources
|
||||
# Calculate total available resources.
|
||||
total_available_resources = defaultdict(int)
|
||||
for available_resources in available_resources_by_id.values():
|
||||
for resource_id, num_available in available_resources.items():
|
||||
|
||||
@@ -695,12 +695,16 @@ def test_accelerator_type_api(shutdown_only):
|
||||
|
||||
@ray.remote(accelerator_type=v100)
|
||||
def decorated_func(quantity):
|
||||
return ray.available_resources()[resource_name] < quantity
|
||||
wait_for_condition(
|
||||
lambda: ray.available_resources()[resource_name] < quantity)
|
||||
return True
|
||||
|
||||
assert ray.get(decorated_func.remote(quantity))
|
||||
|
||||
def via_options_func(quantity):
|
||||
return ray.available_resources()[resource_name] < quantity
|
||||
wait_for_condition(
|
||||
lambda: ray.available_resources()[resource_name] < quantity)
|
||||
return True
|
||||
|
||||
assert ray.get(
|
||||
ray.remote(via_options_func).options(
|
||||
@@ -725,13 +729,15 @@ def test_accelerator_type_api(shutdown_only):
|
||||
# Avoid a race condition where the actor hasn't been initialized and
|
||||
# claimed the resources yet.
|
||||
ray.get(decorated_actor.initialized.remote())
|
||||
assert ray.available_resources()[resource_name] < quantity
|
||||
wait_for_condition(
|
||||
lambda: ray.available_resources()[resource_name] < quantity)
|
||||
|
||||
quantity = ray.available_resources()[resource_name]
|
||||
with_options = ray.remote(ActorWithOptions).options(
|
||||
accelerator_type=v100).remote()
|
||||
ray.get(with_options.initialized.remote())
|
||||
assert ray.available_resources()[resource_name] < quantity
|
||||
wait_for_condition(
|
||||
lambda: ray.available_resources()[resource_name] < quantity)
|
||||
|
||||
|
||||
def test_detect_docker_cpus():
|
||||
|
||||
@@ -219,8 +219,12 @@ def test_many_fractional_resources(shutdown_only):
|
||||
stop_time = time.time() + 10
|
||||
correct_available_resources = False
|
||||
while time.time() < stop_time:
|
||||
if (ray.available_resources()["CPU"] == 2.0
|
||||
available_resources = ray.available_resources()
|
||||
if ("CPU" in available_resources
|
||||
and ray.available_resources()["CPU"] == 2.0
|
||||
and "GPU" in available_resources
|
||||
and ray.available_resources()["GPU"] == 2.0
|
||||
and "Custom" in available_resources
|
||||
and ray.available_resources()["Custom"] == 2.0):
|
||||
correct_available_resources = True
|
||||
break
|
||||
@@ -346,6 +350,9 @@ def test_ray_options(shutdown_only):
|
||||
@ray.remote(
|
||||
num_cpus=2, num_gpus=3, memory=150 * 2**20, resources={"custom1": 1})
|
||||
def foo():
|
||||
import time
|
||||
# Sleep for a heartbeat period to ensure resources changing reported.
|
||||
time.sleep(0.1)
|
||||
return ray.available_resources()
|
||||
|
||||
ray.init(num_cpus=10, num_gpus=10, resources={"custom1": 2})
|
||||
|
||||
@@ -647,7 +647,9 @@ def test_release_cpus_when_actor_creation_task_blocking(shutdown_only):
|
||||
return False
|
||||
|
||||
def assert_available_resources():
|
||||
return 1 == ray.available_resources()["CPU"]
|
||||
available_resources = ray.available_resources()
|
||||
return "CPU" in available_resources and 1 == ray.available_resources(
|
||||
)["CPU"]
|
||||
|
||||
result = wait_until(assert_available_resources, 1000)
|
||||
assert result is True
|
||||
|
||||
Reference in New Issue
Block a user