mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 23:25:24 +08:00
[PlacementGroup]Fix placement group wait api disorder bug (#12827)
* [PlacementGroup]Fix placment group wait api disorder bug * fix review comment * fix review comment * fix review comment * fix review comments * increase num_heartbeats_timeout Co-authored-by: 灵洵 <fengbin.ffb@antgroup.com>
This commit is contained in:
@@ -1207,14 +1207,17 @@ cdef class CoreWorker:
|
||||
|
||||
def wait_placement_group_ready(self,
|
||||
PlacementGroupID placement_group_id,
|
||||
int32_t timeout_ms):
|
||||
int32_t timeout_seconds):
|
||||
cdef CRayStatus status
|
||||
cdef CPlacementGroupID cplacement_group_id = (
|
||||
CPlacementGroupID.FromBinary(placement_group_id.binary()))
|
||||
cdef int ctimeout_ms = timeout_ms
|
||||
cdef int ctimeout_seconds = timeout_seconds
|
||||
with nogil:
|
||||
status = CCoreWorkerProcess.GetCoreWorker() \
|
||||
.WaitPlacementGroupReady(cplacement_group_id, ctimeout_ms)
|
||||
.WaitPlacementGroupReady(cplacement_group_id, ctimeout_seconds)
|
||||
if status.IsNotFound():
|
||||
raise Exception("Placement group {} does not exist.".format(
|
||||
placement_group_id))
|
||||
return status.ok()
|
||||
|
||||
def submit_actor_task(self,
|
||||
|
||||
@@ -1256,5 +1256,37 @@ def test_create_placement_group_during_gcs_server_restart(
|
||||
ray.get(placement_groups[i].ready())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_system_config_map(
|
||||
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
|
||||
],
|
||||
indirect=True)
|
||||
def test_placement_group_wait_api(ray_start_cluster_head):
|
||||
cluster = ray_start_cluster_head
|
||||
cluster.add_node(num_cpus=2)
|
||||
cluster.add_node(num_cpus=2)
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
# Create placement group 1 successfully.
|
||||
placement_group1 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
||||
assert placement_group1.wait(10)
|
||||
|
||||
# Restart gcs server.
|
||||
cluster.head_node.kill_gcs_server()
|
||||
cluster.head_node.start_gcs_server()
|
||||
|
||||
# Create placement group 2 successfully.
|
||||
placement_group2 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
|
||||
assert placement_group2.wait(10)
|
||||
|
||||
# Remove placement group 1.
|
||||
ray.util.remove_placement_group(placement_group1)
|
||||
|
||||
# Wait for placement group 1 after it is removed.
|
||||
with pytest.raises(Exception):
|
||||
placement_group1.wait(10)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -83,6 +83,19 @@ class PlacementGroup:
|
||||
placement_group_bundle_index=bundle_index,
|
||||
resources=resources).remote(self)
|
||||
|
||||
def wait(self, timeout_seconds: int) -> bool:
|
||||
"""Wait for the placement group to be ready within the specified time.
|
||||
Args:
|
||||
timeout_seconds(str): Timeout in seconds.
|
||||
Return:
|
||||
True if the placement group is created. False otherwise.
|
||||
"""
|
||||
worker = ray.worker.global_worker
|
||||
worker.check_connected()
|
||||
|
||||
return worker.core_worker.wait_placement_group_ready(
|
||||
self.id, timeout_seconds)
|
||||
|
||||
@property
|
||||
def bundle_specs(self) -> List[Dict]:
|
||||
"""List[Dict]: Return bundles belonging to this placement group."""
|
||||
|
||||
Reference in New Issue
Block a user