[PlacementGroup]Fix placement group wait api disorder bug (#12827)

* [PlacementGroup]Fix placment group wait api disorder bug

* fix review comment

* fix review comment

* fix review comment

* fix review comments

* increase num_heartbeats_timeout

Co-authored-by: 灵洵 <fengbin.ffb@antgroup.com>
This commit is contained in:
fangfengbin
2020-12-16 18:45:53 +08:00
committed by GitHub
parent 7ff314a5df
commit 91878d18b5
9 changed files with 109 additions and 29 deletions
+6 -3
View File
@@ -1207,14 +1207,17 @@ cdef class CoreWorker:
def wait_placement_group_ready(self,
PlacementGroupID placement_group_id,
int32_t timeout_ms):
int32_t timeout_seconds):
cdef CRayStatus status
cdef CPlacementGroupID cplacement_group_id = (
CPlacementGroupID.FromBinary(placement_group_id.binary()))
cdef int ctimeout_ms = timeout_ms
cdef int ctimeout_seconds = timeout_seconds
with nogil:
status = CCoreWorkerProcess.GetCoreWorker() \
.WaitPlacementGroupReady(cplacement_group_id, ctimeout_ms)
.WaitPlacementGroupReady(cplacement_group_id, ctimeout_seconds)
if status.IsNotFound():
raise Exception("Placement group {} does not exist.".format(
placement_group_id))
return status.ok()
def submit_actor_task(self,
+32
View File
@@ -1256,5 +1256,37 @@ def test_create_placement_group_during_gcs_server_restart(
ray.get(placement_groups[i].ready())
@pytest.mark.parametrize(
"ray_start_cluster_head", [
generate_system_config_map(
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
],
indirect=True)
def test_placement_group_wait_api(ray_start_cluster_head):
cluster = ray_start_cluster_head
cluster.add_node(num_cpus=2)
cluster.add_node(num_cpus=2)
cluster.wait_for_nodes()
# Create placement group 1 successfully.
placement_group1 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
assert placement_group1.wait(10)
# Restart gcs server.
cluster.head_node.kill_gcs_server()
cluster.head_node.start_gcs_server()
# Create placement group 2 successfully.
placement_group2 = ray.util.placement_group([{"CPU": 1}, {"CPU": 1}])
assert placement_group2.wait(10)
# Remove placement group 1.
ray.util.remove_placement_group(placement_group1)
# Wait for placement group 1 after it is removed.
with pytest.raises(Exception):
placement_group1.wait(10)
if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
+13
View File
@@ -83,6 +83,19 @@ class PlacementGroup:
placement_group_bundle_index=bundle_index,
resources=resources).remote(self)
def wait(self, timeout_seconds: int) -> bool:
"""Wait for the placement group to be ready within the specified time.
Args:
timeout_seconds(str): Timeout in seconds.
Return:
True if the placement group is created. False otherwise.
"""
worker = ray.worker.global_worker
worker.check_connected()
return worker.core_worker.wait_placement_group_ready(
self.id, timeout_seconds)
@property
def bundle_specs(self) -> List[Dict]:
"""List[Dict]: Return bundles belonging to this placement group."""