mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 09:29:05 +08:00
[Placement Group] Capture Child Task Part 1 (#10968)
* In progress. * In progers. * Done. * Addressed code review. * Increase timeout to make a test less flaky. * Addressed code review. * Addressed code review.
This commit is contained in:
@@ -791,6 +791,11 @@ cdef class CoreWorker:
|
||||
return ActorID(
|
||||
CCoreWorkerProcess.GetCoreWorker().GetActorId().Binary())
|
||||
|
||||
def get_placement_group_id(self):
|
||||
return PlacementGroupID(
|
||||
CCoreWorkerProcess.GetCoreWorker()
|
||||
.GetCurrentPlacementGroupId().Binary())
|
||||
|
||||
def set_webui_display(self, key, message):
|
||||
CCoreWorkerProcess.GetCoreWorker().SetWebuiDisplay(key, message)
|
||||
|
||||
|
||||
@@ -122,6 +122,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
CJobID GetCurrentJobId()
|
||||
CTaskID GetCurrentTaskId()
|
||||
CClientID GetCurrentNodeId()
|
||||
CPlacementGroupID GetCurrentPlacementGroupId()
|
||||
const CActorID &GetActorId()
|
||||
void SetActorTitle(const c_string &title)
|
||||
void SetWebuiDisplay(const c_string &key, const c_string &message)
|
||||
|
||||
@@ -45,6 +45,15 @@ class RuntimeContext(object):
|
||||
actor_info = ray.state.actors(self.current_actor_id.hex())
|
||||
return actor_info and actor_info["NumRestarts"] != 0
|
||||
|
||||
@property
|
||||
def current_placement_group_id(self):
|
||||
"""Get the current Placement group ID of this worker.
|
||||
|
||||
Returns:
|
||||
The current placement group id of this worker.
|
||||
"""
|
||||
return self.worker.placement_group_id
|
||||
|
||||
|
||||
_runtime_context = None
|
||||
|
||||
|
||||
+5
-1
@@ -6,6 +6,7 @@ import time
|
||||
|
||||
import ray
|
||||
|
||||
from google.protobuf.json_format import MessageToDict
|
||||
from ray import (
|
||||
gcs_utils,
|
||||
services,
|
||||
@@ -423,7 +424,10 @@ class GlobalState:
|
||||
placement_group_info.placement_group_id),
|
||||
"name": placement_group_info.name,
|
||||
"bundles": {
|
||||
bundle.bundle_id.bundle_index: bundle.unit_resources
|
||||
# The value here is needs to be dictionarified
|
||||
# otherwise, the payload becomes unserializable.
|
||||
bundle.bundle_id.bundle_index:
|
||||
MessageToDict(bundle)["unitResources"]
|
||||
for bundle in placement_group_info.bundles
|
||||
},
|
||||
"strategy": get_strategy(placement_group_info.strategy),
|
||||
|
||||
@@ -11,7 +11,8 @@ import ray
|
||||
from ray.test_utils import get_other_nodes, wait_for_condition
|
||||
import ray.cluster_utils
|
||||
from ray._raylet import PlacementGroupID
|
||||
from ray.util.placement_group import PlacementGroup
|
||||
from ray.util.placement_group import (PlacementGroup,
|
||||
get_current_placement_group)
|
||||
|
||||
|
||||
def test_placement_group_pack(ray_start_cluster):
|
||||
@@ -288,7 +289,7 @@ def test_remove_placement_group(ray_start_cluster):
|
||||
# First try to remove a placement group that doesn't
|
||||
# exist. This should not do anything.
|
||||
random_group_id = PlacementGroupID.from_random()
|
||||
random_placement_group = PlacementGroup(random_group_id, [{"CPU": 1}])
|
||||
random_placement_group = PlacementGroup(random_group_id)
|
||||
for _ in range(3):
|
||||
ray.util.remove_placement_group(random_placement_group)
|
||||
|
||||
@@ -591,7 +592,7 @@ def test_placement_group_wait(ray_start_cluster):
|
||||
assert table["state"] == "CREATED"
|
||||
|
||||
pg = ray.get(placement_group.ready())
|
||||
assert pg.bundles == placement_group.bundles
|
||||
assert pg.bundle_specs == placement_group.bundle_specs
|
||||
assert pg.id.binary() == placement_group.id.binary()
|
||||
|
||||
|
||||
@@ -791,5 +792,59 @@ def test_mini_integration(ray_start_cluster):
|
||||
assert all(ray.get([a.ping.remote() for a in actors]))
|
||||
|
||||
|
||||
def test_capture_child_tasks(ray_start_cluster):
|
||||
cluster = ray_start_cluster
|
||||
total_num_actors = 4
|
||||
for _ in range(2):
|
||||
cluster.add_node(num_cpus=total_num_actors)
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
pg = ray.util.placement_group(
|
||||
[{
|
||||
"CPU": 2
|
||||
}, {
|
||||
"CPU": 2
|
||||
}], strategy="STRICT_PACK")
|
||||
ray.get(pg.ready(), timeout=5)
|
||||
|
||||
# If get_current_placement_group is used when the current worker/driver
|
||||
# doesn't belong to any of placement group, it should return None.
|
||||
assert get_current_placement_group() is None
|
||||
|
||||
@ray.remote(num_cpus=1)
|
||||
class NestedActor:
|
||||
def ready(self):
|
||||
return True
|
||||
|
||||
@ray.remote(num_cpus=1)
|
||||
class Actor:
|
||||
def __init__(self):
|
||||
self.actors = []
|
||||
|
||||
def ready(self):
|
||||
return True
|
||||
|
||||
def schedule_nested_actor(self):
|
||||
actor = NestedActor.options(
|
||||
placement_group=get_current_placement_group()).remote()
|
||||
ray.get(actor.ready.remote())
|
||||
self.actors.append(actor)
|
||||
|
||||
a = Actor.options(placement_group=pg).remote()
|
||||
ray.get(a.ready.remote())
|
||||
# 1 top level actor + 3 children.
|
||||
for _ in range(total_num_actors - 1):
|
||||
ray.get(a.schedule_nested_actor.remote())
|
||||
# Make sure all the actors are scheduled on the same node.
|
||||
# (why? The placement group has STRICT_PACK strategy).
|
||||
node_id_set = set()
|
||||
for actor_info in ray.actors().values():
|
||||
node_id = actor_info["Address"]["NodeID"]
|
||||
node_id_set.add(node_id)
|
||||
|
||||
# Since all node id should be identical, set should be equal to 1.
|
||||
assert len(node_id_set) == 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
from typing import (List, Dict)
|
||||
import time
|
||||
|
||||
from typing import (List, Dict, Optional)
|
||||
|
||||
import ray
|
||||
from ray._raylet import PlacementGroupID, ObjectRef
|
||||
@@ -9,11 +11,11 @@ class PlacementGroup:
|
||||
|
||||
@staticmethod
|
||||
def empty():
|
||||
return PlacementGroup(PlacementGroupID.nil(), [])
|
||||
return PlacementGroup(PlacementGroupID.nil())
|
||||
|
||||
def __init__(self, id: PlacementGroupID, bundles: List[Dict[str, float]]):
|
||||
def __init__(self, id: PlacementGroupID):
|
||||
self.id = id
|
||||
self.bundles = bundles
|
||||
self.bundle_cache = None
|
||||
|
||||
def ready(self) -> ObjectRef:
|
||||
"""Returns an ObjectRef to check ready status.
|
||||
@@ -29,22 +31,22 @@ class PlacementGroup:
|
||||
>>> pg = placement_group([{"CPU": 1}])
|
||||
ray.wait([pg.ready()], timeout=0)
|
||||
"""
|
||||
worker = ray.worker.global_worker
|
||||
worker.check_connected()
|
||||
self._fill_bundle_cache_if_needed()
|
||||
|
||||
@ray.remote(num_cpus=0, max_calls=0)
|
||||
def bundle_reservation_check(placement_group):
|
||||
return placement_group
|
||||
|
||||
assert len(self.bundles) != 0, (
|
||||
assert len(self.bundle_cache) != 0, (
|
||||
"ready() cannot be called on placement group object with a "
|
||||
f"bundle length == 0, current bundle length: {len(self.bundles)}")
|
||||
"bundle length == 0, current bundle length: "
|
||||
f"{len(self.bundle_cache)}")
|
||||
|
||||
# Select the first bundle to schedule a dummy task.
|
||||
# Since the placement group creation will be atomic, it is sufficient
|
||||
# to schedule a single task.
|
||||
bundle_index = 0
|
||||
bundle = self.bundles[bundle_index]
|
||||
bundle = self.bundle_cache[bundle_index]
|
||||
|
||||
resource_name, value = self._get_none_zero_resource(bundle)
|
||||
num_cpus = 0
|
||||
@@ -67,11 +69,13 @@ class PlacementGroup:
|
||||
@property
|
||||
def bundle_specs(self) -> List[Dict]:
|
||||
"""List[Dict]: Return bundles belonging to this placement group."""
|
||||
return self.bundles
|
||||
self._fill_bundle_cache_if_needed()
|
||||
return self.bundle_cache
|
||||
|
||||
@property
|
||||
def bundle_count(self):
|
||||
return len(self.bundles)
|
||||
self._fill_bundle_cache_if_needed()
|
||||
return len(self.bundle_cache)
|
||||
|
||||
def _get_none_zero_resource(self, bundle: List[Dict]):
|
||||
for key, value in bundle.items():
|
||||
@@ -80,6 +84,30 @@ class PlacementGroup:
|
||||
return key, value
|
||||
assert False, "This code should be unreachable."
|
||||
|
||||
def _fill_bundle_cache_if_needed(self):
|
||||
if not self.bundle_cache:
|
||||
# Since creating placement group is async, it is
|
||||
# possible table is not ready yet. To avoid the
|
||||
# problem, we should keep trying with timeout.
|
||||
TIMEOUT_SECOND = 30
|
||||
WAIT_INTERVAL = 0.05
|
||||
timeout_cnt = 0
|
||||
worker = ray.worker.global_worker
|
||||
worker.check_connected()
|
||||
|
||||
while timeout_cnt < int(TIMEOUT_SECOND / WAIT_INTERVAL):
|
||||
pg_info = ray.state.state.placement_group_table(self.id)
|
||||
if pg_info:
|
||||
self.bundle_cache = list(pg_info["bundles"].values())
|
||||
return
|
||||
time.sleep(WAIT_INTERVAL)
|
||||
timeout_cnt += 1
|
||||
|
||||
raise RuntimeError(
|
||||
"Couldn't get the bundle information of placement group id "
|
||||
f"{self.id} in {TIMEOUT_SECOND} seconds. It is likely "
|
||||
"because GCS server is too busy.")
|
||||
|
||||
|
||||
def placement_group(bundles: List[Dict[str, float]],
|
||||
strategy: str = "PACK",
|
||||
@@ -120,7 +148,7 @@ def placement_group(bundles: List[Dict[str, float]],
|
||||
placement_group_id = worker.core_worker.create_placement_group(
|
||||
name, bundles, strategy)
|
||||
|
||||
return PlacementGroup(placement_group_id, bundles)
|
||||
return PlacementGroup(placement_group_id)
|
||||
|
||||
|
||||
def remove_placement_group(placement_group: PlacementGroup):
|
||||
@@ -149,6 +177,41 @@ def placement_group_table(placement_group: PlacementGroup) -> dict:
|
||||
return ray.state.state.placement_group_table(placement_group.id)
|
||||
|
||||
|
||||
def get_current_placement_group() -> Optional[PlacementGroup]:
|
||||
"""Get the current placement group which a task or actor is using.
|
||||
|
||||
It returns None if there's no current placement group for the worker.
|
||||
For example, if you call this method in your driver, it returns None
|
||||
(because drivers never belong to any placement group).
|
||||
|
||||
Examples:
|
||||
|
||||
>>> @ray.remote
|
||||
>>> def f():
|
||||
>>> # This will return the placement group the task f belongs to.
|
||||
>>> # It means this pg will be identical to the pg created below.
|
||||
>>> pg = get_current_placement_group()
|
||||
>>> pg = placement_group([{"CPU": 2}])
|
||||
>>> f.options(placement_group=pg).remote()
|
||||
|
||||
>>> # New script.
|
||||
>>> ray.init()
|
||||
>>> # New script doesn't belong to any placement group,
|
||||
>>> # so it returns None.
|
||||
>>> assert get_current_placement_group() is None
|
||||
|
||||
Return:
|
||||
PlacementGroup: Placement group object.
|
||||
None if the current task or actor wasn't
|
||||
created with any placement group.
|
||||
"""
|
||||
pg_id = ray.runtime_context.get_runtime_context(
|
||||
).current_placement_group_id
|
||||
if pg_id.is_nil():
|
||||
return None
|
||||
return PlacementGroup(pg_id)
|
||||
|
||||
|
||||
def check_placement_group_index(placement_group: PlacementGroup,
|
||||
bundle_index: int):
|
||||
assert placement_group is not None
|
||||
|
||||
@@ -155,6 +155,10 @@ class Worker:
|
||||
def current_task_id(self):
|
||||
return self.core_worker.get_current_task_id()
|
||||
|
||||
@property
|
||||
def placement_group_id(self):
|
||||
return self.core_worker.get_placement_group_id()
|
||||
|
||||
@property
|
||||
def current_session_and_job(self):
|
||||
"""Get the current session index and job id as pair."""
|
||||
|
||||
Reference in New Issue
Block a user