[Placement Group]Add detached support for placement group. (#13582)

This commit is contained in:
DK.Pino
2021-01-27 18:51:26 +08:00
committed by GitHub
parent d2963f4ee1
commit 7f6d326ad8
15 changed files with 209 additions and 21 deletions
+36
View File
@@ -252,6 +252,42 @@ Note that you can anytime remove the placement group to clean up resources.
ray.shutdown()
Placement Group Lifetimes
-------------------------
.. tabs::
.. group-tab:: Python
By default, the lifetimes of placement groups are not detached and will be destroyed
when the driver is terminated (but, if it is created from a detached actor, it is
killed when the detached actor is killed). If you'd like to keep the placement group
alive regardless of its job or detached actor, you should specify
`lifetime="detached"`. For example:
.. code-block:: python
# first_driver.py
pg = placement_group([{"CPU": 2}, {"CPU": 2}], strategy="STRICT_SPREAD", lifetime="detached")
ray.get(pg.ready())
The placement group's lifetime will be independent of the driver now. This means it
is possible to retrieve the placement group from other drivers regardless of when
the current driver exits. Let's see an example:
.. code-block:: python
# second_driver.py
table = ray.util.placement_group_table()
print(len(table))
Note that the lifetime option is decoupled from the name. If we only specified
the name without specifying ``lifetime="detached"``, then the placement group can
only be retrieved as long as the original driver is still running.
.. group-tab:: Java
The lifetime argument is not implemented for Java APIs yet.
Tips for Using Placement Groups
-------------------------------
- Learn the :ref:`lifecycle <ray-placement-group-lifecycle-ref>` of placement groups.
+4 -2
View File
@@ -1184,7 +1184,8 @@ cdef class CoreWorker:
self,
c_string name,
c_vector[unordered_map[c_string, double]] bundles,
c_string strategy):
c_string strategy,
c_bool is_detached):
cdef:
CPlacementGroupID c_placement_group_id
CPlacementStrategy c_strategy
@@ -1208,7 +1209,8 @@ cdef class CoreWorker:
CPlacementGroupCreationOptions(
name,
c_strategy,
bundles
bundles,
is_detached
),
&c_placement_group_id))
+3 -1
View File
@@ -584,7 +584,9 @@ class ActorClass:
elif lifetime == "detached":
detached = True
else:
raise ValueError("lifetime must be either `None` or 'detached'")
raise ValueError(
"actor `lifetime` argument must be either `None` or 'detached'"
)
if placement_group_capture_child_tasks is None:
placement_group_capture_child_tasks = (
+2 -1
View File
@@ -270,7 +270,8 @@ cdef extern from "ray/core_worker/common.h" nogil:
CPlacementGroupCreationOptions(
const c_string &name,
CPlacementStrategy strategy,
const c_vector[unordered_map[c_string, double]] &bundles
const c_vector[unordered_map[c_string, double]] &bundles,
c_bool is_detached
)
cdef extern from "ray/gcs/gcs_client.h" nogil:
+113
View File
@@ -1309,6 +1309,119 @@ def test_schedule_placement_groups_at_the_same_time():
wait_for_condition(is_all_placement_group_removed)
ray.shutdown()
def test_detached_placement_group(ray_start_cluster):
cluster = ray_start_cluster
for _ in range(2):
cluster.add_node(num_cpus=3)
cluster.wait_for_nodes()
info = ray.init(address=cluster.address)
# Make sure detached placement group will alive when job dead.
driver_code = f"""
import ray
ray.init(address="{info["redis_address"]}")
pg = ray.util.placement_group(
[{{"CPU": 1}} for _ in range(2)],
strategy="STRICT_SPREAD", lifetime="detached")
ray.get(pg.ready())
@ray.remote(num_cpus=1)
class Actor:
def ready(self):
return True
for bundle_index in range(2):
actor = Actor.options(lifetime="detached", placement_group=pg,
placement_group_bundle_index=bundle_index).remote()
ray.get(actor.ready.remote())
ray.shutdown()
"""
run_string_as_driver(driver_code)
# Wait until the driver is reported as dead by GCS.
def is_job_done():
jobs = ray.jobs()
for job in jobs:
if "StopTime" in job:
return True
return False
def assert_alive_num_pg(expected_num_pg):
alive_num_pg = 0
for _, placement_group_info in ray.util.placement_group_table().items(
):
if placement_group_info["state"] == "CREATED":
alive_num_pg += 1
return alive_num_pg == expected_num_pg
def assert_alive_num_actor(expected_num_actor):
alive_num_actor = 0
for actor_info in ray.actors().values():
if actor_info["State"] == ray.gcs_utils.ActorTableData.ALIVE:
alive_num_actor += 1
return alive_num_actor == expected_num_actor
wait_for_condition(is_job_done)
assert assert_alive_num_pg(1)
assert assert_alive_num_actor(2)
# Make sure detached placement group will alive when its creator which
# is detached actor dead.
# Test actors first.
@ray.remote(num_cpus=1)
class NestedActor:
def ready(self):
return True
@ray.remote(num_cpus=1)
class Actor:
def __init__(self):
self.actors = []
def ready(self):
return True
def schedule_nested_actor_with_detached_pg(self):
# Create placement group which is detached.
pg = ray.util.placement_group(
[{
"CPU": 1
} for _ in range(2)],
strategy="STRICT_SPREAD",
lifetime="detached",
name="detached_pg")
ray.get(pg.ready())
# Schedule nested actor with the placement group.
for bundle_index in range(2):
actor = NestedActor.options(
placement_group=pg,
placement_group_bundle_index=bundle_index,
lifetime="detached").remote()
ray.get(actor.ready.remote())
self.actors.append(actor)
a = Actor.options(lifetime="detached").remote()
ray.get(a.ready.remote())
# 1 parent actor and 2 children actor.
ray.get(a.schedule_nested_actor_with_detached_pg.remote())
# Kill an actor and wait until it is killed.
ray.kill(a)
with pytest.raises(ray.exceptions.RayActorError):
ray.get(a.ready.remote())
# We should have 2 alive pgs and 4 alive actors.
assert assert_alive_num_pg(2)
assert assert_alive_num_actor(4)
if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
+15 -2
View File
@@ -145,7 +145,8 @@ class PlacementGroup:
def placement_group(bundles: List[Dict[str, float]],
strategy: str = "PACK",
name: str = "unnamed_group") -> PlacementGroup:
name: str = "unnamed_group",
lifetime=None) -> PlacementGroup:
"""Asynchronously creates a PlacementGroup.
Args:
@@ -160,6 +161,10 @@ def placement_group(bundles: List[Dict[str, float]],
- "STRICT_SPREAD": Packs Bundles across distinct nodes.
name(str): The name of the placement group.
lifetime(str): Either `None`, which defaults to the placement group
will fate share with its creator and will be deleted once its
creator is dead, or "detached", which means the placement group
will live as a global object independent of the creator.
Return:
PlacementGroup: Placement group object.
@@ -179,8 +184,16 @@ def placement_group(bundles: List[Dict[str, float]],
"Bundles cannot be an empty dictionary or "
f"resources with only 0 values. Bundles: {bundles}")
if lifetime is None:
detached = False
elif lifetime == "detached":
detached = True
else:
raise ValueError("placement group `lifetime` argument must be either"
" `None` or 'detached'")
placement_group_id = worker.core_worker.create_placement_group(
name, bundles, strategy)
name, bundles, strategy, detached)
return PlacementGroup(placement_group_id)
+4 -2
View File
@@ -67,8 +67,9 @@ class PlacementGroupSpecBuilder {
PlacementGroupSpecBuilder &SetPlacementGroupSpec(
const PlacementGroupID &placement_group_id, std::string name,
const std::vector<std::unordered_map<std::string, double>> &bundles,
const rpc::PlacementStrategy strategy, const JobID &creator_job_id,
const ActorID &creator_actor_id, bool is_creator_detached_actor) {
const rpc::PlacementStrategy strategy, const bool is_detached,
const JobID &creator_job_id, const ActorID &creator_actor_id,
bool is_creator_detached_actor) {
message_->set_placement_group_id(placement_group_id.Binary());
message_->set_name(name);
message_->set_strategy(strategy);
@@ -82,6 +83,7 @@ class PlacementGroupSpecBuilder {
message_->set_creator_job_dead(is_creator_detached_actor);
message_->set_creator_actor_id(creator_actor_id.Binary());
message_->set_creator_actor_dead(creator_actor_id.IsNil());
message_->set_is_detached(is_detached);
for (size_t i = 0; i < bundles.size(); i++) {
auto resources = bundles[i];
+7 -2
View File
@@ -144,8 +144,11 @@ using PlacementStrategy = rpc::PlacementStrategy;
struct PlacementGroupCreationOptions {
PlacementGroupCreationOptions(
std::string name, PlacementStrategy strategy,
std::vector<std::unordered_map<std::string, double>> bundles)
: name(std::move(name)), strategy(strategy), bundles(std::move(bundles)) {}
std::vector<std::unordered_map<std::string, double>> bundles, bool is_detached)
: name(std::move(name)),
strategy(strategy),
bundles(std::move(bundles)),
is_detached(is_detached) {}
/// The name of the placement group.
const std::string name;
@@ -153,6 +156,8 @@ struct PlacementGroupCreationOptions {
const PlacementStrategy strategy = rpc::PACK;
/// The resource bundles in this placement group.
const std::vector<std::unordered_map<std::string, double>> bundles;
/// Whether to keep the placement group persistent after its creator dead.
const bool is_detached = false;
};
} // namespace ray
+2 -2
View File
@@ -1463,8 +1463,8 @@ Status CoreWorker::CreatePlacementGroup(
builder.SetPlacementGroupSpec(
placement_group_id, placement_group_creation_options.name,
placement_group_creation_options.bundles, placement_group_creation_options.strategy,
worker_context_.GetCurrentJobID(), worker_context_.GetCurrentActorID(),
worker_context_.CurrentActorDetached());
placement_group_creation_options.is_detached, worker_context_.GetCurrentJobID(),
worker_context_.GetCurrentActorID(), worker_context_.CurrentActorDetached());
PlacementGroupSpecification placement_group_spec = builder.Build();
*return_placement_group_id = placement_group_id;
RAY_LOG(INFO) << "Submitting Placement Group creation to GCS: " << placement_group_id;
@@ -201,7 +201,8 @@ inline ray::PlacementGroupCreationOptions ToPlacementGroupCreationOptions(
});
});
return ray::PlacementGroupCreationOptions(JavaStringToNativeString(env, name),
ConvertStrategy(java_strategy), bundles);
ConvertStrategy(java_strategy), bundles,
/*is_detached=*/false);
}
#ifdef __cplusplus
@@ -96,11 +96,15 @@ void GcsPlacementGroup::MarkCreatorActorDead() {
placement_group_table_data_.set_creator_actor_dead(true);
}
bool GcsPlacementGroup::IsPlacementGroupRemovable() const {
return placement_group_table_data_.creator_job_dead() &&
bool GcsPlacementGroup::IsPlacementGroupLifetimeDone() const {
return !IsDetached() && placement_group_table_data_.creator_job_dead() &&
placement_group_table_data_.creator_actor_dead();
}
bool GcsPlacementGroup::IsDetached() const {
return placement_group_table_data_.is_detached();
}
/////////////////////////////////////////////////////////////////////////////////////////
GcsPlacementGroupManager::GcsPlacementGroupManager(
@@ -495,7 +499,7 @@ void GcsPlacementGroupManager::CleanPlacementGroupIfNeededWhenJobDead(
continue;
}
placement_group->MarkCreatorJobDead();
if (placement_group->IsPlacementGroupRemovable()) {
if (placement_group->IsPlacementGroupLifetimeDone()) {
RemovePlacementGroup(placement_group->GetPlacementGroupID(), [](Status status) {});
}
}
@@ -509,7 +513,7 @@ void GcsPlacementGroupManager::CleanPlacementGroupIfNeededWhenActorDead(
continue;
}
placement_group->MarkCreatorActorDead();
if (placement_group->IsPlacementGroupRemovable()) {
if (placement_group->IsPlacementGroupLifetimeDone()) {
RemovePlacementGroup(placement_group->GetPlacementGroupID(), [](Status status) {});
}
}
@@ -61,6 +61,7 @@ class GcsPlacementGroup {
placement_group_spec.creator_job_dead());
placement_group_table_data_.set_creator_actor_dead(
placement_group_spec.creator_actor_dead());
placement_group_table_data_.set_is_detached(placement_group_spec.is_detached());
}
/// Get the immutable PlacementGroupTableData of this placement group.
@@ -107,8 +108,11 @@ class GcsPlacementGroup {
/// Mark that the creator actor of this placement group is dead.
void MarkCreatorActorDead();
/// Return True if the placement group is removable. False otherwise.
bool IsPlacementGroupRemovable() const;
/// Return True if the placement group lifetime is done. False otherwise.
bool IsPlacementGroupLifetimeDone() const;
/// Returns whether or not this is a detached placement group.
bool IsDetached() const;
private:
/// The placement_group meta data which contains the task specification as well as the
+3 -2
View File
@@ -101,8 +101,9 @@ struct Mocker {
PlacementGroupSpecBuilder builder;
auto placement_group_id = PlacementGroupID::FromRandom();
builder.SetPlacementGroupSpec(placement_group_id, name, bundles, strategy, job_id,
actor_id, /* is_creator_detached */ false);
builder.SetPlacementGroupSpec(placement_group_id, name, bundles, strategy,
/* is_detached */ false, job_id, actor_id,
/* is_creator_detached */ false);
return builder.Build();
}
+2
View File
@@ -233,6 +233,8 @@ message PlacementGroupSpec {
bool creator_job_dead = 7;
// Whether or not if the creator actor is dead.
bool creator_actor_dead = 8;
// Whether the placement group is persistent.
bool is_detached = 9;
}
message ObjectReference {
+2
View File
@@ -191,6 +191,8 @@ message PlacementGroupTableData {
bool creator_job_dead = 8;
// Whether or not if the creator actor is dead.
bool creator_actor_dead = 9;
// Whether the placement group is persistent.
bool is_detached = 10;
}
message ScheduleData {