mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 09:29:05 +08:00
[Placement Group] Placement group automatic cleanup. (#11546)
* In progress. Done with all placement group manager code. * It is working with job. * Finished detached actor implementation. * Fix minor issue. * In progress. * Addressed code review. * Addressed code review. * Addressed code reivew. * Fix a build error.
This commit is contained in:
@@ -12,6 +12,7 @@ from ray.test_utils import (get_other_nodes, wait_for_condition,
|
||||
get_error_message)
|
||||
import ray.cluster_utils
|
||||
from ray._raylet import PlacementGroupID
|
||||
from ray.test_utils import run_string_as_driver
|
||||
from ray.util.placement_group import (PlacementGroup,
|
||||
get_current_placement_group)
|
||||
|
||||
@@ -995,5 +996,165 @@ def test_ready_warning_suppressed(ray_start_regular, error_pubsub):
|
||||
assert len(errors) == 0
|
||||
|
||||
|
||||
def test_automatic_cleanup_job(ray_start_cluster):
|
||||
# Make sure the placement groups created by a
|
||||
# job, actor, and task are cleaned when the job is done.
|
||||
cluster = ray_start_cluster
|
||||
num_nodes = 3
|
||||
num_cpu_per_node = 4
|
||||
# Create 3 nodes cluster.
|
||||
for _ in range(num_nodes):
|
||||
cluster.add_node(num_cpus=num_cpu_per_node)
|
||||
|
||||
info = ray.init(address=cluster.address)
|
||||
available_cpus = ray.available_resources()["CPU"]
|
||||
assert available_cpus == num_nodes * num_cpu_per_node
|
||||
|
||||
driver_code = f"""
|
||||
import ray
|
||||
|
||||
ray.init(address="{info["redis_address"]}")
|
||||
|
||||
def create_pg():
|
||||
pg = ray.util.placement_group(
|
||||
[{{"CPU": 1}} for _ in range(3)],
|
||||
strategy="STRICT_SPREAD")
|
||||
ray.get(pg.ready())
|
||||
return pg
|
||||
|
||||
@ray.remote(num_cpus=0)
|
||||
def f():
|
||||
create_pg()
|
||||
|
||||
@ray.remote(num_cpus=0)
|
||||
class A:
|
||||
def create_pg(self):
|
||||
create_pg()
|
||||
|
||||
ray.get(f.remote())
|
||||
a = A.remote()
|
||||
ray.get(a.create_pg.remote())
|
||||
# Create 2 pgs to make sure multiple placement groups that belong
|
||||
# to a single job will be properly cleaned.
|
||||
create_pg()
|
||||
create_pg()
|
||||
|
||||
ray.shutdown()
|
||||
"""
|
||||
|
||||
run_string_as_driver(driver_code)
|
||||
|
||||
# Wait until the driver is reported as dead by GCS.
|
||||
def is_job_done():
|
||||
jobs = ray.jobs()
|
||||
for job in jobs:
|
||||
if "StopTime" in job:
|
||||
return True
|
||||
return False
|
||||
|
||||
def assert_num_cpus(expected_num_cpus):
|
||||
if expected_num_cpus == 0:
|
||||
return "CPU" not in ray.available_resources()
|
||||
return ray.available_resources()["CPU"] == expected_num_cpus
|
||||
|
||||
wait_for_condition(is_job_done)
|
||||
available_cpus = ray.available_resources()["CPU"]
|
||||
wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
|
||||
|
||||
|
||||
def test_automatic_cleanup_detached_actors(ray_start_cluster):
|
||||
# Make sure the placement groups created by a
|
||||
# detached actors are cleaned properly.
|
||||
cluster = ray_start_cluster
|
||||
num_nodes = 3
|
||||
num_cpu_per_node = 2
|
||||
# Create 3 nodes cluster.
|
||||
for _ in range(num_nodes):
|
||||
cluster.add_node(num_cpus=num_cpu_per_node)
|
||||
|
||||
info = ray.init(address=cluster.address)
|
||||
available_cpus = ray.available_resources()["CPU"]
|
||||
assert available_cpus == num_nodes * num_cpu_per_node
|
||||
|
||||
driver_code = f"""
|
||||
import ray
|
||||
|
||||
ray.init(address="{info["redis_address"]}")
|
||||
|
||||
def create_pg():
|
||||
pg = ray.util.placement_group(
|
||||
[{{"CPU": 1}} for _ in range(3)],
|
||||
strategy="STRICT_SPREAD")
|
||||
ray.get(pg.ready())
|
||||
return pg
|
||||
|
||||
# TODO(sang): Placement groups created by tasks launched by detached actor
|
||||
# is not cleaned with the current protocol.
|
||||
# @ray.remote(num_cpus=0)
|
||||
# def f():
|
||||
# create_pg()
|
||||
|
||||
@ray.remote(num_cpus=0, max_restarts=1)
|
||||
class A:
|
||||
def create_pg(self):
|
||||
create_pg()
|
||||
def create_child_pg(self):
|
||||
self.a = A.options(name="B").remote()
|
||||
ray.get(self.a.create_pg.remote())
|
||||
def kill_child_actor(self):
|
||||
ray.kill(self.a)
|
||||
try:
|
||||
ray.get(self.a.create_pg.remote())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
a = A.options(lifetime="detached", name="A").remote()
|
||||
ray.get(a.create_pg.remote())
|
||||
# TODO(sang): Currently, child tasks are cleaned when a detached actor
|
||||
# is dead. We cannot test this scenario until it is fixed.
|
||||
# ray.get(a.create_child_pg.remote())
|
||||
|
||||
ray.shutdown()
|
||||
"""
|
||||
|
||||
run_string_as_driver(driver_code)
|
||||
|
||||
# Wait until the driver is reported as dead by GCS.
|
||||
def is_job_done():
|
||||
jobs = ray.jobs()
|
||||
for job in jobs:
|
||||
if "StopTime" in job:
|
||||
return True
|
||||
return False
|
||||
|
||||
def assert_num_cpus(expected_num_cpus):
|
||||
if expected_num_cpus == 0:
|
||||
return "CPU" not in ray.available_resources()
|
||||
return ray.available_resources()["CPU"] == expected_num_cpus
|
||||
|
||||
wait_for_condition(is_job_done)
|
||||
assert assert_num_cpus(num_nodes)
|
||||
# Make sure when a child actor spawned by a detached actor
|
||||
# is killed, the placement group is removed.
|
||||
a = ray.get_actor("A")
|
||||
# TODO(sang): child of detached actors
|
||||
# seem to be killed when jobs are done. We should fix this before
|
||||
# testing this scenario.
|
||||
# ray.get(a.kill_child_actor.remote())
|
||||
# assert assert_num_cpus(num_nodes)
|
||||
|
||||
# Make sure placement groups are cleaned when detached actors are killed.
|
||||
ray.kill(a, no_restart=False)
|
||||
wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
|
||||
# The detached actor a should've been restarted.
|
||||
# Recreate a placement group.
|
||||
ray.get(a.create_pg.remote())
|
||||
wait_for_condition(lambda: assert_num_cpus(num_nodes))
|
||||
# Kill it again and make sure the placement group
|
||||
# that is created is deleted again.
|
||||
ray.kill(a, no_restart=False)
|
||||
wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
Reference in New Issue
Block a user