mirror of
https://github.com/wassname/ray.git
synced 2026-07-04 00:17:00 +08:00
[Core] Clean up detached actors (#8759)
This commit is contained in:
@@ -668,7 +668,9 @@ def test_detached_actor(ray_start_regular):
|
||||
ValueError, match="Actor name cannot be an empty string"):
|
||||
DetachedActor._remote(name="")
|
||||
|
||||
DetachedActor._remote(name="d_actor")
|
||||
d = DetachedActor._remote(name="d_actor")
|
||||
assert ray.get(d.ping.remote()) == "pong"
|
||||
|
||||
with pytest.raises(ValueError, match="Please use a different name"):
|
||||
DetachedActor._remote(name="d_actor")
|
||||
|
||||
@@ -697,6 +699,141 @@ ray.get(actor.ping.remote())
|
||||
assert ray.get(detached_actor.ping.remote()) == "pong"
|
||||
|
||||
|
||||
def test_detached_actor_cleanup(ray_start_regular):
|
||||
@ray.remote
|
||||
class DetachedActor:
|
||||
def ping(self):
|
||||
return "pong"
|
||||
|
||||
dup_actor_name = "actor"
|
||||
|
||||
def create_and_kill_actor(actor_name):
|
||||
# Make sure same name is creatable after killing it.
|
||||
detached_actor = DetachedActor.options(name=actor_name).remote()
|
||||
# Wait for detached actor creation.
|
||||
assert ray.get(detached_actor.ping.remote()) == "pong"
|
||||
ray.kill(detached_actor)
|
||||
# Wait until actor dies.
|
||||
actor_status = ray.actors(actor_id=detached_actor._actor_id.hex())
|
||||
max_wait_time = 10
|
||||
wait_time = 0
|
||||
while actor_status["State"] != 3:
|
||||
actor_status = ray.actors(actor_id=detached_actor._actor_id.hex())
|
||||
time.sleep(1.0)
|
||||
wait_time += 1
|
||||
if wait_time >= max_wait_time:
|
||||
assert None, (
|
||||
"It took too much time to kill an actor: {}".format(
|
||||
detached_actor._actor_id))
|
||||
|
||||
create_and_kill_actor(dup_actor_name)
|
||||
|
||||
# This shouldn't be broken because actor
|
||||
# name should have been cleaned up from GCS.
|
||||
create_and_kill_actor(dup_actor_name)
|
||||
|
||||
redis_address = ray_start_regular["redis_address"]
|
||||
driver_script = """
|
||||
import ray
|
||||
import time
|
||||
ray.init(address="{}")
|
||||
|
||||
@ray.remote
|
||||
class DetachedActor:
|
||||
def ping(self):
|
||||
return "pong"
|
||||
|
||||
# Make sure same name is creatable after killing it.
|
||||
detached_actor = DetachedActor.options(name="{}").remote()
|
||||
assert ray.get(detached_actor.ping.remote()) == "pong"
|
||||
ray.kill(detached_actor)
|
||||
# Wait until actor dies.
|
||||
actor_status = ray.actors(actor_id=detached_actor._actor_id.hex())
|
||||
max_wait_time = 10
|
||||
wait_time = 0
|
||||
while actor_status["State"] != 3:
|
||||
actor_status = ray.actors(actor_id=detached_actor._actor_id.hex())
|
||||
time.sleep(1.0)
|
||||
wait_time += 1
|
||||
if wait_time >= max_wait_time:
|
||||
assert None, (
|
||||
"It took too much time to kill an actor")
|
||||
""".format(redis_address, dup_actor_name)
|
||||
|
||||
run_string_as_driver(driver_script)
|
||||
# Make sure we can create a detached actor created/killed
|
||||
# at other scripts.
|
||||
create_and_kill_actor(dup_actor_name)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster", [{
|
||||
"num_cpus": 3,
|
||||
"num_nodes": 1,
|
||||
"resources": {
|
||||
"first_node": 5
|
||||
}
|
||||
}],
|
||||
indirect=True)
|
||||
def test_detached_actor_cleanup_due_to_failure(ray_start_cluster):
|
||||
cluster = ray_start_cluster
|
||||
node = cluster.add_node(resources={"second_node": 1})
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
@ray.remote
|
||||
class DetachedActor:
|
||||
def ping(self):
|
||||
return "pong"
|
||||
|
||||
def kill_itself(self):
|
||||
# kill itself.
|
||||
os._exit(0)
|
||||
|
||||
worker_failure_actor_name = "worker_failure_actor_name"
|
||||
node_failure_actor_name = "node_failure_actor_name"
|
||||
|
||||
def wait_until_actor_dead(handle):
|
||||
actor_status = ray.actors(actor_id=handle._actor_id.hex())
|
||||
max_wait_time = 10
|
||||
wait_time = 0
|
||||
while actor_status["State"] != 3:
|
||||
actor_status = ray.actors(actor_id=handle._actor_id.hex())
|
||||
time.sleep(1.0)
|
||||
wait_time += 1
|
||||
if wait_time >= max_wait_time:
|
||||
assert None, (
|
||||
"It took too much time to kill an actor: {}".format(
|
||||
handle._actor_id))
|
||||
|
||||
def create_detached_actor_blocking(actor_name,
|
||||
schedule_in_second_node=False):
|
||||
resources = {"second_node": 1}\
|
||||
if schedule_in_second_node\
|
||||
else {"first_node": 1}
|
||||
actor_handle = DetachedActor.options(
|
||||
name=actor_name, resources=resources).remote()
|
||||
# Wait for detached actor creation.
|
||||
assert ray.get(actor_handle.ping.remote()) == "pong"
|
||||
return actor_handle
|
||||
|
||||
# Name should be cleaned when workers fail
|
||||
deatched_actor = create_detached_actor_blocking(worker_failure_actor_name)
|
||||
deatched_actor.kill_itself.remote()
|
||||
wait_until_actor_dead(deatched_actor)
|
||||
# Name should be available now.
|
||||
deatched_actor = create_detached_actor_blocking(worker_failure_actor_name)
|
||||
assert ray.get(deatched_actor.ping.remote()) == "pong"
|
||||
|
||||
# Name should be cleaned when nodes fail.
|
||||
deatched_actor = create_detached_actor_blocking(
|
||||
node_failure_actor_name, schedule_in_second_node=True)
|
||||
cluster.remove_node(node)
|
||||
wait_until_actor_dead(deatched_actor)
|
||||
# Name should be available now.
|
||||
deatched_actor = create_detached_actor_blocking(node_failure_actor_name)
|
||||
assert ray.get(deatched_actor.ping.remote()) == "pong"
|
||||
|
||||
|
||||
def test_kill(ray_start_regular):
|
||||
@ray.remote
|
||||
class Actor:
|
||||
|
||||
@@ -3,6 +3,7 @@ import logging
|
||||
import ray
|
||||
import ray.cloudpickle as pickle
|
||||
from ray.experimental.internal_kv import _internal_kv_get, _internal_kv_put
|
||||
from ray.gcs_utils import ActorTableData
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -30,7 +31,14 @@ def _get_actor(name):
|
||||
raise ValueError(
|
||||
"The actor with name={} doesn't exist".format(name))
|
||||
handle = pickle.loads(pickled_state)
|
||||
|
||||
# If the actor state is dead, that means that this name is reusable.
|
||||
# We don't delete the name entry from key value store when
|
||||
# the actor is killed because ray.kill is asynchronous,
|
||||
# and it can cause worker leaks.
|
||||
actor_info = ray.actors(actor_id=handle._actor_id.hex())
|
||||
actor_state = actor_info.get("State", None)
|
||||
if actor_state and actor_state == ActorTableData.DEAD:
|
||||
raise ValueError("The actor with name={} is dead.".format(name))
|
||||
return handle
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user