mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:22:39 +08:00
[Core]Fix ray.kill doesn't cancel pending actor bug (#14025)
This commit is contained in:
@@ -1093,6 +1093,90 @@ def test_actor_resource_demand(shutdown_only):
|
||||
global_state_accessor.disconnect()
|
||||
|
||||
|
||||
def test_kill_pending_actor_with_no_restart_true():
|
||||
cluster = ray.init()
|
||||
global_state_accessor = GlobalStateAccessor(
|
||||
cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
global_state_accessor.connect()
|
||||
|
||||
@ray.remote(resources={"WORKER": 1.0})
|
||||
class PendingActor:
|
||||
pass
|
||||
|
||||
# Kill actor with `no_restart=True`.
|
||||
actor = PendingActor.remote()
|
||||
# TODO(ffbin): The raylet doesn't guarantee the order when dealing with
|
||||
# RequestWorkerLease and CancelWorkerLease. If we kill the actor
|
||||
# immediately after creating the actor, we may not be able to clean up
|
||||
# the request cached by the raylet.
|
||||
# See https://github.com/ray-project/ray/issues/13545 for details.
|
||||
time.sleep(1)
|
||||
ray.kill(actor, no_restart=True)
|
||||
|
||||
def condition1():
|
||||
message = global_state_accessor.get_all_resource_usage()
|
||||
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(
|
||||
message)
|
||||
if len(resource_usages.resource_load_by_shape.resource_demands) == 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
# Actor is dead, so the infeasible task queue length is 0.
|
||||
wait_for_condition(condition1, timeout=10)
|
||||
|
||||
global_state_accessor.disconnect()
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def test_kill_pending_actor_with_no_restart_false():
|
||||
cluster = ray.init()
|
||||
global_state_accessor = GlobalStateAccessor(
|
||||
cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
global_state_accessor.connect()
|
||||
|
||||
@ray.remote(resources={"WORKER": 1.0}, max_restarts=1)
|
||||
class PendingActor:
|
||||
pass
|
||||
|
||||
# Kill actor with `no_restart=False`.
|
||||
actor = PendingActor.remote()
|
||||
# TODO(ffbin): The raylet doesn't guarantee the order when dealing with
|
||||
# RequestWorkerLease and CancelWorkerLease. If we kill the actor
|
||||
# immediately after creating the actor, we may not be able to clean up
|
||||
# the request cached by the raylet.
|
||||
# See https://github.com/ray-project/ray/issues/13545 for details.
|
||||
time.sleep(1)
|
||||
ray.kill(actor, no_restart=False)
|
||||
|
||||
def condition1():
|
||||
message = global_state_accessor.get_all_resource_usage()
|
||||
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(
|
||||
message)
|
||||
if len(resource_usages.resource_load_by_shape.resource_demands) == 0:
|
||||
return False
|
||||
return True
|
||||
|
||||
# Actor restarts, so the infeasible task queue length is 1.
|
||||
wait_for_condition(condition1, timeout=10)
|
||||
|
||||
# Kill actor again and actor is dead,
|
||||
# so the infeasible task queue length is 0.
|
||||
ray.kill(actor, no_restart=False)
|
||||
|
||||
def condition2():
|
||||
message = global_state_accessor.get_all_resource_usage()
|
||||
resource_usages = ray.gcs_utils.ResourceUsageBatchData.FromString(
|
||||
message)
|
||||
if len(resource_usages.resource_load_by_shape.resource_demands) == 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
wait_for_condition(condition2, timeout=10)
|
||||
|
||||
global_state_accessor.disconnect()
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
# Test suite is timing out. Disable on windows for now.
|
||||
|
||||
@@ -754,12 +754,15 @@ def test_warning_for_too_many_actors(shutdown_only):
|
||||
def __init__(self):
|
||||
time.sleep(1000)
|
||||
|
||||
[Foo.remote() for _ in range(num_cpus * 3)]
|
||||
# NOTE: We should save actor, otherwise it will be out of scope.
|
||||
actors = [Foo.remote() for _ in range(num_cpus * 3)]
|
||||
assert len(actors) == num_cpus * 3
|
||||
errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
|
||||
|
||||
[Foo.remote() for _ in range(num_cpus)]
|
||||
actors = [Foo.remote() for _ in range(num_cpus)]
|
||||
assert len(actors) == num_cpus
|
||||
errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR
|
||||
|
||||
@@ -902,8 +902,10 @@ def test_capture_child_actors(ray_start_cluster):
|
||||
|
||||
# Kill an actor and wait until it is killed.
|
||||
ray.kill(a)
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
try:
|
||||
ray.get(a.ready.remote())
|
||||
except ray.exceptions.RayActorError:
|
||||
pass
|
||||
|
||||
# Now create an actor, but do not capture the current tasks
|
||||
a = Actor.options(
|
||||
@@ -925,8 +927,10 @@ def test_capture_child_actors(ray_start_cluster):
|
||||
|
||||
# Kill an actor and wait until it is killed.
|
||||
ray.kill(a)
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
try:
|
||||
ray.get(a.ready.remote())
|
||||
except ray.exceptions.RayActorError:
|
||||
pass
|
||||
|
||||
# Lastly, make sure when None is specified, actors are not scheduled
|
||||
# on the same placement group.
|
||||
@@ -1416,8 +1420,10 @@ ray.shutdown()
|
||||
|
||||
# Kill an actor and wait until it is killed.
|
||||
ray.kill(a)
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
try:
|
||||
ray.get(a.ready.remote())
|
||||
except ray.exceptions.RayActorError:
|
||||
pass
|
||||
|
||||
# We should have 2 alive pgs and 4 alive actors.
|
||||
assert assert_alive_num_pg(2)
|
||||
|
||||
@@ -199,17 +199,19 @@ def test_custom_resources(ray_start_regular_shared):
|
||||
assert current_resources["CPU"] == 1.0
|
||||
|
||||
# By default an actor should not reserve any resources.
|
||||
Queue()
|
||||
q = Queue()
|
||||
current_resources = ray.available_resources()
|
||||
assert current_resources["CPU"] == 1.0
|
||||
q.shutdown()
|
||||
|
||||
# Specify resource requirement. The queue should now reserve 1 CPU.
|
||||
Queue(actor_options={"num_cpus": 1})
|
||||
q = Queue(actor_options={"num_cpus": 1})
|
||||
|
||||
def no_cpu_in_resources():
|
||||
return "CPU" not in ray.available_resources()
|
||||
|
||||
wait_for_condition(no_cpu_in_resources)
|
||||
q.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -470,8 +470,10 @@ def test_actor_holding_serialized_reference(one_worker_100MiB, use_ray_put,
|
||||
# Test that the actor exiting stops the reference from being pinned.
|
||||
ray.kill(actor)
|
||||
# Wait for the actor to exit.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
try:
|
||||
ray.get(actor.delete_ref1.remote())
|
||||
except ray.exceptions.RayActorError:
|
||||
pass
|
||||
else:
|
||||
# Test that deleting the second reference stops it from being pinned.
|
||||
ray.get(actor.delete_ref2.remote())
|
||||
|
||||
Reference in New Issue
Block a user