mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 02:46:49 +08:00
Clean up actor state from the GCS (#8261)
* parametrize test * Regression test and logging * Test no restart after actor deletion * Unit tests * Refactor to subscribe to and lookup from worker failure table * Refactor ActorManager to remove dependencies * Revert "Regression test and logging" This reverts commit 835e1a9091b51ca8efb00392d4cc4a665145de24. * Revert "parametrize test" This reverts commit f31272082831ba1a494816dd5511d87b24eca4c9. * Revert "Test no restart after actor deletion" This reverts commit 114a83de14329aa6ab787c80cd5757cf074a9072. * doc * merge * Revert "Refactor to subscribe to and lookup from worker failure table" This reverts commit 6aa13a05178d0b9aa1db9dee5c978c911b74fa3a. * Revert "Revert "Test no restart after actor deletion"" This reverts commit 1bd92d09172aa8ab42632551cf9c56463f9598fe. * Revert "Revert "parametrize test"" This reverts commit 639ba4d3b02167fb2b05e9878f9aa600bcec95b3. * Revert "Revert "Regression test and logging"" This reverts commit f18b5f0db699a23cbccde32789e3639425e99ca4. * Clean up actors that have gone out of scope * Use actor ID instead of shared_ptr * Clean up actors owned by dead workers * Use actor ID instead of shared_ptr * TODO and lint * Fix unit tests * Add unit tests for supervision and docs * xx * Fix tests * Fix tests * fix build
This commit is contained in:
@@ -699,6 +699,32 @@ def test_use_actor_within_actor(ray_start_10_cpus):
|
||||
assert ray.get(actor2.get_values.remote(5)) == (3, 4)
|
||||
|
||||
|
||||
def test_use_actor_twice(ray_start_10_cpus):
|
||||
# Make sure we can call the same actor using different refs.
|
||||
|
||||
@ray.remote
|
||||
class Actor1:
|
||||
def __init__(self):
|
||||
self.count = 0
|
||||
|
||||
def inc(self):
|
||||
self.count += 1
|
||||
return self.count
|
||||
|
||||
@ray.remote
|
||||
class Actor2:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def inc(self, handle):
|
||||
return ray.get(handle.inc.remote())
|
||||
|
||||
a = Actor1.remote()
|
||||
a2 = Actor2.remote()
|
||||
assert ray.get(a2.inc.remote(a)) == 1
|
||||
assert ray.get(a2.inc.remote(a)) == 2
|
||||
|
||||
|
||||
def test_define_actor_within_remote_function(ray_start_10_cpus):
|
||||
# Make sure we can define and actors within remote funtions.
|
||||
|
||||
|
||||
@@ -110,6 +110,47 @@ def test_actor_lifetime_load_balancing(ray_start_cluster):
|
||||
ray.get([actor.ping.remote() for actor in actors])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_regular", [{
|
||||
"resources": {
|
||||
"actor": 1
|
||||
},
|
||||
"num_cpus": 2,
|
||||
}],
|
||||
indirect=True)
|
||||
def test_deleted_actor_no_restart(ray_start_regular):
|
||||
@ray.remote(resources={"actor": 1}, max_reconstructions=3)
|
||||
class Actor:
|
||||
def method(self):
|
||||
return 1
|
||||
|
||||
def getpid(self):
|
||||
return os.getpid()
|
||||
|
||||
@ray.remote
|
||||
def f(actor, signal):
|
||||
ray.get(signal.wait.remote())
|
||||
return ray.get(actor.method.remote())
|
||||
|
||||
signal = ray.test_utils.SignalActor.remote()
|
||||
a = Actor.remote()
|
||||
pid = ray.get(a.getpid.remote())
|
||||
# Pass the handle to another task that cannot run yet.
|
||||
x_id = f.remote(a, signal)
|
||||
# Delete the original handle. The actor should not get killed yet.
|
||||
del a
|
||||
|
||||
# Once the task finishes, the actor process should get killed.
|
||||
ray.get(signal.send.remote())
|
||||
assert ray.get(x_id) == 1
|
||||
ray.test_utils.wait_for_pid_to_exit(pid)
|
||||
|
||||
# Create another actor with the same resource requirement to make sure the
|
||||
# old one was not restarted.
|
||||
a = Actor.remote()
|
||||
pid = ray.get(a.getpid.remote())
|
||||
|
||||
|
||||
def test_exception_raised_when_actor_node_dies(ray_start_cluster_head):
|
||||
cluster = ray_start_cluster_head
|
||||
remote_node = cluster.add_node()
|
||||
|
||||
@@ -1051,7 +1051,10 @@ def test_serialized_id(ray_start_cluster):
|
||||
ray.get(get.remote([obj], True))
|
||||
|
||||
|
||||
def test_fate_sharing(ray_start_cluster):
|
||||
@pytest.mark.parametrize("use_actors,node_failure",
|
||||
[(False, False), (False, True), (True, False),
|
||||
(True, True)])
|
||||
def test_fate_sharing(ray_start_cluster, use_actors, node_failure):
|
||||
config = json.dumps({
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
@@ -1074,6 +1077,9 @@ def test_fate_sharing(ray_start_cluster):
|
||||
def probe():
|
||||
return
|
||||
|
||||
# TODO(swang): This test does not pass if max_reconstructions > 0 for the
|
||||
# raylet codepath. Add this parameter once the GCS actor service is enabled
|
||||
# by default.
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def __init__(self):
|
||||
@@ -1121,10 +1127,20 @@ def test_fate_sharing(ray_start_cluster):
|
||||
assert wait_for_condition(child_resource_available)
|
||||
return node_to_kill
|
||||
|
||||
test_process_failure(use_actors=True)
|
||||
test_process_failure(use_actors=False)
|
||||
node_to_kill = test_node_failure(node_to_kill, use_actors=True)
|
||||
node_to_kill = test_node_failure(node_to_kill, use_actors=False)
|
||||
if node_failure:
|
||||
test_node_failure(node_to_kill, use_actors)
|
||||
else:
|
||||
test_process_failure(use_actors)
|
||||
|
||||
ray.state.state._check_connected()
|
||||
keys = [
|
||||
key for r in ray.state.state.redis_clients
|
||||
for key in r.keys("WORKER_FAILURE*")
|
||||
]
|
||||
if node_failure:
|
||||
assert len(keys) <= 1, len(keys)
|
||||
else:
|
||||
assert len(keys) <= 2, len(keys)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user