Clean up actor state from the GCS (#8261)

* parametrize test

* Regression test and logging

* Test no restart after actor deletion

* Unit tests

* Refactor to subscribe to and lookup from worker failure table

* Refactor ActorManager to remove dependencies

* Revert "Regression test and logging"

This reverts commit 835e1a9091b51ca8efb00392d4cc4a665145de24.

* Revert "parametrize test"

This reverts commit f31272082831ba1a494816dd5511d87b24eca4c9.

* Revert "Test no restart after actor deletion"

This reverts commit 114a83de14329aa6ab787c80cd5757cf074a9072.

* doc

* merge

* Revert "Refactor to subscribe to and lookup from worker failure table"

This reverts commit 6aa13a05178d0b9aa1db9dee5c978c911b74fa3a.

* Revert "Revert "Test no restart after actor deletion""

This reverts commit 1bd92d09172aa8ab42632551cf9c56463f9598fe.

* Revert "Revert "parametrize test""

This reverts commit 639ba4d3b02167fb2b05e9878f9aa600bcec95b3.

* Revert "Revert "Regression test and logging""

This reverts commit f18b5f0db699a23cbccde32789e3639425e99ca4.

* Clean up actors that have gone out of scope

* Use actor ID instead of shared_ptr

* Clean up actors owned by dead workers

* Use actor ID instead of shared_ptr

* TODO and lint

* Fix unit tests

* Add unit tests for supervision and docs

* xx

* Fix tests

* Fix tests

* fix build
This commit is contained in:
Stephanie Wang
2020-05-09 18:43:49 -07:00
committed by GitHub
parent 4421f3a000
commit 3a25f5f5b4
18 changed files with 672 additions and 109 deletions
+26
View File
@@ -699,6 +699,32 @@ def test_use_actor_within_actor(ray_start_10_cpus):
assert ray.get(actor2.get_values.remote(5)) == (3, 4)
def test_use_actor_twice(ray_start_10_cpus):
# Make sure we can call the same actor using different refs.
@ray.remote
class Actor1:
def __init__(self):
self.count = 0
def inc(self):
self.count += 1
return self.count
@ray.remote
class Actor2:
def __init__(self):
pass
def inc(self, handle):
return ray.get(handle.inc.remote())
a = Actor1.remote()
a2 = Actor2.remote()
assert ray.get(a2.inc.remote(a)) == 1
assert ray.get(a2.inc.remote(a)) == 2
def test_define_actor_within_remote_function(ray_start_10_cpus):
# Make sure we can define and actors within remote funtions.
+41
View File
@@ -110,6 +110,47 @@ def test_actor_lifetime_load_balancing(ray_start_cluster):
ray.get([actor.ping.remote() for actor in actors])
@pytest.mark.parametrize(
"ray_start_regular", [{
"resources": {
"actor": 1
},
"num_cpus": 2,
}],
indirect=True)
def test_deleted_actor_no_restart(ray_start_regular):
@ray.remote(resources={"actor": 1}, max_reconstructions=3)
class Actor:
def method(self):
return 1
def getpid(self):
return os.getpid()
@ray.remote
def f(actor, signal):
ray.get(signal.wait.remote())
return ray.get(actor.method.remote())
signal = ray.test_utils.SignalActor.remote()
a = Actor.remote()
pid = ray.get(a.getpid.remote())
# Pass the handle to another task that cannot run yet.
x_id = f.remote(a, signal)
# Delete the original handle. The actor should not get killed yet.
del a
# Once the task finishes, the actor process should get killed.
ray.get(signal.send.remote())
assert ray.get(x_id) == 1
ray.test_utils.wait_for_pid_to_exit(pid)
# Create another actor with the same resource requirement to make sure the
# old one was not restarted.
a = Actor.remote()
pid = ray.get(a.getpid.remote())
def test_exception_raised_when_actor_node_dies(ray_start_cluster_head):
cluster = ray_start_cluster_head
remote_node = cluster.add_node()
+21 -5
View File
@@ -1051,7 +1051,10 @@ def test_serialized_id(ray_start_cluster):
ray.get(get.remote([obj], True))
def test_fate_sharing(ray_start_cluster):
@pytest.mark.parametrize("use_actors,node_failure",
[(False, False), (False, True), (True, False),
(True, True)])
def test_fate_sharing(ray_start_cluster, use_actors, node_failure):
config = json.dumps({
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
@@ -1074,6 +1077,9 @@ def test_fate_sharing(ray_start_cluster):
def probe():
return
# TODO(swang): This test does not pass if max_reconstructions > 0 for the
# raylet codepath. Add this parameter once the GCS actor service is enabled
# by default.
@ray.remote
class Actor(object):
def __init__(self):
@@ -1121,10 +1127,20 @@ def test_fate_sharing(ray_start_cluster):
assert wait_for_condition(child_resource_available)
return node_to_kill
test_process_failure(use_actors=True)
test_process_failure(use_actors=False)
node_to_kill = test_node_failure(node_to_kill, use_actors=True)
node_to_kill = test_node_failure(node_to_kill, use_actors=False)
if node_failure:
test_node_failure(node_to_kill, use_actors)
else:
test_process_failure(use_actors)
ray.state.state._check_connected()
keys = [
key for r in ray.state.state.redis_clients
for key in r.keys("WORKER_FAILURE*")
]
if node_failure:
assert len(keys) <= 1, len(keys)
else:
assert len(keys) <= 2, len(keys)
if __name__ == "__main__":