Signal actor failure (#4196)

This commit is contained in:
Ion
2019-03-21 15:17:42 -07:00
committed by Philipp Moritz
parent c36d03874b
commit 59079a799c
4 changed files with 86 additions and 8 deletions
+19
View File
@@ -161,3 +161,22 @@ def call_ray_start(request):
ray.shutdown()
# Kill the Ray cluster.
subprocess.Popen(["ray", "stop"]).wait()
@pytest.fixture()
def two_node_cluster():
internal_config = json.dumps({
"initial_reconstruction_timeout_milliseconds": 200,
"num_heartbeats_timeout": 10,
})
cluster = ray.tests.cluster_utils.Cluster(
head_node_args={"_internal_config": internal_config})
for _ in range(2):
remote_node = cluster.add_node(
num_cpus=1, _internal_config=internal_config)
ray.init(redis_address=cluster.redis_address)
yield cluster, remote_node
# The code after the yield will run as teardown code.
ray.shutdown()
cluster.shutdown()
+28
View File
@@ -274,6 +274,34 @@ def test_forget(ray_start_regular):
assert len(result_list) == count
def test_signal_on_node_failure(two_node_cluster):
"""Test actor checkpointing on a remote node."""
class ActorSignal(object):
def __init__(self):
pass
def local_plasma(self):
return ray.worker.global_worker.plasma_client.store_socket_name
# Place the actor on the remote node.
cluster, remote_node = two_node_cluster
actor_cls = ray.remote(max_reconstructions=0)(ActorSignal)
actor = actor_cls.remote()
# Try until we put an actor on a different node.
while (ray.get(actor.local_plasma.remote()) !=
remote_node.plasma_store_socket_name):
actor = actor_cls.remote()
# Kill actor process.
cluster.remove_node(remote_node)
# Wait on signal from the actor on the failed node.
result_list = signal.receive([actor], timeout=10)
assert len(result_list) == 1
assert type(result_list[0][1]) == signal.ActorDiedSignal
def test_send_signal_from_two_tasks_to_driver(ray_start_regular):
# Define a remote function that sends a user-defined signal.
@ray.remote