[Core] Actor Retries Out of Order Tasks on Restart (#12338)

This commit is contained in:
Simon Mo
2020-12-01 09:35:54 -08:00
committed by GitHub
parent f6f3cc9af1
commit f596113fc7
8 changed files with 257 additions and 44 deletions
+18 -10
View File
@@ -2,6 +2,7 @@ import sys
import functools
import time
import asyncio
import os
from typing import Dict
import pytest
@@ -38,34 +39,41 @@ def test_host_standalone(serve_instance):
assert "key_2" in result
@pytest.mark.skip(
"Skip until https://github.com/ray-project/ray/issues/11683 fixed "
"since async actor retries is broken.")
def test_long_pull_restarts(serve_instance):
def test_long_poll_restarts(serve_instance):
@ray.remote(
max_restarts=-1,
# max_task_retries=-1,
max_task_retries=-1,
)
class RestartableLongPollerHost:
def __init__(self) -> None:
print("actor started")
self.host = LongPollerHost()
self.host.notify_changed("timer", time.time())
self.should_exit = False
async def listen_for_change(self, key_to_ids):
await asyncio.sleep(0.5)
print("listening for change ", key_to_ids)
return await self.host.listen_for_change(key_to_ids)
async def exit(self):
sys.exit(1)
async def set_exit(self):
self.should_exit = True
async def exit_if_set(self):
if self.should_exit:
print("actor exit")
os._exit(1)
host = RestartableLongPollerHost.remote()
updated_values = ray.get(host.listen_for_change.remote({"timer": -1}))
timer: UpdatedObject = updated_values["timer"]
on_going_ref = host.listen_for_change.remote({"timer": timer.snapshot_id})
host.exit.remote()
on_going_ref = host.listen_for_change.remote({"timer": timer.snapshot_id})
ray.get(host.set_exit.remote())
# This task should trigger the actor to exit.
# But the retried task will not because self.should_exit is false.
host.exit_if_set.remote()
# on_going_ref should return succesfully with a differnt value.
new_timer: UpdatedObject = ray.get(on_going_ref)["timer"]
assert new_timer.snapshot_id != timer.snapshot_id + 1
assert new_timer.object_snapshot != timer.object_snapshot
+70
View File
@@ -1289,6 +1289,76 @@ def test_gcs_server_failiure_report(ray_start_regular, log_pubsub):
assert data["pid"] == "gcs_server"
@pytest.mark.parametrize(
"ray_start_regular", [{
"_system_config": {
"task_retry_delay_ms": 500
}
}],
indirect=True)
def test_async_actor_task_retries(ray_start_regular):
# https://github.com/ray-project/ray/issues/11683
signal = SignalActor.remote()
@ray.remote
class DyingActor:
def __init__(self):
print("DyingActor init called")
self.should_exit = False
def set_should_exit(self):
print("DyingActor.set_should_exit called")
self.should_exit = True
async def get(self, x, wait=False):
print(f"DyingActor.get called with x={x}, wait={wait}")
if self.should_exit:
os._exit(0)
if wait:
await signal.wait.remote()
return x
# Normal in order actor task retries should work
dying = DyingActor.options(
max_restarts=-1,
max_task_retries=-1,
).remote()
assert ray.get(dying.get.remote(1)) == 1
ray.get(dying.set_should_exit.remote())
assert ray.get(dying.get.remote(42)) == 42
# Now let's try out of order retries:
# Task seqno 0 will return
# Task seqno 1 will be pending and retried later
# Task seqno 2 will return
# Task seqno 3 will crash the actor and retried later
dying = DyingActor.options(
max_restarts=-1,
max_task_retries=-1,
).remote()
# seqno 0
ref_0 = dying.get.remote(0)
assert ray.get(ref_0) == 0
# seqno 1
ref_1 = dying.get.remote(1, wait=True)
# seqno 2
ref_2 = dying.set_should_exit.remote()
assert ray.get(ref_2) is None
# seqno 3, this will crash the actor because previous task set should exit
# to true.
ref_3 = dying.get.remote(3)
# At this point the actor should be restarted. The two pending tasks
# [ref_1, ref_3] should be retried, but not the completed tasks [ref_0,
# ref_2]. Critically, if ref_2 was retried, ref_3 can never return.
ray.get(signal.send.remote())
assert ray.get(ref_1) == 1
assert ray.get(ref_3) == 3
if __name__ == "__main__":
import pytest
sys.exit(pytest.main(["-v", __file__]))