mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 21:23:10 +08:00
[Core] Actor Retries Out of Order Tasks on Restart (#12338)
This commit is contained in:
@@ -2,6 +2,7 @@ import sys
|
||||
import functools
|
||||
import time
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
import pytest
|
||||
@@ -38,34 +39,41 @@ def test_host_standalone(serve_instance):
|
||||
assert "key_2" in result
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
"Skip until https://github.com/ray-project/ray/issues/11683 fixed "
|
||||
"since async actor retries is broken.")
|
||||
def test_long_pull_restarts(serve_instance):
|
||||
def test_long_poll_restarts(serve_instance):
|
||||
@ray.remote(
|
||||
max_restarts=-1,
|
||||
# max_task_retries=-1,
|
||||
max_task_retries=-1,
|
||||
)
|
||||
class RestartableLongPollerHost:
|
||||
def __init__(self) -> None:
|
||||
print("actor started")
|
||||
self.host = LongPollerHost()
|
||||
self.host.notify_changed("timer", time.time())
|
||||
self.should_exit = False
|
||||
|
||||
async def listen_for_change(self, key_to_ids):
|
||||
await asyncio.sleep(0.5)
|
||||
print("listening for change ", key_to_ids)
|
||||
return await self.host.listen_for_change(key_to_ids)
|
||||
|
||||
async def exit(self):
|
||||
sys.exit(1)
|
||||
async def set_exit(self):
|
||||
self.should_exit = True
|
||||
|
||||
async def exit_if_set(self):
|
||||
if self.should_exit:
|
||||
print("actor exit")
|
||||
os._exit(1)
|
||||
|
||||
host = RestartableLongPollerHost.remote()
|
||||
updated_values = ray.get(host.listen_for_change.remote({"timer": -1}))
|
||||
timer: UpdatedObject = updated_values["timer"]
|
||||
|
||||
on_going_ref = host.listen_for_change.remote({"timer": timer.snapshot_id})
|
||||
host.exit.remote()
|
||||
on_going_ref = host.listen_for_change.remote({"timer": timer.snapshot_id})
|
||||
ray.get(host.set_exit.remote())
|
||||
# This task should trigger the actor to exit.
|
||||
# But the retried task will not because self.should_exit is false.
|
||||
host.exit_if_set.remote()
|
||||
|
||||
# on_going_ref should return succesfully with a differnt value.
|
||||
new_timer: UpdatedObject = ray.get(on_going_ref)["timer"]
|
||||
assert new_timer.snapshot_id != timer.snapshot_id + 1
|
||||
assert new_timer.object_snapshot != timer.object_snapshot
|
||||
|
||||
@@ -1289,6 +1289,76 @@ def test_gcs_server_failiure_report(ray_start_regular, log_pubsub):
|
||||
assert data["pid"] == "gcs_server"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_regular", [{
|
||||
"_system_config": {
|
||||
"task_retry_delay_ms": 500
|
||||
}
|
||||
}],
|
||||
indirect=True)
|
||||
def test_async_actor_task_retries(ray_start_regular):
|
||||
# https://github.com/ray-project/ray/issues/11683
|
||||
|
||||
signal = SignalActor.remote()
|
||||
|
||||
@ray.remote
|
||||
class DyingActor:
|
||||
def __init__(self):
|
||||
print("DyingActor init called")
|
||||
self.should_exit = False
|
||||
|
||||
def set_should_exit(self):
|
||||
print("DyingActor.set_should_exit called")
|
||||
self.should_exit = True
|
||||
|
||||
async def get(self, x, wait=False):
|
||||
print(f"DyingActor.get called with x={x}, wait={wait}")
|
||||
if self.should_exit:
|
||||
os._exit(0)
|
||||
if wait:
|
||||
await signal.wait.remote()
|
||||
return x
|
||||
|
||||
# Normal in order actor task retries should work
|
||||
dying = DyingActor.options(
|
||||
max_restarts=-1,
|
||||
max_task_retries=-1,
|
||||
).remote()
|
||||
|
||||
assert ray.get(dying.get.remote(1)) == 1
|
||||
ray.get(dying.set_should_exit.remote())
|
||||
assert ray.get(dying.get.remote(42)) == 42
|
||||
|
||||
# Now let's try out of order retries:
|
||||
# Task seqno 0 will return
|
||||
# Task seqno 1 will be pending and retried later
|
||||
# Task seqno 2 will return
|
||||
# Task seqno 3 will crash the actor and retried later
|
||||
dying = DyingActor.options(
|
||||
max_restarts=-1,
|
||||
max_task_retries=-1,
|
||||
).remote()
|
||||
|
||||
# seqno 0
|
||||
ref_0 = dying.get.remote(0)
|
||||
assert ray.get(ref_0) == 0
|
||||
# seqno 1
|
||||
ref_1 = dying.get.remote(1, wait=True)
|
||||
# seqno 2
|
||||
ref_2 = dying.set_should_exit.remote()
|
||||
assert ray.get(ref_2) is None
|
||||
# seqno 3, this will crash the actor because previous task set should exit
|
||||
# to true.
|
||||
ref_3 = dying.get.remote(3)
|
||||
|
||||
# At this point the actor should be restarted. The two pending tasks
|
||||
# [ref_1, ref_3] should be retried, but not the completed tasks [ref_0,
|
||||
# ref_2]. Critically, if ref_2 was retried, ref_3 can never return.
|
||||
ray.get(signal.send.remote())
|
||||
assert ray.get(ref_1) == 1
|
||||
assert ray.get(ref_3) == 3
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
Reference in New Issue
Block a user