mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 08:40:02 +08:00
Reconstruct failed actors without sending tasks. (#5161)
* fast reconstruct dead actors * add test * fix typos * remove debug print * small fix * fix typos * Update test_actor.py
This commit is contained in:
@@ -11,6 +11,7 @@ import pytest
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from pyarrow import plasma
|
||||
|
||||
import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
@@ -19,6 +20,7 @@ import ray.tests.cluster_utils
|
||||
from ray.tests.conftest import generate_internal_config_map
|
||||
from ray.tests.utils import (
|
||||
relevant_errors,
|
||||
wait_for_condition,
|
||||
wait_for_errors,
|
||||
)
|
||||
|
||||
@@ -2162,6 +2164,39 @@ def test_actor_reconstruction(ray_start_regular):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
|
||||
def test_actor_reconstruction_without_task(ray_start_regular):
|
||||
"""Test a dead actor can be reconstructed without sending task to it."""
|
||||
|
||||
def object_exists(obj_id):
|
||||
"""Check wether an object exists in plasma store."""
|
||||
plasma_client = ray.worker.global_worker.plasma_client
|
||||
plasma_id = plasma.ObjectID(obj_id.binary())
|
||||
return plasma_client.get(
|
||||
plasma_id, timeout_ms=0) != plasma.ObjectNotAvailable
|
||||
|
||||
@ray.remote(max_reconstructions=1)
|
||||
class ReconstructableActor(object):
|
||||
def __init__(self, obj_ids):
|
||||
for obj_id in obj_ids:
|
||||
# Every time the actor gets constructed,
|
||||
# put a new object in plasma store.
|
||||
if not object_exists(obj_id):
|
||||
ray.worker.global_worker.put_object(obj_id, 1)
|
||||
break
|
||||
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
obj_ids = [ray.ObjectID.from_random() for _ in range(2)]
|
||||
actor = ReconstructableActor.remote(obj_ids)
|
||||
# Kill the actor.
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
# Wait until the actor is reconstructed.
|
||||
assert wait_for_condition(
|
||||
lambda: object_exists(obj_ids[1]), timeout_ms=5000)
|
||||
|
||||
|
||||
def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
|
||||
"""Test actor reconstruction when node dies unexpectedly."""
|
||||
cluster = ray_start_cluster_head
|
||||
|
||||
@@ -94,3 +94,25 @@ def wait_for_errors(error_type, num_errors, timeout=10):
|
||||
return
|
||||
time.sleep(0.1)
|
||||
raise Exception("Timing out of wait.")
|
||||
|
||||
|
||||
def wait_for_condition(condition_predictor,
|
||||
timeout_ms=1000,
|
||||
retry_interval_ms=100):
|
||||
"""A helper function that waits until a condition is met.
|
||||
|
||||
Args:
|
||||
condition_predictor: A function that predicts the condition.
|
||||
timeout_ms: Maximum timeout in milliseconds.
|
||||
retry_interval_ms: Retry interval in milliseconds.
|
||||
|
||||
Return:
|
||||
Whether the condition is met within the timeout.
|
||||
"""
|
||||
time_elapsed = 0
|
||||
while time_elapsed <= timeout_ms:
|
||||
if condition_predictor():
|
||||
return True
|
||||
time_elapsed += retry_interval_ms
|
||||
time.sleep(retry_interval_ms / 1000.0)
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user