mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 01:27:43 +08:00
Option to retry failed actor tasks (#8330)
* Python * Consolidate state in the direct actor transport, set the caller starts at * todo * Remove unused * Update and unit tests * Doc * Remove unused * doc * Remove debug * Update src/ray/core_worker/transport/direct_actor_transport.h Co-authored-by: Eric Liang <ekhliang@gmail.com> * Update src/ray/core_worker/transport/direct_actor_transport.cc Co-authored-by: Eric Liang <ekhliang@gmail.com> * lint and fix build * Update * Fix build * Fix tests * Unit test for max_task_retries=0 * Fix java? * Fix bad test * Cross language fix * fix java Co-authored-by: Eric Liang <ekhliang@gmail.com>
This commit is contained in:
@@ -903,6 +903,7 @@ cdef class CoreWorker:
|
||||
FunctionDescriptor function_descriptor,
|
||||
args,
|
||||
int64_t max_restarts,
|
||||
int64_t max_task_retries,
|
||||
resources,
|
||||
placement_resources,
|
||||
int32_t max_concurrency,
|
||||
@@ -929,7 +930,7 @@ cdef class CoreWorker:
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().CreateActor(
|
||||
ray_function, args_vector,
|
||||
CActorCreationOptions(
|
||||
max_restarts, max_concurrency,
|
||||
max_restarts, max_task_retries, max_concurrency,
|
||||
c_resources, c_placement_resources,
|
||||
dynamic_worker_options, is_detached, name, is_asyncio),
|
||||
extension_data,
|
||||
|
||||
+21
-9
@@ -244,7 +244,8 @@ class ActorClassMetadata:
|
||||
|
||||
def __init__(self, language, modified_class,
|
||||
actor_creation_function_descriptor, class_id, max_restarts,
|
||||
num_cpus, num_gpus, memory, object_store_memory, resources):
|
||||
max_task_retries, num_cpus, num_gpus, memory,
|
||||
object_store_memory, resources):
|
||||
self.language = language
|
||||
self.modified_class = modified_class
|
||||
self.actor_creation_function_descriptor = \
|
||||
@@ -253,6 +254,7 @@ class ActorClassMetadata:
|
||||
self.is_cross_language = language != Language.PYTHON
|
||||
self.class_id = class_id
|
||||
self.max_restarts = max_restarts
|
||||
self.max_task_retries = max_task_retries
|
||||
self.num_cpus = num_cpus
|
||||
self.num_gpus = num_gpus
|
||||
self.memory = memory
|
||||
@@ -314,7 +316,7 @@ class ActorClass:
|
||||
|
||||
@classmethod
|
||||
def _ray_from_modified_class(cls, modified_class, class_id, max_restarts,
|
||||
num_cpus, num_gpus, memory,
|
||||
max_task_retries, num_cpus, num_gpus, memory,
|
||||
object_store_memory, resources):
|
||||
for attribute in [
|
||||
"remote", "_remote", "_ray_from_modified_class",
|
||||
@@ -344,20 +346,22 @@ class ActorClass:
|
||||
self.__ray_metadata__ = ActorClassMetadata(
|
||||
Language.PYTHON, modified_class,
|
||||
actor_creation_function_descriptor, class_id, max_restarts,
|
||||
num_cpus, num_gpus, memory, object_store_memory, resources)
|
||||
max_task_retries, num_cpus, num_gpus, memory, object_store_memory,
|
||||
resources)
|
||||
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def _ray_from_function_descriptor(
|
||||
cls, language, actor_creation_function_descriptor, max_restarts,
|
||||
num_cpus, num_gpus, memory, object_store_memory, resources):
|
||||
max_task_retries, num_cpus, num_gpus, memory, object_store_memory,
|
||||
resources):
|
||||
self = ActorClass.__new__(ActorClass)
|
||||
|
||||
self.__ray_metadata__ = ActorClassMetadata(
|
||||
language, None, actor_creation_function_descriptor, None,
|
||||
max_restarts, num_cpus, num_gpus, memory, object_store_memory,
|
||||
resources)
|
||||
max_restarts, max_task_retries, num_cpus, num_gpus, memory,
|
||||
object_store_memory, resources)
|
||||
|
||||
return self
|
||||
|
||||
@@ -406,6 +410,7 @@ class ActorClass:
|
||||
is_direct_call=None,
|
||||
max_concurrency=None,
|
||||
max_restarts=None,
|
||||
max_task_retries=None,
|
||||
name=None,
|
||||
detached=False):
|
||||
"""Create an actor.
|
||||
@@ -557,6 +562,7 @@ class ActorClass:
|
||||
meta.actor_creation_function_descriptor,
|
||||
creation_args,
|
||||
max_restarts or meta.max_restarts,
|
||||
max_task_retries or meta.max_task_retries,
|
||||
resources,
|
||||
actor_placement_resources,
|
||||
max_concurrency,
|
||||
@@ -891,11 +897,13 @@ def modify_class(cls):
|
||||
|
||||
|
||||
def make_actor(cls, num_cpus, num_gpus, memory, object_store_memory, resources,
|
||||
max_restarts):
|
||||
max_restarts, max_task_retries):
|
||||
Class = modify_class(cls)
|
||||
|
||||
if max_restarts is None:
|
||||
max_restarts = 0
|
||||
if max_task_retries is None:
|
||||
max_task_retries = 0
|
||||
|
||||
infinite_restart = max_restarts == -1
|
||||
if not infinite_restart:
|
||||
@@ -907,9 +915,13 @@ def make_actor(cls, num_cpus, num_gpus, memory, object_store_memory, resources,
|
||||
# an overflow.
|
||||
max_restarts = min(max_restarts, ray_constants.MAX_INT64_VALUE)
|
||||
|
||||
if max_restarts == 0 and max_task_retries != 0:
|
||||
raise ValueError(
|
||||
"max_task_retries cannot be set if max_restarts is 0.")
|
||||
|
||||
return ActorClass._ray_from_modified_class(
|
||||
Class, ActorClassID.from_random(), max_restarts, num_cpus, num_gpus,
|
||||
memory, object_store_memory, resources)
|
||||
Class, ActorClassID.from_random(), max_restarts, max_task_retries,
|
||||
num_cpus, num_gpus, memory, object_store_memory, resources)
|
||||
|
||||
|
||||
def exit_actor():
|
||||
|
||||
@@ -77,6 +77,7 @@ def java_actor_class(class_name):
|
||||
Language.JAVA,
|
||||
JavaFunctionDescriptor(class_name, "<init>", ""),
|
||||
0, # max_restarts,
|
||||
0, # max_task_retries,
|
||||
None, # num_cpus,
|
||||
None, # num_gpus,
|
||||
None, # memory,
|
||||
|
||||
@@ -231,6 +231,7 @@ cdef extern from "ray/core_worker/common.h" nogil:
|
||||
CActorCreationOptions()
|
||||
CActorCreationOptions(
|
||||
int64_t max_restarts,
|
||||
int64_t max_task_retries,
|
||||
int32_t max_concurrency,
|
||||
const unordered_map[c_string, double] &resources,
|
||||
const unordered_map[c_string, double] &placement_resources,
|
||||
|
||||
@@ -51,7 +51,7 @@ py_test(
|
||||
size = "medium",
|
||||
srcs = ["test_actor_failures.py"],
|
||||
# TODO(ekl) enable this once we support actor reconstruction again
|
||||
tags = ["exclusive", "manual"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
|
||||
@@ -3,10 +3,6 @@ import json
|
||||
import numpy as np
|
||||
import os
|
||||
import pytest
|
||||
try:
|
||||
import pytest_timeout
|
||||
except ImportError:
|
||||
pytest_timeout = None
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
@@ -88,9 +84,13 @@ def ray_checkpointable_actor_cls(request):
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True)
|
||||
def test_actor_eviction(ray_start_object_store_memory):
|
||||
object_store_memory = ray_start_object_store_memory
|
||||
"ray_start_regular", [{
|
||||
"object_store_memory": 150 * 1024 * 1024,
|
||||
"lru_evict": True,
|
||||
}],
|
||||
indirect=True)
|
||||
def test_actor_eviction(ray_start_regular):
|
||||
object_store_memory = 150 * 1024 * 1024
|
||||
|
||||
@ray.remote
|
||||
class Actor:
|
||||
@@ -127,10 +127,14 @@ def test_actor_eviction(ray_start_object_store_memory):
|
||||
assert num_success > 0
|
||||
|
||||
|
||||
def test_actor_restart(ray_start_regular):
|
||||
"""Test actor reconstruction when actor process is killed."""
|
||||
def test_actor_restart():
|
||||
"""Test actor restart when actor process is killed."""
|
||||
ray.init(
|
||||
_internal_config=json.dumps({
|
||||
"task_retry_delay_ms": 100,
|
||||
}), )
|
||||
|
||||
@ray.remote(max_restarts=1)
|
||||
@ray.remote(max_restarts=1, max_task_retries=-1)
|
||||
class RestartableActor:
|
||||
"""An actor that will be restarted at most once."""
|
||||
|
||||
@@ -147,20 +151,48 @@ def test_actor_restart(ray_start_regular):
|
||||
|
||||
actor = RestartableActor.remote()
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
# Call increase 3 times
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
# Call increase again with some delay.
|
||||
result = actor.increase.remote(delay=0.5)
|
||||
# Sleep some time to wait for the above task to start execution.
|
||||
time.sleep(0.2)
|
||||
results = [actor.increase.remote() for _ in range(100)]
|
||||
# Kill actor process, while the above task is still being executed.
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
# Check that the above task didn't fail and the actor is restarted.
|
||||
assert ray.get(result) == 4
|
||||
# Make sure that all tasks were executed in order before the actor's death.
|
||||
res = results.pop(0)
|
||||
i = 1
|
||||
while True:
|
||||
try:
|
||||
r = ray.get(res)
|
||||
if r != i:
|
||||
# Actor restarted without any failed tasks.
|
||||
break
|
||||
res = results.pop(0)
|
||||
i += 1
|
||||
except ray.exceptions.RayActorError:
|
||||
# Actor restarted.
|
||||
break
|
||||
# Find the first task to execute after the actor was restarted.
|
||||
while True:
|
||||
try:
|
||||
r = ray.get(res)
|
||||
break
|
||||
except ray.exceptions.RayActorError:
|
||||
res = results.pop(0)
|
||||
pass
|
||||
# Make sure that all tasks were executed in order after the actor's death.
|
||||
i = 1
|
||||
while True:
|
||||
r = ray.get(res)
|
||||
assert r == i
|
||||
if results:
|
||||
res = results.pop(0)
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
# Check that we can still call the actor.
|
||||
assert ray.get(actor.increase.remote()) == 5
|
||||
result = actor.increase.remote()
|
||||
assert ray.get(result) == r + 1
|
||||
|
||||
# kill actor process one more time.
|
||||
results = [actor.increase.remote() for _ in range(100)]
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
# The actor has exceeded max restarts, and this task should fail.
|
||||
@@ -174,37 +206,154 @@ def test_actor_restart(ray_start_regular):
|
||||
# Check that the actor won't be restarted.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.increase.remote())
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def test_actor_restart_with_retry():
|
||||
"""Test actor restart when actor process is killed."""
|
||||
ray.init(
|
||||
_internal_config=json.dumps({
|
||||
"task_retry_delay_ms": 100,
|
||||
}), )
|
||||
|
||||
@ray.remote(max_restarts=1, max_task_retries=-1)
|
||||
class RestartableActor:
|
||||
"""An actor that will be restarted at most once."""
|
||||
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
|
||||
def increase(self, delay=0):
|
||||
time.sleep(delay)
|
||||
self.value += 1
|
||||
return self.value
|
||||
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
actor = RestartableActor.remote()
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
results = [actor.increase.remote() for _ in range(100)]
|
||||
# Kill actor process, while the above task is still being executed.
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
# Check that none of the tasks failed and the actor is restarted.
|
||||
seq = list(range(1, 101))
|
||||
results = ray.get(results)
|
||||
failed_task_index = None
|
||||
# Make sure that all tasks were executed in order before and after the
|
||||
# actor's death.
|
||||
for i, res in enumerate(results):
|
||||
if res != seq[0]:
|
||||
if failed_task_index is None:
|
||||
failed_task_index = i
|
||||
assert res + failed_task_index == seq[0]
|
||||
seq.pop(0)
|
||||
# Check that we can still call the actor.
|
||||
result = actor.increase.remote()
|
||||
assert ray.get(result) == results[-1] + 1
|
||||
|
||||
# kill actor process one more time.
|
||||
results = [actor.increase.remote() for _ in range(100)]
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
# The actor has exceeded max restarts, and this task should fail.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
# Create another actor.
|
||||
actor = RestartableActor.remote()
|
||||
# Intentionlly exit the actor
|
||||
actor.__ray_terminate__.remote()
|
||||
# Check that the actor won't be restarted.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.increase.remote())
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def test_actor_restart_on_node_failure(ray_start_cluster):
|
||||
config = json.dumps({
|
||||
"num_heartbeats_timeout": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 100,
|
||||
"initial_reconstruction_timeout_milliseconds": 1000,
|
||||
"task_retry_delay_ms": 100,
|
||||
})
|
||||
cluster = ray_start_cluster
|
||||
# Head node with no resources.
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
# Node to place the actor.
|
||||
cluster.add_node(num_cpus=1, _internal_config=config)
|
||||
cluster.wait_for_nodes()
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@ray.remote(num_cpus=1, max_restarts=1, max_task_retries=-1)
|
||||
class RestartableActor:
|
||||
"""An actor that will be reconstructed at most once."""
|
||||
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
|
||||
def increase(self):
|
||||
self.value += 1
|
||||
return self.value
|
||||
|
||||
def ready(self):
|
||||
return
|
||||
|
||||
actor = RestartableActor.remote()
|
||||
ray.get(actor.ready.remote())
|
||||
results = [actor.increase.remote() for _ in range(100)]
|
||||
# Kill actor node, while the above task is still being executed.
|
||||
cluster.remove_node(cluster.list_all_nodes()[-1])
|
||||
cluster.add_node(num_cpus=1, _internal_config=config)
|
||||
cluster.wait_for_nodes()
|
||||
# Check that none of the tasks failed and the actor is restarted.
|
||||
seq = list(range(1, 101))
|
||||
results = ray.get(results)
|
||||
failed_task_index = None
|
||||
# Make sure that all tasks were executed in order before and after the
|
||||
# actor's death.
|
||||
for i, res in enumerate(results):
|
||||
elm = seq.pop(0)
|
||||
if res != elm:
|
||||
if failed_task_index is None:
|
||||
failed_task_index = i
|
||||
assert res + failed_task_index == elm
|
||||
# Check that we can still call the actor.
|
||||
result = ray.get(actor.increase.remote())
|
||||
assert result == 1 or result == results[-1] + 1
|
||||
|
||||
|
||||
def test_actor_restart_without_task(ray_start_regular):
|
||||
"""Test a dead actor can be restarted without sending task to it."""
|
||||
|
||||
@ray.remote(max_restarts=1)
|
||||
@ray.remote(max_restarts=1, resources={"actor": 1})
|
||||
class RestartableActor:
|
||||
def __init__(self, obj_ids):
|
||||
for obj_id in obj_ids:
|
||||
# Every time the actor gets constructed,
|
||||
# put a new object in plasma store.
|
||||
global_worker = ray.worker.global_worker
|
||||
if not global_worker.core_worker.object_exists(obj_id):
|
||||
global_worker.put_object(1, obj_id)
|
||||
break
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
obj_ids = [ray.ObjectID.from_random() for _ in range(2)]
|
||||
actor = RestartableActor.remote(obj_ids)
|
||||
@ray.remote(resources={"actor": 1})
|
||||
def probe():
|
||||
return
|
||||
|
||||
# Returns whether the "actor" resource is available.
|
||||
def actor_resource_available():
|
||||
p = probe.remote()
|
||||
ready, _ = ray.wait([p], timeout=1)
|
||||
return len(ready) > 0
|
||||
|
||||
ray.experimental.set_resource("actor", 1)
|
||||
actor = RestartableActor.remote()
|
||||
assert wait_for_condition(lambda: not actor_resource_available())
|
||||
# Kill the actor.
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
|
||||
p = probe.remote()
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
|
||||
# Wait until the actor is reconstructed.
|
||||
def check_restarted():
|
||||
worker = ray.worker.global_worker
|
||||
return worker.core_worker.object_exists(obj_ids[1])
|
||||
|
||||
assert wait_for_condition(check_restarted)
|
||||
ray.get(p)
|
||||
assert wait_for_condition(lambda: not actor_resource_available())
|
||||
|
||||
|
||||
def test_caller_actor_restart(ray_start_regular):
|
||||
@@ -277,66 +426,6 @@ def test_caller_task_reconstruction(ray_start_regular):
|
||||
assert ray.get(RetryableTask.remote(remote_actor)) == 3
|
||||
|
||||
|
||||
def test_actor_restart_on_node_failure(ray_start_cluster_head):
|
||||
"""Test actor reconstruction when node dies unexpectedly."""
|
||||
cluster = ray_start_cluster_head
|
||||
max_restarts = 3
|
||||
# Add a few nodes to the cluster.
|
||||
# Use custom resource to make sure the actor is only created on worker
|
||||
# nodes, not on the head node.
|
||||
for _ in range(max_restarts + 2):
|
||||
cluster.add_node(
|
||||
resources={"a": 1},
|
||||
_internal_config=json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"num_heartbeats_timeout": 10,
|
||||
}),
|
||||
)
|
||||
|
||||
def kill_node(node_id):
|
||||
node_to_remove = None
|
||||
for node in cluster.worker_nodes:
|
||||
if node_id == node.unique_id:
|
||||
node_to_remove = node
|
||||
cluster.remove_node(node_to_remove)
|
||||
|
||||
@ray.remote(max_restarts=max_restarts, resources={"a": 1})
|
||||
class MyActor:
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
|
||||
def increase(self):
|
||||
self.value += 1
|
||||
return self.value
|
||||
|
||||
def get_object_store_socket(self):
|
||||
return ray.worker.global_worker.node.unique_id
|
||||
|
||||
actor = MyActor.remote()
|
||||
# Call increase 3 times.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
for i in range(max_restarts):
|
||||
object_store_socket = ray.get(actor.get_object_store_socket.remote())
|
||||
# Kill actor's node and the actor should be restarted
|
||||
# on a different node.
|
||||
kill_node(object_store_socket)
|
||||
# Call increase again.
|
||||
# Check that the actor is restarted and value is correct.
|
||||
assert ray.get(actor.increase.remote()) == 4 + i
|
||||
# Check that the actor is now on a different node.
|
||||
assert object_store_socket != ray.get(
|
||||
actor.get_object_store_socket.remote())
|
||||
|
||||
# kill the node again.
|
||||
object_store_socket = ray.get(actor.get_object_store_socket.remote())
|
||||
kill_node(object_store_socket)
|
||||
# The actor has exceeded max restarts, and this task should fail.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
|
||||
# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for
|
||||
# this test. Because if this value is too small, suprious task reconstruction
|
||||
# may happen and cause the test fauilure. If the value is too large, this test
|
||||
@@ -365,7 +454,7 @@ def test_multiple_actor_restart(ray_start_cluster_head):
|
||||
})) for _ in range(num_nodes)
|
||||
]
|
||||
|
||||
@ray.remote(max_restarts=-1)
|
||||
@ray.remote(max_restarts=-1, max_task_retries=-1)
|
||||
class SlowCounter:
|
||||
def __init__(self):
|
||||
self.x = 0
|
||||
@@ -407,8 +496,12 @@ def test_multiple_actor_restart(ray_start_cluster_head):
|
||||
|
||||
# Get the results and check that they have the correct values.
|
||||
for _, result_id_list in result_ids.items():
|
||||
results = list(range(1, len(result_id_list) + 1))
|
||||
assert ray.get(result_id_list) == results
|
||||
results = ray.get(result_id_list)
|
||||
for i, result in enumerate(results):
|
||||
if i == 0:
|
||||
assert result == 1
|
||||
else:
|
||||
assert result == results[i - 1] + 1 or result == 1
|
||||
|
||||
|
||||
def kill_actor(actor):
|
||||
@@ -418,6 +511,7 @@ def kill_actor(actor):
|
||||
wait_for_pid_to_exit(pid)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
"""Test actor checkpointing and restoring from a checkpoint."""
|
||||
actor = ray.remote(max_restarts=2)(ray_checkpointable_actor_cls).remote()
|
||||
@@ -440,13 +534,14 @@ def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again and check that reconstruction still works after the
|
||||
# Kill actor again and check that restart still works after the
|
||||
# actor resuming from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
"""Test checkpointing of a remote actor through method invocation."""
|
||||
|
||||
@@ -487,13 +582,14 @@ def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again and check that reconstruction still works after the
|
||||
# Kill actor again and check that restart still works after the
|
||||
# actor resuming from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor checkpointing on a remote node."""
|
||||
@@ -520,6 +616,7 @@ def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing_save_exception(ray_start_regular,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor can still be recovered if checkpoints fail to complete."""
|
||||
@@ -549,7 +646,7 @@ def test_checkpointing_save_exception(ray_start_regular,
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again, and check that reconstruction still works and the actor
|
||||
# Kill actor again, and check that restart still works and the actor
|
||||
# wasn't resumed from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
@@ -559,6 +656,7 @@ def test_checkpointing_save_exception(ray_start_regular,
|
||||
wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="TODO: Actor checkpointing")
|
||||
def test_checkpointing_load_exception(ray_start_regular,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor can still be recovered if checkpoints fail to load."""
|
||||
@@ -589,7 +687,7 @@ def test_checkpointing_load_exception(ray_start_regular,
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again, and check that reconstruction still works and the actor
|
||||
# Kill actor again, and check that restart still works and the actor
|
||||
# wasn't resumed from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
@@ -718,10 +816,6 @@ def test_decorated_method(ray_start_regular):
|
||||
assert ray.get(object_id) == 7 # 2 * 3 + 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
pytest_timeout is None,
|
||||
reason="Timeout package not installed; skipping test that may hang.")
|
||||
@pytest.mark.timeout(20)
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster", [{
|
||||
"num_cpus": 1,
|
||||
@@ -747,53 +841,45 @@ def test_ray_wait_dead_actor(ray_start_cluster):
|
||||
actors = [Actor.remote() for _ in range(num_nodes)]
|
||||
ray.get([actor.ping.remote() for actor in actors])
|
||||
|
||||
# Ping the actors and make sure the tasks complete.
|
||||
ping_ids = [actor.ping.remote() for actor in actors]
|
||||
ray.get(ping_ids)
|
||||
# Evict the result from the node that we're about to kill.
|
||||
remote_node = cluster.list_all_nodes()[-1]
|
||||
remote_ping_id = None
|
||||
for i, actor in enumerate(actors):
|
||||
if ray.get(actor.node_id.remote()) == remote_node.unique_id:
|
||||
remote_ping_id = ping_ids[i]
|
||||
ray.internal.free([remote_ping_id], local_only=True)
|
||||
cluster.remove_node(remote_node)
|
||||
def actor_dead():
|
||||
# Ping the actors and make sure the tasks complete.
|
||||
ping_ids = [actor.ping.remote() for actor in actors]
|
||||
unready = ping_ids[:]
|
||||
while unready:
|
||||
_, unready = ray.wait(unready, timeout=0)
|
||||
time.sleep(1)
|
||||
|
||||
# Repeatedly call ray.wait until the exception for the dead actor is
|
||||
# received.
|
||||
unready = ping_ids[:]
|
||||
while unready:
|
||||
_, unready = ray.wait(unready, timeout=0)
|
||||
time.sleep(1)
|
||||
try:
|
||||
ray.get(ping_ids)
|
||||
return False
|
||||
except ray.exceptions.RayActorError:
|
||||
return True
|
||||
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(ping_ids)
|
||||
# Kill a node.
|
||||
cluster.remove_node(cluster.list_all_nodes()[-1])
|
||||
# Repeatedly submit tasks and call ray.wait until the exception for the
|
||||
# dead actor is received.
|
||||
assert wait_for_condition(actor_dead)
|
||||
|
||||
# Evict the result from the dead node.
|
||||
ray.internal.free([remote_ping_id], local_only=True)
|
||||
# Create an actor on the local node that will call ray.wait in a loop.
|
||||
head_node_resource = "HEAD_NODE"
|
||||
ray.experimental.set_resource(head_node_resource, 1)
|
||||
|
||||
@ray.remote(num_cpus=0, resources={head_node_resource: 1})
|
||||
class ParentActor:
|
||||
def __init__(self, ping_ids):
|
||||
self.unready = ping_ids
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def wait(self):
|
||||
_, self.unready = ray.wait(self.unready, timeout=0)
|
||||
return len(self.unready) == 0
|
||||
return actor_dead()
|
||||
|
||||
def ping(self):
|
||||
return
|
||||
|
||||
# Repeatedly call ray.wait through the local actor until the exception for
|
||||
# the dead actor is received.
|
||||
parent_actor = ParentActor.remote(ping_ids)
|
||||
ray.get(parent_actor.ping.remote())
|
||||
failure_detected = False
|
||||
while not failure_detected:
|
||||
failure_detected = ray.get(parent_actor.wait.remote())
|
||||
parent_actor = ParentActor.remote()
|
||||
assert wait_for_condition(lambda: ray.get(parent_actor.wait.remote()))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
+16
-1
@@ -1730,6 +1730,7 @@ def make_decorator(num_return_vals=None,
|
||||
max_calls=None,
|
||||
max_retries=None,
|
||||
max_restarts=None,
|
||||
max_task_retries=None,
|
||||
worker=None):
|
||||
def decorator(function_or_class):
|
||||
if (inspect.isfunction(function_or_class)
|
||||
@@ -1738,6 +1739,9 @@ def make_decorator(num_return_vals=None,
|
||||
if max_restarts is not None:
|
||||
raise ValueError("The keyword 'max_restarts' is not "
|
||||
"allowed for remote functions.")
|
||||
if max_task_retries is not None:
|
||||
raise ValueError("The keyword 'max_task_retries' is not "
|
||||
"allowed for remote functions.")
|
||||
|
||||
return ray.remote_function.RemoteFunction(
|
||||
Language.PYTHON, function_or_class, None, num_cpus, num_gpus,
|
||||
@@ -1754,7 +1758,7 @@ def make_decorator(num_return_vals=None,
|
||||
|
||||
return ray.actor.make_actor(function_or_class, num_cpus, num_gpus,
|
||||
memory, object_store_memory, resources,
|
||||
max_restarts)
|
||||
max_restarts, max_task_retries)
|
||||
|
||||
raise TypeError("The @ray.remote decorator must be applied to "
|
||||
"either a function or to a class.")
|
||||
@@ -1801,6 +1805,14 @@ def remote(*args, **kwargs):
|
||||
unexpectedly. The minimum valid value is 0 (default), which indicates
|
||||
that the actor doesn't need to be restarted. A value of -1
|
||||
indicates that an actor should be restarted indefinitely.
|
||||
* **max_task_retries**: Only for *actors*. How many times to retry an actor
|
||||
task if the task fails due to a system error, e.g., the actor has died.
|
||||
If set to -1, the system will retry the failed task until the task
|
||||
succeeds, or the actor has reached its max_restarts limit. If set to n >
|
||||
0, the system will retry the failed task up to n times, after which the
|
||||
task will throw a `RayActorError` exception upon `ray.get`. Note that
|
||||
Python exceptions are not considered system errors and will not trigger
|
||||
retries.
|
||||
* **max_retries**: Only for *remote functions*. This specifies the maximum
|
||||
number of times that the remote function should be rerun when the worker
|
||||
process executing it crashes unexpectedly. The minimum valid value is 0,
|
||||
@@ -1867,6 +1879,7 @@ def remote(*args, **kwargs):
|
||||
"resources",
|
||||
"max_calls",
|
||||
"max_restarts",
|
||||
"max_task_retries",
|
||||
"max_retries",
|
||||
], error_string
|
||||
|
||||
@@ -1885,6 +1898,7 @@ def remote(*args, **kwargs):
|
||||
num_return_vals = kwargs.get("num_return_vals")
|
||||
max_calls = kwargs.get("max_calls")
|
||||
max_restarts = kwargs.get("max_restarts")
|
||||
max_task_retries = kwargs.get("max_task_retries")
|
||||
memory = kwargs.get("memory")
|
||||
object_store_memory = kwargs.get("object_store_memory")
|
||||
max_retries = kwargs.get("max_retries")
|
||||
@@ -1898,5 +1912,6 @@ def remote(*args, **kwargs):
|
||||
resources=resources,
|
||||
max_calls=max_calls,
|
||||
max_restarts=max_restarts,
|
||||
max_task_retries=max_task_retries,
|
||||
max_retries=max_retries,
|
||||
worker=worker)
|
||||
|
||||
Reference in New Issue
Block a user