Option to retry failed actor tasks (#8330)

* Python

* Consolidate state in the direct actor transport, set the caller starts at

* todo

* Remove unused

* Update and unit tests

* Doc

* Remove unused

* doc

* Remove debug

* Update src/ray/core_worker/transport/direct_actor_transport.h

Co-authored-by: Eric Liang <ekhliang@gmail.com>

* Update src/ray/core_worker/transport/direct_actor_transport.cc

Co-authored-by: Eric Liang <ekhliang@gmail.com>

* lint and fix build

* Update

* Fix build

* Fix tests

* Unit test for max_task_retries=0

* Fix java?

* Fix bad test

* Cross language fix

* fix java

Co-authored-by: Eric Liang <ekhliang@gmail.com>
This commit is contained in:
Stephanie Wang
2020-05-15 20:15:15 -07:00
committed by GitHub
parent 41d8c2bd0a
commit bd169749e0
26 changed files with 873 additions and 564 deletions
+2 -1
View File
@@ -903,6 +903,7 @@ cdef class CoreWorker:
FunctionDescriptor function_descriptor,
args,
int64_t max_restarts,
int64_t max_task_retries,
resources,
placement_resources,
int32_t max_concurrency,
@@ -929,7 +930,7 @@ cdef class CoreWorker:
check_status(CCoreWorkerProcess.GetCoreWorker().CreateActor(
ray_function, args_vector,
CActorCreationOptions(
max_restarts, max_concurrency,
max_restarts, max_task_retries, max_concurrency,
c_resources, c_placement_resources,
dynamic_worker_options, is_detached, name, is_asyncio),
extension_data,
+21 -9
View File
@@ -244,7 +244,8 @@ class ActorClassMetadata:
def __init__(self, language, modified_class,
actor_creation_function_descriptor, class_id, max_restarts,
num_cpus, num_gpus, memory, object_store_memory, resources):
max_task_retries, num_cpus, num_gpus, memory,
object_store_memory, resources):
self.language = language
self.modified_class = modified_class
self.actor_creation_function_descriptor = \
@@ -253,6 +254,7 @@ class ActorClassMetadata:
self.is_cross_language = language != Language.PYTHON
self.class_id = class_id
self.max_restarts = max_restarts
self.max_task_retries = max_task_retries
self.num_cpus = num_cpus
self.num_gpus = num_gpus
self.memory = memory
@@ -314,7 +316,7 @@ class ActorClass:
@classmethod
def _ray_from_modified_class(cls, modified_class, class_id, max_restarts,
num_cpus, num_gpus, memory,
max_task_retries, num_cpus, num_gpus, memory,
object_store_memory, resources):
for attribute in [
"remote", "_remote", "_ray_from_modified_class",
@@ -344,20 +346,22 @@ class ActorClass:
self.__ray_metadata__ = ActorClassMetadata(
Language.PYTHON, modified_class,
actor_creation_function_descriptor, class_id, max_restarts,
num_cpus, num_gpus, memory, object_store_memory, resources)
max_task_retries, num_cpus, num_gpus, memory, object_store_memory,
resources)
return self
@classmethod
def _ray_from_function_descriptor(
cls, language, actor_creation_function_descriptor, max_restarts,
num_cpus, num_gpus, memory, object_store_memory, resources):
max_task_retries, num_cpus, num_gpus, memory, object_store_memory,
resources):
self = ActorClass.__new__(ActorClass)
self.__ray_metadata__ = ActorClassMetadata(
language, None, actor_creation_function_descriptor, None,
max_restarts, num_cpus, num_gpus, memory, object_store_memory,
resources)
max_restarts, max_task_retries, num_cpus, num_gpus, memory,
object_store_memory, resources)
return self
@@ -406,6 +410,7 @@ class ActorClass:
is_direct_call=None,
max_concurrency=None,
max_restarts=None,
max_task_retries=None,
name=None,
detached=False):
"""Create an actor.
@@ -557,6 +562,7 @@ class ActorClass:
meta.actor_creation_function_descriptor,
creation_args,
max_restarts or meta.max_restarts,
max_task_retries or meta.max_task_retries,
resources,
actor_placement_resources,
max_concurrency,
@@ -891,11 +897,13 @@ def modify_class(cls):
def make_actor(cls, num_cpus, num_gpus, memory, object_store_memory, resources,
max_restarts):
max_restarts, max_task_retries):
Class = modify_class(cls)
if max_restarts is None:
max_restarts = 0
if max_task_retries is None:
max_task_retries = 0
infinite_restart = max_restarts == -1
if not infinite_restart:
@@ -907,9 +915,13 @@ def make_actor(cls, num_cpus, num_gpus, memory, object_store_memory, resources,
# an overflow.
max_restarts = min(max_restarts, ray_constants.MAX_INT64_VALUE)
if max_restarts == 0 and max_task_retries != 0:
raise ValueError(
"max_task_retries cannot be set if max_restarts is 0.")
return ActorClass._ray_from_modified_class(
Class, ActorClassID.from_random(), max_restarts, num_cpus, num_gpus,
memory, object_store_memory, resources)
Class, ActorClassID.from_random(), max_restarts, max_task_retries,
num_cpus, num_gpus, memory, object_store_memory, resources)
def exit_actor():
+1
View File
@@ -77,6 +77,7 @@ def java_actor_class(class_name):
Language.JAVA,
JavaFunctionDescriptor(class_name, "<init>", ""),
0, # max_restarts,
0, # max_task_retries,
None, # num_cpus,
None, # num_gpus,
None, # memory,
+1
View File
@@ -231,6 +231,7 @@ cdef extern from "ray/core_worker/common.h" nogil:
CActorCreationOptions()
CActorCreationOptions(
int64_t max_restarts,
int64_t max_task_retries,
int32_t max_concurrency,
const unordered_map[c_string, double] &resources,
const unordered_map[c_string, double] &placement_resources,
+1 -1
View File
@@ -51,7 +51,7 @@ py_test(
size = "medium",
srcs = ["test_actor_failures.py"],
# TODO(ekl) enable this once we support actor reconstruction again
tags = ["exclusive", "manual"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
+225 -139
View File
@@ -3,10 +3,6 @@ import json
import numpy as np
import os
import pytest
try:
import pytest_timeout
except ImportError:
pytest_timeout = None
import signal
import sys
import time
@@ -88,9 +84,13 @@ def ray_checkpointable_actor_cls(request):
@pytest.mark.parametrize(
"ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True)
def test_actor_eviction(ray_start_object_store_memory):
object_store_memory = ray_start_object_store_memory
"ray_start_regular", [{
"object_store_memory": 150 * 1024 * 1024,
"lru_evict": True,
}],
indirect=True)
def test_actor_eviction(ray_start_regular):
object_store_memory = 150 * 1024 * 1024
@ray.remote
class Actor:
@@ -127,10 +127,14 @@ def test_actor_eviction(ray_start_object_store_memory):
assert num_success > 0
def test_actor_restart(ray_start_regular):
"""Test actor reconstruction when actor process is killed."""
def test_actor_restart():
"""Test actor restart when actor process is killed."""
ray.init(
_internal_config=json.dumps({
"task_retry_delay_ms": 100,
}), )
@ray.remote(max_restarts=1)
@ray.remote(max_restarts=1, max_task_retries=-1)
class RestartableActor:
"""An actor that will be restarted at most once."""
@@ -147,20 +151,48 @@ def test_actor_restart(ray_start_regular):
actor = RestartableActor.remote()
pid = ray.get(actor.get_pid.remote())
# Call increase 3 times
for _ in range(3):
ray.get(actor.increase.remote())
# Call increase again with some delay.
result = actor.increase.remote(delay=0.5)
# Sleep some time to wait for the above task to start execution.
time.sleep(0.2)
results = [actor.increase.remote() for _ in range(100)]
# Kill actor process, while the above task is still being executed.
os.kill(pid, signal.SIGKILL)
# Check that the above task didn't fail and the actor is restarted.
assert ray.get(result) == 4
# Make sure that all tasks were executed in order before the actor's death.
res = results.pop(0)
i = 1
while True:
try:
r = ray.get(res)
if r != i:
# Actor restarted without any failed tasks.
break
res = results.pop(0)
i += 1
except ray.exceptions.RayActorError:
# Actor restarted.
break
# Find the first task to execute after the actor was restarted.
while True:
try:
r = ray.get(res)
break
except ray.exceptions.RayActorError:
res = results.pop(0)
pass
# Make sure that all tasks were executed in order after the actor's death.
i = 1
while True:
r = ray.get(res)
assert r == i
if results:
res = results.pop(0)
i += 1
else:
break
# Check that we can still call the actor.
assert ray.get(actor.increase.remote()) == 5
result = actor.increase.remote()
assert ray.get(result) == r + 1
# kill actor process one more time.
results = [actor.increase.remote() for _ in range(100)]
pid = ray.get(actor.get_pid.remote())
os.kill(pid, signal.SIGKILL)
# The actor has exceeded max restarts, and this task should fail.
@@ -174,37 +206,154 @@ def test_actor_restart(ray_start_regular):
# Check that the actor won't be restarted.
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.increase.remote())
ray.shutdown()
def test_actor_restart_with_retry():
"""Test actor restart when actor process is killed."""
ray.init(
_internal_config=json.dumps({
"task_retry_delay_ms": 100,
}), )
@ray.remote(max_restarts=1, max_task_retries=-1)
class RestartableActor:
"""An actor that will be restarted at most once."""
def __init__(self):
self.value = 0
def increase(self, delay=0):
time.sleep(delay)
self.value += 1
return self.value
def get_pid(self):
return os.getpid()
actor = RestartableActor.remote()
pid = ray.get(actor.get_pid.remote())
results = [actor.increase.remote() for _ in range(100)]
# Kill actor process, while the above task is still being executed.
os.kill(pid, signal.SIGKILL)
# Check that none of the tasks failed and the actor is restarted.
seq = list(range(1, 101))
results = ray.get(results)
failed_task_index = None
# Make sure that all tasks were executed in order before and after the
# actor's death.
for i, res in enumerate(results):
if res != seq[0]:
if failed_task_index is None:
failed_task_index = i
assert res + failed_task_index == seq[0]
seq.pop(0)
# Check that we can still call the actor.
result = actor.increase.remote()
assert ray.get(result) == results[-1] + 1
# kill actor process one more time.
results = [actor.increase.remote() for _ in range(100)]
pid = ray.get(actor.get_pid.remote())
os.kill(pid, signal.SIGKILL)
# The actor has exceeded max restarts, and this task should fail.
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.increase.remote())
# Create another actor.
actor = RestartableActor.remote()
# Intentionlly exit the actor
actor.__ray_terminate__.remote()
# Check that the actor won't be restarted.
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.increase.remote())
ray.shutdown()
def test_actor_restart_on_node_failure(ray_start_cluster):
config = json.dumps({
"num_heartbeats_timeout": 10,
"raylet_heartbeat_timeout_milliseconds": 100,
"initial_reconstruction_timeout_milliseconds": 1000,
"task_retry_delay_ms": 100,
})
cluster = ray_start_cluster
# Head node with no resources.
cluster.add_node(num_cpus=0, _internal_config=config)
# Node to place the actor.
cluster.add_node(num_cpus=1, _internal_config=config)
cluster.wait_for_nodes()
ray.init(address=cluster.address)
@ray.remote(num_cpus=1, max_restarts=1, max_task_retries=-1)
class RestartableActor:
"""An actor that will be reconstructed at most once."""
def __init__(self):
self.value = 0
def increase(self):
self.value += 1
return self.value
def ready(self):
return
actor = RestartableActor.remote()
ray.get(actor.ready.remote())
results = [actor.increase.remote() for _ in range(100)]
# Kill actor node, while the above task is still being executed.
cluster.remove_node(cluster.list_all_nodes()[-1])
cluster.add_node(num_cpus=1, _internal_config=config)
cluster.wait_for_nodes()
# Check that none of the tasks failed and the actor is restarted.
seq = list(range(1, 101))
results = ray.get(results)
failed_task_index = None
# Make sure that all tasks were executed in order before and after the
# actor's death.
for i, res in enumerate(results):
elm = seq.pop(0)
if res != elm:
if failed_task_index is None:
failed_task_index = i
assert res + failed_task_index == elm
# Check that we can still call the actor.
result = ray.get(actor.increase.remote())
assert result == 1 or result == results[-1] + 1
def test_actor_restart_without_task(ray_start_regular):
"""Test a dead actor can be restarted without sending task to it."""
@ray.remote(max_restarts=1)
@ray.remote(max_restarts=1, resources={"actor": 1})
class RestartableActor:
def __init__(self, obj_ids):
for obj_id in obj_ids:
# Every time the actor gets constructed,
# put a new object in plasma store.
global_worker = ray.worker.global_worker
if not global_worker.core_worker.object_exists(obj_id):
global_worker.put_object(1, obj_id)
break
def __init__(self):
pass
def get_pid(self):
return os.getpid()
obj_ids = [ray.ObjectID.from_random() for _ in range(2)]
actor = RestartableActor.remote(obj_ids)
@ray.remote(resources={"actor": 1})
def probe():
return
# Returns whether the "actor" resource is available.
def actor_resource_available():
p = probe.remote()
ready, _ = ray.wait([p], timeout=1)
return len(ready) > 0
ray.experimental.set_resource("actor", 1)
actor = RestartableActor.remote()
assert wait_for_condition(lambda: not actor_resource_available())
# Kill the actor.
pid = ray.get(actor.get_pid.remote())
p = probe.remote()
os.kill(pid, signal.SIGKILL)
# Wait until the actor is reconstructed.
def check_restarted():
worker = ray.worker.global_worker
return worker.core_worker.object_exists(obj_ids[1])
assert wait_for_condition(check_restarted)
ray.get(p)
assert wait_for_condition(lambda: not actor_resource_available())
def test_caller_actor_restart(ray_start_regular):
@@ -277,66 +426,6 @@ def test_caller_task_reconstruction(ray_start_regular):
assert ray.get(RetryableTask.remote(remote_actor)) == 3
def test_actor_restart_on_node_failure(ray_start_cluster_head):
"""Test actor reconstruction when node dies unexpectedly."""
cluster = ray_start_cluster_head
max_restarts = 3
# Add a few nodes to the cluster.
# Use custom resource to make sure the actor is only created on worker
# nodes, not on the head node.
for _ in range(max_restarts + 2):
cluster.add_node(
resources={"a": 1},
_internal_config=json.dumps({
"initial_reconstruction_timeout_milliseconds": 200,
"num_heartbeats_timeout": 10,
}),
)
def kill_node(node_id):
node_to_remove = None
for node in cluster.worker_nodes:
if node_id == node.unique_id:
node_to_remove = node
cluster.remove_node(node_to_remove)
@ray.remote(max_restarts=max_restarts, resources={"a": 1})
class MyActor:
def __init__(self):
self.value = 0
def increase(self):
self.value += 1
return self.value
def get_object_store_socket(self):
return ray.worker.global_worker.node.unique_id
actor = MyActor.remote()
# Call increase 3 times.
for _ in range(3):
ray.get(actor.increase.remote())
for i in range(max_restarts):
object_store_socket = ray.get(actor.get_object_store_socket.remote())
# Kill actor's node and the actor should be restarted
# on a different node.
kill_node(object_store_socket)
# Call increase again.
# Check that the actor is restarted and value is correct.
assert ray.get(actor.increase.remote()) == 4 + i
# Check that the actor is now on a different node.
assert object_store_socket != ray.get(
actor.get_object_store_socket.remote())
# kill the node again.
object_store_socket = ray.get(actor.get_object_store_socket.remote())
kill_node(object_store_socket)
# The actor has exceeded max restarts, and this task should fail.
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.increase.remote())
# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for
# this test. Because if this value is too small, suprious task reconstruction
# may happen and cause the test fauilure. If the value is too large, this test
@@ -365,7 +454,7 @@ def test_multiple_actor_restart(ray_start_cluster_head):
})) for _ in range(num_nodes)
]
@ray.remote(max_restarts=-1)
@ray.remote(max_restarts=-1, max_task_retries=-1)
class SlowCounter:
def __init__(self):
self.x = 0
@@ -407,8 +496,12 @@ def test_multiple_actor_restart(ray_start_cluster_head):
# Get the results and check that they have the correct values.
for _, result_id_list in result_ids.items():
results = list(range(1, len(result_id_list) + 1))
assert ray.get(result_id_list) == results
results = ray.get(result_id_list)
for i, result in enumerate(results):
if i == 0:
assert result == 1
else:
assert result == results[i - 1] + 1 or result == 1
def kill_actor(actor):
@@ -418,6 +511,7 @@ def kill_actor(actor):
wait_for_pid_to_exit(pid)
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
"""Test actor checkpointing and restoring from a checkpoint."""
actor = ray.remote(max_restarts=2)(ray_checkpointable_actor_cls).remote()
@@ -440,13 +534,14 @@ def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again and check that reconstruction still works after the
# Kill actor again and check that restart still works after the
# actor resuming from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
"""Test checkpointing of a remote actor through method invocation."""
@@ -487,13 +582,14 @@ def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again and check that reconstruction still works after the
# Kill actor again and check that restart still works after the
# actor resuming from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
ray_checkpointable_actor_cls):
"""Test actor checkpointing on a remote node."""
@@ -520,6 +616,7 @@ def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_checkpointing_save_exception(ray_start_regular,
ray_checkpointable_actor_cls):
"""Test actor can still be recovered if checkpoints fail to complete."""
@@ -549,7 +646,7 @@ def test_checkpointing_save_exception(ray_start_regular,
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again, and check that reconstruction still works and the actor
# Kill actor again, and check that restart still works and the actor
# wasn't resumed from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
@@ -559,6 +656,7 @@ def test_checkpointing_save_exception(ray_start_regular,
wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1)
@pytest.mark.skip(reason="TODO: Actor checkpointing")
def test_checkpointing_load_exception(ray_start_regular,
ray_checkpointable_actor_cls):
"""Test actor can still be recovered if checkpoints fail to load."""
@@ -589,7 +687,7 @@ def test_checkpointing_load_exception(ray_start_regular,
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again, and check that reconstruction still works and the actor
# Kill actor again, and check that restart still works and the actor
# wasn't resumed from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
@@ -718,10 +816,6 @@ def test_decorated_method(ray_start_regular):
assert ray.get(object_id) == 7 # 2 * 3 + 1
@pytest.mark.skipif(
pytest_timeout is None,
reason="Timeout package not installed; skipping test that may hang.")
@pytest.mark.timeout(20)
@pytest.mark.parametrize(
"ray_start_cluster", [{
"num_cpus": 1,
@@ -747,53 +841,45 @@ def test_ray_wait_dead_actor(ray_start_cluster):
actors = [Actor.remote() for _ in range(num_nodes)]
ray.get([actor.ping.remote() for actor in actors])
# Ping the actors and make sure the tasks complete.
ping_ids = [actor.ping.remote() for actor in actors]
ray.get(ping_ids)
# Evict the result from the node that we're about to kill.
remote_node = cluster.list_all_nodes()[-1]
remote_ping_id = None
for i, actor in enumerate(actors):
if ray.get(actor.node_id.remote()) == remote_node.unique_id:
remote_ping_id = ping_ids[i]
ray.internal.free([remote_ping_id], local_only=True)
cluster.remove_node(remote_node)
def actor_dead():
# Ping the actors and make sure the tasks complete.
ping_ids = [actor.ping.remote() for actor in actors]
unready = ping_ids[:]
while unready:
_, unready = ray.wait(unready, timeout=0)
time.sleep(1)
# Repeatedly call ray.wait until the exception for the dead actor is
# received.
unready = ping_ids[:]
while unready:
_, unready = ray.wait(unready, timeout=0)
time.sleep(1)
try:
ray.get(ping_ids)
return False
except ray.exceptions.RayActorError:
return True
with pytest.raises(ray.exceptions.RayActorError):
ray.get(ping_ids)
# Kill a node.
cluster.remove_node(cluster.list_all_nodes()[-1])
# Repeatedly submit tasks and call ray.wait until the exception for the
# dead actor is received.
assert wait_for_condition(actor_dead)
# Evict the result from the dead node.
ray.internal.free([remote_ping_id], local_only=True)
# Create an actor on the local node that will call ray.wait in a loop.
head_node_resource = "HEAD_NODE"
ray.experimental.set_resource(head_node_resource, 1)
@ray.remote(num_cpus=0, resources={head_node_resource: 1})
class ParentActor:
def __init__(self, ping_ids):
self.unready = ping_ids
def __init__(self):
pass
def wait(self):
_, self.unready = ray.wait(self.unready, timeout=0)
return len(self.unready) == 0
return actor_dead()
def ping(self):
return
# Repeatedly call ray.wait through the local actor until the exception for
# the dead actor is received.
parent_actor = ParentActor.remote(ping_ids)
ray.get(parent_actor.ping.remote())
failure_detected = False
while not failure_detected:
failure_detected = ray.get(parent_actor.wait.remote())
parent_actor = ParentActor.remote()
assert wait_for_condition(lambda: ray.get(parent_actor.wait.remote()))
if __name__ == "__main__":
+16 -1
View File
@@ -1730,6 +1730,7 @@ def make_decorator(num_return_vals=None,
max_calls=None,
max_retries=None,
max_restarts=None,
max_task_retries=None,
worker=None):
def decorator(function_or_class):
if (inspect.isfunction(function_or_class)
@@ -1738,6 +1739,9 @@ def make_decorator(num_return_vals=None,
if max_restarts is not None:
raise ValueError("The keyword 'max_restarts' is not "
"allowed for remote functions.")
if max_task_retries is not None:
raise ValueError("The keyword 'max_task_retries' is not "
"allowed for remote functions.")
return ray.remote_function.RemoteFunction(
Language.PYTHON, function_or_class, None, num_cpus, num_gpus,
@@ -1754,7 +1758,7 @@ def make_decorator(num_return_vals=None,
return ray.actor.make_actor(function_or_class, num_cpus, num_gpus,
memory, object_store_memory, resources,
max_restarts)
max_restarts, max_task_retries)
raise TypeError("The @ray.remote decorator must be applied to "
"either a function or to a class.")
@@ -1801,6 +1805,14 @@ def remote(*args, **kwargs):
unexpectedly. The minimum valid value is 0 (default), which indicates
that the actor doesn't need to be restarted. A value of -1
indicates that an actor should be restarted indefinitely.
* **max_task_retries**: Only for *actors*. How many times to retry an actor
task if the task fails due to a system error, e.g., the actor has died.
If set to -1, the system will retry the failed task until the task
succeeds, or the actor has reached its max_restarts limit. If set to n >
0, the system will retry the failed task up to n times, after which the
task will throw a `RayActorError` exception upon `ray.get`. Note that
Python exceptions are not considered system errors and will not trigger
retries.
* **max_retries**: Only for *remote functions*. This specifies the maximum
number of times that the remote function should be rerun when the worker
process executing it crashes unexpectedly. The minimum valid value is 0,
@@ -1867,6 +1879,7 @@ def remote(*args, **kwargs):
"resources",
"max_calls",
"max_restarts",
"max_task_retries",
"max_retries",
], error_string
@@ -1885,6 +1898,7 @@ def remote(*args, **kwargs):
num_return_vals = kwargs.get("num_return_vals")
max_calls = kwargs.get("max_calls")
max_restarts = kwargs.get("max_restarts")
max_task_retries = kwargs.get("max_task_retries")
memory = kwargs.get("memory")
object_store_memory = kwargs.get("object_store_memory")
max_retries = kwargs.get("max_retries")
@@ -1898,5 +1912,6 @@ def remote(*args, **kwargs):
resources=resources,
max_calls=max_calls,
max_restarts=max_restarts,
max_task_retries=max_task_retries,
max_retries=max_retries,
worker=worker)