mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 12:28:10 +08:00
[direct task] Retry tasks on failure and turn on RAY_FORCE_DIRECT for test_multinode_failures.py (#6306)
* multinode failures direct * Add number of retries allowed for tasks * Retry tasks * Add failing test for object reconstruction * Handle return status and debug * update * Retry task unit test * update * update * todo * Fix max_retries decorator, fix test * Fix test that flaked * lint * comments
This commit is contained in:
@@ -14,6 +14,9 @@ import ray.signature
|
||||
DEFAULT_REMOTE_FUNCTION_CPUS = 1
|
||||
DEFAULT_REMOTE_FUNCTION_NUM_RETURN_VALS = 1
|
||||
DEFAULT_REMOTE_FUNCTION_MAX_CALLS = 0
|
||||
# Normal tasks may be retried on failure this many times.
|
||||
# TODO(swang): Allow this to be set globally for an application.
|
||||
DEFAULT_REMOTE_FUNCTION_NUM_TASK_RETRIES = 3
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -59,7 +62,8 @@ class RemoteFunction(object):
|
||||
"""
|
||||
|
||||
def __init__(self, function, num_cpus, num_gpus, memory,
|
||||
object_store_memory, resources, num_return_vals, max_calls):
|
||||
object_store_memory, resources, num_return_vals, max_calls,
|
||||
max_retries):
|
||||
self._function = function
|
||||
self._function_name = (
|
||||
self._function.__module__ + "." + self._function.__name__)
|
||||
@@ -76,6 +80,8 @@ class RemoteFunction(object):
|
||||
num_return_vals is None else num_return_vals)
|
||||
self._max_calls = (DEFAULT_REMOTE_FUNCTION_MAX_CALLS
|
||||
if max_calls is None else max_calls)
|
||||
self._max_retries = (DEFAULT_REMOTE_FUNCTION_NUM_TASK_RETRIES
|
||||
if max_retries is None else max_retries)
|
||||
self._decorator = getattr(function, "__ray_invocation_decorator__",
|
||||
None)
|
||||
|
||||
@@ -142,7 +148,8 @@ class RemoteFunction(object):
|
||||
num_gpus=None,
|
||||
memory=None,
|
||||
object_store_memory=None,
|
||||
resources=None):
|
||||
resources=None,
|
||||
max_retries=None):
|
||||
"""Submit the remote function for execution."""
|
||||
worker = ray.worker.get_global_worker()
|
||||
worker.check_connected()
|
||||
@@ -176,6 +183,8 @@ class RemoteFunction(object):
|
||||
num_return_vals = self._num_return_vals
|
||||
if is_direct_call is None:
|
||||
is_direct_call = self.direct_call_enabled
|
||||
if max_retries is None:
|
||||
max_retries = self._max_retries
|
||||
|
||||
resources = ray.utils.resources_from_resource_arguments(
|
||||
self._num_cpus, self._num_gpus, self._memory,
|
||||
@@ -196,7 +205,7 @@ class RemoteFunction(object):
|
||||
else:
|
||||
object_ids = worker.core_worker.submit_task(
|
||||
self._function_descriptor_list, list_args, num_return_vals,
|
||||
is_direct_call, resources)
|
||||
is_direct_call, resources, max_retries)
|
||||
|
||||
if len(object_ids) == 1:
|
||||
return object_ids[0]
|
||||
|
||||
Reference in New Issue
Block a user