[direct task] Retry tasks on failure and turn on RAY_FORCE_DIRECT for test_multinode_failures.py (#6306)

* multinode failures direct

* Add number of retries allowed for tasks

* Retry tasks

* Add failing test for object reconstruction

* Handle return status and debug

* update

* Retry task unit test

* update

* update

* todo

* Fix max_retries decorator, fix test

* Fix test that flaked

* lint

* comments
This commit is contained in:
Stephanie Wang
2019-12-02 10:20:57 -08:00
committed by GitHub
parent 0b0a16982a
commit da41180dc0
21 changed files with 284 additions and 63 deletions
+12 -3
View File
@@ -14,6 +14,9 @@ import ray.signature
DEFAULT_REMOTE_FUNCTION_CPUS = 1
DEFAULT_REMOTE_FUNCTION_NUM_RETURN_VALS = 1
DEFAULT_REMOTE_FUNCTION_MAX_CALLS = 0
# Normal tasks may be retried on failure this many times.
# TODO(swang): Allow this to be set globally for an application.
DEFAULT_REMOTE_FUNCTION_NUM_TASK_RETRIES = 3
logger = logging.getLogger(__name__)
@@ -59,7 +62,8 @@ class RemoteFunction(object):
"""
def __init__(self, function, num_cpus, num_gpus, memory,
object_store_memory, resources, num_return_vals, max_calls):
object_store_memory, resources, num_return_vals, max_calls,
max_retries):
self._function = function
self._function_name = (
self._function.__module__ + "." + self._function.__name__)
@@ -76,6 +80,8 @@ class RemoteFunction(object):
num_return_vals is None else num_return_vals)
self._max_calls = (DEFAULT_REMOTE_FUNCTION_MAX_CALLS
if max_calls is None else max_calls)
self._max_retries = (DEFAULT_REMOTE_FUNCTION_NUM_TASK_RETRIES
if max_retries is None else max_retries)
self._decorator = getattr(function, "__ray_invocation_decorator__",
None)
@@ -142,7 +148,8 @@ class RemoteFunction(object):
num_gpus=None,
memory=None,
object_store_memory=None,
resources=None):
resources=None,
max_retries=None):
"""Submit the remote function for execution."""
worker = ray.worker.get_global_worker()
worker.check_connected()
@@ -176,6 +183,8 @@ class RemoteFunction(object):
num_return_vals = self._num_return_vals
if is_direct_call is None:
is_direct_call = self.direct_call_enabled
if max_retries is None:
max_retries = self._max_retries
resources = ray.utils.resources_from_resource_arguments(
self._num_cpus, self._num_gpus, self._memory,
@@ -196,7 +205,7 @@ class RemoteFunction(object):
else:
object_ids = worker.core_worker.submit_task(
self._function_descriptor_list, list_args, num_return_vals,
is_direct_call, resources)
is_direct_call, resources, max_retries)
if len(object_ids) == 1:
return object_ids[0]