[direct task] Retry tasks on failure and turn on RAY_FORCE_DIRECT for test_multinode_failures.py (#6306)

* multinode failures direct

* Add number of retries allowed for tasks

* Retry tasks

* Add failing test for object reconstruction

* Handle return status and debug

* update

* Retry task unit test

* update

* update

* todo

* Fix max_retries decorator, fix test

* Fix test that flaked

* lint

* comments
This commit is contained in:
Stephanie Wang
2019-12-02 10:20:57 -08:00
committed by GitHub
parent 0b0a16982a
commit da41180dc0
21 changed files with 284 additions and 63 deletions
+6 -1
View File
@@ -1621,6 +1621,7 @@ def make_decorator(num_return_vals=None,
object_store_memory=None,
resources=None,
max_calls=None,
max_retries=None,
max_reconstructions=None,
worker=None):
def decorator(function_or_class):
@@ -1633,7 +1634,8 @@ def make_decorator(num_return_vals=None,
return ray.remote_function.RemoteFunction(
function_or_class, num_cpus, num_gpus, memory,
object_store_memory, resources, num_return_vals, max_calls)
object_store_memory, resources, num_return_vals, max_calls,
max_retries)
if inspect.isclass(function_or_class):
if num_return_vals is not None:
@@ -1732,6 +1734,7 @@ def remote(*args, **kwargs):
"resources",
"max_calls",
"max_reconstructions",
"max_retries",
], error_string
num_cpus = kwargs["num_cpus"] if "num_cpus" in kwargs else None
@@ -1751,6 +1754,7 @@ def remote(*args, **kwargs):
max_reconstructions = kwargs.get("max_reconstructions")
memory = kwargs.get("memory")
object_store_memory = kwargs.get("object_store_memory")
max_retries = kwargs.get("max_retries")
return make_decorator(
num_return_vals=num_return_vals,
@@ -1761,4 +1765,5 @@ def remote(*args, **kwargs):
resources=resources,
max_calls=max_calls,
max_reconstructions=max_reconstructions,
max_retries=max_retries,
worker=worker)