From 2a204260a85b832c8bf6cfcbfaae1dc786e2a0d8 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 28 Aug 2020 19:57:02 -0700 Subject: [PATCH] [api] Second round of 1.0 API changes: exceptions, num_return_vals (#10377) --- doc/source/actors.rst | 2 +- doc/source/advanced.rst | 2 +- doc/source/fault-tolerance.rst | 4 +- doc/source/walkthrough.rst | 6 +- python/ray/_raylet.pyx | 26 ++++---- python/ray/actor.py | 59 +++++++++---------- python/ray/cross_language.py | 2 +- python/ray/exceptions.py | 50 +++++----------- .../experimental/array/distributed/linalg.py | 10 ++-- .../ray/experimental/array/remote/linalg.py | 14 ++--- python/ray/ray_constants.py | 2 +- python/ray/remote_function.py | 22 +++---- python/ray/serialization.py | 13 ++-- python/ray/serve/tests/test_router.py | 2 +- python/ray/state.py | 2 +- python/ray/tests/test_actor.py | 6 +- python/ray/tests/test_actor_failures.py | 2 +- python/ray/tests/test_basic.py | 10 ++-- python/ray/tests/test_basic_2.py | 6 +- python/ray/tests/test_cancel.py | 18 +++--- python/ray/tests/test_component_failures_2.py | 3 +- .../tests/test_error_ray_not_initialized.py | 2 +- python/ray/tests/test_failure.py | 8 +-- python/ray/tests/test_memory_limits.py | 2 +- python/ray/tests/test_mini.py | 2 +- python/ray/tests/test_multinode_failures.py | 3 +- python/ray/tests/test_placement_group.py | 4 +- python/ray/tests/test_queue.py | 6 +- python/ray/tests/test_reconstruction.py | 17 +++--- python/ray/tests/test_reference_counting.py | 7 +-- python/ray/tests/test_reference_counting_2.py | 10 ++-- python/ray/tests/test_serialization.py | 2 +- python/ray/tests/test_stress_failure.py | 7 +-- .../tests/test_unreconstructable_errors.py | 4 +- python/ray/tune/ray_trial_executor.py | 4 +- python/ray/worker.py | 43 +++++++------- rllib/evaluation/rollout_worker.py | 2 +- 37 files changed, 180 insertions(+), 204 deletions(-) diff --git a/doc/source/actors.rst b/doc/source/actors.rst index 26d43fd24..960148650 100644 --- a/doc/source/actors.rst +++ b/doc/source/actors.rst @@ -54,7 +54,7 @@ Any method of the actor can return multiple object refs with the ``ray.method`` @ray.remote class Foo(object): - @ray.method(num_return_vals=2) + @ray.method(num_returns=2) def bar(self): return 1, 2 diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index fc742fb5e..52ce58bd2 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -47,7 +47,7 @@ And vary the number of return values for tasks (and actor methods too): def f(n): return list(range(n)) - id1, id2 = f.options(num_return_vals=2).remote(2) + id1, id2 = f.options(num_returns=2).remote(2) assert ray.get(id1) == 0 assert ray.get(id2) == 1 diff --git a/doc/source/fault-tolerance.rst b/doc/source/fault-tolerance.rst index fa3abb094..82f77f864 100644 --- a/doc/source/fault-tolerance.rst +++ b/doc/source/fault-tolerance.rst @@ -38,7 +38,7 @@ You can experiment with this behavior by running the following code. # exception. ray.get(potentially_fail.remote(0.5)) print('SUCCESS') - except ray.exceptions.RayWorkerError: + except ray.exceptions.WorkerCrashedError: print('FAILURE') .. _actor-fault-tolerance: @@ -172,7 +172,7 @@ Task outputs over a configurable threshold (default 100KB) may be stored in Ray's distributed object store. Thus, a node failure can cause the loss of a task output. If this occurs, Ray will automatically attempt to recover the value by looking for copies of the same object on other nodes. If there are no -other copies left, an ``UnreconstructableError`` will be raised. +other copies left, an ``ObjectLostError`` will be raised. When there are no copies of an object left, Ray also provides an option to automatically recover the value by re-executing the task that created the diff --git a/doc/source/walkthrough.rst b/doc/source/walkthrough.rst index f73e9d619..7333a77ea 100644 --- a/doc/source/walkthrough.rst +++ b/doc/source/walkthrough.rst @@ -253,7 +253,7 @@ Multiple returns .. code-block:: python - @ray.remote(num_return_vals=3) + @ray.remote(num_returns=3) def return_multiple(): return 1, 2, 3 @@ -341,7 +341,7 @@ If the current node's object store does not contain the object, the object is do assert ray.get([ray.put(i) for i in range(3)]) == [0, 1, 2] # You can also set a timeout to return early from a ``get`` that's blocking for too long. - from ray.exceptions import RayTimeoutError + from ray.exceptions import GetTimeoutError @ray.remote def long_running_function() @@ -350,7 +350,7 @@ If the current node's object store does not contain the object, the object is do obj_ref = long_running_function.remote() try: ray.get(obj_ref, timeout=4) - except RayTimeoutError: + except GetTimeoutError: print("`get` timed out.") .. group-tab:: Java diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 4043e2598..695fe0b93 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -101,11 +101,11 @@ import ray.ray_constants as ray_constants from ray import profiling from ray.exceptions import ( RayError, - RayletError, + RaySystemError, RayTaskError, ObjectStoreFullError, - RayTimeoutError, - RayCancellationError + GetTimeoutError, + TaskCancelledError ) from ray.utils import decode import gc @@ -143,11 +143,11 @@ cdef int check_status(const CRayStatus& status) nogil except -1: elif status.IsInterrupted(): raise KeyboardInterrupt() elif status.IsTimedOut(): - raise RayTimeoutError(message) + raise GetTimeoutError(message) elif status.IsNotFound(): raise ValueError(message) else: - raise RayletError(message) + raise RaySystemError(message) cdef RayObjectsToDataMetadataPairs( const c_vector[shared_ptr[CRayObject]] objects): @@ -481,7 +481,7 @@ cdef execute_task( outputs = function_executor(*args, **kwargs) task_exception = False except KeyboardInterrupt as e: - raise RayCancellationError( + raise TaskCancelledError( core_worker.get_current_task_id()) if c_return_ids.size() == 1: outputs = (outputs,) @@ -489,7 +489,7 @@ cdef execute_task( # was exiting and was raised after the except block. if not check_signals().ok(): task_exception = True - raise RayCancellationError( + raise TaskCancelledError( core_worker.get_current_task_id()) # Store the outputs in the object store. with core_worker.profile_event(b"task:store_outputs"): @@ -976,7 +976,7 @@ cdef class CoreWorker: Language language, FunctionDescriptor function_descriptor, args, - int num_return_vals, + int num_returns, resources, int max_retries, PlacementGroupID placement_group_id, @@ -993,7 +993,7 @@ cdef class CoreWorker: with self.profile_event(b"submit_task"): prepare_resources(resources, &c_resources) task_options = CTaskOptions( - num_return_vals, c_resources) + num_returns, c_resources) ray_function = CRayFunction( language.lang, function_descriptor.descriptor) prepare_args(self, language, args, &args_vector) @@ -1103,7 +1103,7 @@ cdef class CoreWorker: ActorID actor_id, FunctionDescriptor function_descriptor, args, - int num_return_vals, + int num_returns, double num_method_cpus): cdef: @@ -1117,7 +1117,7 @@ cdef class CoreWorker: with self.profile_event(b"submit_task"): if num_method_cpus > 0: c_resources[b"CPU"] = num_method_cpus - task_options = CTaskOptions(num_return_vals, c_resources) + task_options = CTaskOptions(num_returns, c_resources) ray_function = CRayFunction( language.lang, function_descriptor.descriptor) prepare_args(self, language, args, &args_vector) @@ -1209,7 +1209,7 @@ cdef class CoreWorker: return ray.actor.ActorHandle(language, actor_id, method_meta.decorators, method_meta.signatures, - method_meta.num_return_vals, + method_meta.num_returns, actor_method_cpu, actor_creation_function_descriptor, worker.current_session_and_job) @@ -1217,7 +1217,7 @@ cdef class CoreWorker: return ray.actor.ActorHandle(language, actor_id, {}, # method decorators {}, # method signatures - {}, # method num_return_vals + {}, # method num_returns 0, # actor method cpu actor_creation_function_descriptor, worker.current_session_and_job) diff --git a/python/ray/actor.py b/python/ray/actor.py index e0661d93a..7a60ec1ec 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -23,7 +23,7 @@ def method(*args, **kwargs): @ray.remote class Foo: - @ray.method(num_return_vals=2) + @ray.method(num_returns=2) def bar(self): return 1, 2 @@ -32,16 +32,16 @@ def method(*args, **kwargs): _, _ = f.bar.remote() Args: - num_return_vals: The number of object refs that should be returned by + num_returns: The number of object refs that should be returned by invocations of this actor method. """ assert len(args) == 0 assert len(kwargs) == 1 - assert "num_return_vals" in kwargs - num_return_vals = kwargs["num_return_vals"] + assert "num_returns" in kwargs + num_returns = kwargs["num_returns"] def annotate_method(method): - method.__ray_num_return_vals__ = num_return_vals + method.__ray_num_returns__ = num_returns return method return annotate_method @@ -58,7 +58,7 @@ class ActorMethod: Attributes: _actor: A handle to the actor. _method_name: The name of the actor method. - _num_return_vals: The default number of return values that the method + _num_returns: The default number of return values that the method invocation should return. _decorator: An optional decorator that should be applied to the actor method invocation (as opposed to the actor method execution) before @@ -72,12 +72,12 @@ class ActorMethod: def __init__(self, actor, method_name, - num_return_vals, + num_returns, decorator=None, hardref=False): self._actor_ref = weakref.ref(actor) self._method_name = method_name - self._num_return_vals = num_return_vals + self._num_returns = num_returns # This is a decorator that is used to wrap the function invocation (as # opposed to the function execution). The decorator must return a # function that takes in two arguments ("args" and "kwargs"). In most @@ -100,9 +100,9 @@ class ActorMethod: def remote(self, *args, **kwargs): return self._remote(args, kwargs) - def _remote(self, args=None, kwargs=None, num_return_vals=None): - if num_return_vals is None: - num_return_vals = self._num_return_vals + def _remote(self, args=None, kwargs=None, num_returns=None): + if num_returns is None: + num_returns = self._num_returns def invocation(args, kwargs): actor = self._actor_hard_ref or self._actor_ref() @@ -112,7 +112,7 @@ class ActorMethod: self._method_name, args=args, kwargs=kwargs, - num_return_vals=num_return_vals) + num_returns=num_returns) # Apply the decorator if there is one. if self._decorator is not None: @@ -124,7 +124,7 @@ class ActorMethod: return { "actor": self._actor_ref(), "method_name": self._method_name, - "num_return_vals": self._num_return_vals, + "num_returns": self._num_returns, "decorator": self._decorator, } @@ -132,7 +132,7 @@ class ActorMethod: self.__init__( state["actor"], state["method_name"], - state["num_return_vals"], + state["num_returns"], state["decorator"], hardref=True) @@ -147,7 +147,7 @@ class ActorClassMethodMetadata(object): can be set by attaching the attribute "__ray_invocation_decorator__" to the actor method. signatures: The signatures of the methods. - num_return_vals: The default number of return values for + num_returns: The default number of return values for each actor method. """ @@ -182,7 +182,7 @@ class ActorClassMethodMetadata(object): # arguments. self.decorators = {} self.signatures = {} - self.num_return_vals = {} + self.num_returns = {} for method_name, method in actor_methods: # Whether or not this method requires binding of its first # argument. For class and static methods, we do not want to bind @@ -198,11 +198,10 @@ class ActorClassMethodMetadata(object): self.signatures[method_name] = signature.extract_signature( method, ignore_first=not is_bound) # Set the default number of return values for this method. - if hasattr(method, "__ray_num_return_vals__"): - self.num_return_vals[method_name] = ( - method.__ray_num_return_vals__) + if hasattr(method, "__ray_num_returns__"): + self.num_returns[method_name] = (method.__ray_num_returns__) else: - self.num_return_vals[method_name] = ( + self.num_returns[method_name] = ( ray_constants.DEFAULT_ACTOR_METHOD_NUM_RETURN_VALS) if hasattr(method, "__ray_invocation_decorator__"): @@ -589,7 +588,7 @@ class ActorClass: actor_id, meta.method_meta.decorators, meta.method_meta.signatures, - meta.method_meta.num_return_vals, + meta.method_meta.num_returns, actor_method_cpu, meta.actor_creation_function_descriptor, worker.current_session_and_job, @@ -617,7 +616,7 @@ class ActorHandle: invocation side, whereas a regular decorator can be used to change the behavior on the execution side. _ray_method_signatures: The signatures of the actor methods. - _ray_method_num_return_vals: The default number of return values for + _ray_method_num_returns: The default number of return values for each method. _ray_actor_method_cpus: The number of CPUs required by actor methods. _ray_original_handle: True if this is the original actor handle for a @@ -633,7 +632,7 @@ class ActorHandle: actor_id, method_decorators, method_signatures, - method_num_return_vals, + method_num_returns, actor_method_cpus, actor_creation_function_descriptor, session_and_job, @@ -643,7 +642,7 @@ class ActorHandle: self._ray_original_handle = original_handle self._ray_method_decorators = method_decorators self._ray_method_signatures = method_signatures - self._ray_method_num_return_vals = method_num_return_vals + self._ray_method_num_returns = method_num_returns self._ray_actor_method_cpus = actor_method_cpus self._ray_session_and_job = session_and_job self._ray_is_cross_language = language != Language.PYTHON @@ -664,7 +663,7 @@ class ActorHandle: method = ActorMethod( self, method_name, - self._ray_method_num_return_vals[method_name], + self._ray_method_num_returns[method_name], decorator=self._ray_method_decorators.get(method_name)) setattr(self, method_name, method) @@ -680,7 +679,7 @@ class ActorHandle: method_name, args=None, kwargs=None, - num_return_vals=None): + num_returns=None): """Method execution stub for an actor handle. This is the function that executes when @@ -692,7 +691,7 @@ class ActorHandle: method_name: The name of the actor method to execute. args: A list of arguments for the actor method. kwargs: A dictionary of keyword arguments for the actor method. - num_return_vals (int): The number of return values for the method. + num_returns (int): The number of return values for the method. Returns: object_refs: A list of object refs returned by the remote actor @@ -725,7 +724,7 @@ class ActorHandle: object_refs = worker.core_worker.submit_actor_task( self._ray_actor_language, self._ray_actor_id, function_descriptor, - list_args, num_return_vals, self._ray_actor_method_cpus) + list_args, num_returns, self._ray_actor_method_cpus) if len(object_refs) == 1: object_refs = object_refs[0] @@ -795,7 +794,7 @@ class ActorHandle: "actor_id": self._ray_actor_id, "method_decorators": self._ray_method_decorators, "method_signatures": self._ray_method_signatures, - "method_num_return_vals": self._ray_method_num_return_vals, + "method_num_returns": self._ray_method_num_returns, "actor_method_cpus": self._ray_actor_method_cpus, "actor_creation_function_descriptor": self. _ray_actor_creation_function_descriptor, @@ -830,7 +829,7 @@ class ActorHandle: state["actor_id"], state["method_decorators"], state["method_signatures"], - state["method_num_return_vals"], + state["method_num_returns"], state["actor_method_cpus"], state["actor_creation_function_descriptor"], worker.current_session_and_job) diff --git a/python/ray/cross_language.py b/python/ray/cross_language.py index 3f0c04714..8b9dc77da 100644 --- a/python/ray/cross_language.py +++ b/python/ray/cross_language.py @@ -66,7 +66,7 @@ def java_function(class_name, function_name): None, # memory, None, # object_store_memory, None, # resources, - None, # num_return_vals, + None, # num_returns, None, # max_calls, None, # max_retries placement_group=None, diff --git a/python/ray/exceptions.py b/python/ray/exceptions.py index 5b246afda..ec8cb6a88 100644 --- a/python/ray/exceptions.py +++ b/python/ray/exceptions.py @@ -41,12 +41,7 @@ class CrossLanguageError(RayError): ray_exception.formatted_exception_string)) -class RayConnectionError(RayError): - """Raised when ray is not yet connected but needs to be.""" - pass - - -class RayCancellationError(RayError): +class TaskCancelledError(RayError): """Raised when this task is cancelled. Attributes: @@ -143,7 +138,7 @@ class RayTaskError(RayError): return "\n".join(out) -class RayWorkerError(RayError): +class WorkerCrashedError(RayError): """Indicates that the worker died unexpectedly while executing a task.""" def __str__(self): @@ -161,8 +156,8 @@ class RayActorError(RayError): return "The actor died unexpectedly before finishing this task." -class RayletError(RayError): - """Indicates that the Raylet client has errored. +class RaySystemError(RayError): + """Indicates that Ray encountered a system error. This exception can be thrown when the raylet is killed. """ @@ -171,7 +166,7 @@ class RayletError(RayError): self.client_exc = client_exc def __str__(self): - return f"The Raylet died with this message: {self.client_exc}" + return f"System error: {self.client_exc}" class ObjectStoreFullError(RayError): @@ -184,21 +179,13 @@ class ObjectStoreFullError(RayError): def __str__(self): return super(ObjectStoreFullError, self).__str__() + ( "\n" - "The local object store is full of objects that are still in scope" - " and cannot be evicted. Try increasing the object store memory " - "available with ray.init(object_store_memory=). " - "You can also try setting an option to fallback to LRU eviction " - "when the object store is full by calling " - "ray.init(lru_evict=True). See also: " - "https://docs.ray.io/en/latest/memory-management.html.") + "The local object store is full of objects that are still in " + "scope and cannot be evicted. Tip: Use the `ray memory` command " + "to list active objects in the cluster.") -class UnreconstructableError(RayError): - """Indicates that an object is lost and cannot be reconstructed. - - Note, this exception only happens for actor objects. If actor's current - state is after object's creating task, the actor cannot re-run the task to - reconstruct the object. +class ObjectLostError(RayError): + """Indicates that an object has been lost due to node failure. Attributes: object_ref: ID of the object. @@ -208,17 +195,10 @@ class UnreconstructableError(RayError): self.object_ref = object_ref def __str__(self): - return ( - f"Object {self.object_ref.hex()} is lost " - "(either LRU evicted or deleted by user) and " - "cannot be reconstructed. Try increasing the object store " - "memory available with ray.init(object_store_memory=) " - "or setting object store limits with " - "ray.remote(object_store_memory=). " - "See also: https://docs.ray.io/en/latest/memory-management.html") + return (f"Object {self.object_ref.hex()} is lost due to node failure.") -class RayTimeoutError(RayError): +class GetTimeoutError(RayError): """Indicates that a call to the worker timed out.""" pass @@ -232,9 +212,9 @@ RAY_EXCEPTION_TYPES = [ PlasmaObjectNotAvailable, RayError, RayTaskError, - RayWorkerError, + WorkerCrashedError, RayActorError, ObjectStoreFullError, - UnreconstructableError, - RayTimeoutError, + ObjectLostError, + GetTimeoutError, ] diff --git a/python/ray/experimental/array/distributed/linalg.py b/python/ray/experimental/array/distributed/linalg.py index 6317ff676..bde572c85 100644 --- a/python/ray/experimental/array/distributed/linalg.py +++ b/python/ray/experimental/array/distributed/linalg.py @@ -7,7 +7,7 @@ from . import core __all__ = ["tsqr", "modified_lu", "tsqr_hr", "qr"] -@ray.remote(num_return_vals=2) +@ray.remote(num_returns=2) def tsqr(a): """Perform a QR decomposition of a tall-skinny matrix. @@ -83,7 +83,7 @@ def tsqr(a): # TODO(rkn): This is unoptimized, we really want a block version of this. # This is Algorithm 5 from # http://www.eecs.berkeley.edu/Pubs/TechRpts/2013/EECS-2013-175.pdf. -@ray.remote(num_return_vals=3) +@ray.remote(num_returns=3) def modified_lu(q): """Perform a modified LU decomposition of a matrix. @@ -121,7 +121,7 @@ def modified_lu(q): return ray.get(core.numpy_to_dist.remote(ray.put(L))), U, S -@ray.remote(num_return_vals=2) +@ray.remote(num_returns=2) def tsqr_hr_helper1(u, s, y_top_block, b): y_top = y_top_block[:b, :b] s_full = np.diag(s) @@ -137,7 +137,7 @@ def tsqr_hr_helper2(s, r_temp): # This is Algorithm 6 from # http://www.eecs.berkeley.edu/Pubs/TechRpts/2013/EECS-2013-175.pdf. -@ray.remote(num_return_vals=4) +@ray.remote(num_returns=4) def tsqr_hr(a): q, r_temp = tsqr.remote(a) y, u, s = modified_lu.remote(q) @@ -160,7 +160,7 @@ def qr_helper2(y_ri, a_rc): # This is Algorithm 7 from # http://www.eecs.berkeley.edu/Pubs/TechRpts/2013/EECS-2013-175.pdf. -@ray.remote(num_return_vals=2) +@ray.remote(num_returns=2) def qr(a): m, n = a.shape[0], a.shape[1] diff --git a/python/ray/experimental/array/remote/linalg.py b/python/ray/experimental/array/remote/linalg.py index bdde3a17b..47c3b98bf 100644 --- a/python/ray/experimental/array/remote/linalg.py +++ b/python/ray/experimental/array/remote/linalg.py @@ -18,12 +18,12 @@ def solve(a, b): return np.linalg.solve(a, b) -@ray.remote(num_return_vals=2) +@ray.remote(num_returns=2) def tensorsolve(a): raise NotImplementedError -@ray.remote(num_return_vals=2) +@ray.remote(num_returns=2) def tensorinv(a): raise NotImplementedError @@ -63,22 +63,22 @@ def det(a): return np.linalg.det(a) -@ray.remote(num_return_vals=3) +@ray.remote(num_returns=3) def svd(a): return np.linalg.svd(a) -@ray.remote(num_return_vals=2) +@ray.remote(num_returns=2) def eig(a): return np.linalg.eig(a) -@ray.remote(num_return_vals=2) +@ray.remote(num_returns=2) def eigh(a): return np.linalg.eigh(a) -@ray.remote(num_return_vals=4) +@ray.remote(num_returns=4) def lstsq(a, b): return np.linalg.lstsq(a) @@ -88,7 +88,7 @@ def norm(x): return np.linalg.norm(x) -@ray.remote(num_return_vals=2) +@ray.remote(num_returns=2) def qr(a): return np.linalg.qr(a) diff --git a/python/ray/ray_constants.py b/python/ray/ray_constants.py index 47aff0ebe..92aeac05f 100644 --- a/python/ray/ray_constants.py +++ b/python/ray/ray_constants.py @@ -130,7 +130,7 @@ DASHBOARD_DIED_ERROR = "dashboard_died" RAYLET_CONNECTION_ERROR = "raylet_connection_error" # Used in gpu detection -RESOURCE_CONSTRAINT_PREFIX = "GPUType:" +RESOURCE_CONSTRAINT_PREFIX = "gpu_type:" RESOURCES_ENVIRONMENT_VARIABLE = "RAY_OVERRIDE_RESOURCES" diff --git a/python/ray/remote_function.py b/python/ray/remote_function.py index a79eb0163..02a4735a2 100644 --- a/python/ray/remote_function.py +++ b/python/ray/remote_function.py @@ -40,7 +40,7 @@ class RemoteFunction: _object_store_memory: The object store memory request for this task. _resources: The default custom resource requirements for invocations of this remote function. - _num_return_vals: The default number of return values for invocations + _num_returns: The default number of return values for invocations of this remote function. _max_calls: The number of times a worker can execute this function before exiting. @@ -61,8 +61,8 @@ class RemoteFunction: """ def __init__(self, language, function, function_descriptor, num_cpus, - num_gpus, memory, object_store_memory, resources, - num_return_vals, max_calls, max_retries, placement_group, + num_gpus, memory, object_store_memory, resources, num_returns, + max_calls, max_retries, placement_group, placement_group_bundle_index): self._language = language self._function = function @@ -79,8 +79,8 @@ class RemoteFunction: "setting object_store_memory is not implemented for tasks") self._object_store_memory = None self._resources = resources - self._num_return_vals = (DEFAULT_REMOTE_FUNCTION_NUM_RETURN_VALS if - num_return_vals is None else num_return_vals) + self._num_returns = (DEFAULT_REMOTE_FUNCTION_NUM_RETURN_VALS + if num_returns is None else num_returns) self._max_calls = (DEFAULT_REMOTE_FUNCTION_MAX_CALLS if max_calls is None else max_calls) self._max_retries = (DEFAULT_REMOTE_FUNCTION_NUM_TASK_RETRIES @@ -107,7 +107,7 @@ class RemoteFunction: def _submit(self, args=None, kwargs=None, - num_return_vals=None, + num_returns=None, num_cpus=None, num_gpus=None, resources=None): @@ -116,7 +116,7 @@ class RemoteFunction: return self._remote( args=args, kwargs=kwargs, - num_return_vals=num_return_vals, + num_returns=num_returns, num_cpus=num_cpus, num_gpus=num_gpus, resources=resources) @@ -144,7 +144,7 @@ class RemoteFunction: def _remote(self, args=None, kwargs=None, - num_return_vals=None, + num_returns=None, num_cpus=None, num_gpus=None, memory=None, @@ -182,8 +182,8 @@ class RemoteFunction: kwargs = {} if kwargs is None else kwargs args = [] if args is None else args - if num_return_vals is None: - num_return_vals = self._num_return_vals + if num_returns is None: + num_returns = self._num_returns if max_retries is None: max_retries = self._max_retries @@ -213,7 +213,7 @@ class RemoteFunction: "cannot be executed locally." object_refs = worker.core_worker.submit_task( self._language, self._function_descriptor, list_args, - num_return_vals, resources, max_retries, placement_group.id, + num_returns, resources, max_retries, placement_group.id, placement_group_bundle_index) if len(object_refs) == 1: diff --git a/python/ray/serialization.py b/python/ray/serialization.py index faee81971..ae51289c5 100644 --- a/python/ray/serialization.py +++ b/python/ray/serialization.py @@ -13,9 +13,9 @@ from ray.exceptions import ( PlasmaObjectNotAvailable, RayTaskError, RayActorError, - RayCancellationError, - RayWorkerError, - UnreconstructableError, + TaskCancelledError, + WorkerCrashedError, + ObjectLostError, ) from ray._raylet import ( split_buffer, @@ -265,14 +265,13 @@ class SerializationContext: obj = self._deserialize_msgpack_data(data, metadata) return RayError.from_bytes(obj) elif error_type == ErrorType.Value("WORKER_DIED"): - return RayWorkerError() + return WorkerCrashedError() elif error_type == ErrorType.Value("ACTOR_DIED"): return RayActorError() elif error_type == ErrorType.Value("TASK_CANCELLED"): - return RayCancellationError() + return TaskCancelledError() elif error_type == ErrorType.Value("OBJECT_UNRECONSTRUCTABLE"): - return UnreconstructableError( - ray.ObjectRef(object_ref.binary())) + return ObjectLostError(ray.ObjectRef(object_ref.binary())) else: assert error_type != ErrorType.Value("OBJECT_IN_PLASMA"), \ "Tried to get object that has been promoted to plasma." diff --git a/python/ray/serve/tests/test_router.py b/python/ray/serve/tests/test_router.py index 3634b11e7..a7e06a1fd 100644 --- a/python/ray/serve/tests/test_router.py +++ b/python/ray/serve/tests/test_router.py @@ -191,7 +191,7 @@ async def test_router_use_max_concurrency(serve_instance): second_query = q.enqueue_request.remote(RequestMetadata("svc", None), 1) # Neither queries should be available - with pytest.raises(ray.exceptions.RayTimeoutError): + with pytest.raises(ray.exceptions.GetTimeoutError): ray.get([first_query, second_query], timeout=0.2) # Let's retrieve the router internal state diff --git a/python/ray/state.py b/python/ray/state.py index add1b21ae..05044cce4 100644 --- a/python/ray/state.py +++ b/python/ray/state.py @@ -48,7 +48,7 @@ class GlobalState: """ if (self.redis_client is None or self.redis_clients is None or self.global_state_accessor is None): - raise ray.exceptions.RayConnectionError( + raise ray.exceptions.RaySystemError( "Ray has not been started yet. You can start Ray with " "'ray.init()'.") diff --git a/python/ray/tests/test_actor.py b/python/ray/tests/test_actor.py index d5ff824ba..6b7cc662b 100644 --- a/python/ray/tests/test_actor.py +++ b/python/ray/tests/test_actor.py @@ -646,15 +646,15 @@ def test_multiple_return_values(ray_start_regular_shared): def method0(self): return 1 - @ray.method(num_return_vals=1) + @ray.method(num_returns=1) def method1(self): return 1 - @ray.method(num_return_vals=2) + @ray.method(num_returns=2) def method2(self): return 1, 2 - @ray.method(num_return_vals=3) + @ray.method(num_returns=3) def method3(self): return 1, 2, 3 diff --git a/python/ray/tests/test_actor_failures.py b/python/ray/tests/test_actor_failures.py index 63f8f9106..7ddd4b26b 100644 --- a/python/ray/tests/test_actor_failures.py +++ b/python/ray/tests/test_actor_failures.py @@ -63,7 +63,7 @@ def test_actor_eviction(ray_start_regular): val = ray.get(obj) assert isinstance(val, np.ndarray), val num_success += 1 - except ray.exceptions.UnreconstructableError: + except ray.exceptions.ObjectLostError: num_evicted += 1 # Some objects should have been evicted, and some should still be in the # object store. diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py index c076a2f20..3bd048593 100644 --- a/python/ray/tests/test_basic.py +++ b/python/ray/tests/test_basic.py @@ -64,12 +64,12 @@ def test_submit_api(shutdown_only): def g(): return ray.get_gpu_ids() - assert f._remote([0], num_return_vals=0) is None - id1 = f._remote(args=[1], num_return_vals=1) + assert f._remote([0], num_returns=0) is None + id1 = f._remote(args=[1], num_returns=1) assert ray.get(id1) == [0] - id1, id2 = f._remote(args=[2], num_return_vals=2) + id1, id2 = f._remote(args=[2], num_returns=2) assert ray.get([id1, id2]) == [0, 1] - id1, id2, id3 = f._remote(args=[3], num_return_vals=3) + id1, id2, id3 = f._remote(args=[3], num_returns=3) assert ray.get([id1, id2, id3]) == [0, 1, 2] assert ray.get( g._remote(args=[], num_cpus=1, num_gpus=1, @@ -107,7 +107,7 @@ def test_submit_api(shutdown_only): ray.get(a2.method._remote()) id1, id2, id3, id4 = a.method._remote( - args=["test"], kwargs={"b": 2}, num_return_vals=4) + args=["test"], kwargs={"b": 2}, num_returns=4) assert ray.get([id1, id2, id3, id4]) == [0, 1, "test", 2] diff --git a/python/ray/tests/test_basic_2.py b/python/ray/tests/test_basic_2.py index 354eb89cf..216481b56 100644 --- a/python/ray/tests/test_basic_2.py +++ b/python/ray/tests/test_basic_2.py @@ -12,7 +12,7 @@ from unittest.mock import MagicMock, patch import ray import ray.cluster_utils import ray.test_utils -from ray.exceptions import RayTimeoutError +from ray.exceptions import GetTimeoutError logger = logging.getLogger(__name__) @@ -351,7 +351,7 @@ def test_system_config_when_connecting(ray_start_cluster): ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8)) # This would not raise an exception if object pinning was enabled. - with pytest.raises(ray.exceptions.UnreconstructableError): + with pytest.raises(ray.exceptions.ObjectLostError): ray.get(obj_ref) @@ -377,7 +377,7 @@ def test_get_with_timeout(ray_start_regular_shared): # Check that get() raises a TimeoutError after the timeout if the object # is not ready yet. result_id = signal.wait.remote() - with pytest.raises(RayTimeoutError): + with pytest.raises(GetTimeoutError): ray.get(result_id, timeout=0.1) # Check that a subsequent get() returns early. diff --git a/python/ray/tests/test_cancel.py b/python/ray/tests/test_cancel.py index 9d336f192..b4e4aa439 100644 --- a/python/ray/tests/test_cancel.py +++ b/python/ray/tests/test_cancel.py @@ -5,18 +5,18 @@ import time import pytest import ray -from ray.exceptions import RayCancellationError, RayTaskError, \ - RayTimeoutError, RayWorkerError, \ - UnreconstructableError +from ray.exceptions import TaskCancelledError, RayTaskError, \ + GetTimeoutError, WorkerCrashedError, \ + ObjectLostError from ray.test_utils import SignalActor def valid_exceptions(use_force): if use_force: - return (RayTaskError, RayCancellationError, RayWorkerError, - UnreconstructableError) + return (RayTaskError, TaskCancelledError, WorkerCrashedError, + ObjectLostError) else: - return (RayTaskError, RayCancellationError) + return (RayTaskError, TaskCancelledError) @pytest.mark.parametrize("use_force", [True, False]) @@ -50,10 +50,10 @@ def test_cancel_chain(ray_start_regular, use_force): with pytest.raises(valid_exceptions(use_force)): ray.get(ob) - with pytest.raises(RayTimeoutError): + with pytest.raises(GetTimeoutError): ray.get(obj1, timeout=.1) - with pytest.raises(RayTimeoutError): + with pytest.raises(GetTimeoutError): ray.get(obj2, timeout=.1) signaler2.send.remote() @@ -249,7 +249,7 @@ def test_remote_cancel(ray_start_regular, use_force): outer = remote_wait.remote([sig]) inner = ray.get(outer)[0] - with pytest.raises(RayTimeoutError): + with pytest.raises(GetTimeoutError): ray.get(inner, timeout=1) ray.cancel(inner, force=use_force) diff --git a/python/ray/tests/test_component_failures_2.py b/python/ray/tests/test_component_failures_2.py index 4a056ba79..5bfd159cc 100644 --- a/python/ray/tests/test_component_failures_2.py +++ b/python/ray/tests/test_component_failures_2.py @@ -70,7 +70,8 @@ def test_worker_failed(ray_start_workers_separate_multinode): for object_ref in object_refs: try: ray.get(object_ref) - except (ray.exceptions.RayTaskError, ray.exceptions.RayWorkerError): + except (ray.exceptions.RayTaskError, + ray.exceptions.WorkerCrashedError): pass diff --git a/python/ray/tests/test_error_ray_not_initialized.py b/python/ray/tests/test_error_ray_not_initialized.py index 84f1c2ae0..dd006b239 100644 --- a/python/ray/tests/test_error_ray_not_initialized.py +++ b/python/ray/tests/test_error_ray_not_initialized.py @@ -36,7 +36,7 @@ def test_errors_before_initializing_ray(): for api_method in api_methods: print(api_method) with pytest.raises( - ray.exceptions.RayConnectionError, + ray.exceptions.RaySystemError, match="Ray has not been started yet."): api_method() diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index a3667f5aa..b209b7fc6 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -30,7 +30,7 @@ def test_failed_task(ray_start_regular, error_pubsub): def throw_exception_fct2(): raise Exception("Test function 2 intentionally failed.") - @ray.remote(num_return_vals=3) + @ray.remote(num_returns=3) def throw_exception_fct3(x): raise Exception("Test function 3 intentionally failed.") @@ -362,7 +362,7 @@ def test_worker_dying(ray_start_regular, error_pubsub): def f(): eval("exit()") - with pytest.raises(ray.exceptions.RayWorkerError): + with pytest.raises(ray.exceptions.WorkerCrashedError): ray.get(f.remote()) errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR) @@ -901,7 +901,7 @@ def test_raylet_crash_when_get(ray_start_regular): thread = threading.Thread(target=sleep_to_kill_raylet) thread.start() - with pytest.raises(ray.exceptions.UnreconstructableError): + with pytest.raises(ray.exceptions.ObjectLostError): ray.get(object_ref) thread.join() @@ -1062,7 +1062,7 @@ def test_eviction(ray_start_cluster): # Evict the object. ray.internal.free([obj]) # ray.get throws an exception. - with pytest.raises(ray.exceptions.UnreconstructableError): + with pytest.raises(ray.exceptions.ObjectLostError): ray.get(obj) @ray.remote diff --git a/python/ray/tests/test_memory_limits.py b/python/ray/tests/test_memory_limits.py index 2fcf36d75..fd7f487fb 100644 --- a/python/ray/tests/test_memory_limits.py +++ b/python/ray/tests/test_memory_limits.py @@ -5,7 +5,7 @@ import ray MB = 1024 * 1024 -OBJECT_EVICTED = ray.exceptions.UnreconstructableError +OBJECT_EVICTED = ray.exceptions.ObjectLostError OBJECT_TOO_LARGE = ray.exceptions.ObjectStoreFullError diff --git a/python/ray/tests/test_mini.py b/python/ray/tests/test_mini.py index 55b623733..dae1e11bd 100644 --- a/python/ray/tests/test_mini.py +++ b/python/ray/tests/test_mini.py @@ -15,7 +15,7 @@ def test_basic_task_api(ray_start_regular): # Test multiple return values. - @ray.remote(num_return_vals=3) + @ray.remote(num_returns=3) def f_multiple_returns(): return 1, 2, 3 diff --git a/python/ray/tests/test_multinode_failures.py b/python/ray/tests/test_multinode_failures.py index 025ae8073..58f17cc1a 100644 --- a/python/ray/tests/test_multinode_failures.py +++ b/python/ray/tests/test_multinode_failures.py @@ -70,7 +70,8 @@ def test_worker_failed(ray_start_workers_separate_multinode): for object_ref in object_refs: try: ray.get(object_ref) - except (ray.exceptions.RayTaskError, ray.exceptions.RayWorkerError): + except (ray.exceptions.RayTaskError, + ray.exceptions.WorkerCrashedError): pass diff --git a/python/ray/tests/test_placement_group.py b/python/ray/tests/test_placement_group.py index c23420196..0b5fdf67f 100644 --- a/python/ray/tests/test_placement_group.py +++ b/python/ray/tests/test_placement_group.py @@ -355,7 +355,7 @@ def test_remove_placement_group(ray_start_cluster): # That means this request should fail. with pytest.raises(ray.exceptions.RayActorError, match="actor died"): ray.get(a.f.remote(), timeout=3.0) - with pytest.raises(ray.exceptions.RayWorkerError): + with pytest.raises(ray.exceptions.WorkerCrashedError): ray.get(task_ref) @@ -576,7 +576,7 @@ def test_pending_placement_group_wait(ray_start_cluster): assert len(ready) == 0 table = ray.experimental.placement_group_table(placement_group) assert table["state"] == "PENDING" - with pytest.raises(ray.exceptions.RayTimeoutError): + with pytest.raises(ray.exceptions.GetTimeoutError): ray.get(placement_group.ready(), timeout=0.1) diff --git a/python/ray/tests/test_queue.py b/python/ray/tests/test_queue.py index 72bbb821c..cfb254884 100644 --- a/python/ray/tests/test_queue.py +++ b/python/ray/tests/test_queue.py @@ -1,7 +1,7 @@ import pytest import ray -from ray.exceptions import RayTimeoutError +from ray.exceptions import GetTimeoutError from ray.experimental.queue import Queue, Empty, Full @@ -80,7 +80,7 @@ def test_async_get(ray_start_regular): with pytest.raises(Empty): q.get_nowait() - with pytest.raises(RayTimeoutError): + with pytest.raises(GetTimeoutError): ray.get(future, timeout=0.1) # task not canceled on timeout. q.put(1) @@ -95,7 +95,7 @@ def test_async_put(ray_start_regular): with pytest.raises(Full): q.put_nowait(3) - with pytest.raises(RayTimeoutError): + with pytest.raises(GetTimeoutError): ray.get(future, timeout=0.1) # task not canceled on timeout. assert q.get() == 1 diff --git a/python/ray/tests/test_reconstruction.py b/python/ray/tests/test_reconstruction.py index ab4a9b4ef..f6599483d 100644 --- a/python/ray/tests/test_reconstruction.py +++ b/python/ray/tests/test_reconstruction.py @@ -110,7 +110,7 @@ def test_reconstruction_cached_dependency(ray_start_cluster, else: with pytest.raises(ray.exceptions.RayTaskError) as e: ray.get(dependent_task.remote(obj)) - with pytest.raises(ray.exceptions.UnreconstructableError): + with pytest.raises(ray.exceptions.ObjectLostError): raise e.as_instanceof_cause() @@ -159,7 +159,7 @@ def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled): else: with pytest.raises(ray.exceptions.RayTaskError) as e: ray.get(dependent_task.remote(obj)) - with pytest.raises(ray.exceptions.UnreconstructableError): + with pytest.raises(ray.exceptions.ObjectLostError): raise e.as_instanceof_cause() @@ -215,7 +215,7 @@ def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled): # been evicted. try: ray.get(result) - except ray.exceptions.UnreconstructableError: + except ray.exceptions.ObjectLostError: pass @@ -284,7 +284,7 @@ def test_basic_reconstruction_actor_task(ray_start_cluster, else: with pytest.raises(ray.exceptions.RayTaskError) as e: ray.get(dependent_task.remote(obj)) - with pytest.raises(ray.exceptions.UnreconstructableError): + with pytest.raises(ray.exceptions.ObjectLostError): raise e.as_instanceof_cause() # Make sure the actor handle is still usable. @@ -356,8 +356,7 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster, return True except ray.exceptions.RayActorError: return False - except (ray.exceptions.RayTaskError, - ray.exceptions.UnreconstructableError): + except (ray.exceptions.RayTaskError, ray.exceptions.ObjectLostError): return True wait_for_condition(probe) @@ -369,7 +368,7 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster, x = a.dependent_task.remote(obj) print(x) ray.get(x) - with pytest.raises(ray.exceptions.UnreconstructableError): + with pytest.raises(ray.exceptions.ObjectLostError): raise e.as_instanceof_cause() @@ -429,7 +428,7 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled): dependent_task.options(resources={ "node1": 1 }).remote(obj)) - with pytest.raises(ray.exceptions.UnreconstructableError): + with pytest.raises(ray.exceptions.ObjectLostError): raise e.as_instanceof_cause() @@ -480,7 +479,7 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled): else: with pytest.raises(ray.exceptions.RayTaskError) as e: ray.get(dependent_task.remote(obj)) - with pytest.raises(ray.exceptions.UnreconstructableError): + with pytest.raises(ray.exceptions.ObjectLostError): raise e.as_instanceof_cause() diff --git a/python/ray/tests/test_reference_counting.py b/python/ray/tests/test_reference_counting.py index 787d93144..ba6a4b067 100644 --- a/python/ray/tests/test_reference_counting.py +++ b/python/ray/tests/test_reference_counting.py @@ -354,7 +354,7 @@ def test_basic_serialized_reference(one_worker_100MiB, use_ray_put, failure): try: ray.get(obj_ref) assert not failure - except ray.exceptions.RayWorkerError: + except ray.exceptions.WorkerCrashedError: assert failure # Reference should be gone, check that array gets evicted. @@ -403,7 +403,7 @@ def test_recursive_serialized_reference(one_worker_100MiB, use_ray_put, assert ray.get(tail_oid) is None assert not failure # TODO(edoakes): this should raise WorkerError. - except ray.exceptions.UnreconstructableError: + except ray.exceptions.ObjectLostError: assert failure # Reference should be gone, check that array gets evicted. @@ -501,8 +501,7 @@ def test_worker_holding_serialized_reference(one_worker_100MiB, use_ray_put, try: ray.get(child_return_id) assert not failure - except (ray.exceptions.RayWorkerError, - ray.exceptions.UnreconstructableError): + except (ray.exceptions.WorkerCrashedError, ray.exceptions.ObjectLostError): assert failure del child_return_id diff --git a/python/ray/tests/test_reference_counting_2.py b/python/ray/tests/test_reference_counting_2.py index 092314357..17c578408 100644 --- a/python/ray/tests/test_reference_counting_2.py +++ b/python/ray/tests/test_reference_counting_2.py @@ -91,7 +91,7 @@ def test_recursively_nest_ids(one_worker_100MiB, use_ray_put, failure): ray.get(tail_oid) assert not failure # TODO(edoakes): this should raise WorkerError. - except ray.exceptions.UnreconstructableError: + except ray.exceptions.ObjectLostError: assert failure # Reference should be gone, check that array gets evicted. @@ -130,7 +130,7 @@ def test_return_object_ref(one_worker_100MiB, use_ray_put, failure): # Check that the owner dying unpins the object. This should execute on # the same worker because there is only one started and the other tasks # have finished. - with pytest.raises(ray.exceptions.RayWorkerError): + with pytest.raises(ray.exceptions.WorkerCrashedError): ray.get(exit.remote()) else: # Check that removing the inner ID unpins the object. @@ -173,7 +173,7 @@ def test_pass_returned_object_ref(one_worker_100MiB, use_ray_put, failure): # Should succeed because inner_oid is pinned if no failure. ray.get(pending_oid) assert not failure - except ray.exceptions.RayWorkerError: + except ray.exceptions.WorkerCrashedError: assert failure def ref_not_exists(): @@ -232,7 +232,7 @@ def test_recursively_pass_returned_object_ref(one_worker_100MiB, use_ray_put, _fill_object_store_and_get(inner_oid) assert not failure # TODO(edoakes): this should raise WorkerError. - except ray.exceptions.UnreconstructableError: + except ray.exceptions.ObjectLostError: assert failure inner_oid_bytes = inner_oid.binary() @@ -311,7 +311,7 @@ def test_borrowed_id_failure(one_worker_100MiB, failure): def resolve_ref(self): assert self.ref is not None if failure: - with pytest.raises(ray.exceptions.UnreconstructableError): + with pytest.raises(ray.exceptions.ObjectLostError): ray.get(self.ref) else: ray.get(self.ref) diff --git a/python/ray/tests/test_serialization.py b/python/ray/tests/test_serialization.py index 6928981aa..922c72937 100644 --- a/python/ray/tests/test_serialization.py +++ b/python/ray/tests/test_serialization.py @@ -422,7 +422,7 @@ def test_register_class(ray_start_2_cpus): assert ray.get(h2.remote(10)).value == 10 # Test registering multiple classes with the same name. - @ray.remote(num_return_vals=3) + @ray.remote(num_returns=3) def j(): class Class0: def method0(self): diff --git a/python/ray/tests/test_stress_failure.py b/python/ray/tests/test_stress_failure.py index 88b7fa04b..c81bbe879 100644 --- a/python/ray/tests/test_stress_failure.py +++ b/python/ray/tests/test_stress_failure.py @@ -335,10 +335,9 @@ def test_driver_put_errors(ray_start_object_store_memory, error_pubsub): return len(errors) > 1 errors = wait_for_errors(p, error_check) - assert all( - error.type == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR - or "ray.exceptions.UnreconstructableError" in error.error_messages - for error in errors) + assert all(error.type == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR + or "ray.exceptions.ObjectLostError" in error.error_messages + for error in errors) # NOTE(swang): This test tries to launch 1000 workers and breaks. diff --git a/python/ray/tests/test_unreconstructable_errors.py b/python/ray/tests/test_unreconstructable_errors.py index d1dceca61..a86d50da4 100644 --- a/python/ray/tests/test_unreconstructable_errors.py +++ b/python/ray/tests/test_unreconstructable_errors.py @@ -4,7 +4,7 @@ import unittest import ray -class TestUnreconstructableErrors(unittest.TestCase): +class TestObjectLostErrors(unittest.TestCase): def setUp(self): ray.init( num_cpus=1, @@ -20,7 +20,7 @@ class TestUnreconstructableErrors(unittest.TestCase): ray.get(x_id) for _ in range(20): ray.put(np.zeros(10 * 1024 * 1024)) - self.assertRaises(ray.exceptions.UnreconstructableError, + self.assertRaises(ray.exceptions.ObjectLostError, lambda: ray.get(x_id)) diff --git a/python/ray/tune/ray_trial_executor.py b/python/ray/tune/ray_trial_executor.py index ae2ef163c..3018ea261 100644 --- a/python/ray/tune/ray_trial_executor.py +++ b/python/ray/tune/ray_trial_executor.py @@ -8,7 +8,7 @@ import traceback from contextlib import contextmanager import ray -from ray.exceptions import RayTimeoutError +from ray.exceptions import GetTimeoutError from ray import ray_constants from ray.resource_spec import ResourceSpec from ray.tune.durable_trainable import DurableTrainable @@ -397,7 +397,7 @@ class RayTrialExecutor(TrialExecutor): reset_val = ray.get( trainable.reset.remote(new_config, trial.logdir), timeout=DEFAULT_GET_TIMEOUT) - except RayTimeoutError: + except GetTimeoutError: logger.exception("Trial %s: reset timed out.", trial) return False return reset_val diff --git a/python/ray/worker.py b/python/ray/worker.py index 6f87e373e..3f2e03cee 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -41,7 +41,7 @@ from ray import import_thread from ray import profiling from ray.exceptions import ( - RayConnectionError, + RaySystemError, RayError, RayTaskError, ObjectStoreFullError, @@ -202,8 +202,8 @@ class Worker: Exception: An exception is raised if the worker is not connected. """ if not self.connected: - raise RayConnectionError("Ray has not been started yet. You can " - "start Ray with 'ray.init()'.") + raise RaySystemError("Ray has not been started yet. You can " + "start Ray with 'ray.init()'.") def set_mode(self, mode): """Set the mode of the worker. @@ -568,7 +568,7 @@ def init( the distributed plasma store is lost due to node failure, Ray will attempt to reconstruct the object by re-executing the task that created the object. Arguments to the task will be recursively - reconstructed. If False, then ray.UnreconstructableError will be + reconstructed. If False, then ray.ObjectLostError will be thrown. _redis_max_memory: Redis max memory. _node_ip_address (str): The IP address of the node that we are on. @@ -589,7 +589,7 @@ def init( _java_worker_options: Overwrite the options to start Java workers. _lru_evict (bool): If True, when an object store is full, it will evict objects in LRU order to make more space and when under memory - pressure, ray.UnreconstructableError may be thrown. If False, then + pressure, ray.ObjectLostError may be thrown. If False, then reference counting will be used to decide which objects are safe to evict and when under memory pressure, ray.ObjectStoreFullError may be thrown. @@ -1383,7 +1383,7 @@ def get(object_refs, *, timeout=None): A Python object or a list of Python objects. Raises: - RayTimeoutError: A RayTimeoutError is raised if a timeout is set and + GetTimeoutError: A GetTimeoutError is raised if a timeout is set and the get takes longer than timeout to return. Exception: An exception is raised if the task that created the object or that created one of the objects raised an exception. @@ -1417,7 +1417,7 @@ def get(object_refs, *, timeout=None): for i, value in enumerate(values): if isinstance(value, RayError): last_task_error_raise_time = time.time() - if isinstance(value, ray.exceptions.UnreconstructableError): + if isinstance(value, ray.exceptions.ObjectLostError): worker.core_worker.dump_object_store_memory_usage() if isinstance(value, RayTaskError): raise value.as_instanceof_cause() @@ -1611,7 +1611,7 @@ def cancel(object_ref, *, force=False): Only non-actor tasks can be canceled. Canceled tasks will not be retried (max_retries will not be respected). - Calling ray.get on a canceled task will raise a RayCancellationError. + Calling ray.get on a canceled task will raise a TaskCancelledError. Args: object_ref (ObjectRef): ObjectRef returned by the task @@ -1642,7 +1642,7 @@ def _mode(worker=global_worker): return worker.mode -def make_decorator(num_return_vals=None, +def make_decorator(num_returns=None, num_cpus=None, num_gpus=None, memory=None, @@ -1682,13 +1682,12 @@ def make_decorator(num_return_vals=None, " integer") return ray.remote_function.RemoteFunction( Language.PYTHON, function_or_class, None, num_cpus, num_gpus, - memory, object_store_memory, resources, num_return_vals, - max_calls, max_retries, placement_group, - placement_group_bundle_index) + memory, object_store_memory, resources, num_returns, max_calls, + max_retries, placement_group, placement_group_bundle_index) if inspect.isclass(function_or_class): - if num_return_vals is not None: - raise TypeError("The keyword 'num_return_vals' is not " + if num_returns is not None: + raise TypeError("The keyword 'num_returns' is not " "allowed for actors.") if max_calls is not None: raise TypeError("The keyword 'max_calls' is not " @@ -1732,7 +1731,7 @@ def remote(*args, **kwargs): It can also be used with specific keyword arguments: - * **num_return_vals:** This is only for *remote functions*. It specifies + * **num_returns:** This is only for *remote functions*. It specifies the number of object refs returned by the remote function invocation. * **num_cpus:** The quantity of CPU cores to reserve for this task or for the lifetime of the actor. @@ -1774,7 +1773,7 @@ def remote(*args, **kwargs): .. code-block:: python - @ray.remote(num_gpus=1, max_calls=1, num_return_vals=2) + @ray.remote(num_gpus=1, max_calls=1, num_returns=2) def f(): return 1, 2 @@ -1789,7 +1788,7 @@ def remote(*args, **kwargs): .. code-block:: python - @ray.remote(num_gpus=1, max_calls=1, num_return_vals=2) + @ray.remote(num_gpus=1, max_calls=1, num_returns=2) def f(): return 1, 2 g = f.options(num_gpus=2, max_calls=None) @@ -1815,15 +1814,15 @@ def remote(*args, **kwargs): error_string = ("The @ray.remote decorator must be applied either " "with no arguments and no parentheses, for example " "'@ray.remote', or it must be applied using some of " - "the arguments 'num_return_vals', 'num_cpus', 'num_gpus', " + "the arguments 'num_returns', 'num_cpus', 'num_gpus', " "'memory', 'object_store_memory', 'resources', " "'max_calls', or 'max_restarts', like " - "'@ray.remote(num_return_vals=2, " + "'@ray.remote(num_returns=2, " "resources={\"CustomResource\": 1})'.") assert len(args) == 0 and len(kwargs) > 0, error_string for key in kwargs: assert key in [ - "num_return_vals", + "num_returns", "num_cpus", "num_gpus", "memory", @@ -1848,7 +1847,7 @@ def remote(*args, **kwargs): assert "GPU" not in resources, "Use the 'num_gpus' argument." # Handle other arguments. - num_return_vals = kwargs.get("num_return_vals") + num_returns = kwargs.get("num_returns") max_calls = kwargs.get("max_calls") max_restarts = kwargs.get("max_restarts") max_task_retries = kwargs.get("max_task_retries") @@ -1857,7 +1856,7 @@ def remote(*args, **kwargs): max_retries = kwargs.get("max_retries") return make_decorator( - num_return_vals=num_return_vals, + num_returns=num_returns, num_cpus=num_cpus, num_gpus=num_gpus, memory=memory, diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index 58d6d303d..0cef1b178 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -619,7 +619,7 @@ class RolloutWorker(ParallelIteratorWorker): return batch @DeveloperAPI - @ray.method(num_return_vals=2) + @ray.method(num_returns=2) def sample_with_count(self) -> Tuple[SampleBatchType, int]: """Same as sample() but returns the count as a separate future.""" batch = self.sample()