[api] Second round of 1.0 API changes: exceptions, num_return_vals (#10377)

This commit is contained in:
Eric Liang
2020-08-28 19:57:02 -07:00
committed by GitHub
parent b1f3c9e10e
commit 2a204260a8
37 changed files with 180 additions and 204 deletions
+13 -13
View File
@@ -101,11 +101,11 @@ import ray.ray_constants as ray_constants
from ray import profiling
from ray.exceptions import (
RayError,
RayletError,
RaySystemError,
RayTaskError,
ObjectStoreFullError,
RayTimeoutError,
RayCancellationError
GetTimeoutError,
TaskCancelledError
)
from ray.utils import decode
import gc
@@ -143,11 +143,11 @@ cdef int check_status(const CRayStatus& status) nogil except -1:
elif status.IsInterrupted():
raise KeyboardInterrupt()
elif status.IsTimedOut():
raise RayTimeoutError(message)
raise GetTimeoutError(message)
elif status.IsNotFound():
raise ValueError(message)
else:
raise RayletError(message)
raise RaySystemError(message)
cdef RayObjectsToDataMetadataPairs(
const c_vector[shared_ptr[CRayObject]] objects):
@@ -481,7 +481,7 @@ cdef execute_task(
outputs = function_executor(*args, **kwargs)
task_exception = False
except KeyboardInterrupt as e:
raise RayCancellationError(
raise TaskCancelledError(
core_worker.get_current_task_id())
if c_return_ids.size() == 1:
outputs = (outputs,)
@@ -489,7 +489,7 @@ cdef execute_task(
# was exiting and was raised after the except block.
if not check_signals().ok():
task_exception = True
raise RayCancellationError(
raise TaskCancelledError(
core_worker.get_current_task_id())
# Store the outputs in the object store.
with core_worker.profile_event(b"task:store_outputs"):
@@ -976,7 +976,7 @@ cdef class CoreWorker:
Language language,
FunctionDescriptor function_descriptor,
args,
int num_return_vals,
int num_returns,
resources,
int max_retries,
PlacementGroupID placement_group_id,
@@ -993,7 +993,7 @@ cdef class CoreWorker:
with self.profile_event(b"submit_task"):
prepare_resources(resources, &c_resources)
task_options = CTaskOptions(
num_return_vals, c_resources)
num_returns, c_resources)
ray_function = CRayFunction(
language.lang, function_descriptor.descriptor)
prepare_args(self, language, args, &args_vector)
@@ -1103,7 +1103,7 @@ cdef class CoreWorker:
ActorID actor_id,
FunctionDescriptor function_descriptor,
args,
int num_return_vals,
int num_returns,
double num_method_cpus):
cdef:
@@ -1117,7 +1117,7 @@ cdef class CoreWorker:
with self.profile_event(b"submit_task"):
if num_method_cpus > 0:
c_resources[b"CPU"] = num_method_cpus
task_options = CTaskOptions(num_return_vals, c_resources)
task_options = CTaskOptions(num_returns, c_resources)
ray_function = CRayFunction(
language.lang, function_descriptor.descriptor)
prepare_args(self, language, args, &args_vector)
@@ -1209,7 +1209,7 @@ cdef class CoreWorker:
return ray.actor.ActorHandle(language, actor_id,
method_meta.decorators,
method_meta.signatures,
method_meta.num_return_vals,
method_meta.num_returns,
actor_method_cpu,
actor_creation_function_descriptor,
worker.current_session_and_job)
@@ -1217,7 +1217,7 @@ cdef class CoreWorker:
return ray.actor.ActorHandle(language, actor_id,
{}, # method decorators
{}, # method signatures
{}, # method num_return_vals
{}, # method num_returns
0, # actor method cpu
actor_creation_function_descriptor,
worker.current_session_and_job)
+29 -30
View File
@@ -23,7 +23,7 @@ def method(*args, **kwargs):
@ray.remote
class Foo:
@ray.method(num_return_vals=2)
@ray.method(num_returns=2)
def bar(self):
return 1, 2
@@ -32,16 +32,16 @@ def method(*args, **kwargs):
_, _ = f.bar.remote()
Args:
num_return_vals: The number of object refs that should be returned by
num_returns: The number of object refs that should be returned by
invocations of this actor method.
"""
assert len(args) == 0
assert len(kwargs) == 1
assert "num_return_vals" in kwargs
num_return_vals = kwargs["num_return_vals"]
assert "num_returns" in kwargs
num_returns = kwargs["num_returns"]
def annotate_method(method):
method.__ray_num_return_vals__ = num_return_vals
method.__ray_num_returns__ = num_returns
return method
return annotate_method
@@ -58,7 +58,7 @@ class ActorMethod:
Attributes:
_actor: A handle to the actor.
_method_name: The name of the actor method.
_num_return_vals: The default number of return values that the method
_num_returns: The default number of return values that the method
invocation should return.
_decorator: An optional decorator that should be applied to the actor
method invocation (as opposed to the actor method execution) before
@@ -72,12 +72,12 @@ class ActorMethod:
def __init__(self,
actor,
method_name,
num_return_vals,
num_returns,
decorator=None,
hardref=False):
self._actor_ref = weakref.ref(actor)
self._method_name = method_name
self._num_return_vals = num_return_vals
self._num_returns = num_returns
# This is a decorator that is used to wrap the function invocation (as
# opposed to the function execution). The decorator must return a
# function that takes in two arguments ("args" and "kwargs"). In most
@@ -100,9 +100,9 @@ class ActorMethod:
def remote(self, *args, **kwargs):
return self._remote(args, kwargs)
def _remote(self, args=None, kwargs=None, num_return_vals=None):
if num_return_vals is None:
num_return_vals = self._num_return_vals
def _remote(self, args=None, kwargs=None, num_returns=None):
if num_returns is None:
num_returns = self._num_returns
def invocation(args, kwargs):
actor = self._actor_hard_ref or self._actor_ref()
@@ -112,7 +112,7 @@ class ActorMethod:
self._method_name,
args=args,
kwargs=kwargs,
num_return_vals=num_return_vals)
num_returns=num_returns)
# Apply the decorator if there is one.
if self._decorator is not None:
@@ -124,7 +124,7 @@ class ActorMethod:
return {
"actor": self._actor_ref(),
"method_name": self._method_name,
"num_return_vals": self._num_return_vals,
"num_returns": self._num_returns,
"decorator": self._decorator,
}
@@ -132,7 +132,7 @@ class ActorMethod:
self.__init__(
state["actor"],
state["method_name"],
state["num_return_vals"],
state["num_returns"],
state["decorator"],
hardref=True)
@@ -147,7 +147,7 @@ class ActorClassMethodMetadata(object):
can be set by attaching the attribute
"__ray_invocation_decorator__" to the actor method.
signatures: The signatures of the methods.
num_return_vals: The default number of return values for
num_returns: The default number of return values for
each actor method.
"""
@@ -182,7 +182,7 @@ class ActorClassMethodMetadata(object):
# arguments.
self.decorators = {}
self.signatures = {}
self.num_return_vals = {}
self.num_returns = {}
for method_name, method in actor_methods:
# Whether or not this method requires binding of its first
# argument. For class and static methods, we do not want to bind
@@ -198,11 +198,10 @@ class ActorClassMethodMetadata(object):
self.signatures[method_name] = signature.extract_signature(
method, ignore_first=not is_bound)
# Set the default number of return values for this method.
if hasattr(method, "__ray_num_return_vals__"):
self.num_return_vals[method_name] = (
method.__ray_num_return_vals__)
if hasattr(method, "__ray_num_returns__"):
self.num_returns[method_name] = (method.__ray_num_returns__)
else:
self.num_return_vals[method_name] = (
self.num_returns[method_name] = (
ray_constants.DEFAULT_ACTOR_METHOD_NUM_RETURN_VALS)
if hasattr(method, "__ray_invocation_decorator__"):
@@ -589,7 +588,7 @@ class ActorClass:
actor_id,
meta.method_meta.decorators,
meta.method_meta.signatures,
meta.method_meta.num_return_vals,
meta.method_meta.num_returns,
actor_method_cpu,
meta.actor_creation_function_descriptor,
worker.current_session_and_job,
@@ -617,7 +616,7 @@ class ActorHandle:
invocation side, whereas a regular decorator can be used to change
the behavior on the execution side.
_ray_method_signatures: The signatures of the actor methods.
_ray_method_num_return_vals: The default number of return values for
_ray_method_num_returns: The default number of return values for
each method.
_ray_actor_method_cpus: The number of CPUs required by actor methods.
_ray_original_handle: True if this is the original actor handle for a
@@ -633,7 +632,7 @@ class ActorHandle:
actor_id,
method_decorators,
method_signatures,
method_num_return_vals,
method_num_returns,
actor_method_cpus,
actor_creation_function_descriptor,
session_and_job,
@@ -643,7 +642,7 @@ class ActorHandle:
self._ray_original_handle = original_handle
self._ray_method_decorators = method_decorators
self._ray_method_signatures = method_signatures
self._ray_method_num_return_vals = method_num_return_vals
self._ray_method_num_returns = method_num_returns
self._ray_actor_method_cpus = actor_method_cpus
self._ray_session_and_job = session_and_job
self._ray_is_cross_language = language != Language.PYTHON
@@ -664,7 +663,7 @@ class ActorHandle:
method = ActorMethod(
self,
method_name,
self._ray_method_num_return_vals[method_name],
self._ray_method_num_returns[method_name],
decorator=self._ray_method_decorators.get(method_name))
setattr(self, method_name, method)
@@ -680,7 +679,7 @@ class ActorHandle:
method_name,
args=None,
kwargs=None,
num_return_vals=None):
num_returns=None):
"""Method execution stub for an actor handle.
This is the function that executes when
@@ -692,7 +691,7 @@ class ActorHandle:
method_name: The name of the actor method to execute.
args: A list of arguments for the actor method.
kwargs: A dictionary of keyword arguments for the actor method.
num_return_vals (int): The number of return values for the method.
num_returns (int): The number of return values for the method.
Returns:
object_refs: A list of object refs returned by the remote actor
@@ -725,7 +724,7 @@ class ActorHandle:
object_refs = worker.core_worker.submit_actor_task(
self._ray_actor_language, self._ray_actor_id, function_descriptor,
list_args, num_return_vals, self._ray_actor_method_cpus)
list_args, num_returns, self._ray_actor_method_cpus)
if len(object_refs) == 1:
object_refs = object_refs[0]
@@ -795,7 +794,7 @@ class ActorHandle:
"actor_id": self._ray_actor_id,
"method_decorators": self._ray_method_decorators,
"method_signatures": self._ray_method_signatures,
"method_num_return_vals": self._ray_method_num_return_vals,
"method_num_returns": self._ray_method_num_returns,
"actor_method_cpus": self._ray_actor_method_cpus,
"actor_creation_function_descriptor": self.
_ray_actor_creation_function_descriptor,
@@ -830,7 +829,7 @@ class ActorHandle:
state["actor_id"],
state["method_decorators"],
state["method_signatures"],
state["method_num_return_vals"],
state["method_num_returns"],
state["actor_method_cpus"],
state["actor_creation_function_descriptor"],
worker.current_session_and_job)
+1 -1
View File
@@ -66,7 +66,7 @@ def java_function(class_name, function_name):
None, # memory,
None, # object_store_memory,
None, # resources,
None, # num_return_vals,
None, # num_returns,
None, # max_calls,
None, # max_retries
placement_group=None,
+15 -35
View File
@@ -41,12 +41,7 @@ class CrossLanguageError(RayError):
ray_exception.formatted_exception_string))
class RayConnectionError(RayError):
"""Raised when ray is not yet connected but needs to be."""
pass
class RayCancellationError(RayError):
class TaskCancelledError(RayError):
"""Raised when this task is cancelled.
Attributes:
@@ -143,7 +138,7 @@ class RayTaskError(RayError):
return "\n".join(out)
class RayWorkerError(RayError):
class WorkerCrashedError(RayError):
"""Indicates that the worker died unexpectedly while executing a task."""
def __str__(self):
@@ -161,8 +156,8 @@ class RayActorError(RayError):
return "The actor died unexpectedly before finishing this task."
class RayletError(RayError):
"""Indicates that the Raylet client has errored.
class RaySystemError(RayError):
"""Indicates that Ray encountered a system error.
This exception can be thrown when the raylet is killed.
"""
@@ -171,7 +166,7 @@ class RayletError(RayError):
self.client_exc = client_exc
def __str__(self):
return f"The Raylet died with this message: {self.client_exc}"
return f"System error: {self.client_exc}"
class ObjectStoreFullError(RayError):
@@ -184,21 +179,13 @@ class ObjectStoreFullError(RayError):
def __str__(self):
return super(ObjectStoreFullError, self).__str__() + (
"\n"
"The local object store is full of objects that are still in scope"
" and cannot be evicted. Try increasing the object store memory "
"available with ray.init(object_store_memory=<bytes>). "
"You can also try setting an option to fallback to LRU eviction "
"when the object store is full by calling "
"ray.init(lru_evict=True). See also: "
"https://docs.ray.io/en/latest/memory-management.html.")
"The local object store is full of objects that are still in "
"scope and cannot be evicted. Tip: Use the `ray memory` command "
"to list active objects in the cluster.")
class UnreconstructableError(RayError):
"""Indicates that an object is lost and cannot be reconstructed.
Note, this exception only happens for actor objects. If actor's current
state is after object's creating task, the actor cannot re-run the task to
reconstruct the object.
class ObjectLostError(RayError):
"""Indicates that an object has been lost due to node failure.
Attributes:
object_ref: ID of the object.
@@ -208,17 +195,10 @@ class UnreconstructableError(RayError):
self.object_ref = object_ref
def __str__(self):
return (
f"Object {self.object_ref.hex()} is lost "
"(either LRU evicted or deleted by user) and "
"cannot be reconstructed. Try increasing the object store "
"memory available with ray.init(object_store_memory=<bytes>) "
"or setting object store limits with "
"ray.remote(object_store_memory=<bytes>). "
"See also: https://docs.ray.io/en/latest/memory-management.html")
return (f"Object {self.object_ref.hex()} is lost due to node failure.")
class RayTimeoutError(RayError):
class GetTimeoutError(RayError):
"""Indicates that a call to the worker timed out."""
pass
@@ -232,9 +212,9 @@ RAY_EXCEPTION_TYPES = [
PlasmaObjectNotAvailable,
RayError,
RayTaskError,
RayWorkerError,
WorkerCrashedError,
RayActorError,
ObjectStoreFullError,
UnreconstructableError,
RayTimeoutError,
ObjectLostError,
GetTimeoutError,
]
@@ -7,7 +7,7 @@ from . import core
__all__ = ["tsqr", "modified_lu", "tsqr_hr", "qr"]
@ray.remote(num_return_vals=2)
@ray.remote(num_returns=2)
def tsqr(a):
"""Perform a QR decomposition of a tall-skinny matrix.
@@ -83,7 +83,7 @@ def tsqr(a):
# TODO(rkn): This is unoptimized, we really want a block version of this.
# This is Algorithm 5 from
# http://www.eecs.berkeley.edu/Pubs/TechRpts/2013/EECS-2013-175.pdf.
@ray.remote(num_return_vals=3)
@ray.remote(num_returns=3)
def modified_lu(q):
"""Perform a modified LU decomposition of a matrix.
@@ -121,7 +121,7 @@ def modified_lu(q):
return ray.get(core.numpy_to_dist.remote(ray.put(L))), U, S
@ray.remote(num_return_vals=2)
@ray.remote(num_returns=2)
def tsqr_hr_helper1(u, s, y_top_block, b):
y_top = y_top_block[:b, :b]
s_full = np.diag(s)
@@ -137,7 +137,7 @@ def tsqr_hr_helper2(s, r_temp):
# This is Algorithm 6 from
# http://www.eecs.berkeley.edu/Pubs/TechRpts/2013/EECS-2013-175.pdf.
@ray.remote(num_return_vals=4)
@ray.remote(num_returns=4)
def tsqr_hr(a):
q, r_temp = tsqr.remote(a)
y, u, s = modified_lu.remote(q)
@@ -160,7 +160,7 @@ def qr_helper2(y_ri, a_rc):
# This is Algorithm 7 from
# http://www.eecs.berkeley.edu/Pubs/TechRpts/2013/EECS-2013-175.pdf.
@ray.remote(num_return_vals=2)
@ray.remote(num_returns=2)
def qr(a):
m, n = a.shape[0], a.shape[1]
@@ -18,12 +18,12 @@ def solve(a, b):
return np.linalg.solve(a, b)
@ray.remote(num_return_vals=2)
@ray.remote(num_returns=2)
def tensorsolve(a):
raise NotImplementedError
@ray.remote(num_return_vals=2)
@ray.remote(num_returns=2)
def tensorinv(a):
raise NotImplementedError
@@ -63,22 +63,22 @@ def det(a):
return np.linalg.det(a)
@ray.remote(num_return_vals=3)
@ray.remote(num_returns=3)
def svd(a):
return np.linalg.svd(a)
@ray.remote(num_return_vals=2)
@ray.remote(num_returns=2)
def eig(a):
return np.linalg.eig(a)
@ray.remote(num_return_vals=2)
@ray.remote(num_returns=2)
def eigh(a):
return np.linalg.eigh(a)
@ray.remote(num_return_vals=4)
@ray.remote(num_returns=4)
def lstsq(a, b):
return np.linalg.lstsq(a)
@@ -88,7 +88,7 @@ def norm(x):
return np.linalg.norm(x)
@ray.remote(num_return_vals=2)
@ray.remote(num_returns=2)
def qr(a):
return np.linalg.qr(a)
+1 -1
View File
@@ -130,7 +130,7 @@ DASHBOARD_DIED_ERROR = "dashboard_died"
RAYLET_CONNECTION_ERROR = "raylet_connection_error"
# Used in gpu detection
RESOURCE_CONSTRAINT_PREFIX = "GPUType:"
RESOURCE_CONSTRAINT_PREFIX = "gpu_type:"
RESOURCES_ENVIRONMENT_VARIABLE = "RAY_OVERRIDE_RESOURCES"
+11 -11
View File
@@ -40,7 +40,7 @@ class RemoteFunction:
_object_store_memory: The object store memory request for this task.
_resources: The default custom resource requirements for invocations of
this remote function.
_num_return_vals: The default number of return values for invocations
_num_returns: The default number of return values for invocations
of this remote function.
_max_calls: The number of times a worker can execute this function
before exiting.
@@ -61,8 +61,8 @@ class RemoteFunction:
"""
def __init__(self, language, function, function_descriptor, num_cpus,
num_gpus, memory, object_store_memory, resources,
num_return_vals, max_calls, max_retries, placement_group,
num_gpus, memory, object_store_memory, resources, num_returns,
max_calls, max_retries, placement_group,
placement_group_bundle_index):
self._language = language
self._function = function
@@ -79,8 +79,8 @@ class RemoteFunction:
"setting object_store_memory is not implemented for tasks")
self._object_store_memory = None
self._resources = resources
self._num_return_vals = (DEFAULT_REMOTE_FUNCTION_NUM_RETURN_VALS if
num_return_vals is None else num_return_vals)
self._num_returns = (DEFAULT_REMOTE_FUNCTION_NUM_RETURN_VALS
if num_returns is None else num_returns)
self._max_calls = (DEFAULT_REMOTE_FUNCTION_MAX_CALLS
if max_calls is None else max_calls)
self._max_retries = (DEFAULT_REMOTE_FUNCTION_NUM_TASK_RETRIES
@@ -107,7 +107,7 @@ class RemoteFunction:
def _submit(self,
args=None,
kwargs=None,
num_return_vals=None,
num_returns=None,
num_cpus=None,
num_gpus=None,
resources=None):
@@ -116,7 +116,7 @@ class RemoteFunction:
return self._remote(
args=args,
kwargs=kwargs,
num_return_vals=num_return_vals,
num_returns=num_returns,
num_cpus=num_cpus,
num_gpus=num_gpus,
resources=resources)
@@ -144,7 +144,7 @@ class RemoteFunction:
def _remote(self,
args=None,
kwargs=None,
num_return_vals=None,
num_returns=None,
num_cpus=None,
num_gpus=None,
memory=None,
@@ -182,8 +182,8 @@ class RemoteFunction:
kwargs = {} if kwargs is None else kwargs
args = [] if args is None else args
if num_return_vals is None:
num_return_vals = self._num_return_vals
if num_returns is None:
num_returns = self._num_returns
if max_retries is None:
max_retries = self._max_retries
@@ -213,7 +213,7 @@ class RemoteFunction:
"cannot be executed locally."
object_refs = worker.core_worker.submit_task(
self._language, self._function_descriptor, list_args,
num_return_vals, resources, max_retries, placement_group.id,
num_returns, resources, max_retries, placement_group.id,
placement_group_bundle_index)
if len(object_refs) == 1:
+6 -7
View File
@@ -13,9 +13,9 @@ from ray.exceptions import (
PlasmaObjectNotAvailable,
RayTaskError,
RayActorError,
RayCancellationError,
RayWorkerError,
UnreconstructableError,
TaskCancelledError,
WorkerCrashedError,
ObjectLostError,
)
from ray._raylet import (
split_buffer,
@@ -265,14 +265,13 @@ class SerializationContext:
obj = self._deserialize_msgpack_data(data, metadata)
return RayError.from_bytes(obj)
elif error_type == ErrorType.Value("WORKER_DIED"):
return RayWorkerError()
return WorkerCrashedError()
elif error_type == ErrorType.Value("ACTOR_DIED"):
return RayActorError()
elif error_type == ErrorType.Value("TASK_CANCELLED"):
return RayCancellationError()
return TaskCancelledError()
elif error_type == ErrorType.Value("OBJECT_UNRECONSTRUCTABLE"):
return UnreconstructableError(
ray.ObjectRef(object_ref.binary()))
return ObjectLostError(ray.ObjectRef(object_ref.binary()))
else:
assert error_type != ErrorType.Value("OBJECT_IN_PLASMA"), \
"Tried to get object that has been promoted to plasma."
+1 -1
View File
@@ -191,7 +191,7 @@ async def test_router_use_max_concurrency(serve_instance):
second_query = q.enqueue_request.remote(RequestMetadata("svc", None), 1)
# Neither queries should be available
with pytest.raises(ray.exceptions.RayTimeoutError):
with pytest.raises(ray.exceptions.GetTimeoutError):
ray.get([first_query, second_query], timeout=0.2)
# Let's retrieve the router internal state
+1 -1
View File
@@ -48,7 +48,7 @@ class GlobalState:
"""
if (self.redis_client is None or self.redis_clients is None
or self.global_state_accessor is None):
raise ray.exceptions.RayConnectionError(
raise ray.exceptions.RaySystemError(
"Ray has not been started yet. You can start Ray with "
"'ray.init()'.")
+3 -3
View File
@@ -646,15 +646,15 @@ def test_multiple_return_values(ray_start_regular_shared):
def method0(self):
return 1
@ray.method(num_return_vals=1)
@ray.method(num_returns=1)
def method1(self):
return 1
@ray.method(num_return_vals=2)
@ray.method(num_returns=2)
def method2(self):
return 1, 2
@ray.method(num_return_vals=3)
@ray.method(num_returns=3)
def method3(self):
return 1, 2, 3
+1 -1
View File
@@ -63,7 +63,7 @@ def test_actor_eviction(ray_start_regular):
val = ray.get(obj)
assert isinstance(val, np.ndarray), val
num_success += 1
except ray.exceptions.UnreconstructableError:
except ray.exceptions.ObjectLostError:
num_evicted += 1
# Some objects should have been evicted, and some should still be in the
# object store.
+5 -5
View File
@@ -64,12 +64,12 @@ def test_submit_api(shutdown_only):
def g():
return ray.get_gpu_ids()
assert f._remote([0], num_return_vals=0) is None
id1 = f._remote(args=[1], num_return_vals=1)
assert f._remote([0], num_returns=0) is None
id1 = f._remote(args=[1], num_returns=1)
assert ray.get(id1) == [0]
id1, id2 = f._remote(args=[2], num_return_vals=2)
id1, id2 = f._remote(args=[2], num_returns=2)
assert ray.get([id1, id2]) == [0, 1]
id1, id2, id3 = f._remote(args=[3], num_return_vals=3)
id1, id2, id3 = f._remote(args=[3], num_returns=3)
assert ray.get([id1, id2, id3]) == [0, 1, 2]
assert ray.get(
g._remote(args=[], num_cpus=1, num_gpus=1,
@@ -107,7 +107,7 @@ def test_submit_api(shutdown_only):
ray.get(a2.method._remote())
id1, id2, id3, id4 = a.method._remote(
args=["test"], kwargs={"b": 2}, num_return_vals=4)
args=["test"], kwargs={"b": 2}, num_returns=4)
assert ray.get([id1, id2, id3, id4]) == [0, 1, "test", 2]
+3 -3
View File
@@ -12,7 +12,7 @@ from unittest.mock import MagicMock, patch
import ray
import ray.cluster_utils
import ray.test_utils
from ray.exceptions import RayTimeoutError
from ray.exceptions import GetTimeoutError
logger = logging.getLogger(__name__)
@@ -351,7 +351,7 @@ def test_system_config_when_connecting(ray_start_cluster):
ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))
# This would not raise an exception if object pinning was enabled.
with pytest.raises(ray.exceptions.UnreconstructableError):
with pytest.raises(ray.exceptions.ObjectLostError):
ray.get(obj_ref)
@@ -377,7 +377,7 @@ def test_get_with_timeout(ray_start_regular_shared):
# Check that get() raises a TimeoutError after the timeout if the object
# is not ready yet.
result_id = signal.wait.remote()
with pytest.raises(RayTimeoutError):
with pytest.raises(GetTimeoutError):
ray.get(result_id, timeout=0.1)
# Check that a subsequent get() returns early.
+9 -9
View File
@@ -5,18 +5,18 @@ import time
import pytest
import ray
from ray.exceptions import RayCancellationError, RayTaskError, \
RayTimeoutError, RayWorkerError, \
UnreconstructableError
from ray.exceptions import TaskCancelledError, RayTaskError, \
GetTimeoutError, WorkerCrashedError, \
ObjectLostError
from ray.test_utils import SignalActor
def valid_exceptions(use_force):
if use_force:
return (RayTaskError, RayCancellationError, RayWorkerError,
UnreconstructableError)
return (RayTaskError, TaskCancelledError, WorkerCrashedError,
ObjectLostError)
else:
return (RayTaskError, RayCancellationError)
return (RayTaskError, TaskCancelledError)
@pytest.mark.parametrize("use_force", [True, False])
@@ -50,10 +50,10 @@ def test_cancel_chain(ray_start_regular, use_force):
with pytest.raises(valid_exceptions(use_force)):
ray.get(ob)
with pytest.raises(RayTimeoutError):
with pytest.raises(GetTimeoutError):
ray.get(obj1, timeout=.1)
with pytest.raises(RayTimeoutError):
with pytest.raises(GetTimeoutError):
ray.get(obj2, timeout=.1)
signaler2.send.remote()
@@ -249,7 +249,7 @@ def test_remote_cancel(ray_start_regular, use_force):
outer = remote_wait.remote([sig])
inner = ray.get(outer)[0]
with pytest.raises(RayTimeoutError):
with pytest.raises(GetTimeoutError):
ray.get(inner, timeout=1)
ray.cancel(inner, force=use_force)
@@ -70,7 +70,8 @@ def test_worker_failed(ray_start_workers_separate_multinode):
for object_ref in object_refs:
try:
ray.get(object_ref)
except (ray.exceptions.RayTaskError, ray.exceptions.RayWorkerError):
except (ray.exceptions.RayTaskError,
ray.exceptions.WorkerCrashedError):
pass
@@ -36,7 +36,7 @@ def test_errors_before_initializing_ray():
for api_method in api_methods:
print(api_method)
with pytest.raises(
ray.exceptions.RayConnectionError,
ray.exceptions.RaySystemError,
match="Ray has not been started yet."):
api_method()
+4 -4
View File
@@ -30,7 +30,7 @@ def test_failed_task(ray_start_regular, error_pubsub):
def throw_exception_fct2():
raise Exception("Test function 2 intentionally failed.")
@ray.remote(num_return_vals=3)
@ray.remote(num_returns=3)
def throw_exception_fct3(x):
raise Exception("Test function 3 intentionally failed.")
@@ -362,7 +362,7 @@ def test_worker_dying(ray_start_regular, error_pubsub):
def f():
eval("exit()")
with pytest.raises(ray.exceptions.RayWorkerError):
with pytest.raises(ray.exceptions.WorkerCrashedError):
ray.get(f.remote())
errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR)
@@ -901,7 +901,7 @@ def test_raylet_crash_when_get(ray_start_regular):
thread = threading.Thread(target=sleep_to_kill_raylet)
thread.start()
with pytest.raises(ray.exceptions.UnreconstructableError):
with pytest.raises(ray.exceptions.ObjectLostError):
ray.get(object_ref)
thread.join()
@@ -1062,7 +1062,7 @@ def test_eviction(ray_start_cluster):
# Evict the object.
ray.internal.free([obj])
# ray.get throws an exception.
with pytest.raises(ray.exceptions.UnreconstructableError):
with pytest.raises(ray.exceptions.ObjectLostError):
ray.get(obj)
@ray.remote
+1 -1
View File
@@ -5,7 +5,7 @@ import ray
MB = 1024 * 1024
OBJECT_EVICTED = ray.exceptions.UnreconstructableError
OBJECT_EVICTED = ray.exceptions.ObjectLostError
OBJECT_TOO_LARGE = ray.exceptions.ObjectStoreFullError
+1 -1
View File
@@ -15,7 +15,7 @@ def test_basic_task_api(ray_start_regular):
# Test multiple return values.
@ray.remote(num_return_vals=3)
@ray.remote(num_returns=3)
def f_multiple_returns():
return 1, 2, 3
+2 -1
View File
@@ -70,7 +70,8 @@ def test_worker_failed(ray_start_workers_separate_multinode):
for object_ref in object_refs:
try:
ray.get(object_ref)
except (ray.exceptions.RayTaskError, ray.exceptions.RayWorkerError):
except (ray.exceptions.RayTaskError,
ray.exceptions.WorkerCrashedError):
pass
+2 -2
View File
@@ -355,7 +355,7 @@ def test_remove_placement_group(ray_start_cluster):
# That means this request should fail.
with pytest.raises(ray.exceptions.RayActorError, match="actor died"):
ray.get(a.f.remote(), timeout=3.0)
with pytest.raises(ray.exceptions.RayWorkerError):
with pytest.raises(ray.exceptions.WorkerCrashedError):
ray.get(task_ref)
@@ -576,7 +576,7 @@ def test_pending_placement_group_wait(ray_start_cluster):
assert len(ready) == 0
table = ray.experimental.placement_group_table(placement_group)
assert table["state"] == "PENDING"
with pytest.raises(ray.exceptions.RayTimeoutError):
with pytest.raises(ray.exceptions.GetTimeoutError):
ray.get(placement_group.ready(), timeout=0.1)
+3 -3
View File
@@ -1,7 +1,7 @@
import pytest
import ray
from ray.exceptions import RayTimeoutError
from ray.exceptions import GetTimeoutError
from ray.experimental.queue import Queue, Empty, Full
@@ -80,7 +80,7 @@ def test_async_get(ray_start_regular):
with pytest.raises(Empty):
q.get_nowait()
with pytest.raises(RayTimeoutError):
with pytest.raises(GetTimeoutError):
ray.get(future, timeout=0.1) # task not canceled on timeout.
q.put(1)
@@ -95,7 +95,7 @@ def test_async_put(ray_start_regular):
with pytest.raises(Full):
q.put_nowait(3)
with pytest.raises(RayTimeoutError):
with pytest.raises(GetTimeoutError):
ray.get(future, timeout=0.1) # task not canceled on timeout.
assert q.get() == 1
+8 -9
View File
@@ -110,7 +110,7 @@ def test_reconstruction_cached_dependency(ray_start_cluster,
else:
with pytest.raises(ray.exceptions.RayTaskError) as e:
ray.get(dependent_task.remote(obj))
with pytest.raises(ray.exceptions.UnreconstructableError):
with pytest.raises(ray.exceptions.ObjectLostError):
raise e.as_instanceof_cause()
@@ -159,7 +159,7 @@ def test_basic_reconstruction(ray_start_cluster, reconstruction_enabled):
else:
with pytest.raises(ray.exceptions.RayTaskError) as e:
ray.get(dependent_task.remote(obj))
with pytest.raises(ray.exceptions.UnreconstructableError):
with pytest.raises(ray.exceptions.ObjectLostError):
raise e.as_instanceof_cause()
@@ -215,7 +215,7 @@ def test_basic_reconstruction_put(ray_start_cluster, reconstruction_enabled):
# been evicted.
try:
ray.get(result)
except ray.exceptions.UnreconstructableError:
except ray.exceptions.ObjectLostError:
pass
@@ -284,7 +284,7 @@ def test_basic_reconstruction_actor_task(ray_start_cluster,
else:
with pytest.raises(ray.exceptions.RayTaskError) as e:
ray.get(dependent_task.remote(obj))
with pytest.raises(ray.exceptions.UnreconstructableError):
with pytest.raises(ray.exceptions.ObjectLostError):
raise e.as_instanceof_cause()
# Make sure the actor handle is still usable.
@@ -356,8 +356,7 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster,
return True
except ray.exceptions.RayActorError:
return False
except (ray.exceptions.RayTaskError,
ray.exceptions.UnreconstructableError):
except (ray.exceptions.RayTaskError, ray.exceptions.ObjectLostError):
return True
wait_for_condition(probe)
@@ -369,7 +368,7 @@ def test_basic_reconstruction_actor_constructor(ray_start_cluster,
x = a.dependent_task.remote(obj)
print(x)
ray.get(x)
with pytest.raises(ray.exceptions.UnreconstructableError):
with pytest.raises(ray.exceptions.ObjectLostError):
raise e.as_instanceof_cause()
@@ -429,7 +428,7 @@ def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled):
dependent_task.options(resources={
"node1": 1
}).remote(obj))
with pytest.raises(ray.exceptions.UnreconstructableError):
with pytest.raises(ray.exceptions.ObjectLostError):
raise e.as_instanceof_cause()
@@ -480,7 +479,7 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled):
else:
with pytest.raises(ray.exceptions.RayTaskError) as e:
ray.get(dependent_task.remote(obj))
with pytest.raises(ray.exceptions.UnreconstructableError):
with pytest.raises(ray.exceptions.ObjectLostError):
raise e.as_instanceof_cause()
+3 -4
View File
@@ -354,7 +354,7 @@ def test_basic_serialized_reference(one_worker_100MiB, use_ray_put, failure):
try:
ray.get(obj_ref)
assert not failure
except ray.exceptions.RayWorkerError:
except ray.exceptions.WorkerCrashedError:
assert failure
# Reference should be gone, check that array gets evicted.
@@ -403,7 +403,7 @@ def test_recursive_serialized_reference(one_worker_100MiB, use_ray_put,
assert ray.get(tail_oid) is None
assert not failure
# TODO(edoakes): this should raise WorkerError.
except ray.exceptions.UnreconstructableError:
except ray.exceptions.ObjectLostError:
assert failure
# Reference should be gone, check that array gets evicted.
@@ -501,8 +501,7 @@ def test_worker_holding_serialized_reference(one_worker_100MiB, use_ray_put,
try:
ray.get(child_return_id)
assert not failure
except (ray.exceptions.RayWorkerError,
ray.exceptions.UnreconstructableError):
except (ray.exceptions.WorkerCrashedError, ray.exceptions.ObjectLostError):
assert failure
del child_return_id
@@ -91,7 +91,7 @@ def test_recursively_nest_ids(one_worker_100MiB, use_ray_put, failure):
ray.get(tail_oid)
assert not failure
# TODO(edoakes): this should raise WorkerError.
except ray.exceptions.UnreconstructableError:
except ray.exceptions.ObjectLostError:
assert failure
# Reference should be gone, check that array gets evicted.
@@ -130,7 +130,7 @@ def test_return_object_ref(one_worker_100MiB, use_ray_put, failure):
# Check that the owner dying unpins the object. This should execute on
# the same worker because there is only one started and the other tasks
# have finished.
with pytest.raises(ray.exceptions.RayWorkerError):
with pytest.raises(ray.exceptions.WorkerCrashedError):
ray.get(exit.remote())
else:
# Check that removing the inner ID unpins the object.
@@ -173,7 +173,7 @@ def test_pass_returned_object_ref(one_worker_100MiB, use_ray_put, failure):
# Should succeed because inner_oid is pinned if no failure.
ray.get(pending_oid)
assert not failure
except ray.exceptions.RayWorkerError:
except ray.exceptions.WorkerCrashedError:
assert failure
def ref_not_exists():
@@ -232,7 +232,7 @@ def test_recursively_pass_returned_object_ref(one_worker_100MiB, use_ray_put,
_fill_object_store_and_get(inner_oid)
assert not failure
# TODO(edoakes): this should raise WorkerError.
except ray.exceptions.UnreconstructableError:
except ray.exceptions.ObjectLostError:
assert failure
inner_oid_bytes = inner_oid.binary()
@@ -311,7 +311,7 @@ def test_borrowed_id_failure(one_worker_100MiB, failure):
def resolve_ref(self):
assert self.ref is not None
if failure:
with pytest.raises(ray.exceptions.UnreconstructableError):
with pytest.raises(ray.exceptions.ObjectLostError):
ray.get(self.ref)
else:
ray.get(self.ref)
+1 -1
View File
@@ -422,7 +422,7 @@ def test_register_class(ray_start_2_cpus):
assert ray.get(h2.remote(10)).value == 10
# Test registering multiple classes with the same name.
@ray.remote(num_return_vals=3)
@ray.remote(num_returns=3)
def j():
class Class0:
def method0(self):
+3 -4
View File
@@ -335,10 +335,9 @@ def test_driver_put_errors(ray_start_object_store_memory, error_pubsub):
return len(errors) > 1
errors = wait_for_errors(p, error_check)
assert all(
error.type == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
or "ray.exceptions.UnreconstructableError" in error.error_messages
for error in errors)
assert all(error.type == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
or "ray.exceptions.ObjectLostError" in error.error_messages
for error in errors)
# NOTE(swang): This test tries to launch 1000 workers and breaks.
@@ -4,7 +4,7 @@ import unittest
import ray
class TestUnreconstructableErrors(unittest.TestCase):
class TestObjectLostErrors(unittest.TestCase):
def setUp(self):
ray.init(
num_cpus=1,
@@ -20,7 +20,7 @@ class TestUnreconstructableErrors(unittest.TestCase):
ray.get(x_id)
for _ in range(20):
ray.put(np.zeros(10 * 1024 * 1024))
self.assertRaises(ray.exceptions.UnreconstructableError,
self.assertRaises(ray.exceptions.ObjectLostError,
lambda: ray.get(x_id))
+2 -2
View File
@@ -8,7 +8,7 @@ import traceback
from contextlib import contextmanager
import ray
from ray.exceptions import RayTimeoutError
from ray.exceptions import GetTimeoutError
from ray import ray_constants
from ray.resource_spec import ResourceSpec
from ray.tune.durable_trainable import DurableTrainable
@@ -397,7 +397,7 @@ class RayTrialExecutor(TrialExecutor):
reset_val = ray.get(
trainable.reset.remote(new_config, trial.logdir),
timeout=DEFAULT_GET_TIMEOUT)
except RayTimeoutError:
except GetTimeoutError:
logger.exception("Trial %s: reset timed out.", trial)
return False
return reset_val
+21 -22
View File
@@ -41,7 +41,7 @@ from ray import import_thread
from ray import profiling
from ray.exceptions import (
RayConnectionError,
RaySystemError,
RayError,
RayTaskError,
ObjectStoreFullError,
@@ -202,8 +202,8 @@ class Worker:
Exception: An exception is raised if the worker is not connected.
"""
if not self.connected:
raise RayConnectionError("Ray has not been started yet. You can "
"start Ray with 'ray.init()'.")
raise RaySystemError("Ray has not been started yet. You can "
"start Ray with 'ray.init()'.")
def set_mode(self, mode):
"""Set the mode of the worker.
@@ -568,7 +568,7 @@ def init(
the distributed plasma store is lost due to node failure, Ray will
attempt to reconstruct the object by re-executing the task that
created the object. Arguments to the task will be recursively
reconstructed. If False, then ray.UnreconstructableError will be
reconstructed. If False, then ray.ObjectLostError will be
thrown.
_redis_max_memory: Redis max memory.
_node_ip_address (str): The IP address of the node that we are on.
@@ -589,7 +589,7 @@ def init(
_java_worker_options: Overwrite the options to start Java workers.
_lru_evict (bool): If True, when an object store is full, it will evict
objects in LRU order to make more space and when under memory
pressure, ray.UnreconstructableError may be thrown. If False, then
pressure, ray.ObjectLostError may be thrown. If False, then
reference counting will be used to decide which objects are safe
to evict and when under memory pressure, ray.ObjectStoreFullError
may be thrown.
@@ -1383,7 +1383,7 @@ def get(object_refs, *, timeout=None):
A Python object or a list of Python objects.
Raises:
RayTimeoutError: A RayTimeoutError is raised if a timeout is set and
GetTimeoutError: A GetTimeoutError is raised if a timeout is set and
the get takes longer than timeout to return.
Exception: An exception is raised if the task that created the object
or that created one of the objects raised an exception.
@@ -1417,7 +1417,7 @@ def get(object_refs, *, timeout=None):
for i, value in enumerate(values):
if isinstance(value, RayError):
last_task_error_raise_time = time.time()
if isinstance(value, ray.exceptions.UnreconstructableError):
if isinstance(value, ray.exceptions.ObjectLostError):
worker.core_worker.dump_object_store_memory_usage()
if isinstance(value, RayTaskError):
raise value.as_instanceof_cause()
@@ -1611,7 +1611,7 @@ def cancel(object_ref, *, force=False):
Only non-actor tasks can be canceled. Canceled tasks will not be
retried (max_retries will not be respected).
Calling ray.get on a canceled task will raise a RayCancellationError.
Calling ray.get on a canceled task will raise a TaskCancelledError.
Args:
object_ref (ObjectRef): ObjectRef returned by the task
@@ -1642,7 +1642,7 @@ def _mode(worker=global_worker):
return worker.mode
def make_decorator(num_return_vals=None,
def make_decorator(num_returns=None,
num_cpus=None,
num_gpus=None,
memory=None,
@@ -1682,13 +1682,12 @@ def make_decorator(num_return_vals=None,
" integer")
return ray.remote_function.RemoteFunction(
Language.PYTHON, function_or_class, None, num_cpus, num_gpus,
memory, object_store_memory, resources, num_return_vals,
max_calls, max_retries, placement_group,
placement_group_bundle_index)
memory, object_store_memory, resources, num_returns, max_calls,
max_retries, placement_group, placement_group_bundle_index)
if inspect.isclass(function_or_class):
if num_return_vals is not None:
raise TypeError("The keyword 'num_return_vals' is not "
if num_returns is not None:
raise TypeError("The keyword 'num_returns' is not "
"allowed for actors.")
if max_calls is not None:
raise TypeError("The keyword 'max_calls' is not "
@@ -1732,7 +1731,7 @@ def remote(*args, **kwargs):
It can also be used with specific keyword arguments:
* **num_return_vals:** This is only for *remote functions*. It specifies
* **num_returns:** This is only for *remote functions*. It specifies
the number of object refs returned by the remote function invocation.
* **num_cpus:** The quantity of CPU cores to reserve for this task or for
the lifetime of the actor.
@@ -1774,7 +1773,7 @@ def remote(*args, **kwargs):
.. code-block:: python
@ray.remote(num_gpus=1, max_calls=1, num_return_vals=2)
@ray.remote(num_gpus=1, max_calls=1, num_returns=2)
def f():
return 1, 2
@@ -1789,7 +1788,7 @@ def remote(*args, **kwargs):
.. code-block:: python
@ray.remote(num_gpus=1, max_calls=1, num_return_vals=2)
@ray.remote(num_gpus=1, max_calls=1, num_returns=2)
def f():
return 1, 2
g = f.options(num_gpus=2, max_calls=None)
@@ -1815,15 +1814,15 @@ def remote(*args, **kwargs):
error_string = ("The @ray.remote decorator must be applied either "
"with no arguments and no parentheses, for example "
"'@ray.remote', or it must be applied using some of "
"the arguments 'num_return_vals', 'num_cpus', 'num_gpus', "
"the arguments 'num_returns', 'num_cpus', 'num_gpus', "
"'memory', 'object_store_memory', 'resources', "
"'max_calls', or 'max_restarts', like "
"'@ray.remote(num_return_vals=2, "
"'@ray.remote(num_returns=2, "
"resources={\"CustomResource\": 1})'.")
assert len(args) == 0 and len(kwargs) > 0, error_string
for key in kwargs:
assert key in [
"num_return_vals",
"num_returns",
"num_cpus",
"num_gpus",
"memory",
@@ -1848,7 +1847,7 @@ def remote(*args, **kwargs):
assert "GPU" not in resources, "Use the 'num_gpus' argument."
# Handle other arguments.
num_return_vals = kwargs.get("num_return_vals")
num_returns = kwargs.get("num_returns")
max_calls = kwargs.get("max_calls")
max_restarts = kwargs.get("max_restarts")
max_task_retries = kwargs.get("max_task_retries")
@@ -1857,7 +1856,7 @@ def remote(*args, **kwargs):
max_retries = kwargs.get("max_retries")
return make_decorator(
num_return_vals=num_return_vals,
num_returns=num_returns,
num_cpus=num_cpus,
num_gpus=num_gpus,
memory=memory,