Propagate backend error to worker (#4039)

This commit is contained in:
Hao Chen
2019-02-16 11:39:15 +08:00
committed by GitHub
parent 4be3d0c5d3
commit de17443dc2
21 changed files with 635 additions and 258 deletions
+105
View File
@@ -0,0 +1,105 @@
import os
import colorama
try:
import setproctitle
except ImportError:
setproctitle = None
class RayError(Exception):
"""Super class of all ray exception types."""
pass
class RayTaskError(RayError):
"""Indicates that a task threw an exception during execution.
If a task throws an exception during execution, a RayTaskError is stored in
the object store for each of the task's outputs. When an object is
retrieved from the object store, the Python method that retrieved it checks
to see if the object is a RayTaskError and if it is then an exception is
thrown propagating the error message.
Attributes:
function_name (str): The name of the function that failed and produced
the RayTaskError.
traceback_str (str): The traceback from the exception.
"""
def __init__(self, function_name, traceback_str):
"""Initialize a RayTaskError."""
if setproctitle:
self.proctitle = setproctitle.getproctitle()
else:
self.proctitle = "ray_worker"
self.pid = os.getpid()
self.host = os.uname()[1]
self.function_name = function_name
self.traceback_str = traceback_str
assert traceback_str is not None
def __str__(self):
"""Format a RayTaskError as a string."""
lines = self.traceback_str.split("\n")
out = []
in_worker = False
for line in lines:
if line.startswith("Traceback "):
out.append("{}{}{} (pid={}, host={})".format(
colorama.Fore.CYAN, self.proctitle, colorama.Fore.RESET,
self.pid, self.host))
elif in_worker:
in_worker = False
elif "ray/worker.py" in line or "ray/function_manager.py" in line:
in_worker = True
else:
out.append(line)
return "\n".join(out)
class RayWorkerError(RayError):
"""Indicates that the worker died unexpectedly while executing a task."""
def __str__(self):
return "The worker died unexpectedly while executing this task."
class RayActorError(RayError):
"""Indicates that the actor died unexpectedly before finishing a task.
This exception could happen either because the actor process dies while
executing a task, or because a task is submitted to a dead actor.
"""
def __str__(self):
return "The actor died unexpectedly before finishing this task."
class UnreconstructableError(RayError):
"""Indicates that an object is lost and cannot be reconstructed.
Note, this exception only happens for actor objects. If actor's current
state is after object's creating task, the actor cannot re-run the task to
reconstruct the object.
Attributes:
object_id: ID of the object.
"""
def __init__(self, object_id):
self.object_id = object_id
def __str__(self):
return ("Object {} is lost (either evicted or explicitly deleted) and "
+ "cannot be reconstructed.").format(self.object_id.hex())
RAY_EXCEPTION_TYPES = [
RayError,
RayTaskError,
RayWorkerError,
RayActorError,
UnreconstructableError,
]
+86 -87
View File
@@ -4,7 +4,6 @@ from __future__ import print_function
from contextlib import contextmanager
import atexit
import colorama
import faulthandler
import hashlib
import inspect
@@ -28,18 +27,43 @@ import ray.experimental.state as state
import ray.gcs_utils
import ray.memory_monitor as memory_monitor
import ray.node
import ray.parameter
import ray.ray_constants as ray_constants
import ray.remote_function
import ray.serialization as serialization
import ray.services as services
import ray.signature
import ray.ray_constants as ray_constants
from ray import (
ActorHandleID,
ActorID,
ClientID,
DriverID,
ObjectID,
TaskID,
)
from ray import import_thread
from ray import ObjectID, DriverID, ActorID, ActorHandleID, ClientID, TaskID
from ray import profiling
from ray.function_manager import (FunctionActorManager, FunctionDescriptor)
import ray.parameter
from ray.utils import (check_oversized_pickle, is_cython, _random_string,
thread_safe_client, setup_logger)
from ray.core.generated.ErrorType import ErrorType
from ray.exceptions import (
RayActorError,
RayError,
RayTaskError,
RayWorkerError,
UnreconstructableError,
RAY_EXCEPTION_TYPES,
)
from ray.function_manager import (
FunctionActorManager,
FunctionDescriptor,
)
from ray.utils import (
_random_string,
check_oversized_pickle,
is_cython,
setup_logger,
thread_safe_client,
)
SCRIPT_MODE = 0
WORKER_MODE = 1
@@ -68,55 +92,6 @@ except ImportError:
setproctitle = None
class RayTaskError(Exception):
"""An object used internally to represent a task that threw an exception.
If a task throws an exception during execution, a RayTaskError is stored in
the object store for each of the task's outputs. When an object is
retrieved from the object store, the Python method that retrieved it checks
to see if the object is a RayTaskError and if it is then an exception is
thrown propagating the error message.
Currently, we either use the exception attribute or the traceback attribute
but not both.
Attributes:
function_name (str): The name of the function that failed and produced
the RayTaskError.
traceback_str (str): The traceback from the exception.
"""
def __init__(self, function_name, traceback_str):
"""Initialize a RayTaskError."""
if setproctitle:
self.proctitle = setproctitle.getproctitle()
else:
self.proctitle = "ray_worker"
self.pid = os.getpid()
self.host = os.uname()[1]
self.function_name = function_name
self.traceback_str = traceback_str
assert traceback_str is not None
def __str__(self):
"""Format a RayTaskError as a string."""
lines = self.traceback_str.split("\n")
out = []
in_worker = False
for line in lines:
if line.startswith("Traceback "):
out.append("{}{}{} (pid={}, host={})".format(
colorama.Fore.CYAN, self.proctitle, colorama.Fore.RESET,
self.pid, self.host))
elif in_worker:
in_worker = False
elif "ray/worker.py" in line or "ray/function_manager.py" in line:
in_worker = True
else:
out.append(line)
return "\n".join(out)
class ActorCheckpointInfo(object):
"""Information used to maintain actor checkpoints."""
@@ -400,6 +375,8 @@ class Worker(object):
start_time = time.time()
# Only send the warning once.
warning_sent = False
serialization_context = self.get_serialization_context(
self.task_driver_id)
while True:
try:
# We divide very large get requests into smaller get requests
@@ -407,23 +384,23 @@ class Worker(object):
# long time, if the store is blocked, it can block the manager
# as well as a consequence.
results = []
for i in range(0, len(object_ids),
ray._config.worker_get_request_size()):
results += self.plasma_client.get(
object_ids[i:(
i + ray._config.worker_get_request_size())],
batch_size = ray._config.worker_fetch_request_size()
for i in range(0, len(object_ids), batch_size):
metadata_data_pairs = self.plasma_client.get_buffers(
object_ids[i:i + batch_size],
timeout,
self.get_serialization_context(self.task_driver_id))
with_meta=True,
)
for j in range(len(metadata_data_pairs)):
metadata, data = metadata_data_pairs[j]
results.append(
self._deserialize_object_from_arrow(
data,
metadata,
object_ids[i + j],
serialization_context,
))
return results
except pyarrow.lib.ArrowInvalid:
# TODO(ekl): the local scheduler could include relevant
# metadata in the task kill case for a better error message
invalid_error = RayTaskError(
"<unknown>",
"Invalid return value: likely worker died or was killed "
"while executing the task; check previous logs or dmesg "
"for errors.")
return [invalid_error] * len(object_ids)
except pyarrow.DeserializationCallbackError:
# Wait a little bit for the import thread to import the class.
# If we currently have the worker lock, we need to release it
@@ -448,6 +425,30 @@ class Worker(object):
driver_id=self.task_driver_id)
warning_sent = True
def _deserialize_object_from_arrow(self, data, metadata, object_id,
serialization_context):
if metadata:
# If metadata is not empty, return an exception object based on
# the error type.
error_type = int(metadata)
if error_type == ErrorType.WORKER_DIED:
return RayWorkerError()
elif error_type == ErrorType.ACTOR_DIED:
return RayActorError()
elif error_type == ErrorType.OBJECT_UNRECONSTRUCTABLE:
return UnreconstructableError(ray.ObjectID(object_id.binary()))
else:
assert False, "Unrecognized error type " + str(error_type)
elif data:
# If data is not empty, deserialize the object.
# Note, the lock is needed because `serialization_context` isn't
# thread-safe.
with self.plasma_client.lock:
return pyarrow.deserialize(data, serialization_context)
else:
# Object isn't available in plasma.
return plasma.ObjectNotAvailable
def get_object(self, object_ids):
"""Get the value or values in the object store associated with the IDs.
@@ -741,7 +742,7 @@ class Worker(object):
passed by value.
Raises:
RayTaskError: This exception is raised if a task that
RayError: This exception is raised if a task that
created one of the arguments failed.
"""
arguments = []
@@ -749,7 +750,7 @@ class Worker(object):
if isinstance(arg, ObjectID):
# get the object from the local object store
argument = self.get_object([arg])[0]
if isinstance(argument, RayTaskError):
if isinstance(argument, RayError):
raise argument
else:
# pass the argument by value
@@ -831,11 +832,6 @@ class Worker(object):
with profiling.profile("task:deserialize_arguments"):
arguments = self._get_arguments_for_execution(
function_name, args)
except RayTaskError as e:
self._handle_process_task_failure(
function_descriptor, return_object_ids, e,
ray.utils.format_error_message(traceback.format_exc()))
return
except Exception as e:
self._handle_process_task_failure(
function_descriptor, return_object_ids, e,
@@ -1155,12 +1151,15 @@ def _initialize_serialization(driver_id, worker=global_worker):
worker.serialization_context_map[driver_id] = serialization_context
register_custom_serializer(
RayTaskError,
use_dict=True,
local=True,
driver_id=driver_id,
class_id="ray.RayTaskError")
# Register exception types.
for error_cls in RAY_EXCEPTION_TYPES:
register_custom_serializer(
error_cls,
use_dict=True,
local=True,
driver_id=driver_id,
class_id=error_cls.__module__ + ". " + error_cls.__name__,
)
# Tell Ray to serialize lambdas with pickle.
register_custom_serializer(
type(lambda: 0),
@@ -2229,14 +2228,14 @@ def get(object_ids):
if isinstance(object_ids, list):
values = worker.get_object(object_ids)
for i, value in enumerate(values):
if isinstance(value, RayTaskError):
if isinstance(value, RayError):
last_task_error_raise_time = time.time()
raise value
return values
else:
value = worker.get_object([object_ids])[0]
if isinstance(value, RayTaskError):
# If the result is a RayTaskError, then the task that created
if isinstance(value, RayError):
# If the result is a RayError, then the task that created
# this object failed, and we should propagate the error message
# here.
last_task_error_raise_time = time.time()