[core worker] Python core worker task execution (#5783)

Executes tasks via the event loop in the C++ core worker. Also properly handles signals (including KeyboardInterrupt), so ctrl-C in a python interactive shell works now (if connecting to an existing cluster).
This commit is contained in:
Edward Oakes
2019-10-22 20:15:59 -07:00
committed by GitHub
parent 95241f6686
commit 02931e08f3
38 changed files with 830 additions and 678 deletions
+353 -66
View File
@@ -3,11 +3,21 @@
# cython: embedsignature = True
# cython: language_level = 3
from cpython.exc cimport PyErr_CheckSignals
import numpy
import time
import logging
import os
import sys
from libc.stdint cimport uint8_t, int32_t, int64_t, uint64_t
from libc.stdint cimport (
int32_t,
int64_t,
INT64_MAX,
uint64_t,
uint8_t,
)
from libcpp cimport bool as c_bool
from libcpp.memory cimport (
dynamic_pointer_cast,
@@ -28,6 +38,7 @@ from ray.includes.common cimport (
CRayStatus,
CGcsClientOptions,
CTaskArg,
CTaskType,
CRayFunction,
LocalMemoryBuffer,
move,
@@ -35,6 +46,9 @@ from ray.includes.common cimport (
LANGUAGE_JAVA,
LANGUAGE_PYTHON,
LocalMemoryBuffer,
TASK_TYPE_NORMAL_TASK,
TASK_TYPE_ACTOR_CREATION_TASK,
TASK_TYPE_ACTOR_TASK,
WORKER_TYPE_WORKER,
WORKER_TYPE_DRIVER,
)
@@ -42,10 +56,10 @@ from ray.includes.libraylet cimport (
CRayletClient,
GCSProfileEvent,
GCSProfileTableData,
ResourceMappingType,
WaitResultPair,
)
from ray.includes.unique_ids cimport (
CActorID,
CActorCheckpointID,
CObjectID,
CClientID,
@@ -54,12 +68,22 @@ from ray.includes.libcoreworker cimport (
CActorCreationOptions,
CCoreWorker,
CTaskOptions,
ResourceMappingType,
)
from ray.includes.task cimport CTaskSpec
from ray.includes.ray_config cimport RayConfig
import ray
import ray.experimental.signal as ray_signal
import ray.ray_constants as ray_constants
from ray import profiling
from ray.exceptions import RayletError, ObjectStoreFullError
from ray.exceptions import (
RayError,
RayletError,
RayTaskError,
ObjectStoreFullError
)
from ray.function_manager import FunctionDescriptor
from ray.utils import decode
from ray.ray_constants import (
DEFAULT_PUT_OBJECT_DELAY,
@@ -105,9 +129,30 @@ cdef int check_status(const CRayStatus& status) nogil except -1:
if status.IsObjectStoreFull():
raise ObjectStoreFullError(message)
elif status.IsInterrupted():
raise KeyboardInterrupt()
else:
raise RayletError(message)
cdef RayObjectsToDataMetadataPairs(
const c_vector[shared_ptr[CRayObject]] objects):
data_metadata_pairs = []
for i in range(objects.size()):
# core_worker will return a nullptr for objects that couldn't be
# retrieved from the store or if an object was an exception.
if not objects[i].get():
data_metadata_pairs.append((None, None))
else:
data = None
metadata = None
if objects[i].get().HasData():
data = Buffer.make(objects[i].get().GetData())
if objects[i].get().HasMetadata():
metadata = Buffer.make(
objects[i].get().GetMetadata()).to_pybytes()
data_metadata_pairs.append((data, metadata))
return data_metadata_pairs
cdef VectorToObjectIDs(const c_vector[CObjectID] &object_ids):
result = []
@@ -327,17 +372,6 @@ cdef class RayletClient:
# initialized before the raylet client.
self.client = &core_worker.core_worker.get().GetRayletClient()
def get_task(self):
cdef:
unique_ptr[CTaskSpec] task_spec
with nogil:
check_status(self.client.GetTask(&task_spec))
return TaskSpec.make(task_spec)
def task_done(self):
check_status(self.client.TaskDone())
def fetch_or_reconstruct(self, object_ids,
c_bool fetch_only,
TaskID current_task_id=TaskID.nil()):
@@ -345,27 +379,6 @@ cdef class RayletClient:
check_status(self.client.FetchOrReconstruct(
fetch_ids, fetch_only, current_task_id.native()))
def resource_ids(self):
cdef:
ResourceMappingType resource_mapping = (
self.client.GetResourceIDs())
unordered_map[
c_string, c_vector[pair[int64_t, double]]
].iterator iterator = resource_mapping.begin()
c_vector[pair[int64_t, double]] c_value
resources_dict = {}
while iterator != resource_mapping.end():
key = decode(dereference(iterator).first)
c_value = dereference(iterator).second
ids_and_fractions = []
for i in range(c_value.size()):
ids_and_fractions.append(
(c_value[i].first, c_value[i].second))
resources_dict[key] = ids_and_fractions
postincrement(iterator)
return resources_dict
def push_error(self, JobID job_id, error_type, error_message,
double timestamp):
check_status(self.client.PushError(job_id.native(),
@@ -403,6 +416,272 @@ cdef class RayletClient:
def is_worker(self):
return self.client.IsWorker()
cdef deserialize_args(
const c_vector[shared_ptr[CRayObject]] &c_args,
const c_vector[CObjectID] &arg_reference_ids):
cdef:
c_vector[shared_ptr[CRayObject]] by_reference_objects
args = []
by_reference_ids = []
by_reference_indices = []
for i in range(c_args.size()):
# Passed by value.
if arg_reference_ids[i].IsNil():
data = Buffer.make(c_args[i].get().GetData())
if (c_args[i].get().HasMetadata()
and Buffer.make(
c_args[i].get().GetMetadata()).to_pybytes()
== RAW_BUFFER_METADATA):
args.append(data)
else:
args.append(pickle.loads(data.to_pybytes()))
# Passed by reference.
else:
by_reference_ids.append(
ObjectID(arg_reference_ids[i].Binary()))
by_reference_indices.append(i)
by_reference_objects.push_back(c_args[i])
args.append(None)
data_metadata_pairs = RayObjectsToDataMetadataPairs(
by_reference_objects)
for i, arg in enumerate(
ray.worker.global_worker.deserialize_objects(
data_metadata_pairs, by_reference_ids)):
args[by_reference_indices[i]] = arg
for arg in args:
if isinstance(arg, RayError):
raise arg
return ray.signature.recover_args(args)
cdef _check_worker_state(worker, CTaskType task_type, JobID job_id):
assert worker.current_task_id.is_nil()
assert worker.task_context.task_index == 0
assert worker.task_context.put_index == 1
# If this worker is not an actor, check that `current_job_id`
# was reset when the worker finished the previous task.
if <int>task_type in [<int>TASK_TYPE_NORMAL_TASK,
<int>TASK_TYPE_ACTOR_CREATION_TASK]:
assert worker.current_job_id.is_nil()
# Set the driver ID of the current running task. This is
# needed so that if the task throws an exception, we propagate
# the error message to the correct driver.
worker.current_job_id = job_id
else:
# If this worker is an actor, current_job_id wasn't reset.
# Check that current task's driver ID equals the previous
# one.
assert worker.current_job_id == job_id
cdef _store_task_outputs(worker, return_ids, outputs):
for i in range(len(return_ids)):
return_id, output = return_ids[i], outputs[i]
if isinstance(output, ray.actor.ActorHandle):
raise Exception("Returning an actor handle from a remote "
"function is not allowed).")
if output is ray.experimental.no_return.NoReturn:
if not worker.core_worker.object_exists(return_id):
raise RuntimeError(
"Attempting to return 'ray.experimental.NoReturn' "
"from a remote function, but the corresponding "
"ObjectID does not exist in the local object store.")
else:
worker.put_object(return_id, output)
cdef execute_task(
CTaskType task_type,
const CRayFunction &ray_function,
const CJobID &c_job_id,
const CActorID &c_actor_id,
const unordered_map[c_string, double] &c_resources,
const c_vector[shared_ptr[CRayObject]] &c_args,
const c_vector[CObjectID] &c_arg_reference_ids,
const c_vector[CObjectID] &c_return_ids,
c_vector[shared_ptr[CRayObject]] *returns):
worker = ray.worker.global_worker
actor_id = ActorID(c_actor_id.Binary())
job_id = JobID(c_job_id.Binary())
task_id = worker.core_worker.get_current_task_id()
# Check that the worker is in the expected state to execute the task.
_check_worker_state(worker, task_type, job_id)
worker.task_context.current_task_id = task_id
# Automatically restrict the GPUs available to this task.
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
function_descriptor = FunctionDescriptor.from_bytes_list(
ray_function.GetFunctionDescriptor())
if <int>task_type == <int>TASK_TYPE_ACTOR_CREATION_TASK:
worker.actor_id = actor_id
actor_class = worker.function_actor_manager.load_actor_class(
job_id, function_descriptor)
worker.actors[actor_id] = actor_class.__new__(actor_class)
worker.actor_checkpoint_info[actor_id] = (
ray.worker.ActorCheckpointInfo(
num_tasks_since_last_checkpoint=0,
last_checkpoint_timestamp=int(1000 * time.time()),
checkpoint_ids=[]))
execution_info = worker.function_actor_manager.get_execution_info(
job_id, function_descriptor)
function_name = execution_info.function_name
extra_data = {"name": function_name, "task_id": task_id.hex()}
if <int>task_type == <int>TASK_TYPE_NORMAL_TASK:
title = "ray_worker:{}()".format(function_name)
next_title = "ray_worker"
function_executor = execution_info.function
else:
actor = worker.actors[actor_id]
class_name = actor.__class__.__name__
title = "ray_{}:{}()".format(class_name, function_name)
next_title = "ray_{}".format(class_name)
worker_name = "ray_{}_{}".format(class_name, os.getpid())
if c_resources.find(b"memory") != c_resources.end():
worker.memory_monitor.set_heap_limit(
worker_name,
ray_constants.from_memory_units(
dereference(c_resources.find(b"memory")).second))
if c_resources.find(b"object_store_memory") != c_resources.end():
worker._set_object_store_client_options(
worker_name,
int(ray_constants.from_memory_units(
dereference(
c_resources.find(b"object_store_memory")).second)))
def function_executor(*arguments, **kwarguments):
return execution_info.function(actor, *arguments, **kwarguments)
return_ids = VectorToObjectIDs(c_return_ids)
with profiling.profile("task", extra_data=extra_data):
try:
task_exception = False
if not (<int>task_type == <int>TASK_TYPE_ACTOR_TASK
and function_name == "__ray_terminate__"):
worker.reraise_actor_init_error()
worker.memory_monitor.raise_if_low_memory()
with profiling.profile("task:deserialize_arguments"):
args, kwargs = deserialize_args(c_args, c_arg_reference_ids)
# Execute the task.
with ray.worker._changeproctitle(title, next_title):
with profiling.profile("task:execute"):
task_exception = True
outputs = function_executor(*args, **kwargs)
task_exception = False
if len(return_ids) == 1:
outputs = (outputs,)
# Store the outputs in the object store.
with profiling.profile("task:store_outputs"):
_store_task_outputs(worker, return_ids, outputs)
except Exception as error:
if (<int>task_type == <int>TASK_TYPE_ACTOR_CREATION_TASK):
worker.mark_actor_init_failed(error)
backtrace = ray.utils.format_error_message(
traceback.format_exc(), task_exception=task_exception)
if isinstance(error, RayTaskError):
# Avoid recursive nesting of RayTaskError.
failure_object = RayTaskError(function_name, backtrace,
error.cause_cls)
else:
failure_object = RayTaskError(function_name, backtrace,
error.__class__)
_store_task_outputs(
worker, return_ids, [failure_object] * len(return_ids))
ray.utils.push_error_to_driver(
worker,
ray_constants.TASK_PUSH_ERROR,
str(failure_object),
job_id=worker.current_job_id)
# Send signal with the error.
ray_signal.send(ray_signal.ErrorSignal(str(failure_object)))
# Reset the state fields so the next task can run.
worker.task_context.current_task_id = TaskID.nil()
worker.core_worker.set_current_task_id(TaskID.nil())
worker.task_context.task_index = 0
worker.task_context.put_index = 1
# Don't need to reset `current_job_id` if the worker is an
# actor. Because the following tasks should all have the
# same driver id.
if <int>task_type == <int>TASK_TYPE_NORMAL_TASK:
worker.current_job_id = JobID.nil()
worker.core_worker.set_current_job_id(JobID.nil())
# Reset signal counters so that the next task can get
# all past signals.
ray_signal.reset()
# Reset the state of the worker for the next task to execute.
# Increase the task execution counter.
worker.function_actor_manager.increase_task_counter(
job_id, function_descriptor)
# If we've reached the max number of executions for this worker, exit.
reached_max_executions = (
worker.function_actor_manager.get_task_counter(
job_id, function_descriptor) == execution_info.max_calls)
if reached_max_executions:
worker.core_worker.disconnect()
sys.exit(0)
cdef CRayStatus task_execution_handler(
CTaskType task_type,
const CRayFunction &ray_function,
const CJobID &c_job_id,
const CActorID &c_actor_id,
const unordered_map[c_string, double] &c_resources,
const c_vector[shared_ptr[CRayObject]] &c_args,
const c_vector[CObjectID] &c_arg_reference_ids,
const c_vector[CObjectID] &c_return_ids,
c_vector[shared_ptr[CRayObject]] *returns) nogil:
with gil:
try:
# The call to execute_task should never raise an exception. If it
# does, that indicates that there was an unexpected internal error.
execute_task(task_type, ray_function, c_job_id,
c_actor_id, c_resources, c_args,
c_arg_reference_ids, c_return_ids, returns)
except Exception:
traceback_str = traceback.format_exc() + (
"An unexpected internal error occurred while the worker was"
"executing a task.")
ray.utils.push_error_to_driver(
ray.worker.global_worker,
"worker_crash",
traceback_str,
job_id=None)
# TODO(rkn): Note that if the worker was in the middle of executing
# a task, then any worker or driver that is blocking in a get call
# and waiting for the output of that task will hang. We need to
# address this.
sys.exit(1)
return CRayStatus.OK()
cdef CRayStatus check_signals() nogil:
with gil:
try:
PyErr_CheckSignals()
except KeyboardInterrupt:
return CRayStatus.Interrupted(b"")
return CRayStatus.OK()
cdef class CoreWorker:
cdef unique_ptr[CCoreWorker] core_worker
@@ -419,12 +698,20 @@ cdef class CoreWorker:
LANGUAGE_PYTHON, store_socket.encode("ascii"),
raylet_socket.encode("ascii"), job_id.native(),
gcs_options.native()[0], log_dir.encode("utf-8"),
node_ip_address.encode("utf-8"), NULL, False))
node_ip_address.encode("utf-8"), task_execution_handler,
check_signals, False))
def disconnect(self):
with nogil:
self.core_worker.get().Disconnect()
def run_task_loop(self):
with nogil:
self.core_worker.get().Execution().Run()
def get_current_task_id(self):
return TaskID(self.core_worker.get().GetCurrentTaskId().Binary())
def set_current_task_id(self, TaskID task_id):
cdef:
CTaskID c_task_id = task_id.native()
@@ -432,15 +719,8 @@ cdef class CoreWorker:
with nogil:
self.core_worker.get().SetCurrentTaskId(c_task_id)
def set_actor_id(self, ActorID actor_id):
cdef:
CActorID c_actor_id = actor_id.native()
with nogil:
self.core_worker.get().SetActorId(c_actor_id)
def get_current_task_id(self):
return TaskID(self.core_worker.get().GetCurrentTaskId().Binary())
def get_current_job_id(self):
return JobID(self.core_worker.get().GetCurrentJobId().Binary())
def set_current_job_id(self, JobID job_id):
cdef:
@@ -449,7 +729,8 @@ cdef class CoreWorker:
with nogil:
self.core_worker.get().SetCurrentJobId(c_job_id)
def get_objects(self, object_ids, TaskID current_task_id):
def get_objects(self, object_ids, TaskID current_task_id,
int64_t timeout_ms=-1):
cdef:
c_vector[shared_ptr[CRayObject]] results
CTaskID c_task_id = current_task_id.native()
@@ -457,25 +738,9 @@ cdef class CoreWorker:
with nogil:
check_status(self.core_worker.get().Objects().Get(
c_object_ids, -1, &results))
c_object_ids, timeout_ms, &results))
data_metadata_pairs = []
for result in results:
# core_worker will return a nullptr for objects that couldn't be
# retrieved from the store or if an object was an exception.
if not result.get():
data_metadata_pairs.append((None, None))
else:
data = None
metadata = None
if result.get().HasData():
data = Buffer.make(result.get().GetData())
if result.get().HasMetadata():
metadata = Buffer.make(
result.get().GetMetadata()).to_pybytes()
data_metadata_pairs.append((data, metadata))
return data_metadata_pairs
return RayObjectsToDataMetadataPairs(results)
def object_exists(self, ObjectID object_id):
cdef:
@@ -570,7 +835,7 @@ cdef class CoreWorker:
with nogil:
check_status(self.core_worker.get().Objects().Seal(c_object_id))
def wait(self, object_ids, int num_returns, int64_t timeout_milliseconds,
def wait(self, object_ids, int num_returns, int64_t timeout_ms,
TaskID current_task_id):
cdef:
WaitResultPair result
@@ -581,7 +846,7 @@ cdef class CoreWorker:
wait_ids = ObjectIDsToVector(object_ids)
with nogil:
check_status(self.core_worker.get().Objects().Wait(
wait_ids, num_returns, timeout_milliseconds, &results))
wait_ids, num_returns, timeout_ms, &results))
assert len(results) == len(object_ids)
@@ -704,6 +969,28 @@ cdef class CoreWorker:
return VectorToObjectIDs(return_ids)
def resource_ids(self):
cdef:
ResourceMappingType resource_mapping = (
self.core_worker.get().GetResourceIDs())
unordered_map[
c_string, c_vector[pair[int64_t, double]]
].iterator iterator = resource_mapping.begin()
c_vector[pair[int64_t, double]] c_value
resources_dict = {}
while iterator != resource_mapping.end():
key = decode(dereference(iterator).first)
c_value = dereference(iterator).second
ids_and_fractions = []
for i in range(c_value.size()):
ids_and_fractions.append(
(c_value[i].first, c_value[i].second))
resources_dict[key] = ids_and_fractions
postincrement(iterator)
return resources_dict
def profile_event(self, event_type, dict extra_data):
cdef:
c_string c_event_type = event_type.encode("ascii")
+2 -2
View File
@@ -199,8 +199,8 @@ class PlasmaEventHandler:
del self._waiting_dict[fut.object_id]
def _complete_future(self, fut):
obj = self._worker.retrieve_and_deserialize(
[ray.ObjectID(fut.object_id.binary())], 0)[0]
obj = self._worker.get_objects([ray.ObjectID(
fut.object_id.binary())])[0]
fut.set_result(obj)
def as_future(self, object_id, check_ready=True):
+3 -4
View File
@@ -69,11 +69,10 @@ def send(signal):
Args:
signal: Signal to be sent.
"""
if hasattr(ray.worker.global_worker, "actor_creation_task_id"):
source_key = ray.worker.global_worker.actor_id.hex()
else:
# No actors; this function must have been called from a task
if ray.worker.global_worker.actor_id.is_nil():
source_key = ray.worker.global_worker.current_task_id.hex()
else:
source_key = ray.worker.global_worker.actor_id.hex()
encoded_signal = ray.utils.binary_to_hex(cloudpickle.dumps(signal))
ray.worker.global_worker.redis_client.execute_command(
+1 -1
View File
@@ -763,7 +763,7 @@ class FunctionActorManager(object):
worker's internal state to record the executed method.
"""
def actor_method_executor(dummy_return_id, actor, *args, **kwargs):
def actor_method_executor(actor, *args, **kwargs):
# Update the actor's task counter to reflect the task we're about
# to execute.
self._worker.actor_task_counter += 1
+22 -10
View File
@@ -47,31 +47,34 @@ cdef extern from "ray/common/status.h" namespace "ray" nogil:
CRayStatus OK()
@staticmethod
CRayStatus OutOfMemory()
CRayStatus OutOfMemory(const c_string &msg)
@staticmethod
CRayStatus KeyError()
CRayStatus KeyError(const c_string &msg)
@staticmethod
CRayStatus Invalid()
CRayStatus Invalid(const c_string &msg)
@staticmethod
CRayStatus IOError()
CRayStatus IOError(const c_string &msg)
@staticmethod
CRayStatus TypeError()
CRayStatus TypeError(const c_string &msg)
@staticmethod
CRayStatus UnknownError()
CRayStatus UnknownError(const c_string &msg)
@staticmethod
CRayStatus NotImplemented()
CRayStatus NotImplemented(const c_string &msg)
@staticmethod
CRayStatus RedisError()
CRayStatus ObjectStoreFull(const c_string &msg)
@staticmethod
CRayStatus ObjectStoreFull()
CRayStatus RedisError(const c_string &msg)
@staticmethod
CRayStatus Interrupted(const c_string &msg)
c_bool ok()
c_bool IsOutOfMemory()
@@ -81,8 +84,9 @@ cdef extern from "ray/common/status.h" namespace "ray" nogil:
c_bool IsTypeError()
c_bool IsUnknownError()
c_bool IsNotImplemented()
c_bool IsRedisError()
c_bool IsObjectStoreFull()
c_bool IsRedisError()
c_bool IsInterrupted()
c_string ToString()
c_string CodeAsString()
@@ -92,6 +96,7 @@ cdef extern from "ray/common/status.h" namespace "ray" nogil:
# We can later add more of the common status factory methods as needed
cdef CRayStatus RayStatus_OK "Status::OK"()
cdef CRayStatus RayStatus_Invalid "Status::Invalid"()
cdef CRayStatus RayStatus_NotImplemented "Status::NotImplemented"()
cdef extern from "ray/common/status.h" namespace "ray::StatusCode" nogil:
@@ -117,6 +122,8 @@ cdef extern from "ray/protobuf/common.pb.h" nogil:
pass
cdef cppclass CWorkerType "ray::WorkerType":
pass
cdef cppclass CTaskType "ray::TaskType":
pass
# This is a workaround for C++ enum class since Cython has no corresponding
@@ -130,6 +137,11 @@ cdef extern from "ray/protobuf/common.pb.h" nogil:
cdef CWorkerType WORKER_TYPE_WORKER "ray::WorkerType::WORKER"
cdef CWorkerType WORKER_TYPE_DRIVER "ray::WorkerType::DRIVER"
cdef extern from "ray/protobuf/common.pb.h" nogil:
cdef CTaskType TASK_TYPE_NORMAL_TASK "ray::TaskType::NORMAL_TASK"
cdef CTaskType TASK_TYPE_ACTOR_CREATION_TASK "ray::TaskType::ACTOR_CREATION_TASK" # noqa: E501
cdef CTaskType TASK_TYPE_ACTOR_TASK "ray::TaskType::ACTOR_TASK"
cdef extern from "ray/common/task/scheduling_resources.h" nogil:
cdef cppclass ResourceSet "ray::ResourceSet":
+38 -3
View File
@@ -1,7 +1,13 @@
# cython: profile = False
# distutils: language = c++
# cython: embedsignature = True
from libc.stdint cimport int64_t
from libcpp cimport bool as c_bool
from libcpp.memory cimport shared_ptr, unique_ptr
from libcpp.string cimport string as c_string
from libcpp.unordered_map cimport unordered_map
from libcpp.utility cimport pair
from libcpp.vector cimport vector as c_vector
from ray.includes.unique_ids cimport (
@@ -18,12 +24,30 @@ from ray.includes.common cimport (
CRayStatus,
CTaskArg,
CTaskOptions,
CTaskType,
CWorkerType,
CLanguage,
CGcsClientOptions,
)
from ray.includes.task cimport CTaskSpec
from ray.includes.libraylet cimport CRayletClient
ctypedef unordered_map[c_string, c_vector[pair[int64_t, double]]] \
ResourceMappingType
cdef extern from "ray/core_worker/task_execution.h" namespace "ray" nogil:
cdef cppclass CTaskExecutionInterface "CoreWorkerTaskExecutionInterface":
void Run()
void Stop()
cdef extern from "ray/core_worker/profiling.h" nogil:
cdef cppclass CProfiler "ray::worker::Profiler":
void Start()
cdef cppclass CProfileEvent "ray::worker::ProfileEvent":
CProfileEvent(const shared_ptr[CProfiler] profiler,
const c_string &event_type)
void SetExtraData(const c_string &extra_data)
cdef extern from "ray/core_worker/profiling.h" nogil:
cdef cppclass CProfileEvent "ray::worker::ProfileEvent":
@@ -54,12 +78,23 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
const c_string &raylet_socket, const CJobID &job_id,
const CGcsClientOptions &gcs_options,
const c_string &log_dir, const c_string &node_ip_address,
void* execution_callback,
CRayStatus (
CTaskType task_type,
const CRayFunction &ray_function,
const CJobID &job_id,
const CActorID &actor_id,
const unordered_map[c_string, double] &resources,
const c_vector[shared_ptr[CRayObject]] &args,
const c_vector[CObjectID] &arg_reference_ids,
const c_vector[CObjectID] &return_ids,
c_vector[shared_ptr[CRayObject]] *returns) nogil,
CRayStatus() nogil,
c_bool use_memory_store_)
void Disconnect()
CWorkerType &GetWorkerType()
CLanguage &GetLanguage()
CObjectInterface &Objects()
CTaskExecutionInterface &Execution()
CRayStatus SubmitTask(
const CRayFunction &function, const c_vector[CTaskArg] &args,
@@ -72,7 +107,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
const c_vector[CTaskArg] &args, const CTaskOptions &options,
c_vector[CObjectID] *return_ids)
# CTaskExecutionInterface &Execution()
unique_ptr[CProfileEvent] CreateProfileEvent(
const c_string &event_type)
@@ -81,12 +115,13 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
CRayletClient &GetRayletClient()
# TODO(edoakes): remove these once the Python core worker uses the task
# interfaces
CJobID GetCurrentJobId()
void SetCurrentJobId(const CJobID &job_id)
CTaskID GetCurrentTaskId()
void SetCurrentTaskId(const CTaskID &task_id)
void SetActorId(const CActorID &actor_id)
const CActorID &GetActorId()
CTaskID GetCallerId()
const ResourceMappingType &GetResourceIDs() const
CActorID DeserializeAndRegisterActorHandle(const c_string &bytes)
CRayStatus SerializeActorHandle(const CActorID &actor_id, c_string
*bytes)
-4
View File
@@ -3,7 +3,6 @@ from libcpp cimport bool as c_bool
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string as c_string
from libcpp.utility cimport pair
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector as c_vector
from ray.includes.common cimport (
@@ -38,8 +37,6 @@ cdef extern from "ray/protobuf/gcs.pb.h" nogil:
GCSProfileTableData()
ctypedef unordered_map[c_string, c_vector[pair[int64_t, double]]] \
ResourceMappingType
ctypedef pair[c_vector[CObjectID], c_vector[CObjectID]] WaitResultPair
@@ -78,4 +75,3 @@ cdef extern from "ray/raylet/raylet_client.h" nogil:
CWorkerID GetWorkerID() const
CJobID GetJobID() const
c_bool IsWorker() const
const ResourceMappingType &GetResourceIDs() const
+11 -17
View File
@@ -14,12 +14,6 @@ cdef class TaskSpec:
cdef:
unique_ptr[CTaskSpec] task_spec
@staticmethod
cdef make(unique_ptr[CTaskSpec]& task_spec):
cdef TaskSpec self = TaskSpec.__new__(TaskSpec)
self.task_spec.reset(task_spec.release())
return self
@staticmethod
def from_string(const c_string& task_spec_str):
"""Convert a string to a Ray task specification Python object.
@@ -82,23 +76,23 @@ cdef class TaskSpec:
def arguments(self):
"""Return the arguments for the task."""
cdef:
CTaskSpec*task_spec = self.task_spec.get()
int64_t num_args = task_spec.NumArgs()
int32_t lang = <int32_t>task_spec.GetLanguage()
int64_t num_args = self.task_spec.get().NumArgs()
int32_t lang = <int32_t>self.task_spec.get().GetLanguage()
int count
arg_list = []
if lang == <int32_t>LANGUAGE_PYTHON:
for i in range(num_args):
count = task_spec.ArgIdCount(i)
count = self.task_spec.get().ArgIdCount(i)
if count > 0:
assert count == 1
arg_list.append(
ObjectID(task_spec.ArgId(i, 0).Binary()))
ObjectID(self.task_spec.get().ArgId(i, 0).Binary()))
else:
data = task_spec.ArgData(i)[:task_spec.ArgDataSize(i)]
metadata = task_spec.ArgMetadata(i)[
:task_spec.ArgMetadataSize(i)]
data = self.task_spec.get().ArgData(i)[
:self.task_spec.get().ArgDataSize(i)]
metadata = self.task_spec.get().ArgMetadata(i)[
:self.task_spec.get().ArgMetadataSize(i)]
if metadata == RAW_BUFFER_METADATA:
obj = data
else:
@@ -111,10 +105,10 @@ cdef class TaskSpec:
def returns(self):
"""Return the object IDs for the return values of the task."""
cdef CTaskSpec *task_spec = self.task_spec.get()
return_id_list = []
for i in range(task_spec.NumReturns()):
return_id_list.append(ObjectID(task_spec.ReturnId(i).Binary()))
for i in range(self.task_spec.get().NumReturns()):
return_id_list.append(
ObjectID(self.task_spec.get().ReturnId(i).Binary()))
return return_id_list
def required_resources(self):
+5 -1
View File
@@ -505,6 +505,10 @@ class GlobalState(object):
node_ip_address = profile_table_message.node_ip_address
for profile_event_message in profile_table_message.profile_events:
try:
extra_data = json.loads(profile_event_message.extra_data)
except ValueError:
extra_data = {}
profile_event = {
"event_type": profile_event_message.event_type,
"component_id": component_id,
@@ -512,7 +516,7 @@ class GlobalState(object):
"component_type": component_type,
"start_time": profile_event_message.start_time,
"end_time": profile_event_message.end_time,
"extra_data": json.loads(profile_event_message.extra_data),
"extra_data": extra_data
}
profile_events.append(profile_event)
+1 -1
View File
@@ -106,7 +106,7 @@ class Cluster(object):
return node
def remove_node(self, node, allow_graceful=False):
def remove_node(self, node, allow_graceful=True):
"""Kills all processes associated with worker node.
Args:
-1
View File
@@ -47,4 +47,3 @@ def test_raylet_gdb(ray_gdb_start):
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
assert pgrep_command.communicate()[0]
subprocess.call(["pkill", "-f", "gdb.*{}".format(process_name)])
+9 -6
View File
@@ -292,7 +292,7 @@ def test_incorrect_method_calls(ray_start_regular):
def test_worker_raising_exception(ray_start_regular):
@ray.remote
def f():
ray.worker.global_worker._get_next_task_from_raylet = None
ray.worker.global_worker.function_actor_manager = None
# Running this task should cause the worker to raise an exception after
# the task has successfully completed.
@@ -618,12 +618,17 @@ def test_warning_for_too_many_nested_tasks(shutdown_only):
time.sleep(1000)
return 1
@ray.remote
def h():
time.sleep(1)
ray.get(f.remote())
@ray.remote
def g():
# Sleep so that the f tasks all get submitted to the scheduler after
# the g tasks.
time.sleep(1)
ray.get(f.remote())
ray.get(h.remote())
[g.remote() for _ in range(num_cpus * 4)]
wait_for_errors(ray_constants.WORKER_POOL_LARGE_ERROR, 1)
@@ -705,8 +710,6 @@ def test_warning_for_dead_node(ray_start_cluster_2_nodes):
def test_raylet_crash_when_get(ray_start_regular):
nonexistent_id = ray.ObjectID.from_random()
def sleep_to_kill_raylet():
# Don't kill raylet before default workers get connected.
time.sleep(2)
@@ -715,14 +718,14 @@ def test_raylet_crash_when_get(ray_start_regular):
thread = threading.Thread(target=sleep_to_kill_raylet)
thread.start()
with pytest.raises(ray.exceptions.UnreconstructableError):
ray.get(nonexistent_id)
ray.get(ray.ObjectID.from_random())
thread.join()
def test_connect_with_disconnected_node(shutdown_only):
config = json.dumps({
"num_heartbeats_timeout": 50,
"heartbeat_timeout_milliseconds": 10,
"raylet_heartbeat_timeout_milliseconds": 10,
})
cluster = Cluster()
cluster.add_node(num_cpus=0, _internal_config=config)
+1 -1
View File
@@ -52,7 +52,7 @@ def test_internal_config(ray_start_cluster_head):
worker = cluster.add_node()
cluster.wait_for_nodes()
cluster.remove_node(worker)
cluster.remove_node(worker, allow_graceful=False)
time.sleep(1)
assert ray.cluster_resources()["CPU"] == 2
-86
View File
@@ -1,86 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pytest
import ray
import ray.exceptions
import ray.experimental.no_return
import ray.worker
def test_set_single_output(ray_start_regular):
@ray.remote
def f():
return_object_ids = ray.worker.global_worker._current_task.returns()
ray.worker.global_worker.put_object(return_object_ids[0], 123)
return ray.experimental.no_return.NoReturn
assert ray.get(f.remote()) == 123
def test_set_multiple_outputs(ray_start_regular):
@ray.remote(num_return_vals=3)
def f(set_out0, set_out1, set_out2):
returns = []
return_object_ids = ray.worker.global_worker._current_task.returns()
for i, set_out in enumerate([set_out0, set_out1, set_out2]):
if set_out:
ray.worker.global_worker.put_object(return_object_ids[i], True)
returns.append(ray.experimental.no_return.NoReturn)
else:
returns.append(False)
return tuple(returns)
for set_out0 in [True, False]:
for set_out1 in [True, False]:
for set_out2 in [True, False]:
result_object_ids = f.remote(set_out0, set_out1, set_out2)
assert ray.get(result_object_ids) == [
set_out0, set_out1, set_out2
]
def test_set_actor_method(ray_start_regular):
@ray.remote
class Actor(object):
def __init__(self):
pass
def ping(self):
return_object_ids = ray.worker.global_worker._current_task.returns(
)
ray.worker.global_worker.put_object(return_object_ids[0], 123)
return ray.experimental.no_return.NoReturn
actor = Actor.remote()
assert ray.get(actor.ping.remote()) == 123
def test_exception(ray_start_regular):
@ray.remote(num_return_vals=2)
def f():
return_object_ids = ray.worker.global_worker._current_task.returns()
# The first return value is successfully stored in the object store
ray.worker.global_worker.put_object(return_object_ids[0], 123)
raise Exception("Error")
# The exception is stored at the second return objcet ID.
return ray.experimental.no_return.NoReturn, 456
object_id, exception_id = f.remote()
assert ray.get(object_id) == 123
with pytest.raises(ray.exceptions.RayTaskError):
ray.get(exception_id)
def test_no_set_and_no_return(ray_start_regular):
@ray.remote
def f():
return ray.experimental.no_return.NoReturn
object_id = f.remote()
with pytest.raises(ray.exceptions.RayTaskError) as e:
ray.get(object_id)
assert "Attempting to return 'ray.experimental.NoReturn'" in str(e.value)
+30 -287
View File
@@ -26,7 +26,6 @@ import random
import pyarrow
import pyarrow.plasma as plasma
import ray.cloudpickle as pickle
import ray.experimental.signal as ray_signal
import ray.experimental.no_return
import ray.gcs_utils
import ray.memory_monitor as memory_monitor
@@ -41,7 +40,6 @@ import ray.state
from ray import (
ActorID,
WorkerID,
JobID,
ObjectID,
TaskID,
@@ -60,10 +58,7 @@ from ray.exceptions import (
UnreconstructableError,
RAY_EXCEPTION_TYPES,
)
from ray.function_manager import (
FunctionActorManager,
FunctionDescriptor,
)
from ray.function_manager import FunctionActorManager
from ray.utils import (
_random_string,
check_oversized_pickle,
@@ -156,7 +151,6 @@ class Worker(object):
# Index of the current session. This number will
# increment every time when `ray.shutdown` is called.
self._session_index = 0
self._current_task = None
# Functions to run to process the values returned by ray.get. Each
# postprocessor must take two arguments ("object_ids", and "values").
self._post_get_hooks = []
@@ -473,9 +467,10 @@ class Worker(object):
logger.warning(warning_message)
self.store_and_register(object_id, value)
def retrieve_and_deserialize(self, object_ids, error_timeout=10):
data_metadata_pairs = self.core_worker.get_objects(
object_ids, self.current_task_id)
def deserialize_objects(self,
data_metadata_pairs,
object_ids,
error_timeout=10):
assert len(data_metadata_pairs) == len(object_ids)
start_time = time.time()
@@ -571,9 +566,9 @@ class Worker(object):
if self.mode == LOCAL_MODE:
return self.local_mode_manager.get_objects(object_ids)
results = self.retrieve_and_deserialize(object_ids)
assert len(results) == len(object_ids)
return results
data_metadata_pairs = self.core_worker.get_objects(
object_ids, self.current_task_id)
return self.deserialize_objects(data_metadata_pairs, object_ids)
def run_function_on_all_workers(self, function,
run_on_other_drivers=False):
@@ -679,149 +674,6 @@ class Worker(object):
return ray.signature.recover_args(arguments)
def _store_outputs_in_object_store(self, object_ids, outputs):
"""Store the outputs of a remote function in the local object store.
This stores the values that were returned by a remote function in the
local object store. If any of the return values are object IDs, then
these object IDs are aliased with the object IDs that the scheduler
assigned for the return values. This is called by the worker that
executes the remote function.
Note:
The arguments object_ids and outputs should have the same length.
Args:
object_ids (List[ObjectID]): The object IDs that were assigned to
the outputs of the remote function call.
outputs (Tuple): The value returned by the remote function. If the
remote function was supposed to only return one value, then its
output was wrapped in a tuple with one element prior to being
passed into this function.
"""
for i in range(len(object_ids)):
if isinstance(outputs[i], ray.actor.ActorHandle):
raise Exception("Returning an actor handle from a remote "
"function is not allowed).")
if outputs[i] is ray.experimental.no_return.NoReturn:
if not self.core_worker.object_exists(object_ids[i]):
raise RuntimeError(
"Attempting to return 'ray.experimental.NoReturn' "
"from a remote function, but the corresponding "
"ObjectID does not exist in the local object store.")
else:
self.put_object(object_ids[i], outputs[i])
def _process_task(self, task, function_execution_info):
"""Execute a task assigned to this worker.
This method deserializes a task from the scheduler, and attempts to
execute the task. If the task succeeds, the outputs are stored in the
local object store. If the task throws an exception, RayTaskError
objects are stored in the object store to represent the failed task
(these will be retrieved by calls to get or by subsequent tasks that
use the outputs of this task).
"""
assert self.current_task_id.is_nil()
assert self.task_context.task_index == 0
assert self.task_context.put_index == 1
if not task.is_actor_task():
# If this worker is not an actor, check that `current_job_id`
# was reset when the worker finished the previous task.
assert self.current_job_id.is_nil()
# Set the driver ID of the current running task. This is
# needed so that if the task throws an exception, we propagate
# the error message to the correct driver.
self.current_job_id = task.job_id()
self.core_worker.set_current_job_id(task.job_id())
else:
# If this worker is an actor, current_job_id wasn't reset.
# Check that current task's driver ID equals the previous one.
assert self.current_job_id == task.job_id()
self.task_context.current_task_id = task.task_id()
self.core_worker.set_current_task_id(task.task_id())
function_descriptor = FunctionDescriptor.from_bytes_list(
task.function_descriptor_list())
serialized_args = task.arguments()
return_object_ids = task.returns()
if task.is_actor_task() or task.is_actor_creation_task():
dummy_return_id = return_object_ids.pop()
function_executor = function_execution_info.function
function_name = function_execution_info.function_name
# Get task arguments from the object store.
try:
if function_name != "__ray_terminate__":
self.reraise_actor_init_error()
self.memory_monitor.raise_if_low_memory()
with profiling.profile("task:deserialize_arguments"):
function_args, function_kwargs = (
self._get_arguments_for_execution(function_name,
serialized_args))
except Exception as e:
self._handle_process_task_failure(
function_descriptor, return_object_ids, e,
ray.utils.format_error_message(traceback.format_exc()))
return
# Execute the task.
try:
self._current_task = task
with profiling.profile("task:execute"):
if task.is_normal_task():
outputs = function_executor(*function_args,
**function_kwargs)
else:
if task.is_actor_task():
key = task.actor_id()
else:
key = task.actor_creation_id()
worker_name = "ray_{}_{}".format(
self.actors[key].__class__.__name__, os.getpid())
if "memory" in task.required_resources():
self.memory_monitor.set_heap_limit(
worker_name,
ray_constants.from_memory_units(
task.required_resources()["memory"]))
if "object_store_memory" in task.required_resources():
self._set_object_store_client_options(
worker_name,
int(
ray_constants.from_memory_units(
task.required_resources()[
"object_store_memory"])))
outputs = function_executor(
dummy_return_id, self.actors[key], *function_args,
**function_kwargs)
except Exception as e:
# Determine whether the exception occured during a task, not an
# actor method.
task_exception = not task.is_actor_task()
traceback_str = ray.utils.format_error_message(
traceback.format_exc(), task_exception=task_exception)
self._handle_process_task_failure(
function_descriptor, return_object_ids, e, traceback_str)
return
finally:
self._current_task = None
# Store the outputs in the local object store.
try:
with profiling.profile("task:store_outputs"):
# If this is an actor task, then the last object ID returned by
# the task is a dummy output, not returned by the function
# itself. Decrement to get the correct number of return values.
num_returns = len(return_object_ids)
if num_returns == 1:
outputs = (outputs, )
self._store_outputs_in_object_store(return_object_ids, outputs)
except Exception as e:
self._handle_process_task_failure(
function_descriptor, return_object_ids, e,
ray.utils.format_error_message(traceback.format_exc()))
def _set_object_store_client_options(self, name, object_store_memory):
try:
logger.debug("Setting plasma memory limit to {} for {}".format(
@@ -838,133 +690,15 @@ class Worker(object):
"object store memory status is:\n\n{}".format(
object_store_memory, name, e))
def _handle_process_task_failure(self, function_descriptor,
return_object_ids, error, backtrace):
function_name = function_descriptor.function_name
if isinstance(error, RayTaskError):
# avoid recursively nesting of RayTaskError
failure_object = RayTaskError(function_name, backtrace,
error.cause_cls)
else:
failure_object = RayTaskError(function_name, backtrace,
error.__class__)
failure_objects = [
failure_object for _ in range(len(return_object_ids))
]
self._store_outputs_in_object_store(return_object_ids, failure_objects)
# Log the error message.
ray.utils.push_error_to_driver(
self,
ray_constants.TASK_PUSH_ERROR,
str(failure_object),
job_id=self.current_job_id)
# Mark the actor init as failed
if not self.actor_id.is_nil() and function_name == "__init__":
self.mark_actor_init_failed(error)
# Send signal with the error.
ray_signal.send(ray_signal.ErrorSignal(str(failure_object)))
def _wait_for_and_process_task(self, task):
"""Wait for a task to be ready and process the task.
Args:
task: The task to execute.
"""
function_descriptor = FunctionDescriptor.from_bytes_list(
task.function_descriptor_list())
job_id = task.job_id()
# TODO(rkn): It would be preferable for actor creation tasks to share
# more of the code path with regular task execution.
if task.is_actor_creation_task():
# TODO: Remove Worker.actor_id and just use CoreWorker.GetActorId.
self.actor_id = task.actor_creation_id()
self.core_worker.set_actor_id(task.actor_creation_id())
self.actor_creation_task_id = task.task_id()
actor_class = self.function_actor_manager.load_actor_class(
job_id, function_descriptor)
self.actors[self.actor_id] = actor_class.__new__(actor_class)
self.actor_checkpoint_info[self.actor_id] = ActorCheckpointInfo(
num_tasks_since_last_checkpoint=0,
last_checkpoint_timestamp=int(1000 * time.time()),
checkpoint_ids=[],
)
execution_info = self.function_actor_manager.get_execution_info(
job_id, function_descriptor)
# Execute the task.
function_name = execution_info.function_name
extra_data = {"name": function_name, "task_id": task.task_id().hex()}
if not task.is_actor_task():
if not task.is_actor_creation_task():
title = "ray_worker:{}()".format(function_name)
next_title = "ray_worker"
else:
actor = self.actors[task.actor_creation_id()]
title = "ray_{}:{}()".format(actor.__class__.__name__,
function_name)
next_title = "ray_{}".format(actor.__class__.__name__)
else:
actor = self.actors[task.actor_id()]
title = "ray_{}:{}()".format(actor.__class__.__name__,
function_name)
next_title = "ray_{}".format(actor.__class__.__name__)
with profiling.profile("task", extra_data=extra_data):
with _changeproctitle(title, next_title):
self._process_task(task, execution_info)
# Reset the state fields so the next task can run.
self.task_context.current_task_id = TaskID.nil()
self.core_worker.set_current_task_id(TaskID.nil())
self.task_context.task_index = 0
self.task_context.put_index = 1
if self.actor_id.is_nil():
# Don't need to reset `current_job_id` if the worker is an
# actor. Because the following tasks should all have the
# same driver id.
self.current_job_id = WorkerID.nil()
self.core_worker.set_current_job_id(JobID.nil())
# Reset signal counters so that the next task can get
# all past signals.
ray_signal.reset()
# Increase the task execution counter.
self.function_actor_manager.increase_task_counter(
job_id, function_descriptor)
reached_max_executions = (self.function_actor_manager.get_task_counter(
job_id, function_descriptor) == execution_info.max_calls)
if reached_max_executions:
self.core_worker.disconnect()
sys.exit(0)
def _get_next_task_from_raylet(self):
"""Get the next task from the raylet.
Returns:
A task from the raylet.
"""
with profiling.profile("worker_idle"):
task = self.raylet_client.get_task()
# Automatically restrict the GPUs available to this task.
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
return task
def main_loop(self):
"""The main loop a worker runs to receive and execute tasks."""
def exit(signum, frame):
shutdown()
sys.exit(0)
def sigterm_handler(signum, frame):
shutdown(True)
sys.exit(1)
signal.signal(signal.SIGTERM, exit)
while True:
task = self._get_next_task_from_raylet()
self._wait_for_and_process_task(task)
signal.signal(signal.SIGTERM, sigterm_handler)
self.core_worker.run_task_loop()
def get_gpu_ids():
@@ -982,7 +716,7 @@ def get_gpu_ids():
raise Exception("ray.get_gpu_ids() currently does not work in LOCAL "
"MODE.")
all_resource_ids = global_worker.raylet_client.resource_ids()
all_resource_ids = global_worker.core_worker.resource_ids()
assigned_ids = [
resource_id for resource_id, _ in all_resource_ids.get("GPU", [])
]
@@ -1010,7 +744,7 @@ def get_resource_ids():
"ray.get_resource_ids() currently does not work in LOCAL "
"MODE.")
return global_worker.raylet_client.resource_ids()
return global_worker.core_worker.resource_ids()
def get_webui_url():
@@ -1437,7 +1171,7 @@ def shutdown(exiting_interpreter=False):
# to make sure that log messages finish printing.
time.sleep(0.5)
disconnect()
disconnect(exiting_interpreter)
# Disconnect global state from GCS.
ray.state.state.disconnect()
@@ -1456,6 +1190,13 @@ def shutdown(exiting_interpreter=False):
atexit.register(shutdown, True)
def sigterm_handler(signum, frame):
sys.exit(signal.SIGTERM)
signal.signal(signal.SIGTERM, sigterm_handler)
# Define a custom excepthook so that if the driver exits with an exception, we
# can push that exception to Redis.
normal_excepthook = sys.excepthook
@@ -1900,7 +1641,7 @@ def connect(node,
worker.cached_functions_to_run = None
def disconnect():
def disconnect(exiting_interpreter=False):
"""Disconnect this worker from the raylet and object store."""
# Reset the list of cached remote functions and actors so that if more
# remote functions or actors are defined and then connect is called again,
@@ -1928,10 +1669,12 @@ def disconnect():
worker.function_actor_manager.reset_cache()
worker.serialization_context_map.clear()
if hasattr(worker, "raylet_client"):
del worker.raylet_client
if hasattr(worker, "core_worker"):
del worker.core_worker
if not exiting_interpreter:
if hasattr(worker, "raylet_client"):
del worker.raylet_client
if hasattr(worker, "core_worker"):
del worker.core_worker
@contextmanager
+1 -27
View File
@@ -3,7 +3,6 @@ from __future__ import division
from __future__ import print_function
import argparse
import traceback
import ray
import ray.actor
@@ -86,30 +85,5 @@ if __name__ == "__main__":
node = ray.node.Node(
ray_params, head=False, shutdown_at_exit=False, connect_only=True)
ray.worker._global_node = node
ray.worker.connect(node, mode=ray.WORKER_MODE)
error_explanation = """
This error is unexpected and should not have happened. Somehow a worker
crashed in an unanticipated way causing the main_loop to throw an exception,
which is being caught in "python/ray/workers/default_worker.py".
"""
try:
# This call to main_loop should never return if things are working.
# Most exceptions that are thrown (e.g., inside the execution of a
# task) should be caught and handled inside of the call to
# main_loop. If an exception is thrown here, then that means that
# there is some error that we didn't anticipate.
ray.worker.global_worker.main_loop()
except Exception:
traceback_str = traceback.format_exc() + error_explanation
ray.utils.push_error_to_driver(
ray.worker.global_worker,
"worker_crash",
traceback_str,
job_id=None)
# TODO(rkn): Note that if the worker was in the middle of executing
# a task, then any worker or driver that is blocking in a get call
# and waiting for the output of that task will hang. We need to
# address this.
ray.worker.global_worker.main_loop()