mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 22:53:20 +08:00
[core worker] Python core worker task execution (#5783)
Executes tasks via the event loop in the C++ core worker. Also properly handles signals (including KeyboardInterrupt), so ctrl-C in a python interactive shell works now (if connecting to an existing cluster).
This commit is contained in:
+353
-66
@@ -3,11 +3,21 @@
|
||||
# cython: embedsignature = True
|
||||
# cython: language_level = 3
|
||||
|
||||
from cpython.exc cimport PyErr_CheckSignals
|
||||
|
||||
import numpy
|
||||
import time
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
from libc.stdint cimport uint8_t, int32_t, int64_t, uint64_t
|
||||
from libc.stdint cimport (
|
||||
int32_t,
|
||||
int64_t,
|
||||
INT64_MAX,
|
||||
uint64_t,
|
||||
uint8_t,
|
||||
)
|
||||
from libcpp cimport bool as c_bool
|
||||
from libcpp.memory cimport (
|
||||
dynamic_pointer_cast,
|
||||
@@ -28,6 +38,7 @@ from ray.includes.common cimport (
|
||||
CRayStatus,
|
||||
CGcsClientOptions,
|
||||
CTaskArg,
|
||||
CTaskType,
|
||||
CRayFunction,
|
||||
LocalMemoryBuffer,
|
||||
move,
|
||||
@@ -35,6 +46,9 @@ from ray.includes.common cimport (
|
||||
LANGUAGE_JAVA,
|
||||
LANGUAGE_PYTHON,
|
||||
LocalMemoryBuffer,
|
||||
TASK_TYPE_NORMAL_TASK,
|
||||
TASK_TYPE_ACTOR_CREATION_TASK,
|
||||
TASK_TYPE_ACTOR_TASK,
|
||||
WORKER_TYPE_WORKER,
|
||||
WORKER_TYPE_DRIVER,
|
||||
)
|
||||
@@ -42,10 +56,10 @@ from ray.includes.libraylet cimport (
|
||||
CRayletClient,
|
||||
GCSProfileEvent,
|
||||
GCSProfileTableData,
|
||||
ResourceMappingType,
|
||||
WaitResultPair,
|
||||
)
|
||||
from ray.includes.unique_ids cimport (
|
||||
CActorID,
|
||||
CActorCheckpointID,
|
||||
CObjectID,
|
||||
CClientID,
|
||||
@@ -54,12 +68,22 @@ from ray.includes.libcoreworker cimport (
|
||||
CActorCreationOptions,
|
||||
CCoreWorker,
|
||||
CTaskOptions,
|
||||
ResourceMappingType,
|
||||
)
|
||||
from ray.includes.task cimport CTaskSpec
|
||||
from ray.includes.ray_config cimport RayConfig
|
||||
|
||||
import ray
|
||||
import ray.experimental.signal as ray_signal
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray import profiling
|
||||
from ray.exceptions import RayletError, ObjectStoreFullError
|
||||
from ray.exceptions import (
|
||||
RayError,
|
||||
RayletError,
|
||||
RayTaskError,
|
||||
ObjectStoreFullError
|
||||
)
|
||||
from ray.function_manager import FunctionDescriptor
|
||||
from ray.utils import decode
|
||||
from ray.ray_constants import (
|
||||
DEFAULT_PUT_OBJECT_DELAY,
|
||||
@@ -105,9 +129,30 @@ cdef int check_status(const CRayStatus& status) nogil except -1:
|
||||
|
||||
if status.IsObjectStoreFull():
|
||||
raise ObjectStoreFullError(message)
|
||||
elif status.IsInterrupted():
|
||||
raise KeyboardInterrupt()
|
||||
else:
|
||||
raise RayletError(message)
|
||||
|
||||
cdef RayObjectsToDataMetadataPairs(
|
||||
const c_vector[shared_ptr[CRayObject]] objects):
|
||||
data_metadata_pairs = []
|
||||
for i in range(objects.size()):
|
||||
# core_worker will return a nullptr for objects that couldn't be
|
||||
# retrieved from the store or if an object was an exception.
|
||||
if not objects[i].get():
|
||||
data_metadata_pairs.append((None, None))
|
||||
else:
|
||||
data = None
|
||||
metadata = None
|
||||
if objects[i].get().HasData():
|
||||
data = Buffer.make(objects[i].get().GetData())
|
||||
if objects[i].get().HasMetadata():
|
||||
metadata = Buffer.make(
|
||||
objects[i].get().GetMetadata()).to_pybytes()
|
||||
data_metadata_pairs.append((data, metadata))
|
||||
return data_metadata_pairs
|
||||
|
||||
|
||||
cdef VectorToObjectIDs(const c_vector[CObjectID] &object_ids):
|
||||
result = []
|
||||
@@ -327,17 +372,6 @@ cdef class RayletClient:
|
||||
# initialized before the raylet client.
|
||||
self.client = &core_worker.core_worker.get().GetRayletClient()
|
||||
|
||||
def get_task(self):
|
||||
cdef:
|
||||
unique_ptr[CTaskSpec] task_spec
|
||||
|
||||
with nogil:
|
||||
check_status(self.client.GetTask(&task_spec))
|
||||
return TaskSpec.make(task_spec)
|
||||
|
||||
def task_done(self):
|
||||
check_status(self.client.TaskDone())
|
||||
|
||||
def fetch_or_reconstruct(self, object_ids,
|
||||
c_bool fetch_only,
|
||||
TaskID current_task_id=TaskID.nil()):
|
||||
@@ -345,27 +379,6 @@ cdef class RayletClient:
|
||||
check_status(self.client.FetchOrReconstruct(
|
||||
fetch_ids, fetch_only, current_task_id.native()))
|
||||
|
||||
def resource_ids(self):
|
||||
cdef:
|
||||
ResourceMappingType resource_mapping = (
|
||||
self.client.GetResourceIDs())
|
||||
unordered_map[
|
||||
c_string, c_vector[pair[int64_t, double]]
|
||||
].iterator iterator = resource_mapping.begin()
|
||||
c_vector[pair[int64_t, double]] c_value
|
||||
|
||||
resources_dict = {}
|
||||
while iterator != resource_mapping.end():
|
||||
key = decode(dereference(iterator).first)
|
||||
c_value = dereference(iterator).second
|
||||
ids_and_fractions = []
|
||||
for i in range(c_value.size()):
|
||||
ids_and_fractions.append(
|
||||
(c_value[i].first, c_value[i].second))
|
||||
resources_dict[key] = ids_and_fractions
|
||||
postincrement(iterator)
|
||||
return resources_dict
|
||||
|
||||
def push_error(self, JobID job_id, error_type, error_message,
|
||||
double timestamp):
|
||||
check_status(self.client.PushError(job_id.native(),
|
||||
@@ -403,6 +416,272 @@ cdef class RayletClient:
|
||||
def is_worker(self):
|
||||
return self.client.IsWorker()
|
||||
|
||||
cdef deserialize_args(
|
||||
const c_vector[shared_ptr[CRayObject]] &c_args,
|
||||
const c_vector[CObjectID] &arg_reference_ids):
|
||||
cdef:
|
||||
c_vector[shared_ptr[CRayObject]] by_reference_objects
|
||||
|
||||
args = []
|
||||
by_reference_ids = []
|
||||
by_reference_indices = []
|
||||
for i in range(c_args.size()):
|
||||
# Passed by value.
|
||||
if arg_reference_ids[i].IsNil():
|
||||
data = Buffer.make(c_args[i].get().GetData())
|
||||
if (c_args[i].get().HasMetadata()
|
||||
and Buffer.make(
|
||||
c_args[i].get().GetMetadata()).to_pybytes()
|
||||
== RAW_BUFFER_METADATA):
|
||||
args.append(data)
|
||||
else:
|
||||
args.append(pickle.loads(data.to_pybytes()))
|
||||
# Passed by reference.
|
||||
else:
|
||||
by_reference_ids.append(
|
||||
ObjectID(arg_reference_ids[i].Binary()))
|
||||
by_reference_indices.append(i)
|
||||
by_reference_objects.push_back(c_args[i])
|
||||
args.append(None)
|
||||
|
||||
data_metadata_pairs = RayObjectsToDataMetadataPairs(
|
||||
by_reference_objects)
|
||||
for i, arg in enumerate(
|
||||
ray.worker.global_worker.deserialize_objects(
|
||||
data_metadata_pairs, by_reference_ids)):
|
||||
args[by_reference_indices[i]] = arg
|
||||
|
||||
for arg in args:
|
||||
if isinstance(arg, RayError):
|
||||
raise arg
|
||||
|
||||
return ray.signature.recover_args(args)
|
||||
|
||||
cdef _check_worker_state(worker, CTaskType task_type, JobID job_id):
|
||||
assert worker.current_task_id.is_nil()
|
||||
assert worker.task_context.task_index == 0
|
||||
assert worker.task_context.put_index == 1
|
||||
|
||||
# If this worker is not an actor, check that `current_job_id`
|
||||
# was reset when the worker finished the previous task.
|
||||
if <int>task_type in [<int>TASK_TYPE_NORMAL_TASK,
|
||||
<int>TASK_TYPE_ACTOR_CREATION_TASK]:
|
||||
assert worker.current_job_id.is_nil()
|
||||
# Set the driver ID of the current running task. This is
|
||||
# needed so that if the task throws an exception, we propagate
|
||||
# the error message to the correct driver.
|
||||
worker.current_job_id = job_id
|
||||
else:
|
||||
# If this worker is an actor, current_job_id wasn't reset.
|
||||
# Check that current task's driver ID equals the previous
|
||||
# one.
|
||||
assert worker.current_job_id == job_id
|
||||
|
||||
|
||||
cdef _store_task_outputs(worker, return_ids, outputs):
|
||||
for i in range(len(return_ids)):
|
||||
return_id, output = return_ids[i], outputs[i]
|
||||
if isinstance(output, ray.actor.ActorHandle):
|
||||
raise Exception("Returning an actor handle from a remote "
|
||||
"function is not allowed).")
|
||||
if output is ray.experimental.no_return.NoReturn:
|
||||
if not worker.core_worker.object_exists(return_id):
|
||||
raise RuntimeError(
|
||||
"Attempting to return 'ray.experimental.NoReturn' "
|
||||
"from a remote function, but the corresponding "
|
||||
"ObjectID does not exist in the local object store.")
|
||||
else:
|
||||
worker.put_object(return_id, output)
|
||||
|
||||
|
||||
cdef execute_task(
|
||||
CTaskType task_type,
|
||||
const CRayFunction &ray_function,
|
||||
const CJobID &c_job_id,
|
||||
const CActorID &c_actor_id,
|
||||
const unordered_map[c_string, double] &c_resources,
|
||||
const c_vector[shared_ptr[CRayObject]] &c_args,
|
||||
const c_vector[CObjectID] &c_arg_reference_ids,
|
||||
const c_vector[CObjectID] &c_return_ids,
|
||||
c_vector[shared_ptr[CRayObject]] *returns):
|
||||
|
||||
worker = ray.worker.global_worker
|
||||
|
||||
actor_id = ActorID(c_actor_id.Binary())
|
||||
job_id = JobID(c_job_id.Binary())
|
||||
task_id = worker.core_worker.get_current_task_id()
|
||||
|
||||
# Check that the worker is in the expected state to execute the task.
|
||||
_check_worker_state(worker, task_type, job_id)
|
||||
worker.task_context.current_task_id = task_id
|
||||
|
||||
# Automatically restrict the GPUs available to this task.
|
||||
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
|
||||
|
||||
function_descriptor = FunctionDescriptor.from_bytes_list(
|
||||
ray_function.GetFunctionDescriptor())
|
||||
|
||||
if <int>task_type == <int>TASK_TYPE_ACTOR_CREATION_TASK:
|
||||
worker.actor_id = actor_id
|
||||
actor_class = worker.function_actor_manager.load_actor_class(
|
||||
job_id, function_descriptor)
|
||||
worker.actors[actor_id] = actor_class.__new__(actor_class)
|
||||
worker.actor_checkpoint_info[actor_id] = (
|
||||
ray.worker.ActorCheckpointInfo(
|
||||
num_tasks_since_last_checkpoint=0,
|
||||
last_checkpoint_timestamp=int(1000 * time.time()),
|
||||
checkpoint_ids=[]))
|
||||
|
||||
execution_info = worker.function_actor_manager.get_execution_info(
|
||||
job_id, function_descriptor)
|
||||
function_name = execution_info.function_name
|
||||
extra_data = {"name": function_name, "task_id": task_id.hex()}
|
||||
|
||||
if <int>task_type == <int>TASK_TYPE_NORMAL_TASK:
|
||||
title = "ray_worker:{}()".format(function_name)
|
||||
next_title = "ray_worker"
|
||||
function_executor = execution_info.function
|
||||
else:
|
||||
actor = worker.actors[actor_id]
|
||||
class_name = actor.__class__.__name__
|
||||
title = "ray_{}:{}()".format(class_name, function_name)
|
||||
next_title = "ray_{}".format(class_name)
|
||||
worker_name = "ray_{}_{}".format(class_name, os.getpid())
|
||||
if c_resources.find(b"memory") != c_resources.end():
|
||||
worker.memory_monitor.set_heap_limit(
|
||||
worker_name,
|
||||
ray_constants.from_memory_units(
|
||||
dereference(c_resources.find(b"memory")).second))
|
||||
if c_resources.find(b"object_store_memory") != c_resources.end():
|
||||
worker._set_object_store_client_options(
|
||||
worker_name,
|
||||
int(ray_constants.from_memory_units(
|
||||
dereference(
|
||||
c_resources.find(b"object_store_memory")).second)))
|
||||
|
||||
def function_executor(*arguments, **kwarguments):
|
||||
return execution_info.function(actor, *arguments, **kwarguments)
|
||||
|
||||
return_ids = VectorToObjectIDs(c_return_ids)
|
||||
with profiling.profile("task", extra_data=extra_data):
|
||||
try:
|
||||
task_exception = False
|
||||
if not (<int>task_type == <int>TASK_TYPE_ACTOR_TASK
|
||||
and function_name == "__ray_terminate__"):
|
||||
worker.reraise_actor_init_error()
|
||||
worker.memory_monitor.raise_if_low_memory()
|
||||
|
||||
with profiling.profile("task:deserialize_arguments"):
|
||||
args, kwargs = deserialize_args(c_args, c_arg_reference_ids)
|
||||
|
||||
# Execute the task.
|
||||
with ray.worker._changeproctitle(title, next_title):
|
||||
with profiling.profile("task:execute"):
|
||||
task_exception = True
|
||||
outputs = function_executor(*args, **kwargs)
|
||||
task_exception = False
|
||||
if len(return_ids) == 1:
|
||||
outputs = (outputs,)
|
||||
|
||||
# Store the outputs in the object store.
|
||||
with profiling.profile("task:store_outputs"):
|
||||
_store_task_outputs(worker, return_ids, outputs)
|
||||
except Exception as error:
|
||||
if (<int>task_type == <int>TASK_TYPE_ACTOR_CREATION_TASK):
|
||||
worker.mark_actor_init_failed(error)
|
||||
|
||||
backtrace = ray.utils.format_error_message(
|
||||
traceback.format_exc(), task_exception=task_exception)
|
||||
if isinstance(error, RayTaskError):
|
||||
# Avoid recursive nesting of RayTaskError.
|
||||
failure_object = RayTaskError(function_name, backtrace,
|
||||
error.cause_cls)
|
||||
else:
|
||||
failure_object = RayTaskError(function_name, backtrace,
|
||||
error.__class__)
|
||||
_store_task_outputs(
|
||||
worker, return_ids, [failure_object] * len(return_ids))
|
||||
ray.utils.push_error_to_driver(
|
||||
worker,
|
||||
ray_constants.TASK_PUSH_ERROR,
|
||||
str(failure_object),
|
||||
job_id=worker.current_job_id)
|
||||
|
||||
# Send signal with the error.
|
||||
ray_signal.send(ray_signal.ErrorSignal(str(failure_object)))
|
||||
|
||||
# Reset the state fields so the next task can run.
|
||||
worker.task_context.current_task_id = TaskID.nil()
|
||||
worker.core_worker.set_current_task_id(TaskID.nil())
|
||||
worker.task_context.task_index = 0
|
||||
worker.task_context.put_index = 1
|
||||
|
||||
# Don't need to reset `current_job_id` if the worker is an
|
||||
# actor. Because the following tasks should all have the
|
||||
# same driver id.
|
||||
if <int>task_type == <int>TASK_TYPE_NORMAL_TASK:
|
||||
worker.current_job_id = JobID.nil()
|
||||
worker.core_worker.set_current_job_id(JobID.nil())
|
||||
|
||||
# Reset signal counters so that the next task can get
|
||||
# all past signals.
|
||||
ray_signal.reset()
|
||||
|
||||
# Reset the state of the worker for the next task to execute.
|
||||
# Increase the task execution counter.
|
||||
worker.function_actor_manager.increase_task_counter(
|
||||
job_id, function_descriptor)
|
||||
|
||||
# If we've reached the max number of executions for this worker, exit.
|
||||
reached_max_executions = (
|
||||
worker.function_actor_manager.get_task_counter(
|
||||
job_id, function_descriptor) == execution_info.max_calls)
|
||||
if reached_max_executions:
|
||||
worker.core_worker.disconnect()
|
||||
sys.exit(0)
|
||||
|
||||
cdef CRayStatus task_execution_handler(
|
||||
CTaskType task_type,
|
||||
const CRayFunction &ray_function,
|
||||
const CJobID &c_job_id,
|
||||
const CActorID &c_actor_id,
|
||||
const unordered_map[c_string, double] &c_resources,
|
||||
const c_vector[shared_ptr[CRayObject]] &c_args,
|
||||
const c_vector[CObjectID] &c_arg_reference_ids,
|
||||
const c_vector[CObjectID] &c_return_ids,
|
||||
c_vector[shared_ptr[CRayObject]] *returns) nogil:
|
||||
|
||||
with gil:
|
||||
try:
|
||||
# The call to execute_task should never raise an exception. If it
|
||||
# does, that indicates that there was an unexpected internal error.
|
||||
execute_task(task_type, ray_function, c_job_id,
|
||||
c_actor_id, c_resources, c_args,
|
||||
c_arg_reference_ids, c_return_ids, returns)
|
||||
except Exception:
|
||||
traceback_str = traceback.format_exc() + (
|
||||
"An unexpected internal error occurred while the worker was"
|
||||
"executing a task.")
|
||||
ray.utils.push_error_to_driver(
|
||||
ray.worker.global_worker,
|
||||
"worker_crash",
|
||||
traceback_str,
|
||||
job_id=None)
|
||||
# TODO(rkn): Note that if the worker was in the middle of executing
|
||||
# a task, then any worker or driver that is blocking in a get call
|
||||
# and waiting for the output of that task will hang. We need to
|
||||
# address this.
|
||||
sys.exit(1)
|
||||
|
||||
return CRayStatus.OK()
|
||||
|
||||
cdef CRayStatus check_signals() nogil:
|
||||
with gil:
|
||||
try:
|
||||
PyErr_CheckSignals()
|
||||
except KeyboardInterrupt:
|
||||
return CRayStatus.Interrupted(b"")
|
||||
return CRayStatus.OK()
|
||||
|
||||
cdef class CoreWorker:
|
||||
cdef unique_ptr[CCoreWorker] core_worker
|
||||
@@ -419,12 +698,20 @@ cdef class CoreWorker:
|
||||
LANGUAGE_PYTHON, store_socket.encode("ascii"),
|
||||
raylet_socket.encode("ascii"), job_id.native(),
|
||||
gcs_options.native()[0], log_dir.encode("utf-8"),
|
||||
node_ip_address.encode("utf-8"), NULL, False))
|
||||
node_ip_address.encode("utf-8"), task_execution_handler,
|
||||
check_signals, False))
|
||||
|
||||
def disconnect(self):
|
||||
with nogil:
|
||||
self.core_worker.get().Disconnect()
|
||||
|
||||
def run_task_loop(self):
|
||||
with nogil:
|
||||
self.core_worker.get().Execution().Run()
|
||||
|
||||
def get_current_task_id(self):
|
||||
return TaskID(self.core_worker.get().GetCurrentTaskId().Binary())
|
||||
|
||||
def set_current_task_id(self, TaskID task_id):
|
||||
cdef:
|
||||
CTaskID c_task_id = task_id.native()
|
||||
@@ -432,15 +719,8 @@ cdef class CoreWorker:
|
||||
with nogil:
|
||||
self.core_worker.get().SetCurrentTaskId(c_task_id)
|
||||
|
||||
def set_actor_id(self, ActorID actor_id):
|
||||
cdef:
|
||||
CActorID c_actor_id = actor_id.native()
|
||||
|
||||
with nogil:
|
||||
self.core_worker.get().SetActorId(c_actor_id)
|
||||
|
||||
def get_current_task_id(self):
|
||||
return TaskID(self.core_worker.get().GetCurrentTaskId().Binary())
|
||||
def get_current_job_id(self):
|
||||
return JobID(self.core_worker.get().GetCurrentJobId().Binary())
|
||||
|
||||
def set_current_job_id(self, JobID job_id):
|
||||
cdef:
|
||||
@@ -449,7 +729,8 @@ cdef class CoreWorker:
|
||||
with nogil:
|
||||
self.core_worker.get().SetCurrentJobId(c_job_id)
|
||||
|
||||
def get_objects(self, object_ids, TaskID current_task_id):
|
||||
def get_objects(self, object_ids, TaskID current_task_id,
|
||||
int64_t timeout_ms=-1):
|
||||
cdef:
|
||||
c_vector[shared_ptr[CRayObject]] results
|
||||
CTaskID c_task_id = current_task_id.native()
|
||||
@@ -457,25 +738,9 @@ cdef class CoreWorker:
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Objects().Get(
|
||||
c_object_ids, -1, &results))
|
||||
c_object_ids, timeout_ms, &results))
|
||||
|
||||
data_metadata_pairs = []
|
||||
for result in results:
|
||||
# core_worker will return a nullptr for objects that couldn't be
|
||||
# retrieved from the store or if an object was an exception.
|
||||
if not result.get():
|
||||
data_metadata_pairs.append((None, None))
|
||||
else:
|
||||
data = None
|
||||
metadata = None
|
||||
if result.get().HasData():
|
||||
data = Buffer.make(result.get().GetData())
|
||||
if result.get().HasMetadata():
|
||||
metadata = Buffer.make(
|
||||
result.get().GetMetadata()).to_pybytes()
|
||||
data_metadata_pairs.append((data, metadata))
|
||||
|
||||
return data_metadata_pairs
|
||||
return RayObjectsToDataMetadataPairs(results)
|
||||
|
||||
def object_exists(self, ObjectID object_id):
|
||||
cdef:
|
||||
@@ -570,7 +835,7 @@ cdef class CoreWorker:
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Objects().Seal(c_object_id))
|
||||
|
||||
def wait(self, object_ids, int num_returns, int64_t timeout_milliseconds,
|
||||
def wait(self, object_ids, int num_returns, int64_t timeout_ms,
|
||||
TaskID current_task_id):
|
||||
cdef:
|
||||
WaitResultPair result
|
||||
@@ -581,7 +846,7 @@ cdef class CoreWorker:
|
||||
wait_ids = ObjectIDsToVector(object_ids)
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Objects().Wait(
|
||||
wait_ids, num_returns, timeout_milliseconds, &results))
|
||||
wait_ids, num_returns, timeout_ms, &results))
|
||||
|
||||
assert len(results) == len(object_ids)
|
||||
|
||||
@@ -704,6 +969,28 @@ cdef class CoreWorker:
|
||||
|
||||
return VectorToObjectIDs(return_ids)
|
||||
|
||||
def resource_ids(self):
|
||||
cdef:
|
||||
ResourceMappingType resource_mapping = (
|
||||
self.core_worker.get().GetResourceIDs())
|
||||
unordered_map[
|
||||
c_string, c_vector[pair[int64_t, double]]
|
||||
].iterator iterator = resource_mapping.begin()
|
||||
c_vector[pair[int64_t, double]] c_value
|
||||
|
||||
resources_dict = {}
|
||||
while iterator != resource_mapping.end():
|
||||
key = decode(dereference(iterator).first)
|
||||
c_value = dereference(iterator).second
|
||||
ids_and_fractions = []
|
||||
for i in range(c_value.size()):
|
||||
ids_and_fractions.append(
|
||||
(c_value[i].first, c_value[i].second))
|
||||
resources_dict[key] = ids_and_fractions
|
||||
postincrement(iterator)
|
||||
|
||||
return resources_dict
|
||||
|
||||
def profile_event(self, event_type, dict extra_data):
|
||||
cdef:
|
||||
c_string c_event_type = event_type.encode("ascii")
|
||||
|
||||
@@ -199,8 +199,8 @@ class PlasmaEventHandler:
|
||||
del self._waiting_dict[fut.object_id]
|
||||
|
||||
def _complete_future(self, fut):
|
||||
obj = self._worker.retrieve_and_deserialize(
|
||||
[ray.ObjectID(fut.object_id.binary())], 0)[0]
|
||||
obj = self._worker.get_objects([ray.ObjectID(
|
||||
fut.object_id.binary())])[0]
|
||||
fut.set_result(obj)
|
||||
|
||||
def as_future(self, object_id, check_ready=True):
|
||||
|
||||
@@ -69,11 +69,10 @@ def send(signal):
|
||||
Args:
|
||||
signal: Signal to be sent.
|
||||
"""
|
||||
if hasattr(ray.worker.global_worker, "actor_creation_task_id"):
|
||||
source_key = ray.worker.global_worker.actor_id.hex()
|
||||
else:
|
||||
# No actors; this function must have been called from a task
|
||||
if ray.worker.global_worker.actor_id.is_nil():
|
||||
source_key = ray.worker.global_worker.current_task_id.hex()
|
||||
else:
|
||||
source_key = ray.worker.global_worker.actor_id.hex()
|
||||
|
||||
encoded_signal = ray.utils.binary_to_hex(cloudpickle.dumps(signal))
|
||||
ray.worker.global_worker.redis_client.execute_command(
|
||||
|
||||
@@ -763,7 +763,7 @@ class FunctionActorManager(object):
|
||||
worker's internal state to record the executed method.
|
||||
"""
|
||||
|
||||
def actor_method_executor(dummy_return_id, actor, *args, **kwargs):
|
||||
def actor_method_executor(actor, *args, **kwargs):
|
||||
# Update the actor's task counter to reflect the task we're about
|
||||
# to execute.
|
||||
self._worker.actor_task_counter += 1
|
||||
|
||||
@@ -47,31 +47,34 @@ cdef extern from "ray/common/status.h" namespace "ray" nogil:
|
||||
CRayStatus OK()
|
||||
|
||||
@staticmethod
|
||||
CRayStatus OutOfMemory()
|
||||
CRayStatus OutOfMemory(const c_string &msg)
|
||||
|
||||
@staticmethod
|
||||
CRayStatus KeyError()
|
||||
CRayStatus KeyError(const c_string &msg)
|
||||
|
||||
@staticmethod
|
||||
CRayStatus Invalid()
|
||||
CRayStatus Invalid(const c_string &msg)
|
||||
|
||||
@staticmethod
|
||||
CRayStatus IOError()
|
||||
CRayStatus IOError(const c_string &msg)
|
||||
|
||||
@staticmethod
|
||||
CRayStatus TypeError()
|
||||
CRayStatus TypeError(const c_string &msg)
|
||||
|
||||
@staticmethod
|
||||
CRayStatus UnknownError()
|
||||
CRayStatus UnknownError(const c_string &msg)
|
||||
|
||||
@staticmethod
|
||||
CRayStatus NotImplemented()
|
||||
CRayStatus NotImplemented(const c_string &msg)
|
||||
|
||||
@staticmethod
|
||||
CRayStatus RedisError()
|
||||
CRayStatus ObjectStoreFull(const c_string &msg)
|
||||
|
||||
@staticmethod
|
||||
CRayStatus ObjectStoreFull()
|
||||
CRayStatus RedisError(const c_string &msg)
|
||||
|
||||
@staticmethod
|
||||
CRayStatus Interrupted(const c_string &msg)
|
||||
|
||||
c_bool ok()
|
||||
c_bool IsOutOfMemory()
|
||||
@@ -81,8 +84,9 @@ cdef extern from "ray/common/status.h" namespace "ray" nogil:
|
||||
c_bool IsTypeError()
|
||||
c_bool IsUnknownError()
|
||||
c_bool IsNotImplemented()
|
||||
c_bool IsRedisError()
|
||||
c_bool IsObjectStoreFull()
|
||||
c_bool IsRedisError()
|
||||
c_bool IsInterrupted()
|
||||
|
||||
c_string ToString()
|
||||
c_string CodeAsString()
|
||||
@@ -92,6 +96,7 @@ cdef extern from "ray/common/status.h" namespace "ray" nogil:
|
||||
# We can later add more of the common status factory methods as needed
|
||||
cdef CRayStatus RayStatus_OK "Status::OK"()
|
||||
cdef CRayStatus RayStatus_Invalid "Status::Invalid"()
|
||||
cdef CRayStatus RayStatus_NotImplemented "Status::NotImplemented"()
|
||||
|
||||
|
||||
cdef extern from "ray/common/status.h" namespace "ray::StatusCode" nogil:
|
||||
@@ -117,6 +122,8 @@ cdef extern from "ray/protobuf/common.pb.h" nogil:
|
||||
pass
|
||||
cdef cppclass CWorkerType "ray::WorkerType":
|
||||
pass
|
||||
cdef cppclass CTaskType "ray::TaskType":
|
||||
pass
|
||||
|
||||
|
||||
# This is a workaround for C++ enum class since Cython has no corresponding
|
||||
@@ -130,6 +137,11 @@ cdef extern from "ray/protobuf/common.pb.h" nogil:
|
||||
cdef CWorkerType WORKER_TYPE_WORKER "ray::WorkerType::WORKER"
|
||||
cdef CWorkerType WORKER_TYPE_DRIVER "ray::WorkerType::DRIVER"
|
||||
|
||||
cdef extern from "ray/protobuf/common.pb.h" nogil:
|
||||
cdef CTaskType TASK_TYPE_NORMAL_TASK "ray::TaskType::NORMAL_TASK"
|
||||
cdef CTaskType TASK_TYPE_ACTOR_CREATION_TASK "ray::TaskType::ACTOR_CREATION_TASK" # noqa: E501
|
||||
cdef CTaskType TASK_TYPE_ACTOR_TASK "ray::TaskType::ACTOR_TASK"
|
||||
|
||||
|
||||
cdef extern from "ray/common/task/scheduling_resources.h" nogil:
|
||||
cdef cppclass ResourceSet "ray::ResourceSet":
|
||||
|
||||
@@ -1,7 +1,13 @@
|
||||
# cython: profile = False
|
||||
# distutils: language = c++
|
||||
# cython: embedsignature = True
|
||||
|
||||
from libc.stdint cimport int64_t
|
||||
from libcpp cimport bool as c_bool
|
||||
from libcpp.memory cimport shared_ptr, unique_ptr
|
||||
from libcpp.string cimport string as c_string
|
||||
from libcpp.unordered_map cimport unordered_map
|
||||
from libcpp.utility cimport pair
|
||||
from libcpp.vector cimport vector as c_vector
|
||||
|
||||
from ray.includes.unique_ids cimport (
|
||||
@@ -18,12 +24,30 @@ from ray.includes.common cimport (
|
||||
CRayStatus,
|
||||
CTaskArg,
|
||||
CTaskOptions,
|
||||
CTaskType,
|
||||
CWorkerType,
|
||||
CLanguage,
|
||||
CGcsClientOptions,
|
||||
)
|
||||
from ray.includes.task cimport CTaskSpec
|
||||
from ray.includes.libraylet cimport CRayletClient
|
||||
|
||||
ctypedef unordered_map[c_string, c_vector[pair[int64_t, double]]] \
|
||||
ResourceMappingType
|
||||
|
||||
cdef extern from "ray/core_worker/task_execution.h" namespace "ray" nogil:
|
||||
cdef cppclass CTaskExecutionInterface "CoreWorkerTaskExecutionInterface":
|
||||
void Run()
|
||||
void Stop()
|
||||
|
||||
cdef extern from "ray/core_worker/profiling.h" nogil:
|
||||
cdef cppclass CProfiler "ray::worker::Profiler":
|
||||
void Start()
|
||||
|
||||
cdef cppclass CProfileEvent "ray::worker::ProfileEvent":
|
||||
CProfileEvent(const shared_ptr[CProfiler] profiler,
|
||||
const c_string &event_type)
|
||||
void SetExtraData(const c_string &extra_data)
|
||||
|
||||
cdef extern from "ray/core_worker/profiling.h" nogil:
|
||||
cdef cppclass CProfileEvent "ray::worker::ProfileEvent":
|
||||
@@ -54,12 +78,23 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
const c_string &raylet_socket, const CJobID &job_id,
|
||||
const CGcsClientOptions &gcs_options,
|
||||
const c_string &log_dir, const c_string &node_ip_address,
|
||||
void* execution_callback,
|
||||
CRayStatus (
|
||||
CTaskType task_type,
|
||||
const CRayFunction &ray_function,
|
||||
const CJobID &job_id,
|
||||
const CActorID &actor_id,
|
||||
const unordered_map[c_string, double] &resources,
|
||||
const c_vector[shared_ptr[CRayObject]] &args,
|
||||
const c_vector[CObjectID] &arg_reference_ids,
|
||||
const c_vector[CObjectID] &return_ids,
|
||||
c_vector[shared_ptr[CRayObject]] *returns) nogil,
|
||||
CRayStatus() nogil,
|
||||
c_bool use_memory_store_)
|
||||
void Disconnect()
|
||||
CWorkerType &GetWorkerType()
|
||||
CLanguage &GetLanguage()
|
||||
CObjectInterface &Objects()
|
||||
CTaskExecutionInterface &Execution()
|
||||
|
||||
CRayStatus SubmitTask(
|
||||
const CRayFunction &function, const c_vector[CTaskArg] &args,
|
||||
@@ -72,7 +107,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
const c_vector[CTaskArg] &args, const CTaskOptions &options,
|
||||
c_vector[CObjectID] *return_ids)
|
||||
|
||||
# CTaskExecutionInterface &Execution()
|
||||
unique_ptr[CProfileEvent] CreateProfileEvent(
|
||||
const c_string &event_type)
|
||||
|
||||
@@ -81,12 +115,13 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
CRayletClient &GetRayletClient()
|
||||
# TODO(edoakes): remove these once the Python core worker uses the task
|
||||
# interfaces
|
||||
CJobID GetCurrentJobId()
|
||||
void SetCurrentJobId(const CJobID &job_id)
|
||||
CTaskID GetCurrentTaskId()
|
||||
void SetCurrentTaskId(const CTaskID &task_id)
|
||||
void SetActorId(const CActorID &actor_id)
|
||||
const CActorID &GetActorId()
|
||||
CTaskID GetCallerId()
|
||||
const ResourceMappingType &GetResourceIDs() const
|
||||
CActorID DeserializeAndRegisterActorHandle(const c_string &bytes)
|
||||
CRayStatus SerializeActorHandle(const CActorID &actor_id, c_string
|
||||
*bytes)
|
||||
|
||||
@@ -3,7 +3,6 @@ from libcpp cimport bool as c_bool
|
||||
from libcpp.memory cimport unique_ptr
|
||||
from libcpp.string cimport string as c_string
|
||||
from libcpp.utility cimport pair
|
||||
from libcpp.unordered_map cimport unordered_map
|
||||
from libcpp.vector cimport vector as c_vector
|
||||
|
||||
from ray.includes.common cimport (
|
||||
@@ -38,8 +37,6 @@ cdef extern from "ray/protobuf/gcs.pb.h" nogil:
|
||||
GCSProfileTableData()
|
||||
|
||||
|
||||
ctypedef unordered_map[c_string, c_vector[pair[int64_t, double]]] \
|
||||
ResourceMappingType
|
||||
ctypedef pair[c_vector[CObjectID], c_vector[CObjectID]] WaitResultPair
|
||||
|
||||
|
||||
@@ -78,4 +75,3 @@ cdef extern from "ray/raylet/raylet_client.h" nogil:
|
||||
CWorkerID GetWorkerID() const
|
||||
CJobID GetJobID() const
|
||||
c_bool IsWorker() const
|
||||
const ResourceMappingType &GetResourceIDs() const
|
||||
|
||||
@@ -14,12 +14,6 @@ cdef class TaskSpec:
|
||||
cdef:
|
||||
unique_ptr[CTaskSpec] task_spec
|
||||
|
||||
@staticmethod
|
||||
cdef make(unique_ptr[CTaskSpec]& task_spec):
|
||||
cdef TaskSpec self = TaskSpec.__new__(TaskSpec)
|
||||
self.task_spec.reset(task_spec.release())
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def from_string(const c_string& task_spec_str):
|
||||
"""Convert a string to a Ray task specification Python object.
|
||||
@@ -82,23 +76,23 @@ cdef class TaskSpec:
|
||||
def arguments(self):
|
||||
"""Return the arguments for the task."""
|
||||
cdef:
|
||||
CTaskSpec*task_spec = self.task_spec.get()
|
||||
int64_t num_args = task_spec.NumArgs()
|
||||
int32_t lang = <int32_t>task_spec.GetLanguage()
|
||||
int64_t num_args = self.task_spec.get().NumArgs()
|
||||
int32_t lang = <int32_t>self.task_spec.get().GetLanguage()
|
||||
int count
|
||||
arg_list = []
|
||||
|
||||
if lang == <int32_t>LANGUAGE_PYTHON:
|
||||
for i in range(num_args):
|
||||
count = task_spec.ArgIdCount(i)
|
||||
count = self.task_spec.get().ArgIdCount(i)
|
||||
if count > 0:
|
||||
assert count == 1
|
||||
arg_list.append(
|
||||
ObjectID(task_spec.ArgId(i, 0).Binary()))
|
||||
ObjectID(self.task_spec.get().ArgId(i, 0).Binary()))
|
||||
else:
|
||||
data = task_spec.ArgData(i)[:task_spec.ArgDataSize(i)]
|
||||
metadata = task_spec.ArgMetadata(i)[
|
||||
:task_spec.ArgMetadataSize(i)]
|
||||
data = self.task_spec.get().ArgData(i)[
|
||||
:self.task_spec.get().ArgDataSize(i)]
|
||||
metadata = self.task_spec.get().ArgMetadata(i)[
|
||||
:self.task_spec.get().ArgMetadataSize(i)]
|
||||
if metadata == RAW_BUFFER_METADATA:
|
||||
obj = data
|
||||
else:
|
||||
@@ -111,10 +105,10 @@ cdef class TaskSpec:
|
||||
|
||||
def returns(self):
|
||||
"""Return the object IDs for the return values of the task."""
|
||||
cdef CTaskSpec *task_spec = self.task_spec.get()
|
||||
return_id_list = []
|
||||
for i in range(task_spec.NumReturns()):
|
||||
return_id_list.append(ObjectID(task_spec.ReturnId(i).Binary()))
|
||||
for i in range(self.task_spec.get().NumReturns()):
|
||||
return_id_list.append(
|
||||
ObjectID(self.task_spec.get().ReturnId(i).Binary()))
|
||||
return return_id_list
|
||||
|
||||
def required_resources(self):
|
||||
|
||||
+5
-1
@@ -505,6 +505,10 @@ class GlobalState(object):
|
||||
node_ip_address = profile_table_message.node_ip_address
|
||||
|
||||
for profile_event_message in profile_table_message.profile_events:
|
||||
try:
|
||||
extra_data = json.loads(profile_event_message.extra_data)
|
||||
except ValueError:
|
||||
extra_data = {}
|
||||
profile_event = {
|
||||
"event_type": profile_event_message.event_type,
|
||||
"component_id": component_id,
|
||||
@@ -512,7 +516,7 @@ class GlobalState(object):
|
||||
"component_type": component_type,
|
||||
"start_time": profile_event_message.start_time,
|
||||
"end_time": profile_event_message.end_time,
|
||||
"extra_data": json.loads(profile_event_message.extra_data),
|
||||
"extra_data": extra_data
|
||||
}
|
||||
|
||||
profile_events.append(profile_event)
|
||||
|
||||
@@ -106,7 +106,7 @@ class Cluster(object):
|
||||
|
||||
return node
|
||||
|
||||
def remove_node(self, node, allow_graceful=False):
|
||||
def remove_node(self, node, allow_graceful=True):
|
||||
"""Kills all processes associated with worker node.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -47,4 +47,3 @@ def test_raylet_gdb(ray_gdb_start):
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
assert pgrep_command.communicate()[0]
|
||||
subprocess.call(["pkill", "-f", "gdb.*{}".format(process_name)])
|
||||
|
||||
@@ -292,7 +292,7 @@ def test_incorrect_method_calls(ray_start_regular):
|
||||
def test_worker_raising_exception(ray_start_regular):
|
||||
@ray.remote
|
||||
def f():
|
||||
ray.worker.global_worker._get_next_task_from_raylet = None
|
||||
ray.worker.global_worker.function_actor_manager = None
|
||||
|
||||
# Running this task should cause the worker to raise an exception after
|
||||
# the task has successfully completed.
|
||||
@@ -618,12 +618,17 @@ def test_warning_for_too_many_nested_tasks(shutdown_only):
|
||||
time.sleep(1000)
|
||||
return 1
|
||||
|
||||
@ray.remote
|
||||
def h():
|
||||
time.sleep(1)
|
||||
ray.get(f.remote())
|
||||
|
||||
@ray.remote
|
||||
def g():
|
||||
# Sleep so that the f tasks all get submitted to the scheduler after
|
||||
# the g tasks.
|
||||
time.sleep(1)
|
||||
ray.get(f.remote())
|
||||
ray.get(h.remote())
|
||||
|
||||
[g.remote() for _ in range(num_cpus * 4)]
|
||||
wait_for_errors(ray_constants.WORKER_POOL_LARGE_ERROR, 1)
|
||||
@@ -705,8 +710,6 @@ def test_warning_for_dead_node(ray_start_cluster_2_nodes):
|
||||
|
||||
|
||||
def test_raylet_crash_when_get(ray_start_regular):
|
||||
nonexistent_id = ray.ObjectID.from_random()
|
||||
|
||||
def sleep_to_kill_raylet():
|
||||
# Don't kill raylet before default workers get connected.
|
||||
time.sleep(2)
|
||||
@@ -715,14 +718,14 @@ def test_raylet_crash_when_get(ray_start_regular):
|
||||
thread = threading.Thread(target=sleep_to_kill_raylet)
|
||||
thread.start()
|
||||
with pytest.raises(ray.exceptions.UnreconstructableError):
|
||||
ray.get(nonexistent_id)
|
||||
ray.get(ray.ObjectID.from_random())
|
||||
thread.join()
|
||||
|
||||
|
||||
def test_connect_with_disconnected_node(shutdown_only):
|
||||
config = json.dumps({
|
||||
"num_heartbeats_timeout": 50,
|
||||
"heartbeat_timeout_milliseconds": 10,
|
||||
"raylet_heartbeat_timeout_milliseconds": 10,
|
||||
})
|
||||
cluster = Cluster()
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
|
||||
@@ -52,7 +52,7 @@ def test_internal_config(ray_start_cluster_head):
|
||||
worker = cluster.add_node()
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
cluster.remove_node(worker)
|
||||
cluster.remove_node(worker, allow_graceful=False)
|
||||
time.sleep(1)
|
||||
assert ray.cluster_resources()["CPU"] == 2
|
||||
|
||||
|
||||
@@ -1,86 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pytest
|
||||
|
||||
import ray
|
||||
import ray.exceptions
|
||||
import ray.experimental.no_return
|
||||
import ray.worker
|
||||
|
||||
|
||||
def test_set_single_output(ray_start_regular):
|
||||
@ray.remote
|
||||
def f():
|
||||
return_object_ids = ray.worker.global_worker._current_task.returns()
|
||||
ray.worker.global_worker.put_object(return_object_ids[0], 123)
|
||||
return ray.experimental.no_return.NoReturn
|
||||
|
||||
assert ray.get(f.remote()) == 123
|
||||
|
||||
|
||||
def test_set_multiple_outputs(ray_start_regular):
|
||||
@ray.remote(num_return_vals=3)
|
||||
def f(set_out0, set_out1, set_out2):
|
||||
returns = []
|
||||
return_object_ids = ray.worker.global_worker._current_task.returns()
|
||||
for i, set_out in enumerate([set_out0, set_out1, set_out2]):
|
||||
if set_out:
|
||||
ray.worker.global_worker.put_object(return_object_ids[i], True)
|
||||
returns.append(ray.experimental.no_return.NoReturn)
|
||||
else:
|
||||
returns.append(False)
|
||||
return tuple(returns)
|
||||
|
||||
for set_out0 in [True, False]:
|
||||
for set_out1 in [True, False]:
|
||||
for set_out2 in [True, False]:
|
||||
result_object_ids = f.remote(set_out0, set_out1, set_out2)
|
||||
assert ray.get(result_object_ids) == [
|
||||
set_out0, set_out1, set_out2
|
||||
]
|
||||
|
||||
|
||||
def test_set_actor_method(ray_start_regular):
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def ping(self):
|
||||
return_object_ids = ray.worker.global_worker._current_task.returns(
|
||||
)
|
||||
ray.worker.global_worker.put_object(return_object_ids[0], 123)
|
||||
return ray.experimental.no_return.NoReturn
|
||||
|
||||
actor = Actor.remote()
|
||||
assert ray.get(actor.ping.remote()) == 123
|
||||
|
||||
|
||||
def test_exception(ray_start_regular):
|
||||
@ray.remote(num_return_vals=2)
|
||||
def f():
|
||||
return_object_ids = ray.worker.global_worker._current_task.returns()
|
||||
# The first return value is successfully stored in the object store
|
||||
ray.worker.global_worker.put_object(return_object_ids[0], 123)
|
||||
raise Exception("Error")
|
||||
# The exception is stored at the second return objcet ID.
|
||||
return ray.experimental.no_return.NoReturn, 456
|
||||
|
||||
object_id, exception_id = f.remote()
|
||||
|
||||
assert ray.get(object_id) == 123
|
||||
with pytest.raises(ray.exceptions.RayTaskError):
|
||||
ray.get(exception_id)
|
||||
|
||||
|
||||
def test_no_set_and_no_return(ray_start_regular):
|
||||
@ray.remote
|
||||
def f():
|
||||
return ray.experimental.no_return.NoReturn
|
||||
|
||||
object_id = f.remote()
|
||||
with pytest.raises(ray.exceptions.RayTaskError) as e:
|
||||
ray.get(object_id)
|
||||
assert "Attempting to return 'ray.experimental.NoReturn'" in str(e.value)
|
||||
+30
-287
@@ -26,7 +26,6 @@ import random
|
||||
import pyarrow
|
||||
import pyarrow.plasma as plasma
|
||||
import ray.cloudpickle as pickle
|
||||
import ray.experimental.signal as ray_signal
|
||||
import ray.experimental.no_return
|
||||
import ray.gcs_utils
|
||||
import ray.memory_monitor as memory_monitor
|
||||
@@ -41,7 +40,6 @@ import ray.state
|
||||
|
||||
from ray import (
|
||||
ActorID,
|
||||
WorkerID,
|
||||
JobID,
|
||||
ObjectID,
|
||||
TaskID,
|
||||
@@ -60,10 +58,7 @@ from ray.exceptions import (
|
||||
UnreconstructableError,
|
||||
RAY_EXCEPTION_TYPES,
|
||||
)
|
||||
from ray.function_manager import (
|
||||
FunctionActorManager,
|
||||
FunctionDescriptor,
|
||||
)
|
||||
from ray.function_manager import FunctionActorManager
|
||||
from ray.utils import (
|
||||
_random_string,
|
||||
check_oversized_pickle,
|
||||
@@ -156,7 +151,6 @@ class Worker(object):
|
||||
# Index of the current session. This number will
|
||||
# increment every time when `ray.shutdown` is called.
|
||||
self._session_index = 0
|
||||
self._current_task = None
|
||||
# Functions to run to process the values returned by ray.get. Each
|
||||
# postprocessor must take two arguments ("object_ids", and "values").
|
||||
self._post_get_hooks = []
|
||||
@@ -473,9 +467,10 @@ class Worker(object):
|
||||
logger.warning(warning_message)
|
||||
self.store_and_register(object_id, value)
|
||||
|
||||
def retrieve_and_deserialize(self, object_ids, error_timeout=10):
|
||||
data_metadata_pairs = self.core_worker.get_objects(
|
||||
object_ids, self.current_task_id)
|
||||
def deserialize_objects(self,
|
||||
data_metadata_pairs,
|
||||
object_ids,
|
||||
error_timeout=10):
|
||||
assert len(data_metadata_pairs) == len(object_ids)
|
||||
|
||||
start_time = time.time()
|
||||
@@ -571,9 +566,9 @@ class Worker(object):
|
||||
if self.mode == LOCAL_MODE:
|
||||
return self.local_mode_manager.get_objects(object_ids)
|
||||
|
||||
results = self.retrieve_and_deserialize(object_ids)
|
||||
assert len(results) == len(object_ids)
|
||||
return results
|
||||
data_metadata_pairs = self.core_worker.get_objects(
|
||||
object_ids, self.current_task_id)
|
||||
return self.deserialize_objects(data_metadata_pairs, object_ids)
|
||||
|
||||
def run_function_on_all_workers(self, function,
|
||||
run_on_other_drivers=False):
|
||||
@@ -679,149 +674,6 @@ class Worker(object):
|
||||
|
||||
return ray.signature.recover_args(arguments)
|
||||
|
||||
def _store_outputs_in_object_store(self, object_ids, outputs):
|
||||
"""Store the outputs of a remote function in the local object store.
|
||||
|
||||
This stores the values that were returned by a remote function in the
|
||||
local object store. If any of the return values are object IDs, then
|
||||
these object IDs are aliased with the object IDs that the scheduler
|
||||
assigned for the return values. This is called by the worker that
|
||||
executes the remote function.
|
||||
|
||||
Note:
|
||||
The arguments object_ids and outputs should have the same length.
|
||||
|
||||
Args:
|
||||
object_ids (List[ObjectID]): The object IDs that were assigned to
|
||||
the outputs of the remote function call.
|
||||
outputs (Tuple): The value returned by the remote function. If the
|
||||
remote function was supposed to only return one value, then its
|
||||
output was wrapped in a tuple with one element prior to being
|
||||
passed into this function.
|
||||
"""
|
||||
for i in range(len(object_ids)):
|
||||
if isinstance(outputs[i], ray.actor.ActorHandle):
|
||||
raise Exception("Returning an actor handle from a remote "
|
||||
"function is not allowed).")
|
||||
if outputs[i] is ray.experimental.no_return.NoReturn:
|
||||
if not self.core_worker.object_exists(object_ids[i]):
|
||||
raise RuntimeError(
|
||||
"Attempting to return 'ray.experimental.NoReturn' "
|
||||
"from a remote function, but the corresponding "
|
||||
"ObjectID does not exist in the local object store.")
|
||||
else:
|
||||
self.put_object(object_ids[i], outputs[i])
|
||||
|
||||
def _process_task(self, task, function_execution_info):
|
||||
"""Execute a task assigned to this worker.
|
||||
|
||||
This method deserializes a task from the scheduler, and attempts to
|
||||
execute the task. If the task succeeds, the outputs are stored in the
|
||||
local object store. If the task throws an exception, RayTaskError
|
||||
objects are stored in the object store to represent the failed task
|
||||
(these will be retrieved by calls to get or by subsequent tasks that
|
||||
use the outputs of this task).
|
||||
"""
|
||||
assert self.current_task_id.is_nil()
|
||||
assert self.task_context.task_index == 0
|
||||
assert self.task_context.put_index == 1
|
||||
if not task.is_actor_task():
|
||||
# If this worker is not an actor, check that `current_job_id`
|
||||
# was reset when the worker finished the previous task.
|
||||
assert self.current_job_id.is_nil()
|
||||
# Set the driver ID of the current running task. This is
|
||||
# needed so that if the task throws an exception, we propagate
|
||||
# the error message to the correct driver.
|
||||
self.current_job_id = task.job_id()
|
||||
self.core_worker.set_current_job_id(task.job_id())
|
||||
else:
|
||||
# If this worker is an actor, current_job_id wasn't reset.
|
||||
# Check that current task's driver ID equals the previous one.
|
||||
assert self.current_job_id == task.job_id()
|
||||
|
||||
self.task_context.current_task_id = task.task_id()
|
||||
self.core_worker.set_current_task_id(task.task_id())
|
||||
|
||||
function_descriptor = FunctionDescriptor.from_bytes_list(
|
||||
task.function_descriptor_list())
|
||||
serialized_args = task.arguments()
|
||||
return_object_ids = task.returns()
|
||||
if task.is_actor_task() or task.is_actor_creation_task():
|
||||
dummy_return_id = return_object_ids.pop()
|
||||
function_executor = function_execution_info.function
|
||||
function_name = function_execution_info.function_name
|
||||
|
||||
# Get task arguments from the object store.
|
||||
try:
|
||||
if function_name != "__ray_terminate__":
|
||||
self.reraise_actor_init_error()
|
||||
self.memory_monitor.raise_if_low_memory()
|
||||
with profiling.profile("task:deserialize_arguments"):
|
||||
function_args, function_kwargs = (
|
||||
self._get_arguments_for_execution(function_name,
|
||||
serialized_args))
|
||||
except Exception as e:
|
||||
self._handle_process_task_failure(
|
||||
function_descriptor, return_object_ids, e,
|
||||
ray.utils.format_error_message(traceback.format_exc()))
|
||||
return
|
||||
|
||||
# Execute the task.
|
||||
try:
|
||||
self._current_task = task
|
||||
with profiling.profile("task:execute"):
|
||||
if task.is_normal_task():
|
||||
outputs = function_executor(*function_args,
|
||||
**function_kwargs)
|
||||
else:
|
||||
if task.is_actor_task():
|
||||
key = task.actor_id()
|
||||
else:
|
||||
key = task.actor_creation_id()
|
||||
worker_name = "ray_{}_{}".format(
|
||||
self.actors[key].__class__.__name__, os.getpid())
|
||||
if "memory" in task.required_resources():
|
||||
self.memory_monitor.set_heap_limit(
|
||||
worker_name,
|
||||
ray_constants.from_memory_units(
|
||||
task.required_resources()["memory"]))
|
||||
if "object_store_memory" in task.required_resources():
|
||||
self._set_object_store_client_options(
|
||||
worker_name,
|
||||
int(
|
||||
ray_constants.from_memory_units(
|
||||
task.required_resources()[
|
||||
"object_store_memory"])))
|
||||
outputs = function_executor(
|
||||
dummy_return_id, self.actors[key], *function_args,
|
||||
**function_kwargs)
|
||||
except Exception as e:
|
||||
# Determine whether the exception occured during a task, not an
|
||||
# actor method.
|
||||
task_exception = not task.is_actor_task()
|
||||
traceback_str = ray.utils.format_error_message(
|
||||
traceback.format_exc(), task_exception=task_exception)
|
||||
self._handle_process_task_failure(
|
||||
function_descriptor, return_object_ids, e, traceback_str)
|
||||
return
|
||||
finally:
|
||||
self._current_task = None
|
||||
|
||||
# Store the outputs in the local object store.
|
||||
try:
|
||||
with profiling.profile("task:store_outputs"):
|
||||
# If this is an actor task, then the last object ID returned by
|
||||
# the task is a dummy output, not returned by the function
|
||||
# itself. Decrement to get the correct number of return values.
|
||||
num_returns = len(return_object_ids)
|
||||
if num_returns == 1:
|
||||
outputs = (outputs, )
|
||||
self._store_outputs_in_object_store(return_object_ids, outputs)
|
||||
except Exception as e:
|
||||
self._handle_process_task_failure(
|
||||
function_descriptor, return_object_ids, e,
|
||||
ray.utils.format_error_message(traceback.format_exc()))
|
||||
|
||||
def _set_object_store_client_options(self, name, object_store_memory):
|
||||
try:
|
||||
logger.debug("Setting plasma memory limit to {} for {}".format(
|
||||
@@ -838,133 +690,15 @@ class Worker(object):
|
||||
"object store memory status is:\n\n{}".format(
|
||||
object_store_memory, name, e))
|
||||
|
||||
def _handle_process_task_failure(self, function_descriptor,
|
||||
return_object_ids, error, backtrace):
|
||||
function_name = function_descriptor.function_name
|
||||
if isinstance(error, RayTaskError):
|
||||
# avoid recursively nesting of RayTaskError
|
||||
failure_object = RayTaskError(function_name, backtrace,
|
||||
error.cause_cls)
|
||||
else:
|
||||
failure_object = RayTaskError(function_name, backtrace,
|
||||
error.__class__)
|
||||
failure_objects = [
|
||||
failure_object for _ in range(len(return_object_ids))
|
||||
]
|
||||
self._store_outputs_in_object_store(return_object_ids, failure_objects)
|
||||
# Log the error message.
|
||||
ray.utils.push_error_to_driver(
|
||||
self,
|
||||
ray_constants.TASK_PUSH_ERROR,
|
||||
str(failure_object),
|
||||
job_id=self.current_job_id)
|
||||
# Mark the actor init as failed
|
||||
if not self.actor_id.is_nil() and function_name == "__init__":
|
||||
self.mark_actor_init_failed(error)
|
||||
# Send signal with the error.
|
||||
ray_signal.send(ray_signal.ErrorSignal(str(failure_object)))
|
||||
|
||||
def _wait_for_and_process_task(self, task):
|
||||
"""Wait for a task to be ready and process the task.
|
||||
|
||||
Args:
|
||||
task: The task to execute.
|
||||
"""
|
||||
function_descriptor = FunctionDescriptor.from_bytes_list(
|
||||
task.function_descriptor_list())
|
||||
job_id = task.job_id()
|
||||
|
||||
# TODO(rkn): It would be preferable for actor creation tasks to share
|
||||
# more of the code path with regular task execution.
|
||||
if task.is_actor_creation_task():
|
||||
# TODO: Remove Worker.actor_id and just use CoreWorker.GetActorId.
|
||||
self.actor_id = task.actor_creation_id()
|
||||
self.core_worker.set_actor_id(task.actor_creation_id())
|
||||
self.actor_creation_task_id = task.task_id()
|
||||
actor_class = self.function_actor_manager.load_actor_class(
|
||||
job_id, function_descriptor)
|
||||
self.actors[self.actor_id] = actor_class.__new__(actor_class)
|
||||
self.actor_checkpoint_info[self.actor_id] = ActorCheckpointInfo(
|
||||
num_tasks_since_last_checkpoint=0,
|
||||
last_checkpoint_timestamp=int(1000 * time.time()),
|
||||
checkpoint_ids=[],
|
||||
)
|
||||
|
||||
execution_info = self.function_actor_manager.get_execution_info(
|
||||
job_id, function_descriptor)
|
||||
|
||||
# Execute the task.
|
||||
function_name = execution_info.function_name
|
||||
extra_data = {"name": function_name, "task_id": task.task_id().hex()}
|
||||
if not task.is_actor_task():
|
||||
if not task.is_actor_creation_task():
|
||||
title = "ray_worker:{}()".format(function_name)
|
||||
next_title = "ray_worker"
|
||||
else:
|
||||
actor = self.actors[task.actor_creation_id()]
|
||||
title = "ray_{}:{}()".format(actor.__class__.__name__,
|
||||
function_name)
|
||||
next_title = "ray_{}".format(actor.__class__.__name__)
|
||||
else:
|
||||
actor = self.actors[task.actor_id()]
|
||||
title = "ray_{}:{}()".format(actor.__class__.__name__,
|
||||
function_name)
|
||||
next_title = "ray_{}".format(actor.__class__.__name__)
|
||||
|
||||
with profiling.profile("task", extra_data=extra_data):
|
||||
with _changeproctitle(title, next_title):
|
||||
self._process_task(task, execution_info)
|
||||
# Reset the state fields so the next task can run.
|
||||
self.task_context.current_task_id = TaskID.nil()
|
||||
self.core_worker.set_current_task_id(TaskID.nil())
|
||||
self.task_context.task_index = 0
|
||||
self.task_context.put_index = 1
|
||||
if self.actor_id.is_nil():
|
||||
# Don't need to reset `current_job_id` if the worker is an
|
||||
# actor. Because the following tasks should all have the
|
||||
# same driver id.
|
||||
self.current_job_id = WorkerID.nil()
|
||||
self.core_worker.set_current_job_id(JobID.nil())
|
||||
# Reset signal counters so that the next task can get
|
||||
# all past signals.
|
||||
ray_signal.reset()
|
||||
|
||||
# Increase the task execution counter.
|
||||
self.function_actor_manager.increase_task_counter(
|
||||
job_id, function_descriptor)
|
||||
|
||||
reached_max_executions = (self.function_actor_manager.get_task_counter(
|
||||
job_id, function_descriptor) == execution_info.max_calls)
|
||||
if reached_max_executions:
|
||||
self.core_worker.disconnect()
|
||||
sys.exit(0)
|
||||
|
||||
def _get_next_task_from_raylet(self):
|
||||
"""Get the next task from the raylet.
|
||||
|
||||
Returns:
|
||||
A task from the raylet.
|
||||
"""
|
||||
with profiling.profile("worker_idle"):
|
||||
task = self.raylet_client.get_task()
|
||||
|
||||
# Automatically restrict the GPUs available to this task.
|
||||
ray.utils.set_cuda_visible_devices(ray.get_gpu_ids())
|
||||
|
||||
return task
|
||||
|
||||
def main_loop(self):
|
||||
"""The main loop a worker runs to receive and execute tasks."""
|
||||
|
||||
def exit(signum, frame):
|
||||
shutdown()
|
||||
sys.exit(0)
|
||||
def sigterm_handler(signum, frame):
|
||||
shutdown(True)
|
||||
sys.exit(1)
|
||||
|
||||
signal.signal(signal.SIGTERM, exit)
|
||||
|
||||
while True:
|
||||
task = self._get_next_task_from_raylet()
|
||||
self._wait_for_and_process_task(task)
|
||||
signal.signal(signal.SIGTERM, sigterm_handler)
|
||||
self.core_worker.run_task_loop()
|
||||
|
||||
|
||||
def get_gpu_ids():
|
||||
@@ -982,7 +716,7 @@ def get_gpu_ids():
|
||||
raise Exception("ray.get_gpu_ids() currently does not work in LOCAL "
|
||||
"MODE.")
|
||||
|
||||
all_resource_ids = global_worker.raylet_client.resource_ids()
|
||||
all_resource_ids = global_worker.core_worker.resource_ids()
|
||||
assigned_ids = [
|
||||
resource_id for resource_id, _ in all_resource_ids.get("GPU", [])
|
||||
]
|
||||
@@ -1010,7 +744,7 @@ def get_resource_ids():
|
||||
"ray.get_resource_ids() currently does not work in LOCAL "
|
||||
"MODE.")
|
||||
|
||||
return global_worker.raylet_client.resource_ids()
|
||||
return global_worker.core_worker.resource_ids()
|
||||
|
||||
|
||||
def get_webui_url():
|
||||
@@ -1437,7 +1171,7 @@ def shutdown(exiting_interpreter=False):
|
||||
# to make sure that log messages finish printing.
|
||||
time.sleep(0.5)
|
||||
|
||||
disconnect()
|
||||
disconnect(exiting_interpreter)
|
||||
|
||||
# Disconnect global state from GCS.
|
||||
ray.state.state.disconnect()
|
||||
@@ -1456,6 +1190,13 @@ def shutdown(exiting_interpreter=False):
|
||||
|
||||
atexit.register(shutdown, True)
|
||||
|
||||
|
||||
def sigterm_handler(signum, frame):
|
||||
sys.exit(signal.SIGTERM)
|
||||
|
||||
|
||||
signal.signal(signal.SIGTERM, sigterm_handler)
|
||||
|
||||
# Define a custom excepthook so that if the driver exits with an exception, we
|
||||
# can push that exception to Redis.
|
||||
normal_excepthook = sys.excepthook
|
||||
@@ -1900,7 +1641,7 @@ def connect(node,
|
||||
worker.cached_functions_to_run = None
|
||||
|
||||
|
||||
def disconnect():
|
||||
def disconnect(exiting_interpreter=False):
|
||||
"""Disconnect this worker from the raylet and object store."""
|
||||
# Reset the list of cached remote functions and actors so that if more
|
||||
# remote functions or actors are defined and then connect is called again,
|
||||
@@ -1928,10 +1669,12 @@ def disconnect():
|
||||
worker.function_actor_manager.reset_cache()
|
||||
worker.serialization_context_map.clear()
|
||||
|
||||
if hasattr(worker, "raylet_client"):
|
||||
del worker.raylet_client
|
||||
if hasattr(worker, "core_worker"):
|
||||
del worker.core_worker
|
||||
if not exiting_interpreter:
|
||||
if hasattr(worker, "raylet_client"):
|
||||
del worker.raylet_client
|
||||
|
||||
if hasattr(worker, "core_worker"):
|
||||
del worker.core_worker
|
||||
|
||||
|
||||
@contextmanager
|
||||
|
||||
@@ -3,7 +3,6 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import traceback
|
||||
|
||||
import ray
|
||||
import ray.actor
|
||||
@@ -86,30 +85,5 @@ if __name__ == "__main__":
|
||||
node = ray.node.Node(
|
||||
ray_params, head=False, shutdown_at_exit=False, connect_only=True)
|
||||
ray.worker._global_node = node
|
||||
|
||||
ray.worker.connect(node, mode=ray.WORKER_MODE)
|
||||
|
||||
error_explanation = """
|
||||
This error is unexpected and should not have happened. Somehow a worker
|
||||
crashed in an unanticipated way causing the main_loop to throw an exception,
|
||||
which is being caught in "python/ray/workers/default_worker.py".
|
||||
"""
|
||||
|
||||
try:
|
||||
# This call to main_loop should never return if things are working.
|
||||
# Most exceptions that are thrown (e.g., inside the execution of a
|
||||
# task) should be caught and handled inside of the call to
|
||||
# main_loop. If an exception is thrown here, then that means that
|
||||
# there is some error that we didn't anticipate.
|
||||
ray.worker.global_worker.main_loop()
|
||||
except Exception:
|
||||
traceback_str = traceback.format_exc() + error_explanation
|
||||
ray.utils.push_error_to_driver(
|
||||
ray.worker.global_worker,
|
||||
"worker_crash",
|
||||
traceback_str,
|
||||
job_id=None)
|
||||
# TODO(rkn): Note that if the worker was in the middle of executing
|
||||
# a task, then any worker or driver that is blocking in a get call
|
||||
# and waiting for the output of that task will hang. We need to
|
||||
# address this.
|
||||
ray.worker.global_worker.main_loop()
|
||||
|
||||
Reference in New Issue
Block a user