mirror of
https://github.com/wassname/ray.git
synced 2026-07-05 14:44:48 +08:00
Support multiple core workers in one process (#7623)
This commit is contained in:
@@ -17,7 +17,7 @@ from ray.includes.common cimport (
|
||||
CBuffer,
|
||||
CRayObject
|
||||
)
|
||||
from ray.includes.libcoreworker cimport CCoreWorker
|
||||
from ray.includes.libcoreworker cimport CFiberEvent
|
||||
from ray.includes.unique_ids cimport (
|
||||
CObjectID,
|
||||
CActorID
|
||||
@@ -72,7 +72,7 @@ cdef class ActorID(BaseID):
|
||||
|
||||
cdef class CoreWorker:
|
||||
cdef:
|
||||
unique_ptr[CCoreWorker] core_worker
|
||||
c_bool is_driver
|
||||
object async_thread
|
||||
object async_event_loop
|
||||
object plasma_event_handler
|
||||
@@ -85,6 +85,7 @@ cdef class CoreWorker:
|
||||
cdef store_task_outputs(
|
||||
self, worker, outputs, const c_vector[CObjectID] return_ids,
|
||||
c_vector[shared_ptr[CRayObject]] *returns)
|
||||
cdef yield_current_fiber(self, CFiberEvent &fiber_event)
|
||||
|
||||
cdef class FunctionDescriptor:
|
||||
cdef:
|
||||
|
||||
+117
-69
@@ -69,7 +69,8 @@ from ray.includes.unique_ids cimport (
|
||||
)
|
||||
from ray.includes.libcoreworker cimport (
|
||||
CActorCreationOptions,
|
||||
CCoreWorker,
|
||||
CCoreWorkerOptions,
|
||||
CCoreWorkerProcess,
|
||||
CTaskOptions,
|
||||
ResourceMappingType,
|
||||
CFiberEvent,
|
||||
@@ -312,7 +313,7 @@ cdef execute_task(
|
||||
dict execution_infos = manager.execution_infos
|
||||
CoreWorker core_worker = worker.core_worker
|
||||
JobID job_id = core_worker.get_current_job_id()
|
||||
CTaskID task_id = core_worker.core_worker.get().GetCurrentTaskId()
|
||||
TaskID task_id = core_worker.get_current_task_id()
|
||||
CFiberEvent task_done_event
|
||||
|
||||
# Automatically restrict the GPUs available to this task.
|
||||
@@ -339,7 +340,7 @@ cdef execute_task(
|
||||
|
||||
function_name = execution_info.function_name
|
||||
extra_data = (b'{"name": ' + function_name.encode("ascii") +
|
||||
b' "task_id": ' + task_id.Hex() + b'}')
|
||||
b' "task_id": ' + task_id.hex().encode("ascii") + b'}')
|
||||
|
||||
if <int>task_type == <int>TASK_TYPE_NORMAL_TASK:
|
||||
title = "ray::{}()".format(function_name)
|
||||
@@ -396,9 +397,7 @@ cdef execute_task(
|
||||
monitor_state.unregister_coroutine(coroutine)
|
||||
|
||||
future.add_done_callback(callback)
|
||||
with nogil:
|
||||
(core_worker.core_worker.get()
|
||||
.YieldCurrentFiber(task_done_event))
|
||||
core_worker.yield_current_fiber(task_done_event)
|
||||
|
||||
return future.result()
|
||||
|
||||
@@ -499,8 +498,7 @@ cdef CRayStatus task_execution_handler(
|
||||
const c_vector[shared_ptr[CRayObject]] &c_args,
|
||||
const c_vector[CObjectID] &c_arg_reference_ids,
|
||||
const c_vector[CObjectID] &c_return_ids,
|
||||
c_vector[shared_ptr[CRayObject]] *returns,
|
||||
const CWorkerID &c_worker_id) nogil:
|
||||
c_vector[shared_ptr[CRayObject]] *returns) nogil:
|
||||
|
||||
with gil:
|
||||
try:
|
||||
@@ -645,43 +643,76 @@ cdef class CoreWorker:
|
||||
|
||||
def __cinit__(self, is_driver, store_socket, raylet_socket,
|
||||
JobID job_id, GcsClientOptions gcs_options, log_dir,
|
||||
node_ip_address, node_manager_port, local_mode):
|
||||
use_driver = is_driver or local_mode
|
||||
self.core_worker.reset(new CCoreWorker(
|
||||
WORKER_TYPE_DRIVER if use_driver else WORKER_TYPE_WORKER,
|
||||
LANGUAGE_PYTHON, store_socket.encode("ascii"),
|
||||
raylet_socket.encode("ascii"), job_id.native(),
|
||||
gcs_options.native()[0], log_dir.encode("utf-8"),
|
||||
node_ip_address.encode("utf-8"), node_manager_port,
|
||||
task_execution_handler, check_signals, gc_collect,
|
||||
get_py_stack, True, local_mode))
|
||||
node_ip_address, node_manager_port, local_mode,
|
||||
driver_name, stdout_file, stderr_file):
|
||||
self.is_driver = is_driver
|
||||
self.is_local_mode = local_mode
|
||||
|
||||
cdef CCoreWorkerOptions options = CCoreWorkerOptions()
|
||||
options.worker_type = (
|
||||
WORKER_TYPE_DRIVER if is_driver else WORKER_TYPE_WORKER)
|
||||
options.language = LANGUAGE_PYTHON
|
||||
options.store_socket = store_socket.encode("ascii")
|
||||
options.raylet_socket = raylet_socket.encode("ascii")
|
||||
options.job_id = job_id.native()
|
||||
options.gcs_options = gcs_options.native()[0]
|
||||
options.log_dir = log_dir.encode("utf-8")
|
||||
options.install_failure_signal_handler = True
|
||||
options.node_ip_address = node_ip_address.encode("utf-8")
|
||||
options.node_manager_port = node_manager_port
|
||||
options.driver_name = driver_name
|
||||
options.stdout_file = stdout_file
|
||||
options.stderr_file = stderr_file
|
||||
options.task_execution_callback = task_execution_handler
|
||||
options.check_signals = check_signals
|
||||
options.gc_collect = gc_collect
|
||||
options.get_lang_stack = get_py_stack
|
||||
options.ref_counting_enabled = True
|
||||
options.is_local_mode = local_mode
|
||||
options.num_workers = 1
|
||||
|
||||
CCoreWorkerProcess.Initialize(options)
|
||||
|
||||
def __dealloc__(self):
|
||||
with nogil:
|
||||
# If it's a worker, the core worker process should have been
|
||||
# shutdown. So we can't call
|
||||
# `CCoreWorkerProcess.GetCoreWorker().GetWorkerType()` here.
|
||||
# Instead, we use the cached `is_driver` flag to test if it's a
|
||||
# driver.
|
||||
if self.is_driver:
|
||||
CCoreWorkerProcess.Shutdown()
|
||||
|
||||
def run_task_loop(self):
|
||||
with nogil:
|
||||
self.core_worker.get().StartExecutingTasks()
|
||||
CCoreWorkerProcess.RunTaskExecutionLoop()
|
||||
|
||||
def get_current_task_id(self):
|
||||
return TaskID(self.core_worker.get().GetCurrentTaskId().Binary())
|
||||
return TaskID(
|
||||
CCoreWorkerProcess.GetCoreWorker().GetCurrentTaskId().Binary())
|
||||
|
||||
def get_current_job_id(self):
|
||||
return JobID(self.core_worker.get().GetCurrentJobId().Binary())
|
||||
return JobID(
|
||||
CCoreWorkerProcess.GetCoreWorker().GetCurrentJobId().Binary())
|
||||
|
||||
def get_actor_id(self):
|
||||
return ActorID(self.core_worker.get().GetActorId().Binary())
|
||||
return ActorID(
|
||||
CCoreWorkerProcess.GetCoreWorker().GetActorId().Binary())
|
||||
|
||||
def set_webui_display(self, key, message):
|
||||
self.core_worker.get().SetWebuiDisplay(key, message)
|
||||
CCoreWorkerProcess.GetCoreWorker().SetWebuiDisplay(key, message)
|
||||
|
||||
def set_actor_title(self, title):
|
||||
self.core_worker.get().SetActorTitle(title)
|
||||
CCoreWorkerProcess.GetCoreWorker().SetActorTitle(title)
|
||||
|
||||
def set_plasma_added_callback(self, plasma_event_handler):
|
||||
self.plasma_event_handler = plasma_event_handler
|
||||
self.core_worker.get().SetPlasmaAddedCallback(async_plasma_callback)
|
||||
CCoreWorkerProcess.GetCoreWorker().SetPlasmaAddedCallback(
|
||||
async_plasma_callback)
|
||||
|
||||
def subscribe_to_plasma_object(self, ObjectID object_id):
|
||||
self.core_worker.get().SubscribeToPlasmaAdd(object_id.native())
|
||||
CCoreWorkerProcess.GetCoreWorker().SubscribeToPlasmaAdd(
|
||||
object_id.native())
|
||||
|
||||
def get_plasma_event_handler(self):
|
||||
return self.plasma_event_handler
|
||||
@@ -694,7 +725,7 @@ cdef class CoreWorker:
|
||||
c_vector[CObjectID] c_object_ids = ObjectIDsToVector(object_ids)
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Get(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().Get(
|
||||
c_object_ids, timeout_ms, &results))
|
||||
|
||||
return RayObjectsToDataMetadataPairs(results)
|
||||
@@ -705,7 +736,7 @@ cdef class CoreWorker:
|
||||
CObjectID c_object_id = object_id.native()
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Contains(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().Contains(
|
||||
c_object_id, &has_object))
|
||||
|
||||
return has_object
|
||||
@@ -716,13 +747,13 @@ cdef class CoreWorker:
|
||||
CObjectID *c_object_id, shared_ptr[CBuffer] *data):
|
||||
if object_id is None:
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Create(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().Create(
|
||||
metadata, data_size, contained_ids,
|
||||
c_object_id, data))
|
||||
else:
|
||||
c_object_id[0] = object_id.native()
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Create(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().Create(
|
||||
metadata, data_size,
|
||||
c_object_id[0], data))
|
||||
|
||||
@@ -752,7 +783,7 @@ cdef class CoreWorker:
|
||||
write_serialized_object(serialized_object, data)
|
||||
if self.is_local_mode:
|
||||
c_object_id_vector.push_back(c_object_id)
|
||||
check_status(self.core_worker.get().Put(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().Put(
|
||||
CRayObject(data, metadata, c_object_id_vector),
|
||||
c_object_id_vector, c_object_id))
|
||||
else:
|
||||
@@ -760,7 +791,7 @@ cdef class CoreWorker:
|
||||
# Using custom object IDs is not supported because we can't
|
||||
# track their lifecycle, so we don't pin the object in this
|
||||
# case.
|
||||
check_status(self.core_worker.get().Seal(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().Seal(
|
||||
c_object_id,
|
||||
pin_object and object_id is None))
|
||||
|
||||
@@ -775,7 +806,7 @@ cdef class CoreWorker:
|
||||
|
||||
wait_ids = ObjectIDsToVector(object_ids)
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Wait(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().Wait(
|
||||
wait_ids, num_returns, timeout_ms, &results))
|
||||
|
||||
assert len(results) == len(object_ids)
|
||||
@@ -795,19 +826,19 @@ cdef class CoreWorker:
|
||||
c_vector[CObjectID] free_ids = ObjectIDsToVector(object_ids)
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().Delete(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().Delete(
|
||||
free_ids, local_only, delete_creating_tasks))
|
||||
|
||||
def global_gc(self):
|
||||
with nogil:
|
||||
self.core_worker.get().TriggerGlobalGC()
|
||||
CCoreWorkerProcess.GetCoreWorker().TriggerGlobalGC()
|
||||
|
||||
def set_object_store_client_options(self, client_name,
|
||||
int64_t limit_bytes):
|
||||
try:
|
||||
logger.debug("Setting plasma memory limit to {} for {}".format(
|
||||
limit_bytes, client_name))
|
||||
check_status(self.core_worker.get().SetClientOptions(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().SetClientOptions(
|
||||
client_name.encode("ascii"), limit_bytes))
|
||||
except RayError as e:
|
||||
self.dump_object_store_memory_usage()
|
||||
@@ -820,7 +851,7 @@ cdef class CoreWorker:
|
||||
limit_bytes, client_name, e))
|
||||
|
||||
def dump_object_store_memory_usage(self):
|
||||
message = self.core_worker.get().MemoryUsageString()
|
||||
message = CCoreWorkerProcess.GetCoreWorker().MemoryUsageString()
|
||||
logger.warning("Local object store memory usage:\n{}\n".format(
|
||||
message.decode("utf-8")))
|
||||
|
||||
@@ -847,7 +878,7 @@ cdef class CoreWorker:
|
||||
prepare_args(self, args, &args_vector)
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().SubmitTask(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().SubmitTask(
|
||||
ray_function, args_vector, task_options, &return_ids,
|
||||
max_retries))
|
||||
|
||||
@@ -880,7 +911,7 @@ cdef class CoreWorker:
|
||||
prepare_args(self, args, &args_vector)
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().CreateActor(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().CreateActor(
|
||||
ray_function, args_vector,
|
||||
CActorCreationOptions(
|
||||
max_reconstructions, max_concurrency,
|
||||
@@ -916,10 +947,11 @@ cdef class CoreWorker:
|
||||
prepare_args(self, args, &args_vector)
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().SubmitActorTask(
|
||||
c_actor_id,
|
||||
ray_function,
|
||||
args_vector, task_options, &return_ids))
|
||||
check_status(
|
||||
CCoreWorkerProcess.GetCoreWorker().SubmitActorTask(
|
||||
c_actor_id,
|
||||
ray_function,
|
||||
args_vector, task_options, &return_ids))
|
||||
|
||||
return VectorToObjectIDs(return_ids)
|
||||
|
||||
@@ -928,13 +960,13 @@ cdef class CoreWorker:
|
||||
CActorID c_actor_id = actor_id.native()
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().KillActor(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().KillActor(
|
||||
c_actor_id, True, no_reconstruction))
|
||||
|
||||
def resource_ids(self):
|
||||
cdef:
|
||||
ResourceMappingType resource_mapping = (
|
||||
self.core_worker.get().GetResourceIDs())
|
||||
CCoreWorkerProcess.GetCoreWorker().GetResourceIDs())
|
||||
unordered_map[
|
||||
c_string, c_vector[pair[int64_t, double]]
|
||||
].iterator iterator = resource_mapping.begin()
|
||||
@@ -955,13 +987,14 @@ cdef class CoreWorker:
|
||||
|
||||
def profile_event(self, c_string event_type, object extra_data=None):
|
||||
return ProfileEvent.make(
|
||||
self.core_worker.get().CreateProfileEvent(event_type),
|
||||
CCoreWorkerProcess.GetCoreWorker().CreateProfileEvent(event_type),
|
||||
extra_data)
|
||||
|
||||
def remove_actor_handle_reference(self, ActorID actor_id):
|
||||
cdef:
|
||||
CActorID c_actor_id = actor_id.native()
|
||||
self.core_worker.get().RemoveActorHandleReference(c_actor_id)
|
||||
CCoreWorkerProcess.GetCoreWorker().RemoveActorHandleReference(
|
||||
c_actor_id)
|
||||
|
||||
def deserialize_and_register_actor_handle(self, const c_string &bytes,
|
||||
ObjectID
|
||||
@@ -974,9 +1007,10 @@ cdef class CoreWorker:
|
||||
worker = ray.worker.global_worker
|
||||
worker.check_connected()
|
||||
manager = worker.function_actor_manager
|
||||
c_actor_id = self.core_worker.get().DeserializeAndRegisterActorHandle(
|
||||
bytes, c_outer_object_id)
|
||||
check_status(self.core_worker.get().GetActorHandle(
|
||||
c_actor_id = (CCoreWorkerProcess.GetCoreWorker()
|
||||
.DeserializeAndRegisterActorHandle(
|
||||
bytes, c_outer_object_id))
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().GetActorHandle(
|
||||
c_actor_id, &c_actor_handle))
|
||||
actor_id = ActorID(c_actor_id.Binary())
|
||||
job_id = JobID(c_actor_handle.CreationJobID().Binary())
|
||||
@@ -1017,24 +1051,26 @@ cdef class CoreWorker:
|
||||
cdef:
|
||||
c_string output
|
||||
CObjectID c_actor_handle_id
|
||||
check_status(self.core_worker.get().SerializeActorHandle(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().SerializeActorHandle(
|
||||
actor_id.native(), &output, &c_actor_handle_id))
|
||||
return output, ObjectID(c_actor_handle_id.Binary())
|
||||
|
||||
def add_object_id_reference(self, ObjectID object_id):
|
||||
# Note: faster to not release GIL for short-running op.
|
||||
self.core_worker.get().AddLocalReference(object_id.native())
|
||||
CCoreWorkerProcess.GetCoreWorker().AddLocalReference(
|
||||
object_id.native())
|
||||
|
||||
def remove_object_id_reference(self, ObjectID object_id):
|
||||
# Note: faster to not release GIL for short-running op.
|
||||
self.core_worker.get().RemoveLocalReference(object_id.native())
|
||||
CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference(
|
||||
object_id.native())
|
||||
|
||||
def serialize_and_promote_object_id(self, ObjectID object_id):
|
||||
cdef:
|
||||
CObjectID c_object_id = object_id.native()
|
||||
CTaskID c_owner_id = CTaskID.Nil()
|
||||
CAddress c_owner_address = CAddress()
|
||||
self.core_worker.get().PromoteToPlasmaAndGetOwnershipInfo(
|
||||
CCoreWorkerProcess.GetCoreWorker().PromoteToPlasmaAndGetOwnershipInfo(
|
||||
c_object_id, &c_owner_id, &c_owner_address)
|
||||
return (object_id,
|
||||
TaskID(c_owner_id.Binary()),
|
||||
@@ -1053,11 +1089,12 @@ cdef class CoreWorker:
|
||||
CAddress c_owner_address = CAddress()
|
||||
|
||||
c_owner_address.ParseFromString(serialized_owner_address)
|
||||
self.core_worker.get().RegisterOwnershipInfoAndResolveFuture(
|
||||
(CCoreWorkerProcess.GetCoreWorker()
|
||||
.RegisterOwnershipInfoAndResolveFuture(
|
||||
c_object_id,
|
||||
c_outer_object_id,
|
||||
c_owner_id,
|
||||
c_owner_address)
|
||||
c_owner_address))
|
||||
|
||||
cdef store_task_outputs(
|
||||
self, worker, outputs, const c_vector[CObjectID] return_ids,
|
||||
@@ -1088,8 +1125,10 @@ cdef class CoreWorker:
|
||||
ObjectIDsToVector(serialized_object.contained_object_ids))
|
||||
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().AllocateReturnObjects(
|
||||
return_ids, data_sizes, metadatas, contained_ids, returns))
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker()
|
||||
.AllocateReturnObjects(
|
||||
return_ids, data_sizes, metadatas, contained_ids,
|
||||
returns))
|
||||
|
||||
for i, serialized_object in enumerate(serialized_objects):
|
||||
# A nullptr is returned if the object already exists.
|
||||
@@ -1099,7 +1138,7 @@ cdef class CoreWorker:
|
||||
if self.is_local_mode:
|
||||
return_ids_vector.push_back(return_ids[i])
|
||||
check_status(
|
||||
self.core_worker.get().Put(
|
||||
CCoreWorkerProcess.GetCoreWorker().Put(
|
||||
CRayObject(returns[0][i].get().GetData(),
|
||||
returns[0][i].get().GetMetadata(),
|
||||
return_ids_vector),
|
||||
@@ -1138,7 +1177,7 @@ cdef class CoreWorker:
|
||||
future = asyncio.run_coroutine_threadsafe(coroutine, loop)
|
||||
future.add_done_callback(lambda _: event.Notify())
|
||||
with nogil:
|
||||
(self.core_worker.get()
|
||||
(CCoreWorkerProcess.GetCoreWorker()
|
||||
.YieldCurrentFiber(event))
|
||||
return future.result()
|
||||
|
||||
@@ -1149,14 +1188,20 @@ cdef class CoreWorker:
|
||||
self.async_thread.join()
|
||||
|
||||
def current_actor_is_asyncio(self):
|
||||
return self.core_worker.get().GetWorkerContext().CurrentActorIsAsync()
|
||||
return (CCoreWorkerProcess.GetCoreWorker().GetWorkerContext()
|
||||
.CurrentActorIsAsync())
|
||||
|
||||
cdef yield_current_fiber(self, CFiberEvent &fiber_event):
|
||||
with nogil:
|
||||
CCoreWorkerProcess.GetCoreWorker().YieldCurrentFiber(fiber_event)
|
||||
|
||||
def get_all_reference_counts(self):
|
||||
cdef:
|
||||
unordered_map[CObjectID, pair[size_t, size_t]] c_ref_counts
|
||||
unordered_map[CObjectID, pair[size_t, size_t]].iterator it
|
||||
|
||||
c_ref_counts = self.core_worker.get().GetAllReferenceCounts()
|
||||
c_ref_counts = (
|
||||
CCoreWorkerProcess.GetCoreWorker().GetAllReferenceCounts())
|
||||
it = c_ref_counts.begin()
|
||||
|
||||
ref_counts = {}
|
||||
@@ -1170,7 +1215,7 @@ cdef class CoreWorker:
|
||||
return ref_counts
|
||||
|
||||
def in_memory_store_get_async(self, ObjectID object_id, future):
|
||||
self.core_worker.get().GetAsync(
|
||||
CCoreWorkerProcess.GetCoreWorker().GetAsync(
|
||||
object_id.native(),
|
||||
async_set_result_callback,
|
||||
async_retry_with_plasma_callback,
|
||||
@@ -1178,7 +1223,7 @@ cdef class CoreWorker:
|
||||
|
||||
def push_error(self, JobID job_id, error_type, error_message,
|
||||
double timestamp):
|
||||
check_status(self.core_worker.get().PushError(
|
||||
check_status(CCoreWorkerProcess.GetCoreWorker().PushError(
|
||||
job_id.native(), error_type.encode("ascii"),
|
||||
error_message.encode("ascii"), timestamp))
|
||||
|
||||
@@ -1190,18 +1235,21 @@ cdef class CoreWorker:
|
||||
# PrepareActorCheckpoint will wait for raylet's reply, release
|
||||
# the GIL so other Python threads can run.
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().PrepareActorCheckpoint(
|
||||
c_actor_id, &checkpoint_id))
|
||||
check_status(
|
||||
CCoreWorkerProcess.GetCoreWorker()
|
||||
.PrepareActorCheckpoint(c_actor_id, &checkpoint_id))
|
||||
return ActorCheckpointID(checkpoint_id.Binary())
|
||||
|
||||
def notify_actor_resumed_from_checkpoint(self, ActorID actor_id,
|
||||
ActorCheckpointID checkpoint_id):
|
||||
check_status(self.core_worker.get().NotifyActorResumedFromCheckpoint(
|
||||
actor_id.native(), checkpoint_id.native()))
|
||||
check_status(
|
||||
CCoreWorkerProcess.GetCoreWorker()
|
||||
.NotifyActorResumedFromCheckpoint(
|
||||
actor_id.native(), checkpoint_id.native()))
|
||||
|
||||
def set_resource(self, basestring resource_name,
|
||||
double capacity, ClientID client_id):
|
||||
self.core_worker.get().SetResource(
|
||||
CCoreWorkerProcess.GetCoreWorker().SetResource(
|
||||
resource_name.encode("ascii"), capacity,
|
||||
CClientID.FromBinary(client_id.binary()))
|
||||
|
||||
|
||||
@@ -17,7 +17,6 @@ from ray.includes.unique_ids cimport (
|
||||
CJobID,
|
||||
CTaskID,
|
||||
CObjectID,
|
||||
CWorkerID,
|
||||
)
|
||||
from ray.includes.common cimport (
|
||||
CAddress,
|
||||
@@ -80,31 +79,9 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
c_string ExtensionData() const
|
||||
|
||||
cdef cppclass CCoreWorker "ray::CoreWorker":
|
||||
CCoreWorker(const CWorkerType worker_type, const CLanguage language,
|
||||
const c_string &store_socket,
|
||||
const c_string &raylet_socket, const CJobID &job_id,
|
||||
const CGcsClientOptions &gcs_options,
|
||||
const c_string &log_dir, const c_string &node_ip_address,
|
||||
int node_manager_port,
|
||||
CRayStatus (
|
||||
CTaskType task_type,
|
||||
const CRayFunction &ray_function,
|
||||
const unordered_map[c_string, double] &resources,
|
||||
const c_vector[shared_ptr[CRayObject]] &args,
|
||||
const c_vector[CObjectID] &arg_reference_ids,
|
||||
const c_vector[CObjectID] &return_ids,
|
||||
c_vector[shared_ptr[CRayObject]] *returns,
|
||||
const CWorkerID &worker_id) nogil,
|
||||
CRayStatus() nogil,
|
||||
void() nogil,
|
||||
void(c_string *stack_out) nogil,
|
||||
c_bool ref_counting_enabled,
|
||||
c_bool local_worker)
|
||||
CWorkerType &GetWorkerType()
|
||||
CLanguage &GetLanguage()
|
||||
|
||||
void StartExecutingTasks()
|
||||
|
||||
CRayStatus SubmitTask(
|
||||
const CRayFunction &function, const c_vector[CTaskArg] &args,
|
||||
const CTaskOptions &options, c_vector[CObjectID] *return_ids,
|
||||
@@ -206,3 +183,46 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
void SetPlasmaAddedCallback(plasma_callback_function callback)
|
||||
|
||||
void SubscribeToPlasmaAdd(const CObjectID &object_id)
|
||||
|
||||
cdef cppclass CCoreWorkerOptions "ray::CoreWorkerOptions":
|
||||
CWorkerType worker_type
|
||||
CLanguage language
|
||||
c_string store_socket
|
||||
c_string raylet_socket
|
||||
CJobID job_id
|
||||
CGcsClientOptions gcs_options
|
||||
c_string log_dir
|
||||
c_bool install_failure_signal_handler
|
||||
c_string node_ip_address
|
||||
int node_manager_port
|
||||
c_string driver_name
|
||||
c_string stdout_file
|
||||
c_string stderr_file
|
||||
(CRayStatus(
|
||||
CTaskType task_type,
|
||||
const CRayFunction &ray_function,
|
||||
const unordered_map[c_string, double] &resources,
|
||||
const c_vector[shared_ptr[CRayObject]] &args,
|
||||
const c_vector[CObjectID] &arg_reference_ids,
|
||||
const c_vector[CObjectID] &return_ids,
|
||||
c_vector[shared_ptr[CRayObject]] *returns) nogil
|
||||
) task_execution_callback
|
||||
(CRayStatus() nogil) check_signals
|
||||
(void() nogil) gc_collect
|
||||
(void(c_string *stack_out) nogil) get_lang_stack
|
||||
c_bool ref_counting_enabled
|
||||
c_bool is_local_mode
|
||||
int num_workers
|
||||
CCoreWorkerOptions()
|
||||
|
||||
cdef cppclass CCoreWorkerProcess "ray::CoreWorkerProcess":
|
||||
@staticmethod
|
||||
void Initialize(const CCoreWorkerOptions &options)
|
||||
# Only call this in CoreWorker.__cinit__,
|
||||
# use CoreWorker.core_worker to access C++ CoreWorker.
|
||||
@staticmethod
|
||||
CCoreWorker &GetCoreWorker()
|
||||
@staticmethod
|
||||
void Shutdown()
|
||||
@staticmethod
|
||||
void RunTaskExecutionLoop()
|
||||
|
||||
+20
-25
@@ -1173,27 +1173,14 @@ def connect(node,
|
||||
ray.state.state._initialize_global_state(
|
||||
node.redis_address, redis_password=node.redis_password)
|
||||
|
||||
# Register the worker with Redis.
|
||||
driver_name = ""
|
||||
log_stdout_file_name = ""
|
||||
log_stderr_file_name = ""
|
||||
if mode == SCRIPT_MODE:
|
||||
# The concept of a driver is the same as the concept of a "job".
|
||||
# Register the driver/job with Redis here.
|
||||
import __main__ as main
|
||||
driver_info = {
|
||||
"node_ip_address": node.node_ip_address,
|
||||
"driver_id": worker.worker_id,
|
||||
"start_time": time.time(),
|
||||
"plasma_store_socket": node.plasma_store_socket_name,
|
||||
"raylet_socket": node.raylet_socket_name,
|
||||
"name": (main.__file__
|
||||
if hasattr(main, "__file__") else "INTERACTIVE MODE")
|
||||
}
|
||||
worker.redis_client.hmset(b"Drivers:" + worker.worker_id, driver_info)
|
||||
driver_name = (main.__file__
|
||||
if hasattr(main, "__file__") else "INTERACTIVE MODE")
|
||||
elif mode == WORKER_MODE:
|
||||
# Register the worker with Redis.
|
||||
worker_dict = {
|
||||
"node_ip_address": node.node_ip_address,
|
||||
"plasma_store_socket": node.plasma_store_socket_name,
|
||||
}
|
||||
# Check the RedirectOutput key in Redis and based on its value redirect
|
||||
# worker output and error to their own files.
|
||||
# This key is set in services.py when Redis is started.
|
||||
@@ -1224,14 +1211,12 @@ def connect(node,
|
||||
print("Ray worker pid: {}".format(os.getpid()), file=sys.stderr)
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
|
||||
worker_dict["stdout_file"] = os.path.abspath(
|
||||
log_stdout_file_name = os.path.abspath(
|
||||
(log_stdout_file
|
||||
if log_stdout_file is not None else sys.stdout).name)
|
||||
worker_dict["stderr_file"] = os.path.abspath(
|
||||
log_stderr_file_name = os.path.abspath(
|
||||
(log_stderr_file
|
||||
if log_stderr_file is not None else sys.stderr).name)
|
||||
worker.redis_client.hmset(b"Workers:" + worker.worker_id, worker_dict)
|
||||
elif not LOCAL_MODE:
|
||||
raise ValueError(
|
||||
"Invalid worker mode. Expected DRIVER, WORKER or LOCAL.")
|
||||
@@ -1242,9 +1227,19 @@ def connect(node,
|
||||
node.redis_password,
|
||||
)
|
||||
worker.core_worker = ray._raylet.CoreWorker(
|
||||
(mode == SCRIPT_MODE), node.plasma_store_socket_name,
|
||||
node.raylet_socket_name, job_id, gcs_options, node.get_logs_dir_path(),
|
||||
node.node_ip_address, node.node_manager_port, mode == LOCAL_MODE)
|
||||
(mode == SCRIPT_MODE or mode == LOCAL_MODE),
|
||||
node.plasma_store_socket_name,
|
||||
node.raylet_socket_name,
|
||||
job_id,
|
||||
gcs_options,
|
||||
node.get_logs_dir_path(),
|
||||
node.node_ip_address,
|
||||
node.node_manager_port,
|
||||
(mode == LOCAL_MODE),
|
||||
driver_name,
|
||||
log_stdout_file_name,
|
||||
log_stderr_file_name,
|
||||
)
|
||||
|
||||
if driver_object_store_memory is not None:
|
||||
worker.core_worker.set_object_store_client_options(
|
||||
|
||||
Reference in New Issue
Block a user