mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 03:34:48 +08:00
Remove raylet client from Python worker (#6018)
This commit is contained in:
+29
-61
@@ -58,12 +58,6 @@ from ray.includes.common cimport (
|
||||
WORKER_TYPE_WORKER,
|
||||
WORKER_TYPE_DRIVER,
|
||||
)
|
||||
from ray.includes.libraylet cimport (
|
||||
CRayletClient,
|
||||
GCSProfileEvent,
|
||||
GCSProfileTableData,
|
||||
WaitResultPair,
|
||||
)
|
||||
from ray.includes.unique_ids cimport (
|
||||
CActorID,
|
||||
CActorCheckpointID,
|
||||
@@ -306,60 +300,6 @@ cdef void prepare_args(
|
||||
(<ObjectID>core_worker.put_serialized_object(
|
||||
serialized_arg)).native()))
|
||||
|
||||
|
||||
cdef class RayletClient:
|
||||
cdef CRayletClient* client
|
||||
|
||||
def __cinit__(self, CoreWorker core_worker):
|
||||
# The core worker and raylet client need to share an underlying
|
||||
# raylet client, so we take a reference to the core worker's client
|
||||
# here. The client is a raw pointer because it is only a temporary
|
||||
# workaround and will be removed once the core worker transition is
|
||||
# complete, so we don't want to change the unique_ptr in core worker
|
||||
# to a shared_ptr. This means the core worker *must* be
|
||||
# initialized before the raylet client.
|
||||
self.client = &core_worker.core_worker.get().GetRayletClient()
|
||||
|
||||
def fetch_or_reconstruct(self, object_ids,
|
||||
c_bool fetch_only,
|
||||
TaskID current_task_id=TaskID.nil()):
|
||||
cdef c_vector[CObjectID] fetch_ids = ObjectIDsToVector(object_ids)
|
||||
check_status(self.client.FetchOrReconstruct(
|
||||
fetch_ids, fetch_only, True, current_task_id.native()))
|
||||
|
||||
def push_error(self, JobID job_id, error_type, error_message,
|
||||
double timestamp):
|
||||
check_status(self.client.PushError(job_id.native(),
|
||||
error_type.encode("ascii"),
|
||||
error_message.encode("ascii"),
|
||||
timestamp))
|
||||
|
||||
def prepare_actor_checkpoint(self, ActorID actor_id):
|
||||
cdef:
|
||||
CActorCheckpointID checkpoint_id
|
||||
CActorID c_actor_id = actor_id.native()
|
||||
|
||||
# PrepareActorCheckpoint will wait for raylet's reply, release
|
||||
# the GIL so other Python threads can run.
|
||||
with nogil:
|
||||
check_status(self.client.PrepareActorCheckpoint(
|
||||
c_actor_id, checkpoint_id))
|
||||
return ActorCheckpointID(checkpoint_id.Binary())
|
||||
|
||||
def notify_actor_resumed_from_checkpoint(self, ActorID actor_id,
|
||||
ActorCheckpointID checkpoint_id):
|
||||
check_status(self.client.NotifyActorResumedFromCheckpoint(
|
||||
actor_id.native(), checkpoint_id.native()))
|
||||
|
||||
def set_resource(self, basestring resource_name,
|
||||
double capacity, ClientID client_id):
|
||||
self.client.SetResource(resource_name.encode("ascii"), capacity,
|
||||
CClientID.FromBinary(client_id.binary()))
|
||||
|
||||
@property
|
||||
def job_id(self):
|
||||
return JobID(self.client.GetJobID().Binary())
|
||||
|
||||
cdef deserialize_args(
|
||||
const c_vector[shared_ptr[CRayObject]] &c_args,
|
||||
const c_vector[CObjectID] &arg_reference_ids):
|
||||
@@ -770,7 +710,6 @@ cdef class CoreWorker:
|
||||
def wait(self, object_ids, int num_returns, int64_t timeout_ms,
|
||||
TaskID current_task_id):
|
||||
cdef:
|
||||
WaitResultPair result
|
||||
c_vector[CObjectID] wait_ids
|
||||
c_vector[c_bool] results
|
||||
CTaskID c_task_id = current_task_id.native()
|
||||
@@ -1099,6 +1038,35 @@ cdef class CoreWorker:
|
||||
async_retry_with_plasma_callback,
|
||||
<void*>future)
|
||||
|
||||
def push_error(self, JobID job_id, error_type, error_message,
|
||||
double timestamp):
|
||||
check_status(self.core_worker.get().PushError(
|
||||
job_id.native(), error_type.encode("ascii"),
|
||||
error_message.encode("ascii"), timestamp))
|
||||
|
||||
def prepare_actor_checkpoint(self, ActorID actor_id):
|
||||
cdef:
|
||||
CActorCheckpointID checkpoint_id
|
||||
CActorID c_actor_id = actor_id.native()
|
||||
|
||||
# PrepareActorCheckpoint will wait for raylet's reply, release
|
||||
# the GIL so other Python threads can run.
|
||||
with nogil:
|
||||
check_status(self.core_worker.get().PrepareActorCheckpoint(
|
||||
c_actor_id, &checkpoint_id))
|
||||
return ActorCheckpointID(checkpoint_id.Binary())
|
||||
|
||||
def notify_actor_resumed_from_checkpoint(self, ActorID actor_id,
|
||||
ActorCheckpointID checkpoint_id):
|
||||
check_status(self.core_worker.get().NotifyActorResumedFromCheckpoint(
|
||||
actor_id.native(), checkpoint_id.native()))
|
||||
|
||||
def set_resource(self, basestring resource_name,
|
||||
double capacity, ClientID client_id):
|
||||
self.core_worker.get().SetResource(
|
||||
resource_name.encode("ascii"), capacity,
|
||||
CClientID.FromBinary(client_id.binary()))
|
||||
|
||||
cdef void async_set_result_callback(shared_ptr[CRayObject] obj,
|
||||
CObjectID object_id,
|
||||
void *future) with gil:
|
||||
|
||||
@@ -31,5 +31,5 @@ def set_resource(resource_name, capacity, client_id=None):
|
||||
if (capacity < 0) or (capacity != int(capacity)):
|
||||
raise ValueError(
|
||||
"Capacity {} must be a non-negative integer.".format(capacity))
|
||||
return ray.worker.global_worker.raylet_client.set_resource(
|
||||
return ray.worker.global_worker.core_worker.set_resource(
|
||||
resource_name, capacity, client_id_obj)
|
||||
|
||||
@@ -817,8 +817,9 @@ class FunctionActorManager:
|
||||
if actor.should_checkpoint(checkpoint_context):
|
||||
try:
|
||||
now = int(1000 * time.time())
|
||||
checkpoint_id = (self._worker.raylet_client.
|
||||
prepare_actor_checkpoint(actor_id))
|
||||
checkpoint_id = (
|
||||
self._worker.core_worker.prepare_actor_checkpoint(actor_id)
|
||||
)
|
||||
checkpoint_info.checkpoint_ids.append(checkpoint_id)
|
||||
actor.save_checkpoint(actor_id, checkpoint_id)
|
||||
if (len(checkpoint_info.checkpoint_ids) >
|
||||
@@ -865,7 +866,7 @@ class FunctionActorManager:
|
||||
for checkpoint in checkpoints), msg
|
||||
# Notify raylet that this actor has been resumed from
|
||||
# a checkpoint.
|
||||
(self._worker.raylet_client.
|
||||
(self._worker.core_worker.
|
||||
notify_actor_resumed_from_checkpoint(
|
||||
actor_id, checkpoint_id))
|
||||
except Exception:
|
||||
|
||||
@@ -12,6 +12,8 @@ from libcpp.vector cimport vector as c_vector
|
||||
|
||||
from ray.includes.unique_ids cimport (
|
||||
CActorID,
|
||||
CActorCheckpointID,
|
||||
CClientID,
|
||||
CJobID,
|
||||
CTaskID,
|
||||
CObjectID,
|
||||
@@ -31,7 +33,6 @@ from ray.includes.common cimport (
|
||||
CGcsClientOptions,
|
||||
)
|
||||
from ray.includes.task cimport CTaskSpec
|
||||
from ray.includes.libraylet cimport CRayletClient
|
||||
|
||||
ctypedef unordered_map[c_string, c_vector[pair[int64_t, double]]] \
|
||||
ResourceMappingType
|
||||
@@ -107,9 +108,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
const c_vector[shared_ptr[CBuffer]] &metadatas,
|
||||
c_vector[shared_ptr[CRayObject]] *return_objects)
|
||||
|
||||
# TODO(edoakes): remove this once the raylet client is no longer used
|
||||
# directly.
|
||||
CRayletClient &GetRayletClient()
|
||||
CJobID GetCurrentJobId()
|
||||
CTaskID GetCurrentTaskId()
|
||||
const CActorID &GetActorId()
|
||||
@@ -159,3 +157,13 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
ray_callback_function successs_callback,
|
||||
ray_callback_function fallback_callback,
|
||||
void* python_future)
|
||||
|
||||
CRayStatus PushError(const CJobID &job_id, const c_string &type,
|
||||
const c_string &error_message, double timestamp)
|
||||
CRayStatus PrepareActorCheckpoint(const CActorID &actor_id,
|
||||
CActorCheckpointID *checkpoint_id)
|
||||
CRayStatus NotifyActorResumedFromCheckpoint(
|
||||
const CActorID &actor_id, const CActorCheckpointID &checkpoint_id)
|
||||
CRayStatus SetResource(const c_string &resource_name,
|
||||
const double capacity,
|
||||
const CClientID &client_Id)
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
from libc.stdint cimport int64_t
|
||||
from libcpp cimport bool as c_bool
|
||||
from libcpp.memory cimport unique_ptr
|
||||
from libcpp.string cimport string as c_string
|
||||
from libcpp.utility cimport pair
|
||||
from libcpp.vector cimport vector as c_vector
|
||||
|
||||
from ray.includes.common cimport (
|
||||
CLanguage,
|
||||
CRayStatus,
|
||||
)
|
||||
from ray.includes.unique_ids cimport (
|
||||
CActorCheckpointID,
|
||||
CActorID,
|
||||
CClientID,
|
||||
CJobID,
|
||||
CWorkerID,
|
||||
CObjectID,
|
||||
CTaskID,
|
||||
)
|
||||
from ray.includes.task cimport CTaskSpec
|
||||
|
||||
|
||||
cdef extern from "ray/protobuf/gcs.pb.h" nogil:
|
||||
cdef cppclass GCSProfileEvent "ProfileTableData::ProfileEvent":
|
||||
void set_event_type(const c_string &value)
|
||||
void set_start_time(double value)
|
||||
void set_end_time(double value)
|
||||
c_string set_extra_data(const c_string &value)
|
||||
GCSProfileEvent()
|
||||
|
||||
cdef cppclass GCSProfileTableData "ProfileTableData":
|
||||
void set_component_type(const c_string &value)
|
||||
void set_component_id(const c_string &value)
|
||||
void set_node_ip_address(const c_string &value)
|
||||
GCSProfileEvent *add_profile_events()
|
||||
GCSProfileTableData()
|
||||
|
||||
|
||||
ctypedef pair[c_vector[CObjectID], c_vector[CObjectID]] WaitResultPair
|
||||
|
||||
|
||||
cdef extern from "ray/raylet/raylet_client.h" nogil:
|
||||
cdef cppclass CRayletClient "ray::raylet::RayletClient":
|
||||
CRayletClient(const c_string &raylet_socket,
|
||||
const CWorkerID &worker_id,
|
||||
c_bool is_worker, const CJobID &job_id,
|
||||
const CLanguage &language)
|
||||
CRayStatus Disconnect()
|
||||
CRayStatus SubmitTask(const CTaskSpec &task_spec)
|
||||
CRayStatus FetchOrReconstruct(c_vector[CObjectID] &object_ids,
|
||||
c_bool fetch_only,
|
||||
c_bool is_direct_call_task,
|
||||
const CTaskID ¤t_task_id)
|
||||
CRayStatus NotifyUnblocked(const CTaskID ¤t_task_id)
|
||||
CRayStatus Wait(const c_vector[CObjectID] &object_ids,
|
||||
int num_returns, int64_t timeout_milliseconds,
|
||||
c_bool wait_local, c_bool is_direct_call_task,
|
||||
const CTaskID ¤t_task_id,
|
||||
WaitResultPair *result)
|
||||
CRayStatus PushError(const CJobID &job_id, const c_string &type,
|
||||
const c_string &error_message, double timestamp)
|
||||
CRayStatus PushProfileEvents(
|
||||
const GCSProfileTableData &profile_events)
|
||||
CRayStatus FreeObjects(const c_vector[CObjectID] &object_ids,
|
||||
c_bool local_only, c_bool delete_creating_tasks)
|
||||
CRayStatus PrepareActorCheckpoint(const CActorID &actor_id,
|
||||
CActorCheckpointID &checkpoint_id)
|
||||
CRayStatus NotifyActorResumedFromCheckpoint(
|
||||
const CActorID &actor_id, const CActorCheckpointID &checkpoint_id)
|
||||
CRayStatus SetResource(const c_string &resource_name,
|
||||
const double capacity,
|
||||
const CClientID &client_Id)
|
||||
CLanguage GetLanguage() const
|
||||
CWorkerID GetWorkerID() const
|
||||
CJobID GetJobID() const
|
||||
c_bool IsWorker() const
|
||||
@@ -332,8 +332,7 @@ def test_driver_put_errors(ray_start_object_store_memory):
|
||||
# were evicted and whose originating tasks are still running, this
|
||||
# for-loop should hang on its first iteration and push an error to the
|
||||
# driver.
|
||||
ray.worker.global_worker.raylet_client.fetch_or_reconstruct([args[0]],
|
||||
False)
|
||||
ray.wait([args[0]], timeout=30)
|
||||
|
||||
def error_check(errors):
|
||||
return len(errors) > 1
|
||||
|
||||
+1
-1
@@ -60,7 +60,7 @@ def push_error_to_driver(worker, error_type, message, job_id=None):
|
||||
if job_id is None:
|
||||
job_id = ray.JobID.nil()
|
||||
assert isinstance(job_id, ray.JobID)
|
||||
worker.raylet_client.push_error(job_id, error_type, message, time.time())
|
||||
worker.core_worker.push_error(job_id, error_type, message, time.time())
|
||||
|
||||
|
||||
def push_error_to_driver_through_redis(redis_client,
|
||||
|
||||
@@ -1252,7 +1252,6 @@ def connect(node,
|
||||
node.node_ip_address,
|
||||
node.node_manager_port,
|
||||
)
|
||||
worker.raylet_client = ray._raylet.RayletClient(worker.core_worker)
|
||||
|
||||
if driver_object_store_memory is not None:
|
||||
worker.core_worker.set_object_store_client_options(
|
||||
|
||||
Reference in New Issue
Block a user