mirror of
https://github.com/wassname/ray.git
synced 2026-07-01 22:23:13 +08:00
Implement actor checkpointing (#3839)
* Implement Actor checkpointing * docs * fix * fix * fix * move restore-from-checkpoint to HandleActorStateTransition * Revert "move restore-from-checkpoint to HandleActorStateTransition" This reverts commit 9aa4447c1e3e321f42a1d895d72f17098b72de12. * resubmit waiting tasks when actor frontier restored * add doc about num_actor_checkpoints_to_keep=1 * add num_actor_checkpoints_to_keep to Cython * add checkpoint_expired api * check if actor class is abstract * change checkpoint_ids to long string * implement java * Refactor to delay actor creation publish until checkpoint is resumed * debug, lint * Erase from checkpoints to restore if task fails * fix lint * update comments * avoid duplicated actor notification log * fix unintended change * add actor_id to checkpoint_expired * small java updates * make checkpoint info per actor * lint * Remove logging * Remove old actor checkpointing Python code, move new checkpointing code to FunctionActionManager * Replace old actor checkpointing tests * Fix test and lint * address comments * consolidate kill_actor * Remove __ray_checkpoint__ * fix non-ascii char * Loosen test checks * fix java * fix sphinx-build
This commit is contained in:
@@ -6,10 +6,19 @@ from libcpp.unordered_map cimport unordered_map
|
||||
from libcpp.vector cimport vector as c_vector
|
||||
|
||||
from ray.includes.unique_ids cimport (
|
||||
CUniqueID, TaskID as CTaskID, ObjectID as CObjectID,
|
||||
FunctionID as CFunctionID, ActorClassID as CActorClassID, ActorID as CActorID,
|
||||
ActorHandleID as CActorHandleID, WorkerID as CWorkerID,
|
||||
DriverID as CDriverID, ConfigID as CConfigID, ClientID as CClientID)
|
||||
ActorCheckpointID as CActorCheckpointID,
|
||||
ActorClassID as CActorClassID,
|
||||
ActorHandleID as CActorHandleID,
|
||||
ActorID as CActorID,
|
||||
CUniqueID,
|
||||
ClientID as CClientID,
|
||||
ConfigID as CConfigID,
|
||||
DriverID as CDriverID,
|
||||
FunctionID as CFunctionID,
|
||||
ObjectID as CObjectID,
|
||||
TaskID as CTaskID,
|
||||
WorkerID as CWorkerID,
|
||||
)
|
||||
|
||||
|
||||
cdef extern from "ray/status.h" namespace "ray" nogil:
|
||||
|
||||
@@ -8,9 +8,21 @@ from libcpp.vector cimport vector as c_vector
|
||||
|
||||
|
||||
from ray.includes.common cimport (
|
||||
CUniqueID, CTaskID, CObjectID, CFunctionID, CActorClassID, CActorID,
|
||||
CActorHandleID, CWorkerID, CDriverID, CConfigID, CClientID,
|
||||
CLanguage, CRayStatus)
|
||||
CActorCheckpointID,
|
||||
CActorClassID,
|
||||
CActorHandleID,
|
||||
CActorID,
|
||||
CClientID,
|
||||
CConfigID,
|
||||
CDriverID,
|
||||
CFunctionID,
|
||||
CLanguage,
|
||||
CObjectID,
|
||||
CRayStatus,
|
||||
CTaskID,
|
||||
CUniqueID,
|
||||
CWorkerID,
|
||||
)
|
||||
from ray.includes.task cimport CTaskSpecification
|
||||
|
||||
|
||||
@@ -57,6 +69,10 @@ cdef extern from "ray/raylet/raylet_client.h" nogil:
|
||||
CRayStatus PushProfileEvents(const GCSProfileTableDataT &profile_events)
|
||||
CRayStatus FreeObjects(const c_vector[CObjectID] &object_ids,
|
||||
c_bool local_only)
|
||||
CRayStatus PrepareActorCheckpoint(const CActorID &actor_id,
|
||||
CActorCheckpointID &checkpoint_id)
|
||||
CRayStatus NotifyActorResumedFromCheckpoint(
|
||||
const CActorID &actor_id, const CActorCheckpointID &checkpoint_id)
|
||||
CLanguage GetLanguage() const
|
||||
CClientID GetClientID() const
|
||||
CDriverID GetDriverID() const
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from libc.stdint cimport int64_t, uint64_t
|
||||
from libc.stdint cimport int64_t, uint64_t, uint32_t
|
||||
from libcpp.string cimport string as c_string
|
||||
from libcpp.unordered_map cimport unordered_map
|
||||
|
||||
@@ -80,4 +80,6 @@ cdef extern from "ray/ray_config.h" nogil:
|
||||
|
||||
int64_t max_task_lease_timeout_ms() const
|
||||
|
||||
uint32_t num_actor_checkpoints_to_keep() const
|
||||
|
||||
void initialize(const unordered_map[c_string, int] &config_map)
|
||||
|
||||
@@ -144,3 +144,7 @@ cdef class Config:
|
||||
@staticmethod
|
||||
def max_task_lease_timeout_ms():
|
||||
return RayConfig.instance().max_task_lease_timeout_ms()
|
||||
|
||||
@staticmethod
|
||||
def num_actor_checkpoints_to_keep():
|
||||
return RayConfig.instance().num_actor_checkpoints_to_keep()
|
||||
|
||||
@@ -28,6 +28,7 @@ cdef extern from "ray/id.h" namespace "ray" nogil:
|
||||
ctypedef CUniqueID ActorID
|
||||
ctypedef CUniqueID ActorClassID
|
||||
ctypedef CUniqueID ActorHandleID
|
||||
ctypedef CUniqueID ActorCheckpointID
|
||||
ctypedef CUniqueID WorkerID
|
||||
ctypedef CUniqueID DriverID
|
||||
ctypedef CUniqueID ConfigID
|
||||
|
||||
@@ -7,9 +7,21 @@ See https://github.com/ray-project/ray/issues/3721.
|
||||
# WARNING: Any additional ID types defined in this file must be added to the
|
||||
# _ID_TYPES list at the bottom of this file.
|
||||
from ray.includes.common cimport (
|
||||
CUniqueID, CTaskID, CObjectID, CFunctionID, CActorClassID, CActorID,
|
||||
CActorHandleID, CWorkerID, CDriverID, CConfigID, CClientID,
|
||||
ComputePutId, ComputeTaskId)
|
||||
CActorCheckpointID,
|
||||
CActorClassID,
|
||||
CActorHandleID,
|
||||
CActorID,
|
||||
CClientID,
|
||||
CConfigID,
|
||||
CDriverID,
|
||||
CFunctionID,
|
||||
CObjectID,
|
||||
CTaskID,
|
||||
CUniqueID,
|
||||
CWorkerID,
|
||||
ComputePutId,
|
||||
ComputeTaskId,
|
||||
)
|
||||
|
||||
from ray.utils import decode
|
||||
|
||||
@@ -236,6 +248,29 @@ cdef class ActorHandleID(UniqueID):
|
||||
return "ActorHandleID(" + self.hex() + ")"
|
||||
|
||||
|
||||
cdef class ActorCheckpointID(UniqueID):
|
||||
|
||||
def __init__(self, id):
|
||||
if not id:
|
||||
self.data = CUniqueID()
|
||||
else:
|
||||
check_id(id)
|
||||
self.data = CUniqueID.from_binary(id)
|
||||
|
||||
@staticmethod
|
||||
cdef from_native(const CActorCheckpointID& cpp_id):
|
||||
cdef ActorCheckpointID self = ActorCheckpointID.__new__(ActorHandleID)
|
||||
self.data = cpp_id
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def nil():
|
||||
return ActorCheckpointID.from_native(CActorCheckpointID.nil())
|
||||
|
||||
def __repr__(self):
|
||||
return "ActorCheckpointID(" + self.hex() + ")"
|
||||
|
||||
|
||||
cdef class FunctionID(UniqueID):
|
||||
|
||||
def __init__(self, id):
|
||||
|
||||
Reference in New Issue
Block a user