Implement actor checkpointing (#3839)

* Implement Actor checkpointing

* docs

* fix

* fix

* fix

* move restore-from-checkpoint to HandleActorStateTransition

* Revert "move restore-from-checkpoint to HandleActorStateTransition"

This reverts commit 9aa4447c1e3e321f42a1d895d72f17098b72de12.

* resubmit waiting tasks when actor frontier restored

* add doc about num_actor_checkpoints_to_keep=1

* add num_actor_checkpoints_to_keep to Cython

* add checkpoint_expired api

* check if actor class is abstract

* change checkpoint_ids to long string

* implement java

* Refactor to delay actor creation publish until checkpoint is resumed

* debug, lint

* Erase from checkpoints to restore if task fails

* fix lint

* update comments

* avoid duplicated actor notification log

* fix unintended change

* add actor_id to checkpoint_expired

* small java updates

* make checkpoint info per actor

* lint

* Remove logging

* Remove old actor checkpointing Python code, move new checkpointing code to FunctionActionManager

* Replace old actor checkpointing tests

* Fix test and lint

* address comments

* consolidate kill_actor

* Remove __ray_checkpoint__

* fix non-ascii char

* Loosen test checks

* fix java

* fix sphinx-build
This commit is contained in:
Hao Chen
2019-02-13 19:39:02 +08:00
committed by GitHub
parent 57dcd3033e
commit f31a79f3f7
41 changed files with 1708 additions and 490 deletions
+13 -4
View File
@@ -6,10 +6,19 @@ from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector as c_vector
from ray.includes.unique_ids cimport (
CUniqueID, TaskID as CTaskID, ObjectID as CObjectID,
FunctionID as CFunctionID, ActorClassID as CActorClassID, ActorID as CActorID,
ActorHandleID as CActorHandleID, WorkerID as CWorkerID,
DriverID as CDriverID, ConfigID as CConfigID, ClientID as CClientID)
ActorCheckpointID as CActorCheckpointID,
ActorClassID as CActorClassID,
ActorHandleID as CActorHandleID,
ActorID as CActorID,
CUniqueID,
ClientID as CClientID,
ConfigID as CConfigID,
DriverID as CDriverID,
FunctionID as CFunctionID,
ObjectID as CObjectID,
TaskID as CTaskID,
WorkerID as CWorkerID,
)
cdef extern from "ray/status.h" namespace "ray" nogil:
+19 -3
View File
@@ -8,9 +8,21 @@ from libcpp.vector cimport vector as c_vector
from ray.includes.common cimport (
CUniqueID, CTaskID, CObjectID, CFunctionID, CActorClassID, CActorID,
CActorHandleID, CWorkerID, CDriverID, CConfigID, CClientID,
CLanguage, CRayStatus)
CActorCheckpointID,
CActorClassID,
CActorHandleID,
CActorID,
CClientID,
CConfigID,
CDriverID,
CFunctionID,
CLanguage,
CObjectID,
CRayStatus,
CTaskID,
CUniqueID,
CWorkerID,
)
from ray.includes.task cimport CTaskSpecification
@@ -57,6 +69,10 @@ cdef extern from "ray/raylet/raylet_client.h" nogil:
CRayStatus PushProfileEvents(const GCSProfileTableDataT &profile_events)
CRayStatus FreeObjects(const c_vector[CObjectID] &object_ids,
c_bool local_only)
CRayStatus PrepareActorCheckpoint(const CActorID &actor_id,
CActorCheckpointID &checkpoint_id)
CRayStatus NotifyActorResumedFromCheckpoint(
const CActorID &actor_id, const CActorCheckpointID &checkpoint_id)
CLanguage GetLanguage() const
CClientID GetClientID() const
CDriverID GetDriverID() const
+3 -1
View File
@@ -1,4 +1,4 @@
from libc.stdint cimport int64_t, uint64_t
from libc.stdint cimport int64_t, uint64_t, uint32_t
from libcpp.string cimport string as c_string
from libcpp.unordered_map cimport unordered_map
@@ -80,4 +80,6 @@ cdef extern from "ray/ray_config.h" nogil:
int64_t max_task_lease_timeout_ms() const
uint32_t num_actor_checkpoints_to_keep() const
void initialize(const unordered_map[c_string, int] &config_map)
+4
View File
@@ -144,3 +144,7 @@ cdef class Config:
@staticmethod
def max_task_lease_timeout_ms():
return RayConfig.instance().max_task_lease_timeout_ms()
@staticmethod
def num_actor_checkpoints_to_keep():
return RayConfig.instance().num_actor_checkpoints_to_keep()
+1
View File
@@ -28,6 +28,7 @@ cdef extern from "ray/id.h" namespace "ray" nogil:
ctypedef CUniqueID ActorID
ctypedef CUniqueID ActorClassID
ctypedef CUniqueID ActorHandleID
ctypedef CUniqueID ActorCheckpointID
ctypedef CUniqueID WorkerID
ctypedef CUniqueID DriverID
ctypedef CUniqueID ConfigID
+38 -3
View File
@@ -7,9 +7,21 @@ See https://github.com/ray-project/ray/issues/3721.
# WARNING: Any additional ID types defined in this file must be added to the
# _ID_TYPES list at the bottom of this file.
from ray.includes.common cimport (
CUniqueID, CTaskID, CObjectID, CFunctionID, CActorClassID, CActorID,
CActorHandleID, CWorkerID, CDriverID, CConfigID, CClientID,
ComputePutId, ComputeTaskId)
CActorCheckpointID,
CActorClassID,
CActorHandleID,
CActorID,
CClientID,
CConfigID,
CDriverID,
CFunctionID,
CObjectID,
CTaskID,
CUniqueID,
CWorkerID,
ComputePutId,
ComputeTaskId,
)
from ray.utils import decode
@@ -236,6 +248,29 @@ cdef class ActorHandleID(UniqueID):
return "ActorHandleID(" + self.hex() + ")"
cdef class ActorCheckpointID(UniqueID):
def __init__(self, id):
if not id:
self.data = CUniqueID()
else:
check_id(id)
self.data = CUniqueID.from_binary(id)
@staticmethod
cdef from_native(const CActorCheckpointID& cpp_id):
cdef ActorCheckpointID self = ActorCheckpointID.__new__(ActorHandleID)
self.data = cpp_id
return self
@staticmethod
def nil():
return ActorCheckpointID.from_native(CActorCheckpointID.nil())
def __repr__(self):
return "ActorCheckpointID(" + self.hex() + ")"
cdef class FunctionID(UniqueID):
def __init__(self, id):