[Core]Remove checkpoint table (#12235)

* Delete an actor entry from node manager.

* Remove checkpoint table

* remote checkpoint interface

* remove checkpoint interface

* fix ExitActorTest

Co-authored-by: chaokunyang <shawn.ck.yang@gmail.com>
This commit is contained in:
SangBin Cho
2020-12-01 08:58:36 -08:00
committed by GitHub
parent 9021f15b2a
commit f6f3cc9af1
52 changed files with 6 additions and 2012 deletions
-2
View File
@@ -63,7 +63,6 @@ if os.path.exists(so_path):
import ray._raylet # noqa: E402
from ray._raylet import (
ActorCheckpointID,
ActorClassID,
ActorID,
NodeID,
@@ -149,7 +148,6 @@ __all__ = [
# ID types
__all__ += [
"ActorCheckpointID",
"ActorClassID",
"ActorID",
"NodeID",
-26
View File
@@ -73,7 +73,6 @@ from ray.includes.common cimport (
)
from ray.includes.unique_ids cimport (
CActorID,
CActorCheckpointID,
CObjectID,
CNodeID,
CPlacementGroupID,
@@ -357,11 +356,6 @@ cdef execute_task(
actor_class = manager.load_actor_class(job_id, function_descriptor)
actor_id = core_worker.get_actor_id()
worker.actors[actor_id] = actor_class.__new__(actor_class)
worker.actor_checkpoint_info[actor_id] = (
ray.worker.ActorCheckpointInfo(
num_tasks_since_last_checkpoint=0,
last_checkpoint_timestamp=int(1000 * time.time()),
checkpoint_ids=[]))
execution_info = execution_infos.get(function_descriptor)
if not execution_info:
@@ -1470,26 +1464,6 @@ cdef class CoreWorker:
job_id.native(), error_type.encode("ascii"),
error_message.encode("ascii"), timestamp))
def prepare_actor_checkpoint(self, ActorID actor_id):
cdef:
CActorCheckpointID checkpoint_id
CActorID c_actor_id = actor_id.native()
# PrepareActorCheckpoint will wait for raylet's reply, release
# the GIL so other Python threads can run.
with nogil:
check_status(
CCoreWorkerProcess.GetCoreWorker()
.PrepareActorCheckpoint(c_actor_id, &checkpoint_id))
return ActorCheckpointID(checkpoint_id.Binary())
def notify_actor_resumed_from_checkpoint(self, ActorID actor_id,
ActorCheckpointID checkpoint_id):
check_status(
CCoreWorkerProcess.GetCoreWorker()
.NotifyActorResumedFromCheckpoint(
actor_id.native(), checkpoint_id.native()))
def set_resource(self, basestring resource_name,
double capacity, NodeID client_id):
CCoreWorkerProcess.GetCoreWorker().SetResource(
-2
View File
@@ -1,6 +1,5 @@
from ray.core.generated.common_pb2 import ErrorType
from ray.core.generated.gcs_pb2 import (
ActorCheckpointIdData,
ActorTableData,
GcsNodeInfo,
AvailableResources,
@@ -26,7 +25,6 @@ from ray.core.generated.gcs_pb2 import (
)
__all__ = [
"ActorCheckpointIdData",
"ActorTableData",
"GcsNodeInfo",
"AvailableResources",
-5
View File
@@ -13,7 +13,6 @@ from libcpp.vector cimport vector as c_vector
from ray.includes.unique_ids cimport (
CActorID,
CActorCheckpointID,
CNodeID,
CJobID,
CTaskID,
@@ -194,10 +193,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
CRayStatus PushError(const CJobID &job_id, const c_string &type,
const c_string &error_message, double timestamp)
CRayStatus PrepareActorCheckpoint(const CActorID &actor_id,
CActorCheckpointID *checkpoint_id)
CRayStatus NotifyActorResumedFromCheckpoint(
const CActorID &actor_id, const CActorCheckpointID &checkpoint_id)
CRayStatus SetResource(const c_string &resource_name,
const double capacity,
const CNodeID &client_Id)
-5
View File
@@ -40,11 +40,6 @@ cdef extern from "ray/common/id.h" namespace "ray" nogil:
@staticmethod
size_t Size()
cdef cppclass CActorCheckpointID "ray::ActorCheckpointID"(CUniqueID):
@staticmethod
CActorCheckpointID FromBinary(const c_string &binary)
cdef cppclass CActorClassID "ray::ActorClassID"(CUniqueID):
@staticmethod
-12
View File
@@ -9,7 +9,6 @@ See https://github.com/ray-project/ray/issues/3721.
import os
from ray.includes.unique_ids cimport (
CActorCheckpointID,
CActorClassID,
CActorID,
CNodeID,
@@ -303,16 +302,6 @@ cdef class ActorID(BaseID):
return self.data.Hash()
cdef class ActorCheckpointID(UniqueID):
def __init__(self, id):
check_id(id)
self.data = CActorCheckpointID.FromBinary(<c_string>id)
cdef CActorCheckpointID native(self):
return <CActorCheckpointID>self.data
cdef class FunctionID(UniqueID):
def __init__(self, id):
@@ -373,7 +362,6 @@ cdef class PlacementGroupID(BaseID):
return self.data.Hash()
_ID_TYPES = [
ActorCheckpointID,
ActorClassID,
ActorID,
NodeID,
-31
View File
@@ -832,37 +832,6 @@ class GlobalState:
return dict(total_available_resources)
def actor_checkpoint_info(self, actor_id):
"""Get checkpoint info for the given actor id.
Args:
actor_id: Actor's ID.
Returns:
A dictionary with information about the actor's checkpoint IDs and
their timestamps.
"""
self._check_connected()
message = self._execute_command(
actor_id,
"RAY.TABLE_LOOKUP",
gcs_utils.TablePrefix.Value("ACTOR_CHECKPOINT_ID"),
"",
actor_id.binary(),
)
if message is None:
return None
gcs_entry = gcs_utils.GcsEntry.FromString(message)
entry = gcs_utils.ActorCheckpointIdData.FromString(
gcs_entry.entries[0])
checkpoint_ids = [
ray.ActorCheckpointID(checkpoint_id)
for checkpoint_id in entry.checkpoint_ids
]
return {
"ActorID": ray.utils.binary_to_hex(entry.actor_id),
"CheckpointIds": checkpoint_ids,
"Timestamps": list(entry.timestamps),
}
state = GlobalState()
"""A global object used to access the cluster's global state."""
-13
View File
@@ -23,19 +23,6 @@ def test_was_current_actor_reconstructed(shutdown_only):
def get_pid(self):
return os.getpid()
# The following methods is to apply the checkpointable interface.
def should_checkpoint(self, checkpoint_context):
return False
def save_checkpoint(self, actor_id, checkpoint_id):
pass
def load_checkpoint(self, actor_id, available_checkpoints):
pass
def checkpoint_expired(self, actor_id, checkpoint_id):
pass
a = A.remote()
# `was_reconstructed` should be False when it's called in actor.
assert ray.get(a.get_was_reconstructed.remote()) is False
-21
View File
@@ -64,25 +64,6 @@ ERROR_KEY_PREFIX = b"Error:"
logger = logging.getLogger(__name__)
class ActorCheckpointInfo:
"""Information used to maintain actor checkpoints."""
__slots__ = [
# Number of tasks executed since last checkpoint.
"num_tasks_since_last_checkpoint",
# Timestamp of the last checkpoint, in milliseconds.
"last_checkpoint_timestamp",
# IDs of the previous checkpoints.
"checkpoint_ids",
]
def __init__(self, num_tasks_since_last_checkpoint,
last_checkpoint_timestamp, checkpoint_ids):
self.num_tasks_since_last_checkpoint = num_tasks_since_last_checkpoint
self.last_checkpoint_timestamp = last_checkpoint_timestamp
self.checkpoint_ids = checkpoint_ids
class Worker:
"""A class used to define the control flow of a worker process.
@@ -106,8 +87,6 @@ class Worker:
self.cached_functions_to_run = []
self.actor_init_error = None
self.actors = {}
# Information used to maintain actor checkpoints.
self.actor_checkpoint_info = {}
# When the worker is constructed. Record the original value of the
# CUDA_VISIBLE_DEVICES environment variable.
self.original_gpu_ids = ray.utils.get_cuda_visible_devices()