mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 06:33:06 +08:00
[Core]Remove checkpoint table (#12235)
* Delete an actor entry from node manager. * Remove checkpoint table * remote checkpoint interface * remove checkpoint interface * fix ExitActorTest Co-authored-by: chaokunyang <shawn.ck.yang@gmail.com>
This commit is contained in:
@@ -63,7 +63,6 @@ if os.path.exists(so_path):
|
||||
import ray._raylet # noqa: E402
|
||||
|
||||
from ray._raylet import (
|
||||
ActorCheckpointID,
|
||||
ActorClassID,
|
||||
ActorID,
|
||||
NodeID,
|
||||
@@ -149,7 +148,6 @@ __all__ = [
|
||||
|
||||
# ID types
|
||||
__all__ += [
|
||||
"ActorCheckpointID",
|
||||
"ActorClassID",
|
||||
"ActorID",
|
||||
"NodeID",
|
||||
|
||||
@@ -73,7 +73,6 @@ from ray.includes.common cimport (
|
||||
)
|
||||
from ray.includes.unique_ids cimport (
|
||||
CActorID,
|
||||
CActorCheckpointID,
|
||||
CObjectID,
|
||||
CNodeID,
|
||||
CPlacementGroupID,
|
||||
@@ -357,11 +356,6 @@ cdef execute_task(
|
||||
actor_class = manager.load_actor_class(job_id, function_descriptor)
|
||||
actor_id = core_worker.get_actor_id()
|
||||
worker.actors[actor_id] = actor_class.__new__(actor_class)
|
||||
worker.actor_checkpoint_info[actor_id] = (
|
||||
ray.worker.ActorCheckpointInfo(
|
||||
num_tasks_since_last_checkpoint=0,
|
||||
last_checkpoint_timestamp=int(1000 * time.time()),
|
||||
checkpoint_ids=[]))
|
||||
|
||||
execution_info = execution_infos.get(function_descriptor)
|
||||
if not execution_info:
|
||||
@@ -1470,26 +1464,6 @@ cdef class CoreWorker:
|
||||
job_id.native(), error_type.encode("ascii"),
|
||||
error_message.encode("ascii"), timestamp))
|
||||
|
||||
def prepare_actor_checkpoint(self, ActorID actor_id):
|
||||
cdef:
|
||||
CActorCheckpointID checkpoint_id
|
||||
CActorID c_actor_id = actor_id.native()
|
||||
|
||||
# PrepareActorCheckpoint will wait for raylet's reply, release
|
||||
# the GIL so other Python threads can run.
|
||||
with nogil:
|
||||
check_status(
|
||||
CCoreWorkerProcess.GetCoreWorker()
|
||||
.PrepareActorCheckpoint(c_actor_id, &checkpoint_id))
|
||||
return ActorCheckpointID(checkpoint_id.Binary())
|
||||
|
||||
def notify_actor_resumed_from_checkpoint(self, ActorID actor_id,
|
||||
ActorCheckpointID checkpoint_id):
|
||||
check_status(
|
||||
CCoreWorkerProcess.GetCoreWorker()
|
||||
.NotifyActorResumedFromCheckpoint(
|
||||
actor_id.native(), checkpoint_id.native()))
|
||||
|
||||
def set_resource(self, basestring resource_name,
|
||||
double capacity, NodeID client_id):
|
||||
CCoreWorkerProcess.GetCoreWorker().SetResource(
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from ray.core.generated.common_pb2 import ErrorType
|
||||
from ray.core.generated.gcs_pb2 import (
|
||||
ActorCheckpointIdData,
|
||||
ActorTableData,
|
||||
GcsNodeInfo,
|
||||
AvailableResources,
|
||||
@@ -26,7 +25,6 @@ from ray.core.generated.gcs_pb2 import (
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ActorCheckpointIdData",
|
||||
"ActorTableData",
|
||||
"GcsNodeInfo",
|
||||
"AvailableResources",
|
||||
|
||||
@@ -13,7 +13,6 @@ from libcpp.vector cimport vector as c_vector
|
||||
|
||||
from ray.includes.unique_ids cimport (
|
||||
CActorID,
|
||||
CActorCheckpointID,
|
||||
CNodeID,
|
||||
CJobID,
|
||||
CTaskID,
|
||||
@@ -194,10 +193,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
|
||||
CRayStatus PushError(const CJobID &job_id, const c_string &type,
|
||||
const c_string &error_message, double timestamp)
|
||||
CRayStatus PrepareActorCheckpoint(const CActorID &actor_id,
|
||||
CActorCheckpointID *checkpoint_id)
|
||||
CRayStatus NotifyActorResumedFromCheckpoint(
|
||||
const CActorID &actor_id, const CActorCheckpointID &checkpoint_id)
|
||||
CRayStatus SetResource(const c_string &resource_name,
|
||||
const double capacity,
|
||||
const CNodeID &client_Id)
|
||||
|
||||
@@ -40,11 +40,6 @@ cdef extern from "ray/common/id.h" namespace "ray" nogil:
|
||||
@staticmethod
|
||||
size_t Size()
|
||||
|
||||
cdef cppclass CActorCheckpointID "ray::ActorCheckpointID"(CUniqueID):
|
||||
|
||||
@staticmethod
|
||||
CActorCheckpointID FromBinary(const c_string &binary)
|
||||
|
||||
cdef cppclass CActorClassID "ray::ActorClassID"(CUniqueID):
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -9,7 +9,6 @@ See https://github.com/ray-project/ray/issues/3721.
|
||||
import os
|
||||
|
||||
from ray.includes.unique_ids cimport (
|
||||
CActorCheckpointID,
|
||||
CActorClassID,
|
||||
CActorID,
|
||||
CNodeID,
|
||||
@@ -303,16 +302,6 @@ cdef class ActorID(BaseID):
|
||||
return self.data.Hash()
|
||||
|
||||
|
||||
cdef class ActorCheckpointID(UniqueID):
|
||||
|
||||
def __init__(self, id):
|
||||
check_id(id)
|
||||
self.data = CActorCheckpointID.FromBinary(<c_string>id)
|
||||
|
||||
cdef CActorCheckpointID native(self):
|
||||
return <CActorCheckpointID>self.data
|
||||
|
||||
|
||||
cdef class FunctionID(UniqueID):
|
||||
|
||||
def __init__(self, id):
|
||||
@@ -373,7 +362,6 @@ cdef class PlacementGroupID(BaseID):
|
||||
return self.data.Hash()
|
||||
|
||||
_ID_TYPES = [
|
||||
ActorCheckpointID,
|
||||
ActorClassID,
|
||||
ActorID,
|
||||
NodeID,
|
||||
|
||||
@@ -832,37 +832,6 @@ class GlobalState:
|
||||
|
||||
return dict(total_available_resources)
|
||||
|
||||
def actor_checkpoint_info(self, actor_id):
|
||||
"""Get checkpoint info for the given actor id.
|
||||
Args:
|
||||
actor_id: Actor's ID.
|
||||
Returns:
|
||||
A dictionary with information about the actor's checkpoint IDs and
|
||||
their timestamps.
|
||||
"""
|
||||
self._check_connected()
|
||||
message = self._execute_command(
|
||||
actor_id,
|
||||
"RAY.TABLE_LOOKUP",
|
||||
gcs_utils.TablePrefix.Value("ACTOR_CHECKPOINT_ID"),
|
||||
"",
|
||||
actor_id.binary(),
|
||||
)
|
||||
if message is None:
|
||||
return None
|
||||
gcs_entry = gcs_utils.GcsEntry.FromString(message)
|
||||
entry = gcs_utils.ActorCheckpointIdData.FromString(
|
||||
gcs_entry.entries[0])
|
||||
checkpoint_ids = [
|
||||
ray.ActorCheckpointID(checkpoint_id)
|
||||
for checkpoint_id in entry.checkpoint_ids
|
||||
]
|
||||
return {
|
||||
"ActorID": ray.utils.binary_to_hex(entry.actor_id),
|
||||
"CheckpointIds": checkpoint_ids,
|
||||
"Timestamps": list(entry.timestamps),
|
||||
}
|
||||
|
||||
|
||||
state = GlobalState()
|
||||
"""A global object used to access the cluster's global state."""
|
||||
|
||||
@@ -23,19 +23,6 @@ def test_was_current_actor_reconstructed(shutdown_only):
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
# The following methods is to apply the checkpointable interface.
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
return False
|
||||
|
||||
def save_checkpoint(self, actor_id, checkpoint_id):
|
||||
pass
|
||||
|
||||
def load_checkpoint(self, actor_id, available_checkpoints):
|
||||
pass
|
||||
|
||||
def checkpoint_expired(self, actor_id, checkpoint_id):
|
||||
pass
|
||||
|
||||
a = A.remote()
|
||||
# `was_reconstructed` should be False when it's called in actor.
|
||||
assert ray.get(a.get_was_reconstructed.remote()) is False
|
||||
|
||||
@@ -64,25 +64,6 @@ ERROR_KEY_PREFIX = b"Error:"
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ActorCheckpointInfo:
|
||||
"""Information used to maintain actor checkpoints."""
|
||||
|
||||
__slots__ = [
|
||||
# Number of tasks executed since last checkpoint.
|
||||
"num_tasks_since_last_checkpoint",
|
||||
# Timestamp of the last checkpoint, in milliseconds.
|
||||
"last_checkpoint_timestamp",
|
||||
# IDs of the previous checkpoints.
|
||||
"checkpoint_ids",
|
||||
]
|
||||
|
||||
def __init__(self, num_tasks_since_last_checkpoint,
|
||||
last_checkpoint_timestamp, checkpoint_ids):
|
||||
self.num_tasks_since_last_checkpoint = num_tasks_since_last_checkpoint
|
||||
self.last_checkpoint_timestamp = last_checkpoint_timestamp
|
||||
self.checkpoint_ids = checkpoint_ids
|
||||
|
||||
|
||||
class Worker:
|
||||
"""A class used to define the control flow of a worker process.
|
||||
|
||||
@@ -106,8 +87,6 @@ class Worker:
|
||||
self.cached_functions_to_run = []
|
||||
self.actor_init_error = None
|
||||
self.actors = {}
|
||||
# Information used to maintain actor checkpoints.
|
||||
self.actor_checkpoint_info = {}
|
||||
# When the worker is constructed. Record the original value of the
|
||||
# CUDA_VISIBLE_DEVICES environment variable.
|
||||
self.original_gpu_ids = ray.utils.get_cuda_visible_devices()
|
||||
|
||||
Reference in New Issue
Block a user