mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 15:56:55 +08:00
f31a79f3f7
* Implement Actor checkpointing * docs * fix * fix * fix * move restore-from-checkpoint to HandleActorStateTransition * Revert "move restore-from-checkpoint to HandleActorStateTransition" This reverts commit 9aa4447c1e3e321f42a1d895d72f17098b72de12. * resubmit waiting tasks when actor frontier restored * add doc about num_actor_checkpoints_to_keep=1 * add num_actor_checkpoints_to_keep to Cython * add checkpoint_expired api * check if actor class is abstract * change checkpoint_ids to long string * implement java * Refactor to delay actor creation publish until checkpoint is resumed * debug, lint * Erase from checkpoints to restore if task fails * fix lint * update comments * avoid duplicated actor notification log * fix unintended change * add actor_id to checkpoint_expired * small java updates * make checkpoint info per actor * lint * Remove logging * Remove old actor checkpointing Python code, move new checkpointing code to FunctionActionManager * Replace old actor checkpointing tests * Fix test and lint * address comments * consolidate kill_actor * Remove __ray_checkpoint__ * fix non-ascii char * Loosen test checks * fix java * fix sphinx-build
91 lines
3.2 KiB
Python
91 lines
3.2 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import flatbuffers
|
|
import ray.core.generated.ErrorTableData
|
|
|
|
from ray.core.generated.ActorCheckpointIdData import ActorCheckpointIdData
|
|
from ray.core.generated.ClientTableData import ClientTableData
|
|
from ray.core.generated.DriverTableData import DriverTableData
|
|
from ray.core.generated.ErrorTableData import ErrorTableData
|
|
from ray.core.generated.GcsTableEntry import GcsTableEntry
|
|
from ray.core.generated.HeartbeatBatchTableData import HeartbeatBatchTableData
|
|
from ray.core.generated.HeartbeatTableData import HeartbeatTableData
|
|
from ray.core.generated.Language import Language
|
|
from ray.core.generated.ObjectTableData import ObjectTableData
|
|
from ray.core.generated.ProfileTableData import ProfileTableData
|
|
from ray.core.generated.TablePrefix import TablePrefix
|
|
from ray.core.generated.TablePubsub import TablePubsub
|
|
|
|
from ray.core.generated.ray.protocol.Task import Task
|
|
|
|
__all__ = [
|
|
"ActorCheckpointIdData",
|
|
"ClientTableData",
|
|
"DriverTableData",
|
|
"ErrorTableData",
|
|
"GcsTableEntry",
|
|
"HeartbeatBatchTableData",
|
|
"HeartbeatTableData",
|
|
"Language",
|
|
"ObjectTableData",
|
|
"ProfileTableData",
|
|
"TablePrefix",
|
|
"TablePubsub",
|
|
"Task",
|
|
"construct_error_message",
|
|
]
|
|
|
|
FUNCTION_PREFIX = "RemoteFunction:"
|
|
LOG_FILE_CHANNEL = "RAY_LOG_CHANNEL"
|
|
|
|
# xray heartbeats
|
|
XRAY_HEARTBEAT_CHANNEL = str(TablePubsub.HEARTBEAT).encode("ascii")
|
|
XRAY_HEARTBEAT_BATCH_CHANNEL = str(TablePubsub.HEARTBEAT_BATCH).encode("ascii")
|
|
|
|
# xray driver updates
|
|
XRAY_DRIVER_CHANNEL = str(TablePubsub.DRIVER).encode("ascii")
|
|
|
|
# These prefixes must be kept up-to-date with the TablePrefix enum in gcs.fbs.
|
|
# TODO(rkn): We should use scoped enums, in which case we should be able to
|
|
# just access the flatbuffer generated values.
|
|
TablePrefix_RAYLET_TASK_string = "RAYLET_TASK"
|
|
TablePrefix_OBJECT_string = "OBJECT"
|
|
TablePrefix_ERROR_INFO_string = "ERROR_INFO"
|
|
TablePrefix_PROFILE_string = "PROFILE"
|
|
|
|
|
|
def construct_error_message(driver_id, error_type, message, timestamp):
|
|
"""Construct a serialized ErrorTableData object.
|
|
|
|
Args:
|
|
driver_id: The ID of the driver that the error should go to. If this is
|
|
nil, then the error will go to all drivers.
|
|
error_type: The type of the error.
|
|
message: The error message.
|
|
timestamp: The time of the error.
|
|
|
|
Returns:
|
|
The serialized object.
|
|
"""
|
|
builder = flatbuffers.Builder(0)
|
|
driver_offset = builder.CreateString(driver_id.binary())
|
|
error_type_offset = builder.CreateString(error_type)
|
|
message_offset = builder.CreateString(message)
|
|
|
|
ray.core.generated.ErrorTableData.ErrorTableDataStart(builder)
|
|
ray.core.generated.ErrorTableData.ErrorTableDataAddJobId(
|
|
builder, driver_offset)
|
|
ray.core.generated.ErrorTableData.ErrorTableDataAddType(
|
|
builder, error_type_offset)
|
|
ray.core.generated.ErrorTableData.ErrorTableDataAddErrorMessage(
|
|
builder, message_offset)
|
|
ray.core.generated.ErrorTableData.ErrorTableDataAddTimestamp(
|
|
builder, timestamp)
|
|
error_data_offset = ray.core.generated.ErrorTableData.ErrorTableDataEnd(
|
|
builder)
|
|
builder.Finish(error_data_offset)
|
|
|
|
return bytes(builder.Output())
|