Files
ray/python/ray/gcs_utils.py
T
Hao Chen f31a79f3f7 Implement actor checkpointing (#3839)
* Implement Actor checkpointing

* docs

* fix

* fix

* fix

* move restore-from-checkpoint to HandleActorStateTransition

* Revert "move restore-from-checkpoint to HandleActorStateTransition"

This reverts commit 9aa4447c1e3e321f42a1d895d72f17098b72de12.

* resubmit waiting tasks when actor frontier restored

* add doc about num_actor_checkpoints_to_keep=1

* add num_actor_checkpoints_to_keep to Cython

* add checkpoint_expired api

* check if actor class is abstract

* change checkpoint_ids to long string

* implement java

* Refactor to delay actor creation publish until checkpoint is resumed

* debug, lint

* Erase from checkpoints to restore if task fails

* fix lint

* update comments

* avoid duplicated actor notification log

* fix unintended change

* add actor_id to checkpoint_expired

* small java updates

* make checkpoint info per actor

* lint

* Remove logging

* Remove old actor checkpointing Python code, move new checkpointing code to FunctionActionManager

* Replace old actor checkpointing tests

* Fix test and lint

* address comments

* consolidate kill_actor

* Remove __ray_checkpoint__

* fix non-ascii char

* Loosen test checks

* fix java

* fix sphinx-build
2019-02-13 19:39:02 +08:00

91 lines
3.2 KiB
Python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import flatbuffers
import ray.core.generated.ErrorTableData
from ray.core.generated.ActorCheckpointIdData import ActorCheckpointIdData
from ray.core.generated.ClientTableData import ClientTableData
from ray.core.generated.DriverTableData import DriverTableData
from ray.core.generated.ErrorTableData import ErrorTableData
from ray.core.generated.GcsTableEntry import GcsTableEntry
from ray.core.generated.HeartbeatBatchTableData import HeartbeatBatchTableData
from ray.core.generated.HeartbeatTableData import HeartbeatTableData
from ray.core.generated.Language import Language
from ray.core.generated.ObjectTableData import ObjectTableData
from ray.core.generated.ProfileTableData import ProfileTableData
from ray.core.generated.TablePrefix import TablePrefix
from ray.core.generated.TablePubsub import TablePubsub
from ray.core.generated.ray.protocol.Task import Task
__all__ = [
"ActorCheckpointIdData",
"ClientTableData",
"DriverTableData",
"ErrorTableData",
"GcsTableEntry",
"HeartbeatBatchTableData",
"HeartbeatTableData",
"Language",
"ObjectTableData",
"ProfileTableData",
"TablePrefix",
"TablePubsub",
"Task",
"construct_error_message",
]
FUNCTION_PREFIX = "RemoteFunction:"
LOG_FILE_CHANNEL = "RAY_LOG_CHANNEL"
# xray heartbeats
XRAY_HEARTBEAT_CHANNEL = str(TablePubsub.HEARTBEAT).encode("ascii")
XRAY_HEARTBEAT_BATCH_CHANNEL = str(TablePubsub.HEARTBEAT_BATCH).encode("ascii")
# xray driver updates
XRAY_DRIVER_CHANNEL = str(TablePubsub.DRIVER).encode("ascii")
# These prefixes must be kept up-to-date with the TablePrefix enum in gcs.fbs.
# TODO(rkn): We should use scoped enums, in which case we should be able to
# just access the flatbuffer generated values.
TablePrefix_RAYLET_TASK_string = "RAYLET_TASK"
TablePrefix_OBJECT_string = "OBJECT"
TablePrefix_ERROR_INFO_string = "ERROR_INFO"
TablePrefix_PROFILE_string = "PROFILE"
def construct_error_message(driver_id, error_type, message, timestamp):
"""Construct a serialized ErrorTableData object.
Args:
driver_id: The ID of the driver that the error should go to. If this is
nil, then the error will go to all drivers.
error_type: The type of the error.
message: The error message.
timestamp: The time of the error.
Returns:
The serialized object.
"""
builder = flatbuffers.Builder(0)
driver_offset = builder.CreateString(driver_id.binary())
error_type_offset = builder.CreateString(error_type)
message_offset = builder.CreateString(message)
ray.core.generated.ErrorTableData.ErrorTableDataStart(builder)
ray.core.generated.ErrorTableData.ErrorTableDataAddJobId(
builder, driver_offset)
ray.core.generated.ErrorTableData.ErrorTableDataAddType(
builder, error_type_offset)
ray.core.generated.ErrorTableData.ErrorTableDataAddErrorMessage(
builder, message_offset)
ray.core.generated.ErrorTableData.ErrorTableDataAddTimestamp(
builder, timestamp)
error_data_offset = ray.core.generated.ErrorTableData.ErrorTableDataEnd(
builder)
builder.Finish(error_data_offset)
return bytes(builder.Output())