Actor checkpointing with object lineage reconstruction (#1004)

* Worker reports error in previous task, actor task counter is incremented after task is successful

* Refactor actor task execution

- Return new task counter in GetTaskRequest
- Update worker state for actor tasks inside of the actor method
  executor

* Manually invoked checkpoint method

* Scheduling for actor checkpoint methods

* Fix python bugs in checkpointing

* Return task success from worker to local scheduler instead of actor counter

* Kill local schedulers halfway through actor execution instead of waiting for all tasks to execute once

* Remove redundant actor tasks during dispatch, reconstruct missing dependencies for actor tasks

* Make executor for temporary actor methods

* doc

* Set default argument for whether the previous task was a success

* Refactor actor method call

* Simplify checkpoint task submission

* lint

* fix philipp's comments

* Add missing line

* Make actor reconstruction tests run faster

* Unimportant whitespace.

* Unimportant whitespace.

* Update checkpoint method signature

* Documentation and handle exceptions during checkpoint save/resume

* Rename get_task message field to actor_checkpoint_failed

* Fix bug.

* Remove debugging check, redirect test output
This commit is contained in:
Stephanie Wang
2017-10-12 09:53:32 -07:00
committed by Robert Nishihara
parent b585001881
commit 3764f2f2e1
14 changed files with 608 additions and 210 deletions
+304 -48
View File
@@ -7,8 +7,10 @@ import copy
import hashlib
import inspect
import json
import numpy as np
import traceback
import pyarrow.plasma as plasma
import ray.local_scheduler
import ray.signature as signature
import ray.worker
@@ -40,12 +42,31 @@ def get_actor_method_function_id(attr):
return ray.local_scheduler.ObjectID(function_id)
def get_actor_checkpoint(actor_id, worker):
def get_checkpoint_indices(worker, actor_id):
"""Get the checkpoint indices associated with a given actor ID.
Args:
worker: The worker to use to get the checkpoint indices.
actor_id: The actor ID of the actor to get the checkpoint indices for.
Returns:
The indices of existing checkpoints as a list of integers.
"""
actor_key = b"Actor:" + actor_id
checkpoint_indices = []
for key in worker.redis_client.hkeys(actor_key):
if key.startswith(b"checkpoint_"):
index = int(key[len(b"checkpoint_"):])
checkpoint_indices.append(index)
return checkpoint_indices
def get_actor_checkpoint(worker, actor_id):
"""Get the most recent checkpoint associated with a given actor ID.
Args:
actor_id: The actor ID of the actor to get the checkpoint for.
worker: The worker to use to get the checkpoint.
actor_id: The actor ID of the actor to get the checkpoint for.
Returns:
If a checkpoint exists, this returns a tuple of the checkpoint index
@@ -53,18 +74,103 @@ def get_actor_checkpoint(actor_id, worker):
index is the actor counter of the last task that was executed on
the actor before the checkpoint was made.
"""
# Get all of the keys associated with checkpoints for this actor.
actor_key = b"Actor:" + actor_id
checkpoint_indices = [int(key[len(b"checkpoint_"):])
for key in worker.redis_client.hkeys(actor_key)
if key.startswith(b"checkpoint_")]
checkpoint_indices = get_checkpoint_indices(worker, actor_id)
if len(checkpoint_indices) == 0:
return -1, None
most_recent_checkpoint_index = max(checkpoint_indices)
# Get the most recent checkpoint.
checkpoint = worker.redis_client.hget(
actor_key, "checkpoint_{}".format(most_recent_checkpoint_index))
return most_recent_checkpoint_index, checkpoint
else:
actor_key = b"Actor:" + actor_id
checkpoint_index = max(checkpoint_indices)
checkpoint = worker.redis_client.hget(
actor_key, "checkpoint_{}".format(checkpoint_index))
return checkpoint_index, checkpoint
def put_dummy_object(worker, dummy_object_id):
"""Put a dummy actor object into the local object store.
This registers a dummy object ID in the local store with an empty numpy
array as the value. The resulting object is pinned to the store by storing
it to the worker's state.
For actors, dummy objects are used to store the stateful dependencies
between consecutive method calls. This function should be called for every
actor method execution that updates the actor's internal state.
Args:
worker: The worker to use to perform the put.
dummy_object_id: The object ID of the dummy object.
"""
# Add the dummy output for actor tasks. TODO(swang): We use
# a numpy array as a hack to pin the object in the object
# store. Once we allow object pinning in the store, we may
# use `None`.
dummy_object = np.zeros(1)
worker.put_object(dummy_object_id, dummy_object)
# Keep the dummy output in scope for the lifetime of the
# actor, to prevent eviction from the object store.
dummy_object = worker.get_object([dummy_object_id])
dummy_object = dummy_object[0]
worker.actor_pinned_objects[dummy_object_id] = dummy_object
def is_checkpoint_task(task_counter, checkpoint_interval):
if checkpoint_interval <= 0:
return False
return (task_counter % checkpoint_interval == 0)
def make_actor_method_executor(worker, method_name, method):
"""Make an executor that wraps a user-defined actor method.
The executor wraps the method to update the worker's internal state. If the
task is a success, the dummy object returned is added to the object store,
to signal that the following task can run, and the worker's task counter is
updated to match the executed task. Else, the executor reports failure to
the local scheduler so that the task counter does not get updated.
Args:
worker (Worker): The worker that is executing the actor.
method_name (str): The name of the actor method.
method (instancemethod): The actor method to wrap. This should be a
method defined on the actor class and should therefore take an
instance of the actor as the first argument.
Returns:
A function that executes the given actor method on the worker's stored
instance of the actor. The function also updates the worker's
internal state to record the executed method.
"""
def actor_method_executor(dummy_return_id, task_counter, actor,
*args):
# An actor task's dependency on the previous task is represented by
# a dummy argument. Remove this argument before invocation.
args = args[:-1]
if method_name == "__ray_checkpoint__":
# Execute the checkpoint task.
actor_checkpoint_failed, error = method(actor, *args)
# If the checkpoint was successfully loaded, put the dummy object
# and update the actor's task counter, so that the task following
# the checkpoint can run.
if not actor_checkpoint_failed:
put_dummy_object(worker, dummy_return_id)
worker.actor_task_counter = task_counter + 1
# Report to the local scheduler whether this task succeeded in
# loading the checkpoint.
worker.actor_checkpoint_failed = actor_checkpoint_failed
# If there was an exception during the checkpoint method, re-raise
# it after updating the actor's internal state.
if error is not None:
raise error
return None
else:
# Update the worker's internal state before executing the method in
# case the method throws an exception.
put_dummy_object(worker, dummy_return_id)
worker.actor_task_counter = task_counter + 1
# Execute the actor method.
return method(actor, *args)
return actor_method_executor
def fetch_and_register_actor(actor_class_key, worker):
@@ -100,8 +206,11 @@ def fetch_and_register_actor(actor_class_key, worker):
"cannot execute this method".format(actor_name))
for actor_method_name in actor_method_names:
function_id = get_actor_method_function_id(actor_method_name).id()
temporary_executor = make_actor_method_executor(worker,
actor_method_name,
temporary_actor_method)
worker.functions[driver_id][function_id] = (actor_method_name,
temporary_actor_method)
temporary_executor)
worker.function_properties[driver_id][function_id] = (
FunctionProperties(num_return_vals=2,
num_cpus=1,
@@ -112,6 +221,7 @@ def fetch_and_register_actor(actor_class_key, worker):
try:
unpickled_class = pickle.loads(pickled_class)
worker.actor_class = unpickled_class
except Exception:
# If an exception was thrown when the actor was imported, we record the
# traceback and notify the scheduler of the failure.
@@ -126,11 +236,15 @@ def fetch_and_register_actor(actor_class_key, worker):
# TODO(pcm): Why is the below line necessary?
unpickled_class.__module__ = module
worker.actors[actor_id_str] = unpickled_class.__new__(unpickled_class)
for (k, v) in inspect.getmembers(
actor_methods = inspect.getmembers(
unpickled_class, predicate=(lambda x: (inspect.isfunction(x) or
inspect.ismethod(x)))):
function_id = get_actor_method_function_id(k).id()
worker.functions[driver_id][function_id] = (k, v)
inspect.ismethod(x))))
for actor_method_name, actor_method in actor_methods:
function_id = get_actor_method_function_id(actor_method_name).id()
executor = make_actor_method_executor(worker, actor_method_name,
actor_method)
worker.functions[driver_id][function_id] = (actor_method_name,
executor)
# We do not set worker.function_properties[driver_id][function_id]
# because we currently do need the actor worker to submit new tasks
# for the actor.
@@ -214,6 +328,10 @@ def export_actor(actor_id, class_id, actor_method_names, num_cpus, num_gpus,
def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
# Add one to the checkpoint interval since we will insert a mock task for
# every checkpoint.
checkpoint_interval += 1
# Modify the class to have an additional method that will be used for
# terminating the worker.
class Class(cls):
@@ -254,9 +372,101 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
# TODO(rkn): It's possible that this will cause problems. When
# you unpickle the same object twice, the two objects will not
# have the same class.
actor_object = pickle.loads(checkpoint)
actor_object = checkpoint
return actor_object
def __ray_checkpoint__(self, task_counter, previous_object_id):
"""Save or resume a stored checkpoint.
This task checkpoints the current state of the actor. If the actor
has not yet executed to `task_counter`, then the task instead
attempts to resume from a saved checkpoint that matches
`task_counter`. If the most recently saved checkpoint is earlier
than `task_counter`, the task requests reconstruction of the tasks
that executed since the previous checkpoint and before
`task_counter`.
Args:
self: An instance of the actor class.
task_counter: The index assigned to this checkpoint method.
previous_object_id: The dummy object returned by the task that
immediately precedes this checkpoint.
Returns:
A bool representing whether the checkpoint was successfully
loaded (whether the actor can safely execute the next task)
and an Exception instance, if one was thrown.
"""
worker = ray.worker.global_worker
previous_object_id = previous_object_id[0]
plasma_id = plasma.ObjectID(previous_object_id.id())
# Initialize the return values. `actor_checkpoint_failed` will be
# set to True if we fail to load the checkpoint. `error` will be
# set to the Exception, if one is thrown.
actor_checkpoint_failed = False
error_to_return = None
# Save or resume the checkpoint.
if previous_object_id in worker.actor_pinned_objects:
# The preceding task executed on this actor instance. Save the
# checkpoint.
print("Saving actor checkpoint. actor_counter = {}."
.format(task_counter))
actor_key = b"Actor:" + worker.actor_id
try:
checkpoint = worker.actors[
worker.actor_id].__ray_save_checkpoint__()
# Save the checkpoint in Redis. TODO(rkn): Checkpoints
# should not be stored in Redis. Fix this.
worker.redis_client.hset(
actor_key,
"checkpoint_{}".format(task_counter),
checkpoint)
# Remove the previous checkpoints if there is one.
checkpoint_indices = get_checkpoint_indices(
worker, worker.actor_id)
for index in checkpoint_indices:
if index < task_counter:
worker.redis_client.hdel(
actor_key, "checkpoint_{}".format(index))
# An exception was thrown. Save the error.
except Exception as error:
# Checkpoint saves should not block execution on the actor,
# so we still consider the task successful.
error_to_return = error
else:
# The preceding task has not yet executed on this actor
# instance. Try to resume from the most recent checkpoint.
checkpoint_index, checkpoint = get_actor_checkpoint(
worker, worker.actor_id)
if checkpoint_index == task_counter:
# The checkpoint matches ours. Resume the actor instance.
try:
actor = (worker.actor_class.
__ray_restore_from_checkpoint__(checkpoint))
worker.actors[worker.actor_id] = actor
# An exception was thrown. Save the error.
except Exception as error:
# We could not resume the checkpoint, so count the task
# as failed.
actor_checkpoint_failed = True
error_to_return = error
else:
# We cannot resume a mismatching checkpoint, so count the
# task as failed.
actor_checkpoint_failed = True
# Fall back to lineage reconstruction if we were unable to load the
# checkpoint.
if actor_checkpoint_failed:
worker.local_scheduler_client.reconstruct_object(
plasma_id.binary())
worker.local_scheduler_client.notify_unblocked()
return actor_checkpoint_failed, error_to_return
Class.__module__ = cls.__module__
Class.__name__ = cls.__name__
@@ -270,10 +480,9 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
# Create objects to wrap method invocations. This is done so that we can
# invoke methods with actor.method.remote() instead of actor.method().
class ActorMethod(object):
def __init__(self, actor, method_name, method_signature):
def __init__(self, actor, method_name):
self.actor = actor
self.method_name = method_name
self.method_signature = method_signature
def __call__(self, *args, **kwargs):
raise Exception("Actor methods cannot be called directly. Instead "
@@ -282,9 +491,20 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
.format(self.method_name, self.method_name))
def remote(self, *args, **kwargs):
return self.actor._actor_method_call(self.method_name,
self.method_signature, *args,
**kwargs)
return self.actor._actor_method_call(
self.method_name, args=args, kwargs=kwargs,
dependency=self.actor._ray_actor_cursor)
# Checkpoint methods do not take in the state of the previous actor method
# as an explicit data dependency.
class CheckpointMethod(ActorMethod):
def remote(self):
# A checkpoint's arguments are the current task counter and the
# object ID of the preceding task. The latter is an implicit data
# dependency, since the checkpoint method can run at any time.
args = [self.actor._ray_actor_counter,
[self.actor._ray_actor_cursor]]
return self.actor._actor_method_call(self.method_name, args=args)
class ActorHandle(object):
def __init__(self, *args, **kwargs):
@@ -307,10 +527,12 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
# the current cursor should be added as a dependency, and then
# updated to reflect the new invocation.
self._ray_actor_cursor = None
self._ray_actor_methods = {
k: v for (k, v) in inspect.getmembers(
Class, predicate=(lambda x: (inspect.isfunction(x) or
inspect.ismethod(x))))}
ray_actor_methods = inspect.getmembers(
Class, predicate=(lambda x: (inspect.isfunction(x) or
inspect.ismethod(x))))
self._ray_actor_methods = {}
for actor_method_name, actor_method in ray_actor_methods:
self._ray_actor_methods[actor_method_name] = actor_method
# Extract the signatures of each of the methods. This will be used
# to catch some errors if the methods are called with inappropriate
# arguments.
@@ -346,18 +568,41 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
# Call __init__ as a remote function.
if "__init__" in self._ray_actor_methods.keys():
self._actor_method_call(
"__init__", self._ray_method_signatures["__init__"], *args,
**kwargs)
self._actor_method_call("__init__", args=args, kwargs=kwargs)
else:
print("WARNING: this object has no __init__ method.")
# The function actor_method_call gets called if somebody tries to call
# a method on their local actor stub object.
def _actor_method_call(self, attr, function_signature, *args,
**kwargs):
def _actor_method_call(self, method_name, args=None, kwargs=None,
dependency=None):
"""Method execution stub for an actor handle.
This is the function that executes when
`actor.method_name.remote(*args, **kwargs)` is called. Instead of
executing locally, the method is packaged as a task and scheduled
to the remote actor instance.
Args:
self: The local actor handle.
method_name: The name of the actor method to execute.
args: A list of arguments for the actor method.
kwargs: A dictionary of keyword arguments for the actor method.
dependency: The object ID that this method is dependent on.
Defaults to None, for no dependencies. Most tasks should
pass in the dummy object returned by the preceding task.
Some tasks, such as checkpoint and terminate methods, have
no dependencies.
Returns:
object_ids: A list of object IDs returned by the remote actor
method.
"""
ray.worker.check_connected()
ray.worker.check_main_thread()
function_signature = self._ray_method_signatures[method_name]
if args is None:
args = []
if kwargs is None:
kwargs = {}
args = signature.extend_args(function_signature, args, kwargs)
# Execute functions locally if Ray is run in PYTHON_MODE
@@ -365,23 +610,33 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
if ray.worker.global_worker.mode == ray.PYTHON_MODE:
return getattr(
ray.worker.global_worker.actors[self._ray_actor_id],
attr)(*copy.deepcopy(args))
method_name)(*copy.deepcopy(args))
# Add the current actor cursor, a dummy object returned by the most
# recent method invocation, as a dependency for the next method
# invocation.
if self._ray_actor_cursor is not None:
args.append(self._ray_actor_cursor)
# Add the dummy argument that represents dependency on a preceding
# task.
args.append(dependency)
function_id = get_actor_method_function_id(attr)
actor_counter = self._ray_actor_counter
# Mark checkpoint methods with a negative task counter.
if is_checkpoint_task(actor_counter, checkpoint_interval):
actor_counter = self._ray_actor_counter * -1
function_id = get_actor_method_function_id(method_name)
object_ids = ray.worker.global_worker.submit_task(
function_id, args, actor_id=self._ray_actor_id,
actor_counter=self._ray_actor_counter)
actor_counter=actor_counter)
# Update the actor counter and cursor to reflect the most recent
# invocation.
self._ray_actor_counter += 1
self._ray_actor_cursor = object_ids.pop()
# Submit a checkpoint task if necessary.
if is_checkpoint_task(self._ray_actor_counter,
checkpoint_interval):
self.__ray_checkpoint__.remote()
# The last object returned is the dummy object that should be
# passed in to the next actor method. Do not return it to the user.
if len(object_ids) == 1:
return object_ids[0]
elif len(object_ids) > 1:
@@ -405,8 +660,11 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
# ActorMethod has a reference to the ActorHandle and this was
# causing cyclic references which were prevent object
# deallocation from behaving in a predictable manner.
return ActorMethod(self, attr,
self._ray_method_signatures[attr])
if attr == "__ray_checkpoint__":
actor_method_cls = CheckpointMethod
else:
actor_method_cls = ActorMethod
return actor_method_cls(self, attr)
else:
# There is no method with this name, so raise an exception.
raise AttributeError("'{}' Actor object has no attribute '{}'"
@@ -421,10 +679,8 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
def __del__(self):
"""Kill the worker that is running this actor."""
if ray.worker.global_worker.connected:
self._actor_method_call(
"__ray_terminate__",
self._ray_method_signatures["__ray_terminate__"],
self._ray_actor_id.id())
self._actor_method_call("__ray_terminate__",
args=[self._ray_actor_id.id()])
return ActorHandle
+16 -59
View File
@@ -226,6 +226,11 @@ class Worker(object):
self.fetch_and_register_actor = None
self.make_actor = None
self.actors = {}
self.actor_task_counter = 0
# This field is used to report actor checkpoint failure for the last
# task assigned. Workers are not assigned a task on startup, so we
# initialize to False.
self.actor_checkpoint_failed = False
# TODO(swang): This is a hack to prevent the object store from evicting
# dummy objects. Once we allow object pinning in the store, we may
# remove this variable.
@@ -691,7 +696,7 @@ class Worker(object):
args = task.arguments()
return_object_ids = task.returns()
if task.actor_id().id() != NIL_ACTOR_ID:
return_object_ids.pop()
dummy_return_id = return_object_ids.pop()
function_name, function_executor = (self.functions
[self.task_driver_id.id()]
[function_id.id()])
@@ -717,14 +722,10 @@ class Worker(object):
if task.actor_id().id() == NIL_ACTOR_ID:
outputs = function_executor.executor(arguments)
else:
# If this is any actor task other than the first, which has
# no dependencies, the last argument is a dummy argument
# that represents the dependency on the previous actor
# task. Remove this argument for invocation.
if task.actor_counter() > 0:
arguments = arguments[:-1]
outputs = function_executor(
self.actors[task.actor_id().id()], *arguments)
dummy_return_id, task.actor_counter(),
self.actors[task.actor_id().id()],
*arguments)
except Exception as e:
# Determine whether the exception occured during a task, not an
# actor method.
@@ -764,35 +765,6 @@ class Worker(object):
data={"function_id": function_id.id(),
"function_name": function_name})
def _checkpoint_actor_state(self, actor_counter):
"""Checkpoint the actor state.
This currently saves the checkpoint to Redis, but the checkpoint really
needs to go somewhere else.
Args:
actor_counter: The index of the most recent task that ran on this
actor.
"""
print("Saving actor checkpoint. actor_counter = {}."
.format(actor_counter))
actor_key = b"Actor:" + self.actor_id
checkpoint = self.actors[self.actor_id].__ray_save_checkpoint__()
# Save the checkpoint in Redis. TODO(rkn): Checkpoints should not
# be stored in Redis. Fix this.
self.redis_client.hset(
actor_key,
"checkpoint_{}".format(actor_counter),
checkpoint)
# Remove the previous checkpoints if there is one.
checkpoint_indices = [int(key[len(b"checkpoint_"):])
for key in self.redis_client.hkeys(actor_key)
if key.startswith(b"checkpoint_")]
for index in checkpoint_indices:
if index < actor_counter:
self.redis_client.hdel(actor_key,
"checkpoint_{}".format(index))
def _wait_for_and_process_task(self, task):
"""Wait for a task to be ready and process the task.
@@ -824,19 +796,6 @@ class Worker(object):
with log_span("ray:task", contents=contents, worker=self):
self._process_task(task)
# Add the dummy output for actor tasks. TODO(swang): We use a
# numpy array as a hack to pin the object in the object store.
# Once we allow object pinning in the store, we may use `None`.
if task.actor_id().id() != NIL_ACTOR_ID:
dummy_object_id = task.returns().pop()
dummy_object = np.zeros(1)
self.put_object(dummy_object_id, dummy_object)
# Keep the dummy output in scope for the lifetime of the actor,
# to prevent eviction from the object store.
dummy_object = self.get_object([dummy_object_id])
self.actor_pinned_objects.append(dummy_object[0])
# Push all of the log events to the global state store.
flush_log()
@@ -853,13 +812,6 @@ class Worker(object):
ray.worker.global_worker.local_scheduler_client.disconnect()
os._exit(0)
# Checkpoint the actor state if it is the right time to do so.
actor_counter = task.actor_counter()
if (self.actor_id != NIL_ACTOR_ID and
self.actor_checkpoint_interval != -1 and
actor_counter % self.actor_checkpoint_interval == 0):
self._checkpoint_actor_state(actor_counter)
def _get_next_task_from_local_scheduler(self):
"""Get the next task from the local scheduler.
@@ -867,7 +819,12 @@ class Worker(object):
A task from the local scheduler.
"""
with log_span("ray:get_task", worker=self):
task = self.local_scheduler_client.get_task()
task = self.local_scheduler_client.get_task(
self.actor_checkpoint_failed)
# We assume that the task is not a checkpoint, or that if it is,
# that the task will succeed. The checkpoint task executor is
# responsible for reporting task failure to the local scheduler.
self.actor_checkpoint_failed = False
# Automatically restrict the GPUs available to this task.
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
@@ -1892,7 +1849,7 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker,
worker.class_id = class_id
# Store a list of the dummy outputs produced by actor tasks, to pin the
# dummy outputs in the object store.
worker.actor_pinned_objects = []
worker.actor_pinned_objects = {}
# Initialize the serialization library. This registers some classes, and so
# it must be run before we export all of the cached remote functions.