mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 17:34:51 +08:00
Actor checkpointing with object lineage reconstruction (#1004)
* Worker reports error in previous task, actor task counter is incremented after task is successful * Refactor actor task execution - Return new task counter in GetTaskRequest - Update worker state for actor tasks inside of the actor method executor * Manually invoked checkpoint method * Scheduling for actor checkpoint methods * Fix python bugs in checkpointing * Return task success from worker to local scheduler instead of actor counter * Kill local schedulers halfway through actor execution instead of waiting for all tasks to execute once * Remove redundant actor tasks during dispatch, reconstruct missing dependencies for actor tasks * Make executor for temporary actor methods * doc * Set default argument for whether the previous task was a success * Refactor actor method call * Simplify checkpoint task submission * lint * fix philipp's comments * Add missing line * Make actor reconstruction tests run faster * Unimportant whitespace. * Unimportant whitespace. * Update checkpoint method signature * Documentation and handle exceptions during checkpoint save/resume * Rename get_task message field to actor_checkpoint_failed * Fix bug. * Remove debugging check, redirect test output
This commit is contained in:
committed by
Robert Nishihara
parent
b585001881
commit
3764f2f2e1
+304
-48
@@ -7,8 +7,10 @@ import copy
|
||||
import hashlib
|
||||
import inspect
|
||||
import json
|
||||
import numpy as np
|
||||
import traceback
|
||||
|
||||
import pyarrow.plasma as plasma
|
||||
import ray.local_scheduler
|
||||
import ray.signature as signature
|
||||
import ray.worker
|
||||
@@ -40,12 +42,31 @@ def get_actor_method_function_id(attr):
|
||||
return ray.local_scheduler.ObjectID(function_id)
|
||||
|
||||
|
||||
def get_actor_checkpoint(actor_id, worker):
|
||||
def get_checkpoint_indices(worker, actor_id):
|
||||
"""Get the checkpoint indices associated with a given actor ID.
|
||||
|
||||
Args:
|
||||
worker: The worker to use to get the checkpoint indices.
|
||||
actor_id: The actor ID of the actor to get the checkpoint indices for.
|
||||
|
||||
Returns:
|
||||
The indices of existing checkpoints as a list of integers.
|
||||
"""
|
||||
actor_key = b"Actor:" + actor_id
|
||||
checkpoint_indices = []
|
||||
for key in worker.redis_client.hkeys(actor_key):
|
||||
if key.startswith(b"checkpoint_"):
|
||||
index = int(key[len(b"checkpoint_"):])
|
||||
checkpoint_indices.append(index)
|
||||
return checkpoint_indices
|
||||
|
||||
|
||||
def get_actor_checkpoint(worker, actor_id):
|
||||
"""Get the most recent checkpoint associated with a given actor ID.
|
||||
|
||||
Args:
|
||||
actor_id: The actor ID of the actor to get the checkpoint for.
|
||||
worker: The worker to use to get the checkpoint.
|
||||
actor_id: The actor ID of the actor to get the checkpoint for.
|
||||
|
||||
Returns:
|
||||
If a checkpoint exists, this returns a tuple of the checkpoint index
|
||||
@@ -53,18 +74,103 @@ def get_actor_checkpoint(actor_id, worker):
|
||||
index is the actor counter of the last task that was executed on
|
||||
the actor before the checkpoint was made.
|
||||
"""
|
||||
# Get all of the keys associated with checkpoints for this actor.
|
||||
actor_key = b"Actor:" + actor_id
|
||||
checkpoint_indices = [int(key[len(b"checkpoint_"):])
|
||||
for key in worker.redis_client.hkeys(actor_key)
|
||||
if key.startswith(b"checkpoint_")]
|
||||
checkpoint_indices = get_checkpoint_indices(worker, actor_id)
|
||||
if len(checkpoint_indices) == 0:
|
||||
return -1, None
|
||||
most_recent_checkpoint_index = max(checkpoint_indices)
|
||||
# Get the most recent checkpoint.
|
||||
checkpoint = worker.redis_client.hget(
|
||||
actor_key, "checkpoint_{}".format(most_recent_checkpoint_index))
|
||||
return most_recent_checkpoint_index, checkpoint
|
||||
else:
|
||||
actor_key = b"Actor:" + actor_id
|
||||
checkpoint_index = max(checkpoint_indices)
|
||||
checkpoint = worker.redis_client.hget(
|
||||
actor_key, "checkpoint_{}".format(checkpoint_index))
|
||||
return checkpoint_index, checkpoint
|
||||
|
||||
|
||||
def put_dummy_object(worker, dummy_object_id):
|
||||
"""Put a dummy actor object into the local object store.
|
||||
|
||||
This registers a dummy object ID in the local store with an empty numpy
|
||||
array as the value. The resulting object is pinned to the store by storing
|
||||
it to the worker's state.
|
||||
|
||||
For actors, dummy objects are used to store the stateful dependencies
|
||||
between consecutive method calls. This function should be called for every
|
||||
actor method execution that updates the actor's internal state.
|
||||
|
||||
Args:
|
||||
worker: The worker to use to perform the put.
|
||||
dummy_object_id: The object ID of the dummy object.
|
||||
"""
|
||||
# Add the dummy output for actor tasks. TODO(swang): We use
|
||||
# a numpy array as a hack to pin the object in the object
|
||||
# store. Once we allow object pinning in the store, we may
|
||||
# use `None`.
|
||||
dummy_object = np.zeros(1)
|
||||
worker.put_object(dummy_object_id, dummy_object)
|
||||
# Keep the dummy output in scope for the lifetime of the
|
||||
# actor, to prevent eviction from the object store.
|
||||
dummy_object = worker.get_object([dummy_object_id])
|
||||
dummy_object = dummy_object[0]
|
||||
worker.actor_pinned_objects[dummy_object_id] = dummy_object
|
||||
|
||||
|
||||
def is_checkpoint_task(task_counter, checkpoint_interval):
|
||||
if checkpoint_interval <= 0:
|
||||
return False
|
||||
return (task_counter % checkpoint_interval == 0)
|
||||
|
||||
|
||||
def make_actor_method_executor(worker, method_name, method):
|
||||
"""Make an executor that wraps a user-defined actor method.
|
||||
|
||||
The executor wraps the method to update the worker's internal state. If the
|
||||
task is a success, the dummy object returned is added to the object store,
|
||||
to signal that the following task can run, and the worker's task counter is
|
||||
updated to match the executed task. Else, the executor reports failure to
|
||||
the local scheduler so that the task counter does not get updated.
|
||||
|
||||
Args:
|
||||
worker (Worker): The worker that is executing the actor.
|
||||
method_name (str): The name of the actor method.
|
||||
method (instancemethod): The actor method to wrap. This should be a
|
||||
method defined on the actor class and should therefore take an
|
||||
instance of the actor as the first argument.
|
||||
|
||||
Returns:
|
||||
A function that executes the given actor method on the worker's stored
|
||||
instance of the actor. The function also updates the worker's
|
||||
internal state to record the executed method.
|
||||
"""
|
||||
|
||||
def actor_method_executor(dummy_return_id, task_counter, actor,
|
||||
*args):
|
||||
# An actor task's dependency on the previous task is represented by
|
||||
# a dummy argument. Remove this argument before invocation.
|
||||
args = args[:-1]
|
||||
if method_name == "__ray_checkpoint__":
|
||||
# Execute the checkpoint task.
|
||||
actor_checkpoint_failed, error = method(actor, *args)
|
||||
# If the checkpoint was successfully loaded, put the dummy object
|
||||
# and update the actor's task counter, so that the task following
|
||||
# the checkpoint can run.
|
||||
if not actor_checkpoint_failed:
|
||||
put_dummy_object(worker, dummy_return_id)
|
||||
worker.actor_task_counter = task_counter + 1
|
||||
# Report to the local scheduler whether this task succeeded in
|
||||
# loading the checkpoint.
|
||||
worker.actor_checkpoint_failed = actor_checkpoint_failed
|
||||
# If there was an exception during the checkpoint method, re-raise
|
||||
# it after updating the actor's internal state.
|
||||
if error is not None:
|
||||
raise error
|
||||
return None
|
||||
else:
|
||||
# Update the worker's internal state before executing the method in
|
||||
# case the method throws an exception.
|
||||
put_dummy_object(worker, dummy_return_id)
|
||||
worker.actor_task_counter = task_counter + 1
|
||||
# Execute the actor method.
|
||||
return method(actor, *args)
|
||||
return actor_method_executor
|
||||
|
||||
|
||||
def fetch_and_register_actor(actor_class_key, worker):
|
||||
@@ -100,8 +206,11 @@ def fetch_and_register_actor(actor_class_key, worker):
|
||||
"cannot execute this method".format(actor_name))
|
||||
for actor_method_name in actor_method_names:
|
||||
function_id = get_actor_method_function_id(actor_method_name).id()
|
||||
temporary_executor = make_actor_method_executor(worker,
|
||||
actor_method_name,
|
||||
temporary_actor_method)
|
||||
worker.functions[driver_id][function_id] = (actor_method_name,
|
||||
temporary_actor_method)
|
||||
temporary_executor)
|
||||
worker.function_properties[driver_id][function_id] = (
|
||||
FunctionProperties(num_return_vals=2,
|
||||
num_cpus=1,
|
||||
@@ -112,6 +221,7 @@ def fetch_and_register_actor(actor_class_key, worker):
|
||||
|
||||
try:
|
||||
unpickled_class = pickle.loads(pickled_class)
|
||||
worker.actor_class = unpickled_class
|
||||
except Exception:
|
||||
# If an exception was thrown when the actor was imported, we record the
|
||||
# traceback and notify the scheduler of the failure.
|
||||
@@ -126,11 +236,15 @@ def fetch_and_register_actor(actor_class_key, worker):
|
||||
# TODO(pcm): Why is the below line necessary?
|
||||
unpickled_class.__module__ = module
|
||||
worker.actors[actor_id_str] = unpickled_class.__new__(unpickled_class)
|
||||
for (k, v) in inspect.getmembers(
|
||||
actor_methods = inspect.getmembers(
|
||||
unpickled_class, predicate=(lambda x: (inspect.isfunction(x) or
|
||||
inspect.ismethod(x)))):
|
||||
function_id = get_actor_method_function_id(k).id()
|
||||
worker.functions[driver_id][function_id] = (k, v)
|
||||
inspect.ismethod(x))))
|
||||
for actor_method_name, actor_method in actor_methods:
|
||||
function_id = get_actor_method_function_id(actor_method_name).id()
|
||||
executor = make_actor_method_executor(worker, actor_method_name,
|
||||
actor_method)
|
||||
worker.functions[driver_id][function_id] = (actor_method_name,
|
||||
executor)
|
||||
# We do not set worker.function_properties[driver_id][function_id]
|
||||
# because we currently do need the actor worker to submit new tasks
|
||||
# for the actor.
|
||||
@@ -214,6 +328,10 @@ def export_actor(actor_id, class_id, actor_method_names, num_cpus, num_gpus,
|
||||
|
||||
|
||||
def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
# Add one to the checkpoint interval since we will insert a mock task for
|
||||
# every checkpoint.
|
||||
checkpoint_interval += 1
|
||||
|
||||
# Modify the class to have an additional method that will be used for
|
||||
# terminating the worker.
|
||||
class Class(cls):
|
||||
@@ -254,9 +372,101 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
# TODO(rkn): It's possible that this will cause problems. When
|
||||
# you unpickle the same object twice, the two objects will not
|
||||
# have the same class.
|
||||
actor_object = pickle.loads(checkpoint)
|
||||
actor_object = checkpoint
|
||||
return actor_object
|
||||
|
||||
def __ray_checkpoint__(self, task_counter, previous_object_id):
|
||||
"""Save or resume a stored checkpoint.
|
||||
|
||||
This task checkpoints the current state of the actor. If the actor
|
||||
has not yet executed to `task_counter`, then the task instead
|
||||
attempts to resume from a saved checkpoint that matches
|
||||
`task_counter`. If the most recently saved checkpoint is earlier
|
||||
than `task_counter`, the task requests reconstruction of the tasks
|
||||
that executed since the previous checkpoint and before
|
||||
`task_counter`.
|
||||
|
||||
Args:
|
||||
self: An instance of the actor class.
|
||||
task_counter: The index assigned to this checkpoint method.
|
||||
previous_object_id: The dummy object returned by the task that
|
||||
immediately precedes this checkpoint.
|
||||
|
||||
Returns:
|
||||
A bool representing whether the checkpoint was successfully
|
||||
loaded (whether the actor can safely execute the next task)
|
||||
and an Exception instance, if one was thrown.
|
||||
"""
|
||||
worker = ray.worker.global_worker
|
||||
previous_object_id = previous_object_id[0]
|
||||
plasma_id = plasma.ObjectID(previous_object_id.id())
|
||||
|
||||
# Initialize the return values. `actor_checkpoint_failed` will be
|
||||
# set to True if we fail to load the checkpoint. `error` will be
|
||||
# set to the Exception, if one is thrown.
|
||||
actor_checkpoint_failed = False
|
||||
error_to_return = None
|
||||
|
||||
# Save or resume the checkpoint.
|
||||
if previous_object_id in worker.actor_pinned_objects:
|
||||
# The preceding task executed on this actor instance. Save the
|
||||
# checkpoint.
|
||||
print("Saving actor checkpoint. actor_counter = {}."
|
||||
.format(task_counter))
|
||||
actor_key = b"Actor:" + worker.actor_id
|
||||
|
||||
try:
|
||||
checkpoint = worker.actors[
|
||||
worker.actor_id].__ray_save_checkpoint__()
|
||||
# Save the checkpoint in Redis. TODO(rkn): Checkpoints
|
||||
# should not be stored in Redis. Fix this.
|
||||
worker.redis_client.hset(
|
||||
actor_key,
|
||||
"checkpoint_{}".format(task_counter),
|
||||
checkpoint)
|
||||
# Remove the previous checkpoints if there is one.
|
||||
checkpoint_indices = get_checkpoint_indices(
|
||||
worker, worker.actor_id)
|
||||
for index in checkpoint_indices:
|
||||
if index < task_counter:
|
||||
worker.redis_client.hdel(
|
||||
actor_key, "checkpoint_{}".format(index))
|
||||
# An exception was thrown. Save the error.
|
||||
except Exception as error:
|
||||
# Checkpoint saves should not block execution on the actor,
|
||||
# so we still consider the task successful.
|
||||
error_to_return = error
|
||||
else:
|
||||
# The preceding task has not yet executed on this actor
|
||||
# instance. Try to resume from the most recent checkpoint.
|
||||
checkpoint_index, checkpoint = get_actor_checkpoint(
|
||||
worker, worker.actor_id)
|
||||
if checkpoint_index == task_counter:
|
||||
# The checkpoint matches ours. Resume the actor instance.
|
||||
try:
|
||||
actor = (worker.actor_class.
|
||||
__ray_restore_from_checkpoint__(checkpoint))
|
||||
worker.actors[worker.actor_id] = actor
|
||||
# An exception was thrown. Save the error.
|
||||
except Exception as error:
|
||||
# We could not resume the checkpoint, so count the task
|
||||
# as failed.
|
||||
actor_checkpoint_failed = True
|
||||
error_to_return = error
|
||||
else:
|
||||
# We cannot resume a mismatching checkpoint, so count the
|
||||
# task as failed.
|
||||
actor_checkpoint_failed = True
|
||||
|
||||
# Fall back to lineage reconstruction if we were unable to load the
|
||||
# checkpoint.
|
||||
if actor_checkpoint_failed:
|
||||
worker.local_scheduler_client.reconstruct_object(
|
||||
plasma_id.binary())
|
||||
worker.local_scheduler_client.notify_unblocked()
|
||||
|
||||
return actor_checkpoint_failed, error_to_return
|
||||
|
||||
Class.__module__ = cls.__module__
|
||||
Class.__name__ = cls.__name__
|
||||
|
||||
@@ -270,10 +480,9 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
# Create objects to wrap method invocations. This is done so that we can
|
||||
# invoke methods with actor.method.remote() instead of actor.method().
|
||||
class ActorMethod(object):
|
||||
def __init__(self, actor, method_name, method_signature):
|
||||
def __init__(self, actor, method_name):
|
||||
self.actor = actor
|
||||
self.method_name = method_name
|
||||
self.method_signature = method_signature
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
raise Exception("Actor methods cannot be called directly. Instead "
|
||||
@@ -282,9 +491,20 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
.format(self.method_name, self.method_name))
|
||||
|
||||
def remote(self, *args, **kwargs):
|
||||
return self.actor._actor_method_call(self.method_name,
|
||||
self.method_signature, *args,
|
||||
**kwargs)
|
||||
return self.actor._actor_method_call(
|
||||
self.method_name, args=args, kwargs=kwargs,
|
||||
dependency=self.actor._ray_actor_cursor)
|
||||
|
||||
# Checkpoint methods do not take in the state of the previous actor method
|
||||
# as an explicit data dependency.
|
||||
class CheckpointMethod(ActorMethod):
|
||||
def remote(self):
|
||||
# A checkpoint's arguments are the current task counter and the
|
||||
# object ID of the preceding task. The latter is an implicit data
|
||||
# dependency, since the checkpoint method can run at any time.
|
||||
args = [self.actor._ray_actor_counter,
|
||||
[self.actor._ray_actor_cursor]]
|
||||
return self.actor._actor_method_call(self.method_name, args=args)
|
||||
|
||||
class ActorHandle(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
@@ -307,10 +527,12 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
# the current cursor should be added as a dependency, and then
|
||||
# updated to reflect the new invocation.
|
||||
self._ray_actor_cursor = None
|
||||
self._ray_actor_methods = {
|
||||
k: v for (k, v) in inspect.getmembers(
|
||||
Class, predicate=(lambda x: (inspect.isfunction(x) or
|
||||
inspect.ismethod(x))))}
|
||||
ray_actor_methods = inspect.getmembers(
|
||||
Class, predicate=(lambda x: (inspect.isfunction(x) or
|
||||
inspect.ismethod(x))))
|
||||
self._ray_actor_methods = {}
|
||||
for actor_method_name, actor_method in ray_actor_methods:
|
||||
self._ray_actor_methods[actor_method_name] = actor_method
|
||||
# Extract the signatures of each of the methods. This will be used
|
||||
# to catch some errors if the methods are called with inappropriate
|
||||
# arguments.
|
||||
@@ -346,18 +568,41 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
|
||||
# Call __init__ as a remote function.
|
||||
if "__init__" in self._ray_actor_methods.keys():
|
||||
self._actor_method_call(
|
||||
"__init__", self._ray_method_signatures["__init__"], *args,
|
||||
**kwargs)
|
||||
self._actor_method_call("__init__", args=args, kwargs=kwargs)
|
||||
else:
|
||||
print("WARNING: this object has no __init__ method.")
|
||||
|
||||
# The function actor_method_call gets called if somebody tries to call
|
||||
# a method on their local actor stub object.
|
||||
def _actor_method_call(self, attr, function_signature, *args,
|
||||
**kwargs):
|
||||
def _actor_method_call(self, method_name, args=None, kwargs=None,
|
||||
dependency=None):
|
||||
"""Method execution stub for an actor handle.
|
||||
|
||||
This is the function that executes when
|
||||
`actor.method_name.remote(*args, **kwargs)` is called. Instead of
|
||||
executing locally, the method is packaged as a task and scheduled
|
||||
to the remote actor instance.
|
||||
|
||||
Args:
|
||||
self: The local actor handle.
|
||||
method_name: The name of the actor method to execute.
|
||||
args: A list of arguments for the actor method.
|
||||
kwargs: A dictionary of keyword arguments for the actor method.
|
||||
dependency: The object ID that this method is dependent on.
|
||||
Defaults to None, for no dependencies. Most tasks should
|
||||
pass in the dummy object returned by the preceding task.
|
||||
Some tasks, such as checkpoint and terminate methods, have
|
||||
no dependencies.
|
||||
|
||||
Returns:
|
||||
object_ids: A list of object IDs returned by the remote actor
|
||||
method.
|
||||
"""
|
||||
ray.worker.check_connected()
|
||||
ray.worker.check_main_thread()
|
||||
function_signature = self._ray_method_signatures[method_name]
|
||||
if args is None:
|
||||
args = []
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
args = signature.extend_args(function_signature, args, kwargs)
|
||||
|
||||
# Execute functions locally if Ray is run in PYTHON_MODE
|
||||
@@ -365,23 +610,33 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
if ray.worker.global_worker.mode == ray.PYTHON_MODE:
|
||||
return getattr(
|
||||
ray.worker.global_worker.actors[self._ray_actor_id],
|
||||
attr)(*copy.deepcopy(args))
|
||||
method_name)(*copy.deepcopy(args))
|
||||
|
||||
# Add the current actor cursor, a dummy object returned by the most
|
||||
# recent method invocation, as a dependency for the next method
|
||||
# invocation.
|
||||
if self._ray_actor_cursor is not None:
|
||||
args.append(self._ray_actor_cursor)
|
||||
# Add the dummy argument that represents dependency on a preceding
|
||||
# task.
|
||||
args.append(dependency)
|
||||
|
||||
function_id = get_actor_method_function_id(attr)
|
||||
actor_counter = self._ray_actor_counter
|
||||
# Mark checkpoint methods with a negative task counter.
|
||||
if is_checkpoint_task(actor_counter, checkpoint_interval):
|
||||
actor_counter = self._ray_actor_counter * -1
|
||||
|
||||
function_id = get_actor_method_function_id(method_name)
|
||||
object_ids = ray.worker.global_worker.submit_task(
|
||||
function_id, args, actor_id=self._ray_actor_id,
|
||||
actor_counter=self._ray_actor_counter)
|
||||
actor_counter=actor_counter)
|
||||
# Update the actor counter and cursor to reflect the most recent
|
||||
# invocation.
|
||||
self._ray_actor_counter += 1
|
||||
self._ray_actor_cursor = object_ids.pop()
|
||||
|
||||
# Submit a checkpoint task if necessary.
|
||||
if is_checkpoint_task(self._ray_actor_counter,
|
||||
checkpoint_interval):
|
||||
self.__ray_checkpoint__.remote()
|
||||
|
||||
# The last object returned is the dummy object that should be
|
||||
# passed in to the next actor method. Do not return it to the user.
|
||||
if len(object_ids) == 1:
|
||||
return object_ids[0]
|
||||
elif len(object_ids) > 1:
|
||||
@@ -405,8 +660,11 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
# ActorMethod has a reference to the ActorHandle and this was
|
||||
# causing cyclic references which were prevent object
|
||||
# deallocation from behaving in a predictable manner.
|
||||
return ActorMethod(self, attr,
|
||||
self._ray_method_signatures[attr])
|
||||
if attr == "__ray_checkpoint__":
|
||||
actor_method_cls = CheckpointMethod
|
||||
else:
|
||||
actor_method_cls = ActorMethod
|
||||
return actor_method_cls(self, attr)
|
||||
else:
|
||||
# There is no method with this name, so raise an exception.
|
||||
raise AttributeError("'{}' Actor object has no attribute '{}'"
|
||||
@@ -421,10 +679,8 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
def __del__(self):
|
||||
"""Kill the worker that is running this actor."""
|
||||
if ray.worker.global_worker.connected:
|
||||
self._actor_method_call(
|
||||
"__ray_terminate__",
|
||||
self._ray_method_signatures["__ray_terminate__"],
|
||||
self._ray_actor_id.id())
|
||||
self._actor_method_call("__ray_terminate__",
|
||||
args=[self._ray_actor_id.id()])
|
||||
|
||||
return ActorHandle
|
||||
|
||||
|
||||
+16
-59
@@ -226,6 +226,11 @@ class Worker(object):
|
||||
self.fetch_and_register_actor = None
|
||||
self.make_actor = None
|
||||
self.actors = {}
|
||||
self.actor_task_counter = 0
|
||||
# This field is used to report actor checkpoint failure for the last
|
||||
# task assigned. Workers are not assigned a task on startup, so we
|
||||
# initialize to False.
|
||||
self.actor_checkpoint_failed = False
|
||||
# TODO(swang): This is a hack to prevent the object store from evicting
|
||||
# dummy objects. Once we allow object pinning in the store, we may
|
||||
# remove this variable.
|
||||
@@ -691,7 +696,7 @@ class Worker(object):
|
||||
args = task.arguments()
|
||||
return_object_ids = task.returns()
|
||||
if task.actor_id().id() != NIL_ACTOR_ID:
|
||||
return_object_ids.pop()
|
||||
dummy_return_id = return_object_ids.pop()
|
||||
function_name, function_executor = (self.functions
|
||||
[self.task_driver_id.id()]
|
||||
[function_id.id()])
|
||||
@@ -717,14 +722,10 @@ class Worker(object):
|
||||
if task.actor_id().id() == NIL_ACTOR_ID:
|
||||
outputs = function_executor.executor(arguments)
|
||||
else:
|
||||
# If this is any actor task other than the first, which has
|
||||
# no dependencies, the last argument is a dummy argument
|
||||
# that represents the dependency on the previous actor
|
||||
# task. Remove this argument for invocation.
|
||||
if task.actor_counter() > 0:
|
||||
arguments = arguments[:-1]
|
||||
outputs = function_executor(
|
||||
self.actors[task.actor_id().id()], *arguments)
|
||||
dummy_return_id, task.actor_counter(),
|
||||
self.actors[task.actor_id().id()],
|
||||
*arguments)
|
||||
except Exception as e:
|
||||
# Determine whether the exception occured during a task, not an
|
||||
# actor method.
|
||||
@@ -764,35 +765,6 @@ class Worker(object):
|
||||
data={"function_id": function_id.id(),
|
||||
"function_name": function_name})
|
||||
|
||||
def _checkpoint_actor_state(self, actor_counter):
|
||||
"""Checkpoint the actor state.
|
||||
|
||||
This currently saves the checkpoint to Redis, but the checkpoint really
|
||||
needs to go somewhere else.
|
||||
|
||||
Args:
|
||||
actor_counter: The index of the most recent task that ran on this
|
||||
actor.
|
||||
"""
|
||||
print("Saving actor checkpoint. actor_counter = {}."
|
||||
.format(actor_counter))
|
||||
actor_key = b"Actor:" + self.actor_id
|
||||
checkpoint = self.actors[self.actor_id].__ray_save_checkpoint__()
|
||||
# Save the checkpoint in Redis. TODO(rkn): Checkpoints should not
|
||||
# be stored in Redis. Fix this.
|
||||
self.redis_client.hset(
|
||||
actor_key,
|
||||
"checkpoint_{}".format(actor_counter),
|
||||
checkpoint)
|
||||
# Remove the previous checkpoints if there is one.
|
||||
checkpoint_indices = [int(key[len(b"checkpoint_"):])
|
||||
for key in self.redis_client.hkeys(actor_key)
|
||||
if key.startswith(b"checkpoint_")]
|
||||
for index in checkpoint_indices:
|
||||
if index < actor_counter:
|
||||
self.redis_client.hdel(actor_key,
|
||||
"checkpoint_{}".format(index))
|
||||
|
||||
def _wait_for_and_process_task(self, task):
|
||||
"""Wait for a task to be ready and process the task.
|
||||
|
||||
@@ -824,19 +796,6 @@ class Worker(object):
|
||||
with log_span("ray:task", contents=contents, worker=self):
|
||||
self._process_task(task)
|
||||
|
||||
# Add the dummy output for actor tasks. TODO(swang): We use a
|
||||
# numpy array as a hack to pin the object in the object store.
|
||||
# Once we allow object pinning in the store, we may use `None`.
|
||||
if task.actor_id().id() != NIL_ACTOR_ID:
|
||||
dummy_object_id = task.returns().pop()
|
||||
dummy_object = np.zeros(1)
|
||||
self.put_object(dummy_object_id, dummy_object)
|
||||
|
||||
# Keep the dummy output in scope for the lifetime of the actor,
|
||||
# to prevent eviction from the object store.
|
||||
dummy_object = self.get_object([dummy_object_id])
|
||||
self.actor_pinned_objects.append(dummy_object[0])
|
||||
|
||||
# Push all of the log events to the global state store.
|
||||
flush_log()
|
||||
|
||||
@@ -853,13 +812,6 @@ class Worker(object):
|
||||
ray.worker.global_worker.local_scheduler_client.disconnect()
|
||||
os._exit(0)
|
||||
|
||||
# Checkpoint the actor state if it is the right time to do so.
|
||||
actor_counter = task.actor_counter()
|
||||
if (self.actor_id != NIL_ACTOR_ID and
|
||||
self.actor_checkpoint_interval != -1 and
|
||||
actor_counter % self.actor_checkpoint_interval == 0):
|
||||
self._checkpoint_actor_state(actor_counter)
|
||||
|
||||
def _get_next_task_from_local_scheduler(self):
|
||||
"""Get the next task from the local scheduler.
|
||||
|
||||
@@ -867,7 +819,12 @@ class Worker(object):
|
||||
A task from the local scheduler.
|
||||
"""
|
||||
with log_span("ray:get_task", worker=self):
|
||||
task = self.local_scheduler_client.get_task()
|
||||
task = self.local_scheduler_client.get_task(
|
||||
self.actor_checkpoint_failed)
|
||||
# We assume that the task is not a checkpoint, or that if it is,
|
||||
# that the task will succeed. The checkpoint task executor is
|
||||
# responsible for reporting task failure to the local scheduler.
|
||||
self.actor_checkpoint_failed = False
|
||||
|
||||
# Automatically restrict the GPUs available to this task.
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
|
||||
@@ -1892,7 +1849,7 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker,
|
||||
worker.class_id = class_id
|
||||
# Store a list of the dummy outputs produced by actor tasks, to pin the
|
||||
# dummy outputs in the object store.
|
||||
worker.actor_pinned_objects = []
|
||||
worker.actor_pinned_objects = {}
|
||||
|
||||
# Initialize the serialization library. This registers some classes, and so
|
||||
# it must be run before we export all of the cached remote functions.
|
||||
|
||||
+8
-1
@@ -217,7 +217,14 @@ ActorID TaskSpec_actor_id(TaskSpec *spec) {
|
||||
int64_t TaskSpec_actor_counter(TaskSpec *spec) {
|
||||
CHECK(spec);
|
||||
auto message = flatbuffers::GetRoot<TaskInfo>(spec);
|
||||
return message->actor_counter();
|
||||
return std::abs(message->actor_counter());
|
||||
}
|
||||
|
||||
bool TaskSpec_actor_is_checkpoint_method(TaskSpec *spec) {
|
||||
CHECK(spec);
|
||||
auto message = flatbuffers::GetRoot<TaskInfo>(spec);
|
||||
int64_t actor_counter = message->actor_counter();
|
||||
return actor_counter < 0;
|
||||
}
|
||||
|
||||
UniqueID TaskSpec_driver_id(TaskSpec *spec) {
|
||||
|
||||
@@ -135,6 +135,8 @@ UniqueID TaskSpec_actor_id(TaskSpec *spec);
|
||||
*/
|
||||
int64_t TaskSpec_actor_counter(TaskSpec *spec);
|
||||
|
||||
bool TaskSpec_actor_is_checkpoint_method(TaskSpec *spec);
|
||||
|
||||
/**
|
||||
* Return the driver ID of the task.
|
||||
*
|
||||
|
||||
@@ -36,6 +36,14 @@ enum MessageType:int {
|
||||
PutObject
|
||||
}
|
||||
|
||||
// This message is sent from a worker to a local scheduler.
|
||||
table GetTaskRequest {
|
||||
// Whether the previously assigned task was a checkpoint task that failed.
|
||||
// If true, then the local scheduler will not update the actor's task
|
||||
// counter to match the assigned checkpoint index.
|
||||
actor_checkpoint_failed: bool;
|
||||
}
|
||||
|
||||
// This message is sent from the local scheduler to a worker.
|
||||
table GetTaskReply {
|
||||
// A string of bytes representing the task specification.
|
||||
|
||||
@@ -568,7 +568,9 @@ void assign_task_to_worker(LocalSchedulerState *state,
|
||||
}
|
||||
}
|
||||
|
||||
void finish_task(LocalSchedulerState *state, LocalSchedulerClient *worker) {
|
||||
void finish_task(LocalSchedulerState *state,
|
||||
LocalSchedulerClient *worker,
|
||||
bool actor_checkpoint_failed) {
|
||||
if (worker->task_in_progress != NULL) {
|
||||
TaskSpec *spec = Task_task_spec(worker->task_in_progress);
|
||||
/* Return dynamic resources back for the task in progress. */
|
||||
@@ -589,8 +591,11 @@ void finish_task(LocalSchedulerState *state, LocalSchedulerClient *worker) {
|
||||
}
|
||||
/* If we're connected to Redis, update tables. */
|
||||
if (state->db != NULL) {
|
||||
/* Update control state tables. */
|
||||
Task_set_state(worker->task_in_progress, TASK_STATUS_DONE);
|
||||
/* Update control state tables. If there was an error while executing a *
|
||||
* checkpoint task, report the task as lost. Else, the task succeeded. */
|
||||
int task_state =
|
||||
actor_checkpoint_failed ? TASK_STATUS_LOST : TASK_STATUS_DONE;
|
||||
Task_set_state(worker->task_in_progress, task_state);
|
||||
task_table_update(state->db, worker->task_in_progress, NULL, NULL, NULL);
|
||||
/* The call to task_table_update takes ownership of the
|
||||
* task_in_progress, so we set the pointer to NULL so it is not used. */
|
||||
@@ -734,18 +739,18 @@ void reconstruct_object_lookup_callback(
|
||||
* object table entry is up-to-date. */
|
||||
LocalSchedulerState *state = (LocalSchedulerState *) user_context;
|
||||
/* Look up the task that created the object in the result table. */
|
||||
if (!never_created && manager_vector.size() == 0) {
|
||||
/* If the object was created and later evicted, we reconstruct the object
|
||||
* if and only if there are no other instances of the task running. */
|
||||
result_table_lookup(state->db, reconstruct_object_id, NULL,
|
||||
reconstruct_evicted_result_lookup_callback,
|
||||
(void *) state);
|
||||
} else if (never_created) {
|
||||
if (never_created) {
|
||||
/* If the object has not been created yet, we reconstruct the object if and
|
||||
* only if the task that created the object failed to complete. */
|
||||
result_table_lookup(state->db, reconstruct_object_id, NULL,
|
||||
reconstruct_failed_result_lookup_callback,
|
||||
(void *) state);
|
||||
} else if (manager_vector.size() == 0) {
|
||||
/* If the object was created and later evicted, we reconstruct the object
|
||||
* if and only if there are no other instances of the task running. */
|
||||
result_table_lookup(state->db, reconstruct_object_id, NULL,
|
||||
reconstruct_evicted_result_lookup_callback,
|
||||
(void *) state);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -951,7 +956,7 @@ void process_message(event_loop *loop,
|
||||
case MessageType_TaskDone: {
|
||||
} break;
|
||||
case MessageType_DisconnectClient: {
|
||||
finish_task(state, worker);
|
||||
finish_task(state, worker, false);
|
||||
CHECK(!worker->disconnected);
|
||||
worker->disconnected = true;
|
||||
/* If the disconnected worker was not an actor, start a new worker to make
|
||||
@@ -977,13 +982,16 @@ void process_message(event_loop *loop,
|
||||
} break;
|
||||
case MessageType_GetTask: {
|
||||
/* If this worker reports a completed task, account for resources. */
|
||||
finish_task(state, worker);
|
||||
auto message = flatbuffers::GetRoot<GetTaskRequest>(input);
|
||||
bool actor_checkpoint_failed = message->actor_checkpoint_failed();
|
||||
finish_task(state, worker, actor_checkpoint_failed);
|
||||
/* Let the scheduling algorithm process the fact that there is an available
|
||||
* worker. */
|
||||
if (ActorID_equal(worker->actor_id, NIL_ACTOR_ID)) {
|
||||
handle_worker_available(state, state->algorithm_state, worker);
|
||||
} else {
|
||||
handle_actor_worker_available(state, state->algorithm_state, worker);
|
||||
handle_actor_worker_available(state, state->algorithm_state, worker,
|
||||
actor_checkpoint_failed);
|
||||
}
|
||||
} break;
|
||||
case MessageType_ReconstructObject: {
|
||||
|
||||
@@ -56,9 +56,13 @@ void assign_task_to_worker(LocalSchedulerState *state,
|
||||
*
|
||||
* @param state The local scheduler state.
|
||||
* @param worker The worker that finished the task.
|
||||
* @param actor_checkpoint_failed If the last task assigned was a checkpoint
|
||||
* task that failed.
|
||||
* @return Void.
|
||||
*/
|
||||
void finish_task(LocalSchedulerState *state, LocalSchedulerClient *worker);
|
||||
void finish_task(LocalSchedulerState *state,
|
||||
LocalSchedulerClient *worker,
|
||||
bool actor_checkpoint_failed);
|
||||
|
||||
/**
|
||||
* This is the callback that is used to process a notification from the Plasma
|
||||
|
||||
@@ -53,6 +53,10 @@ typedef struct {
|
||||
* restrict the submission of tasks on actors to the process that created the
|
||||
* actor. */
|
||||
int64_t task_counter;
|
||||
/** The index of the task assigned to this actor. Set to -1 if no task is
|
||||
* currently assigned. If the actor process reports back success for the
|
||||
* assigned task execution, task_counter should be set to this value. */
|
||||
int64_t assigned_task_counter;
|
||||
/** A queue of tasks to be executed on this actor. The tasks will be sorted by
|
||||
* the order of their actor counters. */
|
||||
std::list<TaskQueueEntry> *task_queue;
|
||||
@@ -234,6 +238,7 @@ void create_actor(SchedulingAlgorithmState *algorithm_state,
|
||||
LocalSchedulerClient *worker) {
|
||||
LocalActorInfo entry;
|
||||
entry.task_counter = 0;
|
||||
entry.assigned_task_counter = -1;
|
||||
entry.task_queue = new std::list<TaskQueueEntry>();
|
||||
entry.worker = worker;
|
||||
entry.worker_available = false;
|
||||
@@ -309,39 +314,56 @@ bool dispatch_actor_task(LocalSchedulerState *state,
|
||||
|
||||
/* There should be some queued tasks for this actor. */
|
||||
CHECK(!entry.task_queue->empty());
|
||||
|
||||
TaskQueueEntry first_task = entry.task_queue->front();
|
||||
int64_t next_task_counter = TaskSpec_actor_counter(first_task.spec);
|
||||
if (next_task_counter != entry.task_counter) {
|
||||
/* We cannot execute the next task on this actor without violating the
|
||||
* in-order execution guarantee for actor tasks. */
|
||||
CHECK(next_task_counter > entry.task_counter);
|
||||
return false;
|
||||
}
|
||||
/* If the worker is not available, we cannot assign a task to it. */
|
||||
if (!entry.worker_available) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Find the first task that either matches the task counter or that is a
|
||||
* checkpoint method. Remove any tasks that we have already executed past
|
||||
* (e.g., by executing a more recent checkpoint method). */
|
||||
auto task = entry.task_queue->begin();
|
||||
int64_t next_task_counter = TaskSpec_actor_counter(task->spec);
|
||||
while (next_task_counter != entry.task_counter) {
|
||||
if (next_task_counter < entry.task_counter) {
|
||||
/* A task that we have already executed past. Remove it. */
|
||||
task = entry.task_queue->erase(task);
|
||||
/* If there are no more tasks in the queue, wait. */
|
||||
if (task == entry.task_queue->end()) {
|
||||
algorithm_state->actors_with_pending_tasks.erase(actor_id);
|
||||
return false;
|
||||
}
|
||||
/* Move on to the next task. */
|
||||
next_task_counter = TaskSpec_actor_counter(task->spec);
|
||||
} else if (TaskSpec_actor_is_checkpoint_method(task->spec)) {
|
||||
/* A later task that is a checkpoint method. Checkpoint methods can
|
||||
* always be executed. */
|
||||
break;
|
||||
} else {
|
||||
/* A later task that is not a checkpoint. Wait for the preceding tasks to
|
||||
* execute. */
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* If there are not enough resources available, we cannot assign the task. */
|
||||
CHECK(0 ==
|
||||
TaskSpec_get_required_resource(first_task.spec, ResourceIndex_GPU));
|
||||
CHECK(0 == TaskSpec_get_required_resource(task->spec, ResourceIndex_GPU));
|
||||
if (!check_dynamic_resources(
|
||||
state,
|
||||
TaskSpec_get_required_resource(first_task.spec, ResourceIndex_CPU), 0,
|
||||
TaskSpec_get_required_resource(first_task.spec,
|
||||
ResourceIndex_CustomResource))) {
|
||||
state, TaskSpec_get_required_resource(task->spec, ResourceIndex_CPU),
|
||||
0, TaskSpec_get_required_resource(task->spec,
|
||||
ResourceIndex_CustomResource))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Assign the first task in the task queue to the worker and mark the worker
|
||||
* as unavailable. */
|
||||
entry.task_counter += 1;
|
||||
assign_task_to_worker(state, first_task.spec, first_task.task_spec_size,
|
||||
entry.worker);
|
||||
assign_task_to_worker(state, task->spec, task->task_spec_size, entry.worker);
|
||||
entry.assigned_task_counter = next_task_counter;
|
||||
entry.worker_available = false;
|
||||
/* Free the task queue entry. */
|
||||
TaskQueueEntry_free(&first_task);
|
||||
TaskQueueEntry_free(&(*task));
|
||||
/* Remove the task from the actor's task queue. */
|
||||
entry.task_queue->pop_front();
|
||||
entry.task_queue->erase(task);
|
||||
|
||||
/* If there are no more tasks in the queue, then indicate that the actor has
|
||||
* no tasks. */
|
||||
@@ -414,12 +436,11 @@ void add_task_to_actor_queue(LocalSchedulerState *state,
|
||||
* guaranteeing in-order execution of the tasks on the actor). TODO(rkn): This
|
||||
* check will fail if the fault-tolerance mechanism resubmits a task on an
|
||||
* actor. */
|
||||
bool task_is_redundant = false;
|
||||
if (task_counter < entry.task_counter) {
|
||||
LOG_INFO(
|
||||
"A task that has already been executed has been resubmitted, so we "
|
||||
"are ignoring it. This should only happen during reconstruction.");
|
||||
task_is_redundant = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Create a new task queue entry. */
|
||||
@@ -437,32 +458,51 @@ void add_task_to_actor_queue(LocalSchedulerState *state,
|
||||
if (it != entry.task_queue->end() &&
|
||||
task_counter == TaskSpec_actor_counter(it->spec)) {
|
||||
LOG_INFO(
|
||||
"A task that has already been executed has been resubmitted, so we "
|
||||
"are ignoring it. This should only happen during reconstruction.");
|
||||
task_is_redundant = true;
|
||||
"A task was resubmitted, so we are ignoring it. This should only "
|
||||
"happen during reconstruction.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!task_is_redundant) {
|
||||
entry.task_queue->insert(it, elt);
|
||||
/* The task has a counter that has not been executed or submitted before. Add
|
||||
* it to the actor queue. */
|
||||
entry.task_queue->insert(it, elt);
|
||||
|
||||
/* Update the task table. */
|
||||
if (state->db != NULL) {
|
||||
Task *task = Task_alloc(spec, task_spec_size, TASK_STATUS_QUEUED,
|
||||
get_db_client_id(state->db));
|
||||
if (from_global_scheduler) {
|
||||
/* If the task is from the global scheduler, it's already been added to
|
||||
* the task table, so just update the entry. */
|
||||
task_table_update(state->db, task, NULL, NULL, NULL);
|
||||
} else {
|
||||
/* Otherwise, this is the first time the task has been seen in the
|
||||
* system (unless it's a resubmission of a previous task), so add the
|
||||
* entry. */
|
||||
task_table_add_task(state->db, task, NULL, NULL, NULL);
|
||||
/* Update the task table. */
|
||||
if (state->db != NULL) {
|
||||
Task *task = Task_alloc(spec, task_spec_size, TASK_STATUS_QUEUED,
|
||||
get_db_client_id(state->db));
|
||||
if (from_global_scheduler) {
|
||||
/* If the task is from the global scheduler, it's already been added to
|
||||
* the task table, so just update the entry. */
|
||||
task_table_update(state->db, task, NULL, NULL, NULL);
|
||||
} else {
|
||||
/* Otherwise, this is the first time the task has been seen in the
|
||||
* system (unless it's a resubmission of a previous task), so add the
|
||||
* entry. */
|
||||
task_table_add_task(state->db, task, NULL, NULL, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/* Record the fact that this actor has a task waiting to execute. */
|
||||
algorithm_state->actors_with_pending_tasks.insert(actor_id);
|
||||
|
||||
/* Register a missing dependency on the preceding task. TODO(swang): Unify
|
||||
* with `fetch_missing_dependencies` for non-actor tasks. */
|
||||
if (entry.task_counter != task_counter) {
|
||||
int64_t num_args = TaskSpec_num_args(spec);
|
||||
/* The last argument represents dependency on a preceding task. If it is by
|
||||
* reference, then it is an explicit dependency. */
|
||||
if (TaskSpec_arg_by_ref(spec, num_args - 1)) {
|
||||
ObjectID dummy_object_id = TaskSpec_arg_id(spec, num_args - 1);
|
||||
if (algorithm_state->local_objects.count(dummy_object_id) == 0) {
|
||||
ObjectEntry entry;
|
||||
/* TODO(swang): Objects in `remote_objects` will get fetched from
|
||||
* remote plasma managers. Do not fetch actor dummy objects. Otherwise,
|
||||
* if the plasma manager associated with the dead local scheduler is
|
||||
* still alive, reconstruction will never complete. */
|
||||
state->algorithm_state->remote_objects[dummy_object_id] = entry;
|
||||
}
|
||||
}
|
||||
|
||||
/* Record the fact that this actor has a task waiting to execute. */
|
||||
algorithm_state->actors_with_pending_tasks.insert(actor_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1202,7 +1242,8 @@ void handle_actor_worker_disconnect(LocalSchedulerState *state,
|
||||
|
||||
void handle_actor_worker_available(LocalSchedulerState *state,
|
||||
SchedulingAlgorithmState *algorithm_state,
|
||||
LocalSchedulerClient *worker) {
|
||||
LocalSchedulerClient *worker,
|
||||
bool actor_checkpoint_failed) {
|
||||
ActorID actor_id = worker->actor_id;
|
||||
CHECK(!ActorID_equal(actor_id, NIL_ACTOR_ID));
|
||||
/* Get the actor info for this worker. */
|
||||
@@ -1212,6 +1253,13 @@ void handle_actor_worker_available(LocalSchedulerState *state,
|
||||
|
||||
CHECK(worker == entry.worker);
|
||||
CHECK(!entry.worker_available);
|
||||
/* If the assigned task was not a checkpoint task, or if it was but it
|
||||
* loaded the checkpoint successfully, then we update the actor's counter
|
||||
* to the assigned counter. */
|
||||
if (!actor_checkpoint_failed) {
|
||||
entry.task_counter = entry.assigned_task_counter + 1;
|
||||
}
|
||||
entry.assigned_task_counter = -1;
|
||||
entry.worker_available = true;
|
||||
/* Assign new tasks if possible. */
|
||||
dispatch_all_tasks(state, algorithm_state);
|
||||
|
||||
@@ -178,12 +178,15 @@ void handle_worker_removed(LocalSchedulerState *state,
|
||||
*
|
||||
* @param state The state of the local scheduler.
|
||||
* @param algorithm_state State maintained by the scheduling algorithm.
|
||||
* @param wi Information about the worker that is available.
|
||||
* @param worker The worker that is available.
|
||||
* @param actor_checkpoint_failed If the last task assigned was a checkpoint
|
||||
* task that failed.
|
||||
* @return Void.
|
||||
*/
|
||||
void handle_actor_worker_available(LocalSchedulerState *state,
|
||||
SchedulingAlgorithmState *algorithm_state,
|
||||
LocalSchedulerClient *worker);
|
||||
LocalSchedulerClient *worker,
|
||||
bool actor_checkpoint_failed);
|
||||
|
||||
/**
|
||||
* Handle the fact that a new worker is available for running an actor.
|
||||
|
||||
@@ -95,14 +95,19 @@ void local_scheduler_submit(LocalSchedulerConnection *conn,
|
||||
}
|
||||
|
||||
TaskSpec *local_scheduler_get_task(LocalSchedulerConnection *conn,
|
||||
int64_t *task_size) {
|
||||
write_message(conn->conn, MessageType_GetTask, 0, NULL);
|
||||
int64_t *task_size,
|
||||
bool actor_checkpoint_failed) {
|
||||
flatbuffers::FlatBufferBuilder fbb;
|
||||
auto message = CreateGetTaskRequest(fbb, actor_checkpoint_failed);
|
||||
fbb.Finish(message);
|
||||
write_message(conn->conn, MessageType_GetTask, fbb.GetSize(),
|
||||
fbb.GetBufferPointer());
|
||||
int64_t type;
|
||||
int64_t message_size;
|
||||
uint8_t *message;
|
||||
int64_t reply_size;
|
||||
uint8_t *reply;
|
||||
/* Receive a task from the local scheduler. This will block until the local
|
||||
* scheduler gives this client a task. */
|
||||
read_message(conn->conn, &type, &message_size, &message);
|
||||
read_message(conn->conn, &type, &reply_size, &reply);
|
||||
if (type == DISCONNECT_CLIENT) {
|
||||
LOG_WARN("Exiting because local scheduler closed connection.");
|
||||
exit(1);
|
||||
@@ -110,7 +115,7 @@ TaskSpec *local_scheduler_get_task(LocalSchedulerConnection *conn,
|
||||
CHECK(type == MessageType_ExecuteTask);
|
||||
|
||||
/* Parse the flatbuffer object. */
|
||||
auto reply_message = flatbuffers::GetRoot<GetTaskReply>(message);
|
||||
auto reply_message = flatbuffers::GetRoot<GetTaskReply>(reply);
|
||||
|
||||
/* Set the GPU IDs for this task. We only do this for non-actor tasks because
|
||||
* for actors the GPUs are associated with the actor itself and not with the
|
||||
@@ -127,7 +132,7 @@ TaskSpec *local_scheduler_get_task(LocalSchedulerConnection *conn,
|
||||
TaskSpec *data = (TaskSpec *) reply_message->task_spec()->data();
|
||||
TaskSpec *spec = TaskSpec_copy(data, *task_size);
|
||||
/* Free the original message from the local scheduler. */
|
||||
free(message);
|
||||
free(reply);
|
||||
/* Return the copy of the task spec and pass ownership to the caller. */
|
||||
return spec;
|
||||
}
|
||||
|
||||
@@ -94,10 +94,14 @@ void local_scheduler_log_event(LocalSchedulerConnection *conn,
|
||||
* @todo When does this actually get freed?
|
||||
*
|
||||
* @param conn The connection information.
|
||||
* @param task_size A pointer to fill out with the task size.
|
||||
* @param actor_checkpoint_failed If the last task assigned was a checkpoint
|
||||
* task that failed.
|
||||
* @return The address of the assigned task.
|
||||
*/
|
||||
TaskSpec *local_scheduler_get_task(LocalSchedulerConnection *conn,
|
||||
int64_t *task_size);
|
||||
int64_t *task_size,
|
||||
bool actor_checkpoint_failed);
|
||||
|
||||
/**
|
||||
* Tell the local scheduler that the client has finished executing a task.
|
||||
|
||||
@@ -59,14 +59,25 @@ static PyObject *PyLocalSchedulerClient_submit(PyObject *self, PyObject *args) {
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
static PyObject *PyLocalSchedulerClient_get_task(PyObject *self) {
|
||||
static PyObject *PyLocalSchedulerClient_get_task(PyObject *self, PyObject *args) {
|
||||
PyObject *py_actor_checkpoint_failed = NULL;
|
||||
if (!PyArg_ParseTuple(args, "|O", &py_actor_checkpoint_failed)) {
|
||||
return NULL;
|
||||
}
|
||||
TaskSpec *task_spec;
|
||||
int64_t task_size;
|
||||
/* If no argument for actor_checkpoint_failed was provided, default to false,
|
||||
* since we assume that there was no previous task. */
|
||||
bool actor_checkpoint_failed = false;
|
||||
if (py_actor_checkpoint_failed != NULL) {
|
||||
actor_checkpoint_failed = (bool) PyObject_IsTrue(py_actor_checkpoint_failed);
|
||||
}
|
||||
/* Drop the global interpreter lock while we get a task because
|
||||
* local_scheduler_get_task may block for a long time. */
|
||||
int64_t task_size;
|
||||
Py_BEGIN_ALLOW_THREADS
|
||||
task_spec = local_scheduler_get_task(
|
||||
((PyLocalSchedulerClient *) self)->local_scheduler_connection, &task_size);
|
||||
((PyLocalSchedulerClient *) self)->local_scheduler_connection,
|
||||
&task_size, actor_checkpoint_failed);
|
||||
Py_END_ALLOW_THREADS
|
||||
return PyTask_make(task_spec, task_size);
|
||||
}
|
||||
@@ -138,7 +149,7 @@ static PyMethodDef PyLocalSchedulerClient_methods[] = {
|
||||
"Notify the local scheduler that this client is exiting gracefully."},
|
||||
{"submit", (PyCFunction) PyLocalSchedulerClient_submit, METH_VARARGS,
|
||||
"Submit a task to the local scheduler."},
|
||||
{"get_task", (PyCFunction) PyLocalSchedulerClient_get_task, METH_NOARGS,
|
||||
{"get_task", (PyCFunction) PyLocalSchedulerClient_get_task, METH_VARARGS,
|
||||
"Get a task from the local scheduler."},
|
||||
{"reconstruct_object",
|
||||
(PyCFunction) PyLocalSchedulerClient_reconstruct_object, METH_VARARGS,
|
||||
|
||||
@@ -210,12 +210,12 @@ TEST object_reconstruction_test(void) {
|
||||
int64_t task_assigned_size;
|
||||
local_scheduler_submit(worker, spec, task_size);
|
||||
TaskSpec *task_assigned =
|
||||
local_scheduler_get_task(worker, &task_assigned_size);
|
||||
local_scheduler_get_task(worker, &task_assigned_size, true);
|
||||
ASSERT_EQ(memcmp(task_assigned, spec, task_size), 0);
|
||||
ASSERT_EQ(task_assigned_size, task_size);
|
||||
int64_t reconstruct_task_size;
|
||||
TaskSpec *reconstruct_task =
|
||||
local_scheduler_get_task(worker, &reconstruct_task_size);
|
||||
local_scheduler_get_task(worker, &reconstruct_task_size, true);
|
||||
ASSERT_EQ(memcmp(reconstruct_task, spec, task_size), 0);
|
||||
ASSERT_EQ(reconstruct_task_size, task_size);
|
||||
/* Clean up. */
|
||||
@@ -315,7 +315,8 @@ TEST object_reconstruction_recursive_test(void) {
|
||||
/* Make sure we receive each task from the initial submission. */
|
||||
for (int i = 0; i < NUM_TASKS; ++i) {
|
||||
int64_t task_size;
|
||||
TaskSpec *task_assigned = local_scheduler_get_task(worker, &task_size);
|
||||
TaskSpec *task_assigned =
|
||||
local_scheduler_get_task(worker, &task_size, true);
|
||||
ASSERT_EQ(memcmp(task_assigned, specs[i], task_sizes[i]), 0);
|
||||
ASSERT_EQ(task_size, task_sizes[i]);
|
||||
free(task_assigned);
|
||||
@@ -325,7 +326,7 @@ TEST object_reconstruction_recursive_test(void) {
|
||||
for (int i = 0; i < NUM_TASKS; ++i) {
|
||||
int64_t task_assigned_size;
|
||||
TaskSpec *task_assigned =
|
||||
local_scheduler_get_task(worker, &task_assigned_size);
|
||||
local_scheduler_get_task(worker, &task_assigned_size, true);
|
||||
bool found = false;
|
||||
for (int j = 0; j < NUM_TASKS; ++j) {
|
||||
if (specs[j] == NULL) {
|
||||
@@ -410,7 +411,7 @@ TEST object_reconstruction_suppression_test(void) {
|
||||
* object_table_add callback completes. */
|
||||
int64_t task_assigned_size;
|
||||
TaskSpec *task_assigned =
|
||||
local_scheduler_get_task(worker, &task_assigned_size);
|
||||
local_scheduler_get_task(worker, &task_assigned_size, true);
|
||||
ASSERT_EQ(memcmp(task_assigned, object_reconstruction_suppression_spec,
|
||||
object_reconstruction_suppression_size),
|
||||
0);
|
||||
|
||||
+109
-25
@@ -1191,12 +1191,8 @@ class ActorReconstruction(unittest.TestCase):
|
||||
# Wait for the last task to finish running.
|
||||
ray.get(ids[-1])
|
||||
|
||||
# Kill the second local scheduler.
|
||||
process = ray.services.all_processes[
|
||||
ray.services.PROCESS_TYPE_LOCAL_SCHEDULER][1]
|
||||
process.kill()
|
||||
process.wait()
|
||||
# Kill the corresponding plasma store to get rid of the cached objects.
|
||||
# Kill the second plasma store to get rid of the cached objects and
|
||||
# trigger the corresponding local scheduler to exit.
|
||||
process = ray.services.all_processes[
|
||||
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
|
||||
process.kill()
|
||||
@@ -1253,14 +1249,10 @@ class ActorReconstruction(unittest.TestCase):
|
||||
for _ in range(num_function_calls_at_a_time):
|
||||
result_ids[actor].append(
|
||||
actor.inc.remote(j ** 2 * 0.000001))
|
||||
# Kill a local scheduler. Don't kill the first local scheduler
|
||||
# since that is the one that the driver is connected to.
|
||||
process = ray.services.all_processes[
|
||||
ray.services.PROCESS_TYPE_LOCAL_SCHEDULER][i + 1]
|
||||
process.kill()
|
||||
process.wait()
|
||||
# Kill the corresponding plasma store to get rid of the cached
|
||||
# objects.
|
||||
# Kill a plasma store to get rid of the cached objects and trigger
|
||||
# exit of the corresponding local scheduler. Don't kill the first
|
||||
# local scheduler since that is the one that the driver is
|
||||
# connected to.
|
||||
process = ray.services.all_processes[
|
||||
ray.services.PROCESS_TYPE_PLASMA_STORE][i + 1]
|
||||
process.kill()
|
||||
@@ -1280,19 +1272,21 @@ class ActorReconstruction(unittest.TestCase):
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
@unittest.skip("Skipping until checkpointing is integrated with object "
|
||||
"lineage.")
|
||||
def testCheckpointing(self):
|
||||
def setup_test_checkpointing(self, save_exception=False,
|
||||
resume_exception=False):
|
||||
ray.worker._init(start_ray_local=True, num_local_schedulers=2,
|
||||
num_workers=0, redirect_output=True)
|
||||
|
||||
@ray.remote(checkpoint_interval=5)
|
||||
class Counter(object):
|
||||
def __init__(self):
|
||||
_resume_exception = resume_exception
|
||||
|
||||
def __init__(self, save_exception):
|
||||
self.x = 0
|
||||
# The number of times that inc has been called. We won't bother
|
||||
# restoring this in the checkpoint
|
||||
self.num_inc_calls = 0
|
||||
self.save_exception = save_exception
|
||||
|
||||
def local_plasma(self):
|
||||
return ray.worker.global_worker.plasma_client.store_socket_name
|
||||
@@ -1310,9 +1304,13 @@ class ActorReconstruction(unittest.TestCase):
|
||||
return self.y
|
||||
|
||||
def __ray_save__(self):
|
||||
if self.save_exception:
|
||||
raise Exception("Exception raised in checkpoint save")
|
||||
return self.x, -1
|
||||
|
||||
def __ray_restore__(self, checkpoint):
|
||||
if self._resume_exception:
|
||||
raise Exception("Exception raised in checkpoint resume")
|
||||
self.x, val = checkpoint
|
||||
self.num_inc_calls = 0
|
||||
# Test that __ray_save__ has been run.
|
||||
@@ -1322,21 +1320,20 @@ class ActorReconstruction(unittest.TestCase):
|
||||
local_plasma = ray.worker.global_worker.plasma_client.store_socket_name
|
||||
|
||||
# Create an actor that is not on the local scheduler.
|
||||
actor = Counter.remote()
|
||||
actor = Counter.remote(save_exception)
|
||||
while ray.get(actor.local_plasma.remote()) == local_plasma:
|
||||
actor = Counter.remote()
|
||||
actor = Counter.remote(save_exception)
|
||||
|
||||
args = [ray.put(0) for _ in range(100)]
|
||||
ids = [actor.inc.remote(*args[i:]) for i in range(100)]
|
||||
|
||||
return actor, ids
|
||||
|
||||
def testCheckpointing(self):
|
||||
actor, ids = self.setup_test_checkpointing()
|
||||
# Wait for the last task to finish running.
|
||||
ray.get(ids[-1])
|
||||
|
||||
# Kill the second local scheduler.
|
||||
process = ray.services.all_processes[
|
||||
ray.services.PROCESS_TYPE_LOCAL_SCHEDULER][1]
|
||||
process.kill()
|
||||
process.wait()
|
||||
# Kill the corresponding plasma store to get rid of the cached objects.
|
||||
process = ray.services.all_processes[
|
||||
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
|
||||
@@ -1355,6 +1352,93 @@ class ActorReconstruction(unittest.TestCase):
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
def testLostCheckpoint(self):
|
||||
actor, ids = self.setup_test_checkpointing()
|
||||
# Wait for the first fraction of tasks to finish running.
|
||||
ray.get(ids[len(ids) // 10])
|
||||
|
||||
actor_key = b"Actor:" + actor._ray_actor_id.id()
|
||||
for index in ray.actor.get_checkpoint_indices(
|
||||
ray.worker.global_worker, actor._ray_actor_id.id()):
|
||||
ray.worker.global_worker.redis_client.hdel(
|
||||
actor_key, "checkpoint_{}".format(index))
|
||||
|
||||
# Kill the corresponding plasma store to get rid of the cached objects.
|
||||
process = ray.services.all_processes[
|
||||
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
|
||||
process.kill()
|
||||
process.wait()
|
||||
|
||||
self.assertEqual(ray.get(actor.inc.remote()), 101)
|
||||
|
||||
# Each inc method has been reexecuted once on the new actor.
|
||||
self.assertEqual(ray.get(actor.get_num_inc_calls.remote()), 101)
|
||||
# Get all of the results that were previously lost. Because the
|
||||
# checkpoints were lost, all methods should be reconstructed.
|
||||
results = ray.get(ids)
|
||||
self.assertEqual(results, list(range(1, 1 + len(results))))
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
def testCheckpointException(self):
|
||||
actor, ids = self.setup_test_checkpointing(save_exception=True)
|
||||
# Wait for the last task to finish running.
|
||||
ray.get(ids[-1])
|
||||
|
||||
# Kill the corresponding plasma store to get rid of the cached objects.
|
||||
process = ray.services.all_processes[
|
||||
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
|
||||
process.kill()
|
||||
process.wait()
|
||||
|
||||
self.assertEqual(ray.get(actor.inc.remote()), 101)
|
||||
# Each inc method has been reexecuted once on the new actor, since all
|
||||
# checkpoint saves failed.
|
||||
self.assertEqual(ray.get(actor.get_num_inc_calls.remote()), 101)
|
||||
# Get all of the results that were previously lost. Because the
|
||||
# checkpoints were lost, all methods should be reconstructed.
|
||||
results = ray.get(ids)
|
||||
self.assertEqual(results, list(range(1, 1 + len(results))))
|
||||
|
||||
errors = ray.error_info()
|
||||
# We submitted 101 tasks with a checkpoint interval of 5.
|
||||
num_checkpoints = 101 // 5
|
||||
# Each checkpoint task throws an exception when saving during initial
|
||||
# execution, and then again during re-execution.
|
||||
self.assertEqual(len([error for error in errors if error[b"type"] ==
|
||||
b"task"]), num_checkpoints * 2)
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
def testCheckpointResumeException(self):
|
||||
actor, ids = self.setup_test_checkpointing(resume_exception=True)
|
||||
# Wait for the last task to finish running.
|
||||
ray.get(ids[-1])
|
||||
|
||||
# Kill the corresponding plasma store to get rid of the cached objects.
|
||||
process = ray.services.all_processes[
|
||||
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
|
||||
process.kill()
|
||||
process.wait()
|
||||
|
||||
self.assertEqual(ray.get(actor.inc.remote()), 101)
|
||||
# Each inc method has been reexecuted once on the new actor, since all
|
||||
# checkpoint resumes failed.
|
||||
self.assertEqual(ray.get(actor.get_num_inc_calls.remote()), 101)
|
||||
# Get all of the results that were previously lost. Because the
|
||||
# checkpoints were lost, all methods should be reconstructed.
|
||||
results = ray.get(ids)
|
||||
self.assertEqual(results, list(range(1, 1 + len(results))))
|
||||
|
||||
errors = ray.error_info()
|
||||
# The most recently executed checkpoint task should throw an exception
|
||||
# when trying to resume. All other checkpoint tasks should reconstruct
|
||||
# the previous task but throw no errors.
|
||||
self.assertEqual(len([error for error in errors if error[b"type"] ==
|
||||
b"task"]), 1)
|
||||
|
||||
ray.worker.cleanup()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
Reference in New Issue
Block a user