mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 03:18:59 +08:00
Actor checkpointing with object lineage reconstruction (#1004)
* Worker reports error in previous task, actor task counter is incremented after task is successful * Refactor actor task execution - Return new task counter in GetTaskRequest - Update worker state for actor tasks inside of the actor method executor * Manually invoked checkpoint method * Scheduling for actor checkpoint methods * Fix python bugs in checkpointing * Return task success from worker to local scheduler instead of actor counter * Kill local schedulers halfway through actor execution instead of waiting for all tasks to execute once * Remove redundant actor tasks during dispatch, reconstruct missing dependencies for actor tasks * Make executor for temporary actor methods * doc * Set default argument for whether the previous task was a success * Refactor actor method call * Simplify checkpoint task submission * lint * fix philipp's comments * Add missing line * Make actor reconstruction tests run faster * Unimportant whitespace. * Unimportant whitespace. * Update checkpoint method signature * Documentation and handle exceptions during checkpoint save/resume * Rename get_task message field to actor_checkpoint_failed * Fix bug. * Remove debugging check, redirect test output
This commit is contained in:
committed by
Robert Nishihara
parent
b585001881
commit
3764f2f2e1
+304
-48
@@ -7,8 +7,10 @@ import copy
|
||||
import hashlib
|
||||
import inspect
|
||||
import json
|
||||
import numpy as np
|
||||
import traceback
|
||||
|
||||
import pyarrow.plasma as plasma
|
||||
import ray.local_scheduler
|
||||
import ray.signature as signature
|
||||
import ray.worker
|
||||
@@ -40,12 +42,31 @@ def get_actor_method_function_id(attr):
|
||||
return ray.local_scheduler.ObjectID(function_id)
|
||||
|
||||
|
||||
def get_actor_checkpoint(actor_id, worker):
|
||||
def get_checkpoint_indices(worker, actor_id):
|
||||
"""Get the checkpoint indices associated with a given actor ID.
|
||||
|
||||
Args:
|
||||
worker: The worker to use to get the checkpoint indices.
|
||||
actor_id: The actor ID of the actor to get the checkpoint indices for.
|
||||
|
||||
Returns:
|
||||
The indices of existing checkpoints as a list of integers.
|
||||
"""
|
||||
actor_key = b"Actor:" + actor_id
|
||||
checkpoint_indices = []
|
||||
for key in worker.redis_client.hkeys(actor_key):
|
||||
if key.startswith(b"checkpoint_"):
|
||||
index = int(key[len(b"checkpoint_"):])
|
||||
checkpoint_indices.append(index)
|
||||
return checkpoint_indices
|
||||
|
||||
|
||||
def get_actor_checkpoint(worker, actor_id):
|
||||
"""Get the most recent checkpoint associated with a given actor ID.
|
||||
|
||||
Args:
|
||||
actor_id: The actor ID of the actor to get the checkpoint for.
|
||||
worker: The worker to use to get the checkpoint.
|
||||
actor_id: The actor ID of the actor to get the checkpoint for.
|
||||
|
||||
Returns:
|
||||
If a checkpoint exists, this returns a tuple of the checkpoint index
|
||||
@@ -53,18 +74,103 @@ def get_actor_checkpoint(actor_id, worker):
|
||||
index is the actor counter of the last task that was executed on
|
||||
the actor before the checkpoint was made.
|
||||
"""
|
||||
# Get all of the keys associated with checkpoints for this actor.
|
||||
actor_key = b"Actor:" + actor_id
|
||||
checkpoint_indices = [int(key[len(b"checkpoint_"):])
|
||||
for key in worker.redis_client.hkeys(actor_key)
|
||||
if key.startswith(b"checkpoint_")]
|
||||
checkpoint_indices = get_checkpoint_indices(worker, actor_id)
|
||||
if len(checkpoint_indices) == 0:
|
||||
return -1, None
|
||||
most_recent_checkpoint_index = max(checkpoint_indices)
|
||||
# Get the most recent checkpoint.
|
||||
checkpoint = worker.redis_client.hget(
|
||||
actor_key, "checkpoint_{}".format(most_recent_checkpoint_index))
|
||||
return most_recent_checkpoint_index, checkpoint
|
||||
else:
|
||||
actor_key = b"Actor:" + actor_id
|
||||
checkpoint_index = max(checkpoint_indices)
|
||||
checkpoint = worker.redis_client.hget(
|
||||
actor_key, "checkpoint_{}".format(checkpoint_index))
|
||||
return checkpoint_index, checkpoint
|
||||
|
||||
|
||||
def put_dummy_object(worker, dummy_object_id):
|
||||
"""Put a dummy actor object into the local object store.
|
||||
|
||||
This registers a dummy object ID in the local store with an empty numpy
|
||||
array as the value. The resulting object is pinned to the store by storing
|
||||
it to the worker's state.
|
||||
|
||||
For actors, dummy objects are used to store the stateful dependencies
|
||||
between consecutive method calls. This function should be called for every
|
||||
actor method execution that updates the actor's internal state.
|
||||
|
||||
Args:
|
||||
worker: The worker to use to perform the put.
|
||||
dummy_object_id: The object ID of the dummy object.
|
||||
"""
|
||||
# Add the dummy output for actor tasks. TODO(swang): We use
|
||||
# a numpy array as a hack to pin the object in the object
|
||||
# store. Once we allow object pinning in the store, we may
|
||||
# use `None`.
|
||||
dummy_object = np.zeros(1)
|
||||
worker.put_object(dummy_object_id, dummy_object)
|
||||
# Keep the dummy output in scope for the lifetime of the
|
||||
# actor, to prevent eviction from the object store.
|
||||
dummy_object = worker.get_object([dummy_object_id])
|
||||
dummy_object = dummy_object[0]
|
||||
worker.actor_pinned_objects[dummy_object_id] = dummy_object
|
||||
|
||||
|
||||
def is_checkpoint_task(task_counter, checkpoint_interval):
|
||||
if checkpoint_interval <= 0:
|
||||
return False
|
||||
return (task_counter % checkpoint_interval == 0)
|
||||
|
||||
|
||||
def make_actor_method_executor(worker, method_name, method):
|
||||
"""Make an executor that wraps a user-defined actor method.
|
||||
|
||||
The executor wraps the method to update the worker's internal state. If the
|
||||
task is a success, the dummy object returned is added to the object store,
|
||||
to signal that the following task can run, and the worker's task counter is
|
||||
updated to match the executed task. Else, the executor reports failure to
|
||||
the local scheduler so that the task counter does not get updated.
|
||||
|
||||
Args:
|
||||
worker (Worker): The worker that is executing the actor.
|
||||
method_name (str): The name of the actor method.
|
||||
method (instancemethod): The actor method to wrap. This should be a
|
||||
method defined on the actor class and should therefore take an
|
||||
instance of the actor as the first argument.
|
||||
|
||||
Returns:
|
||||
A function that executes the given actor method on the worker's stored
|
||||
instance of the actor. The function also updates the worker's
|
||||
internal state to record the executed method.
|
||||
"""
|
||||
|
||||
def actor_method_executor(dummy_return_id, task_counter, actor,
|
||||
*args):
|
||||
# An actor task's dependency on the previous task is represented by
|
||||
# a dummy argument. Remove this argument before invocation.
|
||||
args = args[:-1]
|
||||
if method_name == "__ray_checkpoint__":
|
||||
# Execute the checkpoint task.
|
||||
actor_checkpoint_failed, error = method(actor, *args)
|
||||
# If the checkpoint was successfully loaded, put the dummy object
|
||||
# and update the actor's task counter, so that the task following
|
||||
# the checkpoint can run.
|
||||
if not actor_checkpoint_failed:
|
||||
put_dummy_object(worker, dummy_return_id)
|
||||
worker.actor_task_counter = task_counter + 1
|
||||
# Report to the local scheduler whether this task succeeded in
|
||||
# loading the checkpoint.
|
||||
worker.actor_checkpoint_failed = actor_checkpoint_failed
|
||||
# If there was an exception during the checkpoint method, re-raise
|
||||
# it after updating the actor's internal state.
|
||||
if error is not None:
|
||||
raise error
|
||||
return None
|
||||
else:
|
||||
# Update the worker's internal state before executing the method in
|
||||
# case the method throws an exception.
|
||||
put_dummy_object(worker, dummy_return_id)
|
||||
worker.actor_task_counter = task_counter + 1
|
||||
# Execute the actor method.
|
||||
return method(actor, *args)
|
||||
return actor_method_executor
|
||||
|
||||
|
||||
def fetch_and_register_actor(actor_class_key, worker):
|
||||
@@ -100,8 +206,11 @@ def fetch_and_register_actor(actor_class_key, worker):
|
||||
"cannot execute this method".format(actor_name))
|
||||
for actor_method_name in actor_method_names:
|
||||
function_id = get_actor_method_function_id(actor_method_name).id()
|
||||
temporary_executor = make_actor_method_executor(worker,
|
||||
actor_method_name,
|
||||
temporary_actor_method)
|
||||
worker.functions[driver_id][function_id] = (actor_method_name,
|
||||
temporary_actor_method)
|
||||
temporary_executor)
|
||||
worker.function_properties[driver_id][function_id] = (
|
||||
FunctionProperties(num_return_vals=2,
|
||||
num_cpus=1,
|
||||
@@ -112,6 +221,7 @@ def fetch_and_register_actor(actor_class_key, worker):
|
||||
|
||||
try:
|
||||
unpickled_class = pickle.loads(pickled_class)
|
||||
worker.actor_class = unpickled_class
|
||||
except Exception:
|
||||
# If an exception was thrown when the actor was imported, we record the
|
||||
# traceback and notify the scheduler of the failure.
|
||||
@@ -126,11 +236,15 @@ def fetch_and_register_actor(actor_class_key, worker):
|
||||
# TODO(pcm): Why is the below line necessary?
|
||||
unpickled_class.__module__ = module
|
||||
worker.actors[actor_id_str] = unpickled_class.__new__(unpickled_class)
|
||||
for (k, v) in inspect.getmembers(
|
||||
actor_methods = inspect.getmembers(
|
||||
unpickled_class, predicate=(lambda x: (inspect.isfunction(x) or
|
||||
inspect.ismethod(x)))):
|
||||
function_id = get_actor_method_function_id(k).id()
|
||||
worker.functions[driver_id][function_id] = (k, v)
|
||||
inspect.ismethod(x))))
|
||||
for actor_method_name, actor_method in actor_methods:
|
||||
function_id = get_actor_method_function_id(actor_method_name).id()
|
||||
executor = make_actor_method_executor(worker, actor_method_name,
|
||||
actor_method)
|
||||
worker.functions[driver_id][function_id] = (actor_method_name,
|
||||
executor)
|
||||
# We do not set worker.function_properties[driver_id][function_id]
|
||||
# because we currently do need the actor worker to submit new tasks
|
||||
# for the actor.
|
||||
@@ -214,6 +328,10 @@ def export_actor(actor_id, class_id, actor_method_names, num_cpus, num_gpus,
|
||||
|
||||
|
||||
def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
# Add one to the checkpoint interval since we will insert a mock task for
|
||||
# every checkpoint.
|
||||
checkpoint_interval += 1
|
||||
|
||||
# Modify the class to have an additional method that will be used for
|
||||
# terminating the worker.
|
||||
class Class(cls):
|
||||
@@ -254,9 +372,101 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
# TODO(rkn): It's possible that this will cause problems. When
|
||||
# you unpickle the same object twice, the two objects will not
|
||||
# have the same class.
|
||||
actor_object = pickle.loads(checkpoint)
|
||||
actor_object = checkpoint
|
||||
return actor_object
|
||||
|
||||
def __ray_checkpoint__(self, task_counter, previous_object_id):
|
||||
"""Save or resume a stored checkpoint.
|
||||
|
||||
This task checkpoints the current state of the actor. If the actor
|
||||
has not yet executed to `task_counter`, then the task instead
|
||||
attempts to resume from a saved checkpoint that matches
|
||||
`task_counter`. If the most recently saved checkpoint is earlier
|
||||
than `task_counter`, the task requests reconstruction of the tasks
|
||||
that executed since the previous checkpoint and before
|
||||
`task_counter`.
|
||||
|
||||
Args:
|
||||
self: An instance of the actor class.
|
||||
task_counter: The index assigned to this checkpoint method.
|
||||
previous_object_id: The dummy object returned by the task that
|
||||
immediately precedes this checkpoint.
|
||||
|
||||
Returns:
|
||||
A bool representing whether the checkpoint was successfully
|
||||
loaded (whether the actor can safely execute the next task)
|
||||
and an Exception instance, if one was thrown.
|
||||
"""
|
||||
worker = ray.worker.global_worker
|
||||
previous_object_id = previous_object_id[0]
|
||||
plasma_id = plasma.ObjectID(previous_object_id.id())
|
||||
|
||||
# Initialize the return values. `actor_checkpoint_failed` will be
|
||||
# set to True if we fail to load the checkpoint. `error` will be
|
||||
# set to the Exception, if one is thrown.
|
||||
actor_checkpoint_failed = False
|
||||
error_to_return = None
|
||||
|
||||
# Save or resume the checkpoint.
|
||||
if previous_object_id in worker.actor_pinned_objects:
|
||||
# The preceding task executed on this actor instance. Save the
|
||||
# checkpoint.
|
||||
print("Saving actor checkpoint. actor_counter = {}."
|
||||
.format(task_counter))
|
||||
actor_key = b"Actor:" + worker.actor_id
|
||||
|
||||
try:
|
||||
checkpoint = worker.actors[
|
||||
worker.actor_id].__ray_save_checkpoint__()
|
||||
# Save the checkpoint in Redis. TODO(rkn): Checkpoints
|
||||
# should not be stored in Redis. Fix this.
|
||||
worker.redis_client.hset(
|
||||
actor_key,
|
||||
"checkpoint_{}".format(task_counter),
|
||||
checkpoint)
|
||||
# Remove the previous checkpoints if there is one.
|
||||
checkpoint_indices = get_checkpoint_indices(
|
||||
worker, worker.actor_id)
|
||||
for index in checkpoint_indices:
|
||||
if index < task_counter:
|
||||
worker.redis_client.hdel(
|
||||
actor_key, "checkpoint_{}".format(index))
|
||||
# An exception was thrown. Save the error.
|
||||
except Exception as error:
|
||||
# Checkpoint saves should not block execution on the actor,
|
||||
# so we still consider the task successful.
|
||||
error_to_return = error
|
||||
else:
|
||||
# The preceding task has not yet executed on this actor
|
||||
# instance. Try to resume from the most recent checkpoint.
|
||||
checkpoint_index, checkpoint = get_actor_checkpoint(
|
||||
worker, worker.actor_id)
|
||||
if checkpoint_index == task_counter:
|
||||
# The checkpoint matches ours. Resume the actor instance.
|
||||
try:
|
||||
actor = (worker.actor_class.
|
||||
__ray_restore_from_checkpoint__(checkpoint))
|
||||
worker.actors[worker.actor_id] = actor
|
||||
# An exception was thrown. Save the error.
|
||||
except Exception as error:
|
||||
# We could not resume the checkpoint, so count the task
|
||||
# as failed.
|
||||
actor_checkpoint_failed = True
|
||||
error_to_return = error
|
||||
else:
|
||||
# We cannot resume a mismatching checkpoint, so count the
|
||||
# task as failed.
|
||||
actor_checkpoint_failed = True
|
||||
|
||||
# Fall back to lineage reconstruction if we were unable to load the
|
||||
# checkpoint.
|
||||
if actor_checkpoint_failed:
|
||||
worker.local_scheduler_client.reconstruct_object(
|
||||
plasma_id.binary())
|
||||
worker.local_scheduler_client.notify_unblocked()
|
||||
|
||||
return actor_checkpoint_failed, error_to_return
|
||||
|
||||
Class.__module__ = cls.__module__
|
||||
Class.__name__ = cls.__name__
|
||||
|
||||
@@ -270,10 +480,9 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
# Create objects to wrap method invocations. This is done so that we can
|
||||
# invoke methods with actor.method.remote() instead of actor.method().
|
||||
class ActorMethod(object):
|
||||
def __init__(self, actor, method_name, method_signature):
|
||||
def __init__(self, actor, method_name):
|
||||
self.actor = actor
|
||||
self.method_name = method_name
|
||||
self.method_signature = method_signature
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
raise Exception("Actor methods cannot be called directly. Instead "
|
||||
@@ -282,9 +491,20 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
.format(self.method_name, self.method_name))
|
||||
|
||||
def remote(self, *args, **kwargs):
|
||||
return self.actor._actor_method_call(self.method_name,
|
||||
self.method_signature, *args,
|
||||
**kwargs)
|
||||
return self.actor._actor_method_call(
|
||||
self.method_name, args=args, kwargs=kwargs,
|
||||
dependency=self.actor._ray_actor_cursor)
|
||||
|
||||
# Checkpoint methods do not take in the state of the previous actor method
|
||||
# as an explicit data dependency.
|
||||
class CheckpointMethod(ActorMethod):
|
||||
def remote(self):
|
||||
# A checkpoint's arguments are the current task counter and the
|
||||
# object ID of the preceding task. The latter is an implicit data
|
||||
# dependency, since the checkpoint method can run at any time.
|
||||
args = [self.actor._ray_actor_counter,
|
||||
[self.actor._ray_actor_cursor]]
|
||||
return self.actor._actor_method_call(self.method_name, args=args)
|
||||
|
||||
class ActorHandle(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
@@ -307,10 +527,12 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
# the current cursor should be added as a dependency, and then
|
||||
# updated to reflect the new invocation.
|
||||
self._ray_actor_cursor = None
|
||||
self._ray_actor_methods = {
|
||||
k: v for (k, v) in inspect.getmembers(
|
||||
Class, predicate=(lambda x: (inspect.isfunction(x) or
|
||||
inspect.ismethod(x))))}
|
||||
ray_actor_methods = inspect.getmembers(
|
||||
Class, predicate=(lambda x: (inspect.isfunction(x) or
|
||||
inspect.ismethod(x))))
|
||||
self._ray_actor_methods = {}
|
||||
for actor_method_name, actor_method in ray_actor_methods:
|
||||
self._ray_actor_methods[actor_method_name] = actor_method
|
||||
# Extract the signatures of each of the methods. This will be used
|
||||
# to catch some errors if the methods are called with inappropriate
|
||||
# arguments.
|
||||
@@ -346,18 +568,41 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
|
||||
# Call __init__ as a remote function.
|
||||
if "__init__" in self._ray_actor_methods.keys():
|
||||
self._actor_method_call(
|
||||
"__init__", self._ray_method_signatures["__init__"], *args,
|
||||
**kwargs)
|
||||
self._actor_method_call("__init__", args=args, kwargs=kwargs)
|
||||
else:
|
||||
print("WARNING: this object has no __init__ method.")
|
||||
|
||||
# The function actor_method_call gets called if somebody tries to call
|
||||
# a method on their local actor stub object.
|
||||
def _actor_method_call(self, attr, function_signature, *args,
|
||||
**kwargs):
|
||||
def _actor_method_call(self, method_name, args=None, kwargs=None,
|
||||
dependency=None):
|
||||
"""Method execution stub for an actor handle.
|
||||
|
||||
This is the function that executes when
|
||||
`actor.method_name.remote(*args, **kwargs)` is called. Instead of
|
||||
executing locally, the method is packaged as a task and scheduled
|
||||
to the remote actor instance.
|
||||
|
||||
Args:
|
||||
self: The local actor handle.
|
||||
method_name: The name of the actor method to execute.
|
||||
args: A list of arguments for the actor method.
|
||||
kwargs: A dictionary of keyword arguments for the actor method.
|
||||
dependency: The object ID that this method is dependent on.
|
||||
Defaults to None, for no dependencies. Most tasks should
|
||||
pass in the dummy object returned by the preceding task.
|
||||
Some tasks, such as checkpoint and terminate methods, have
|
||||
no dependencies.
|
||||
|
||||
Returns:
|
||||
object_ids: A list of object IDs returned by the remote actor
|
||||
method.
|
||||
"""
|
||||
ray.worker.check_connected()
|
||||
ray.worker.check_main_thread()
|
||||
function_signature = self._ray_method_signatures[method_name]
|
||||
if args is None:
|
||||
args = []
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
args = signature.extend_args(function_signature, args, kwargs)
|
||||
|
||||
# Execute functions locally if Ray is run in PYTHON_MODE
|
||||
@@ -365,23 +610,33 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
if ray.worker.global_worker.mode == ray.PYTHON_MODE:
|
||||
return getattr(
|
||||
ray.worker.global_worker.actors[self._ray_actor_id],
|
||||
attr)(*copy.deepcopy(args))
|
||||
method_name)(*copy.deepcopy(args))
|
||||
|
||||
# Add the current actor cursor, a dummy object returned by the most
|
||||
# recent method invocation, as a dependency for the next method
|
||||
# invocation.
|
||||
if self._ray_actor_cursor is not None:
|
||||
args.append(self._ray_actor_cursor)
|
||||
# Add the dummy argument that represents dependency on a preceding
|
||||
# task.
|
||||
args.append(dependency)
|
||||
|
||||
function_id = get_actor_method_function_id(attr)
|
||||
actor_counter = self._ray_actor_counter
|
||||
# Mark checkpoint methods with a negative task counter.
|
||||
if is_checkpoint_task(actor_counter, checkpoint_interval):
|
||||
actor_counter = self._ray_actor_counter * -1
|
||||
|
||||
function_id = get_actor_method_function_id(method_name)
|
||||
object_ids = ray.worker.global_worker.submit_task(
|
||||
function_id, args, actor_id=self._ray_actor_id,
|
||||
actor_counter=self._ray_actor_counter)
|
||||
actor_counter=actor_counter)
|
||||
# Update the actor counter and cursor to reflect the most recent
|
||||
# invocation.
|
||||
self._ray_actor_counter += 1
|
||||
self._ray_actor_cursor = object_ids.pop()
|
||||
|
||||
# Submit a checkpoint task if necessary.
|
||||
if is_checkpoint_task(self._ray_actor_counter,
|
||||
checkpoint_interval):
|
||||
self.__ray_checkpoint__.remote()
|
||||
|
||||
# The last object returned is the dummy object that should be
|
||||
# passed in to the next actor method. Do not return it to the user.
|
||||
if len(object_ids) == 1:
|
||||
return object_ids[0]
|
||||
elif len(object_ids) > 1:
|
||||
@@ -405,8 +660,11 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
# ActorMethod has a reference to the ActorHandle and this was
|
||||
# causing cyclic references which were prevent object
|
||||
# deallocation from behaving in a predictable manner.
|
||||
return ActorMethod(self, attr,
|
||||
self._ray_method_signatures[attr])
|
||||
if attr == "__ray_checkpoint__":
|
||||
actor_method_cls = CheckpointMethod
|
||||
else:
|
||||
actor_method_cls = ActorMethod
|
||||
return actor_method_cls(self, attr)
|
||||
else:
|
||||
# There is no method with this name, so raise an exception.
|
||||
raise AttributeError("'{}' Actor object has no attribute '{}'"
|
||||
@@ -421,10 +679,8 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
|
||||
def __del__(self):
|
||||
"""Kill the worker that is running this actor."""
|
||||
if ray.worker.global_worker.connected:
|
||||
self._actor_method_call(
|
||||
"__ray_terminate__",
|
||||
self._ray_method_signatures["__ray_terminate__"],
|
||||
self._ray_actor_id.id())
|
||||
self._actor_method_call("__ray_terminate__",
|
||||
args=[self._ray_actor_id.id()])
|
||||
|
||||
return ActorHandle
|
||||
|
||||
|
||||
+16
-59
@@ -226,6 +226,11 @@ class Worker(object):
|
||||
self.fetch_and_register_actor = None
|
||||
self.make_actor = None
|
||||
self.actors = {}
|
||||
self.actor_task_counter = 0
|
||||
# This field is used to report actor checkpoint failure for the last
|
||||
# task assigned. Workers are not assigned a task on startup, so we
|
||||
# initialize to False.
|
||||
self.actor_checkpoint_failed = False
|
||||
# TODO(swang): This is a hack to prevent the object store from evicting
|
||||
# dummy objects. Once we allow object pinning in the store, we may
|
||||
# remove this variable.
|
||||
@@ -691,7 +696,7 @@ class Worker(object):
|
||||
args = task.arguments()
|
||||
return_object_ids = task.returns()
|
||||
if task.actor_id().id() != NIL_ACTOR_ID:
|
||||
return_object_ids.pop()
|
||||
dummy_return_id = return_object_ids.pop()
|
||||
function_name, function_executor = (self.functions
|
||||
[self.task_driver_id.id()]
|
||||
[function_id.id()])
|
||||
@@ -717,14 +722,10 @@ class Worker(object):
|
||||
if task.actor_id().id() == NIL_ACTOR_ID:
|
||||
outputs = function_executor.executor(arguments)
|
||||
else:
|
||||
# If this is any actor task other than the first, which has
|
||||
# no dependencies, the last argument is a dummy argument
|
||||
# that represents the dependency on the previous actor
|
||||
# task. Remove this argument for invocation.
|
||||
if task.actor_counter() > 0:
|
||||
arguments = arguments[:-1]
|
||||
outputs = function_executor(
|
||||
self.actors[task.actor_id().id()], *arguments)
|
||||
dummy_return_id, task.actor_counter(),
|
||||
self.actors[task.actor_id().id()],
|
||||
*arguments)
|
||||
except Exception as e:
|
||||
# Determine whether the exception occured during a task, not an
|
||||
# actor method.
|
||||
@@ -764,35 +765,6 @@ class Worker(object):
|
||||
data={"function_id": function_id.id(),
|
||||
"function_name": function_name})
|
||||
|
||||
def _checkpoint_actor_state(self, actor_counter):
|
||||
"""Checkpoint the actor state.
|
||||
|
||||
This currently saves the checkpoint to Redis, but the checkpoint really
|
||||
needs to go somewhere else.
|
||||
|
||||
Args:
|
||||
actor_counter: The index of the most recent task that ran on this
|
||||
actor.
|
||||
"""
|
||||
print("Saving actor checkpoint. actor_counter = {}."
|
||||
.format(actor_counter))
|
||||
actor_key = b"Actor:" + self.actor_id
|
||||
checkpoint = self.actors[self.actor_id].__ray_save_checkpoint__()
|
||||
# Save the checkpoint in Redis. TODO(rkn): Checkpoints should not
|
||||
# be stored in Redis. Fix this.
|
||||
self.redis_client.hset(
|
||||
actor_key,
|
||||
"checkpoint_{}".format(actor_counter),
|
||||
checkpoint)
|
||||
# Remove the previous checkpoints if there is one.
|
||||
checkpoint_indices = [int(key[len(b"checkpoint_"):])
|
||||
for key in self.redis_client.hkeys(actor_key)
|
||||
if key.startswith(b"checkpoint_")]
|
||||
for index in checkpoint_indices:
|
||||
if index < actor_counter:
|
||||
self.redis_client.hdel(actor_key,
|
||||
"checkpoint_{}".format(index))
|
||||
|
||||
def _wait_for_and_process_task(self, task):
|
||||
"""Wait for a task to be ready and process the task.
|
||||
|
||||
@@ -824,19 +796,6 @@ class Worker(object):
|
||||
with log_span("ray:task", contents=contents, worker=self):
|
||||
self._process_task(task)
|
||||
|
||||
# Add the dummy output for actor tasks. TODO(swang): We use a
|
||||
# numpy array as a hack to pin the object in the object store.
|
||||
# Once we allow object pinning in the store, we may use `None`.
|
||||
if task.actor_id().id() != NIL_ACTOR_ID:
|
||||
dummy_object_id = task.returns().pop()
|
||||
dummy_object = np.zeros(1)
|
||||
self.put_object(dummy_object_id, dummy_object)
|
||||
|
||||
# Keep the dummy output in scope for the lifetime of the actor,
|
||||
# to prevent eviction from the object store.
|
||||
dummy_object = self.get_object([dummy_object_id])
|
||||
self.actor_pinned_objects.append(dummy_object[0])
|
||||
|
||||
# Push all of the log events to the global state store.
|
||||
flush_log()
|
||||
|
||||
@@ -853,13 +812,6 @@ class Worker(object):
|
||||
ray.worker.global_worker.local_scheduler_client.disconnect()
|
||||
os._exit(0)
|
||||
|
||||
# Checkpoint the actor state if it is the right time to do so.
|
||||
actor_counter = task.actor_counter()
|
||||
if (self.actor_id != NIL_ACTOR_ID and
|
||||
self.actor_checkpoint_interval != -1 and
|
||||
actor_counter % self.actor_checkpoint_interval == 0):
|
||||
self._checkpoint_actor_state(actor_counter)
|
||||
|
||||
def _get_next_task_from_local_scheduler(self):
|
||||
"""Get the next task from the local scheduler.
|
||||
|
||||
@@ -867,7 +819,12 @@ class Worker(object):
|
||||
A task from the local scheduler.
|
||||
"""
|
||||
with log_span("ray:get_task", worker=self):
|
||||
task = self.local_scheduler_client.get_task()
|
||||
task = self.local_scheduler_client.get_task(
|
||||
self.actor_checkpoint_failed)
|
||||
# We assume that the task is not a checkpoint, or that if it is,
|
||||
# that the task will succeed. The checkpoint task executor is
|
||||
# responsible for reporting task failure to the local scheduler.
|
||||
self.actor_checkpoint_failed = False
|
||||
|
||||
# Automatically restrict the GPUs available to this task.
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
|
||||
@@ -1892,7 +1849,7 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker,
|
||||
worker.class_id = class_id
|
||||
# Store a list of the dummy outputs produced by actor tasks, to pin the
|
||||
# dummy outputs in the object store.
|
||||
worker.actor_pinned_objects = []
|
||||
worker.actor_pinned_objects = {}
|
||||
|
||||
# Initialize the serialization library. This registers some classes, and so
|
||||
# it must be run before we export all of the cached remote functions.
|
||||
|
||||
Reference in New Issue
Block a user