Actor checkpointing with object lineage reconstruction (#1004)

* Worker reports error in previous task, actor task counter is incremented after task is successful * Refactor actor task execution - Return new task counter in GetTaskRequest - Update worker state for actor tasks inside of the actor method executor * Manually invoked checkpoint method * Scheduling for actor checkpoint methods * Fix python bugs in checkpointing * Return task success from worker to local scheduler instead of actor counter * Kill local schedulers halfway through actor execution instead of waiting for all tasks to execute once * Remove redundant actor tasks during dispatch, reconstruct missing dependencies for actor tasks * Make executor for temporary actor methods * doc * Set default argument for whether the previous task was a success * Refactor actor method call * Simplify checkpoint task submission * lint * fix philipp's comments * Add missing line * Make actor reconstruction tests run faster * Unimportant whitespace. * Unimportant whitespace. * Update checkpoint method signature * Documentation and handle exceptions during checkpoint save/resume * Rename get_task message field to actor_checkpoint_failed * Fix bug. * Remove debugging check, redirect test output
2026-06-28 03:18:59 +08:00 · 2017-10-12 09:53:32 -07:00
parent b585001881
commit 3764f2f2e1
14 changed files with 608 additions and 210 deletions
@@ -7,8 +7,10 @@ import copy
 import hashlib
 import inspect
 import json
+import numpy as np
 import traceback

+import pyarrow.plasma as plasma
 import ray.local_scheduler
 import ray.signature as signature
 import ray.worker
@@ -40,12 +42,31 @@ def get_actor_method_function_id(attr):
    return ray.local_scheduler.ObjectID(function_id)


-def get_actor_checkpoint(actor_id, worker):
+def get_checkpoint_indices(worker, actor_id):
+    """Get the checkpoint indices associated with a given actor ID.
+
+    Args:
+        worker: The worker to use to get the checkpoint indices.
+        actor_id: The actor ID of the actor to get the checkpoint indices for.
+
+    Returns:
+        The indices of existing checkpoints as a list of integers.
+    """
+    actor_key = b"Actor:" + actor_id
+    checkpoint_indices = []
+    for key in worker.redis_client.hkeys(actor_key):
+        if key.startswith(b"checkpoint_"):
+            index = int(key[len(b"checkpoint_"):])
+            checkpoint_indices.append(index)
+    return checkpoint_indices
+
+
+def get_actor_checkpoint(worker, actor_id):
    """Get the most recent checkpoint associated with a given actor ID.

    Args:
-        actor_id: The actor ID of the actor to get the checkpoint for.
        worker: The worker to use to get the checkpoint.
+        actor_id: The actor ID of the actor to get the checkpoint for.

    Returns:
        If a checkpoint exists, this returns a tuple of the checkpoint index
@@ -53,18 +74,103 @@ def get_actor_checkpoint(actor_id, worker):
            index is the actor counter of the last task that was executed on
            the actor before the checkpoint was made.
    """
-    # Get all of the keys associated with checkpoints for this actor.
-    actor_key = b"Actor:" + actor_id
-    checkpoint_indices = [int(key[len(b"checkpoint_"):])
-                          for key in worker.redis_client.hkeys(actor_key)
-                          if key.startswith(b"checkpoint_")]
+    checkpoint_indices = get_checkpoint_indices(worker, actor_id)
    if len(checkpoint_indices) == 0:
        return -1, None
-    most_recent_checkpoint_index = max(checkpoint_indices)
-    # Get the most recent checkpoint.
-    checkpoint = worker.redis_client.hget(
-        actor_key, "checkpoint_{}".format(most_recent_checkpoint_index))
-    return most_recent_checkpoint_index, checkpoint
+    else:
+        actor_key = b"Actor:" + actor_id
+        checkpoint_index = max(checkpoint_indices)
+        checkpoint = worker.redis_client.hget(
+            actor_key, "checkpoint_{}".format(checkpoint_index))
+        return checkpoint_index, checkpoint
+
+
+def put_dummy_object(worker, dummy_object_id):
+    """Put a dummy actor object into the local object store.
+
+    This registers a dummy object ID in the local store with an empty numpy
+    array as the value. The resulting object is pinned to the store by storing
+    it to the worker's state.
+
+    For actors, dummy objects are used to store the stateful dependencies
+    between consecutive method calls. This function should be called for every
+    actor method execution that updates the actor's internal state.
+
+    Args:
+        worker: The worker to use to perform the put.
+        dummy_object_id: The object ID of the dummy object.
+    """
+    # Add the dummy output for actor tasks. TODO(swang): We use
+    # a numpy array as a hack to pin the object in the object
+    # store. Once we allow object pinning in the store, we may
+    # use `None`.
+    dummy_object = np.zeros(1)
+    worker.put_object(dummy_object_id, dummy_object)
+    # Keep the dummy output in scope for the lifetime of the
+    # actor, to prevent eviction from the object store.
+    dummy_object = worker.get_object([dummy_object_id])
+    dummy_object = dummy_object[0]
+    worker.actor_pinned_objects[dummy_object_id] = dummy_object
+
+
+def is_checkpoint_task(task_counter, checkpoint_interval):
+    if checkpoint_interval <= 0:
+        return False
+    return (task_counter % checkpoint_interval == 0)
+
+
+def make_actor_method_executor(worker, method_name, method):
+    """Make an executor that wraps a user-defined actor method.
+
+    The executor wraps the method to update the worker's internal state. If the
+    task is a success, the dummy object returned is added to the object store,
+    to signal that the following task can run, and the worker's task counter is
+    updated to match the executed task. Else, the executor reports failure to
+    the local scheduler so that the task counter does not get updated.
+
+    Args:
+        worker (Worker): The worker that is executing the actor.
+        method_name (str): The name of the actor method.
+        method (instancemethod): The actor method to wrap. This should be a
+            method defined on the actor class and should therefore take an
+            instance of the actor as the first argument.
+
+    Returns:
+        A function that executes the given actor method on the worker's stored
+            instance of the actor. The function also updates the worker's
+            internal state to record the executed method.
+    """
+
+    def actor_method_executor(dummy_return_id, task_counter, actor,
+                              *args):
+        # An actor task's dependency on the previous task is represented by
+        # a dummy argument. Remove this argument before invocation.
+        args = args[:-1]
+        if method_name == "__ray_checkpoint__":
+            # Execute the checkpoint task.
+            actor_checkpoint_failed, error = method(actor, *args)
+            # If the checkpoint was successfully loaded, put the dummy object
+            # and update the actor's task counter, so that the task following
+            # the checkpoint can run.
+            if not actor_checkpoint_failed:
+                put_dummy_object(worker, dummy_return_id)
+                worker.actor_task_counter = task_counter + 1
+            # Report to the local scheduler whether this task succeeded in
+            # loading the checkpoint.
+            worker.actor_checkpoint_failed = actor_checkpoint_failed
+            # If there was an exception during the checkpoint method, re-raise
+            # it after updating the actor's internal state.
+            if error is not None:
+                raise error
+            return None
+        else:
+            # Update the worker's internal state before executing the method in
+            # case the method throws an exception.
+            put_dummy_object(worker, dummy_return_id)
+            worker.actor_task_counter = task_counter + 1
+            # Execute the actor method.
+            return method(actor, *args)
+    return actor_method_executor


 def fetch_and_register_actor(actor_class_key, worker):
@@ -100,8 +206,11 @@ def fetch_and_register_actor(actor_class_key, worker):
                        "cannot execute this method".format(actor_name))
    for actor_method_name in actor_method_names:
        function_id = get_actor_method_function_id(actor_method_name).id()
+        temporary_executor = make_actor_method_executor(worker,
+                                                        actor_method_name,
+                                                        temporary_actor_method)
        worker.functions[driver_id][function_id] = (actor_method_name,
-                                                    temporary_actor_method)
+                                                    temporary_executor)
        worker.function_properties[driver_id][function_id] = (
            FunctionProperties(num_return_vals=2,
                               num_cpus=1,
@@ -112,6 +221,7 @@ def fetch_and_register_actor(actor_class_key, worker):

    try:
        unpickled_class = pickle.loads(pickled_class)
+        worker.actor_class = unpickled_class
    except Exception:
        # If an exception was thrown when the actor was imported, we record the
        # traceback and notify the scheduler of the failure.
@@ -126,11 +236,15 @@ def fetch_and_register_actor(actor_class_key, worker):
        # TODO(pcm): Why is the below line necessary?
        unpickled_class.__module__ = module
        worker.actors[actor_id_str] = unpickled_class.__new__(unpickled_class)
-        for (k, v) in inspect.getmembers(
+        actor_methods = inspect.getmembers(
            unpickled_class, predicate=(lambda x: (inspect.isfunction(x) or
-                                                   inspect.ismethod(x)))):
-            function_id = get_actor_method_function_id(k).id()
-            worker.functions[driver_id][function_id] = (k, v)
+                                                   inspect.ismethod(x))))
+        for actor_method_name, actor_method in actor_methods:
+            function_id = get_actor_method_function_id(actor_method_name).id()
+            executor = make_actor_method_executor(worker, actor_method_name,
+                                                  actor_method)
+            worker.functions[driver_id][function_id] = (actor_method_name,
+                                                        executor)
            # We do not set worker.function_properties[driver_id][function_id]
            # because we currently do need the actor worker to submit new tasks
            # for the actor.
@@ -214,6 +328,10 @@ def export_actor(actor_id, class_id, actor_method_names, num_cpus, num_gpus,


 def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
+    # Add one to the checkpoint interval since we will insert a mock task for
+    # every checkpoint.
+    checkpoint_interval += 1
+
    # Modify the class to have an additional method that will be used for
    # terminating the worker.
    class Class(cls):
@@ -254,9 +372,101 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
                # TODO(rkn): It's possible that this will cause problems. When
                # you unpickle the same object twice, the two objects will not
                # have the same class.
-                actor_object = pickle.loads(checkpoint)
+                actor_object = checkpoint
            return actor_object

+        def __ray_checkpoint__(self, task_counter, previous_object_id):
+            """Save or resume a stored checkpoint.
+
+            This task checkpoints the current state of the actor. If the actor
+            has not yet executed to `task_counter`, then the task instead
+            attempts to resume from a saved checkpoint that matches
+            `task_counter`. If the most recently saved checkpoint is earlier
+            than `task_counter`, the task requests reconstruction of the tasks
+            that executed since the previous checkpoint and before
+            `task_counter`.
+
+            Args:
+                self: An instance of the actor class.
+                task_counter: The index assigned to this checkpoint method.
+                previous_object_id: The dummy object returned by the task that
+                    immediately precedes this checkpoint.
+
+            Returns:
+                A bool representing whether the checkpoint was successfully
+                    loaded (whether the actor can safely execute the next task)
+                    and an Exception instance, if one was thrown.
+            """
+            worker = ray.worker.global_worker
+            previous_object_id = previous_object_id[0]
+            plasma_id = plasma.ObjectID(previous_object_id.id())
+
+            # Initialize the return values. `actor_checkpoint_failed` will be
+            # set to True if we fail to load the checkpoint. `error` will be
+            # set to the Exception, if one is thrown.
+            actor_checkpoint_failed = False
+            error_to_return = None
+
+            # Save or resume the checkpoint.
+            if previous_object_id in worker.actor_pinned_objects:
+                # The preceding task executed on this actor instance. Save the
+                # checkpoint.
+                print("Saving actor checkpoint. actor_counter = {}."
+                      .format(task_counter))
+                actor_key = b"Actor:" + worker.actor_id
+
+                try:
+                    checkpoint = worker.actors[
+                        worker.actor_id].__ray_save_checkpoint__()
+                    # Save the checkpoint in Redis. TODO(rkn): Checkpoints
+                    # should not be stored in Redis. Fix this.
+                    worker.redis_client.hset(
+                        actor_key,
+                        "checkpoint_{}".format(task_counter),
+                        checkpoint)
+                    # Remove the previous checkpoints if there is one.
+                    checkpoint_indices = get_checkpoint_indices(
+                        worker, worker.actor_id)
+                    for index in checkpoint_indices:
+                        if index < task_counter:
+                            worker.redis_client.hdel(
+                                actor_key, "checkpoint_{}".format(index))
+                # An exception was thrown. Save the error.
+                except Exception as error:
+                    # Checkpoint saves should not block execution on the actor,
+                    # so we still consider the task successful.
+                    error_to_return = error
+            else:
+                # The preceding task has not yet executed on this actor
+                # instance. Try to resume from the most recent checkpoint.
+                checkpoint_index, checkpoint = get_actor_checkpoint(
+                    worker, worker.actor_id)
+                if checkpoint_index == task_counter:
+                    # The checkpoint matches ours. Resume the actor instance.
+                    try:
+                        actor = (worker.actor_class.
+                                 __ray_restore_from_checkpoint__(checkpoint))
+                        worker.actors[worker.actor_id] = actor
+                    # An exception was thrown. Save the error.
+                    except Exception as error:
+                        # We could not resume the checkpoint, so count the task
+                        # as failed.
+                        actor_checkpoint_failed = True
+                        error_to_return = error
+                else:
+                    # We cannot resume a mismatching checkpoint, so count the
+                    # task as failed.
+                    actor_checkpoint_failed = True
+
+            # Fall back to lineage reconstruction if we were unable to load the
+            # checkpoint.
+            if actor_checkpoint_failed:
+                worker.local_scheduler_client.reconstruct_object(
+                    plasma_id.binary())
+                worker.local_scheduler_client.notify_unblocked()
+
+            return actor_checkpoint_failed, error_to_return
+
    Class.__module__ = cls.__module__
    Class.__name__ = cls.__name__

@@ -270,10 +480,9 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
    # Create objects to wrap method invocations. This is done so that we can
    # invoke methods with actor.method.remote() instead of actor.method().
    class ActorMethod(object):
-        def __init__(self, actor, method_name, method_signature):
+        def __init__(self, actor, method_name):
            self.actor = actor
            self.method_name = method_name
-            self.method_signature = method_signature

        def __call__(self, *args, **kwargs):
            raise Exception("Actor methods cannot be called directly. Instead "
@@ -282,9 +491,20 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
                            .format(self.method_name, self.method_name))

        def remote(self, *args, **kwargs):
-            return self.actor._actor_method_call(self.method_name,
-                                                 self.method_signature, *args,
-                                                 **kwargs)
+            return self.actor._actor_method_call(
+                self.method_name, args=args, kwargs=kwargs,
+                dependency=self.actor._ray_actor_cursor)
+
+    # Checkpoint methods do not take in the state of the previous actor method
+    # as an explicit data dependency.
+    class CheckpointMethod(ActorMethod):
+        def remote(self):
+            # A checkpoint's arguments are the current task counter and the
+            # object ID of the preceding task. The latter is an implicit data
+            # dependency, since the checkpoint method can run at any time.
+            args = [self.actor._ray_actor_counter,
+                    [self.actor._ray_actor_cursor]]
+            return self.actor._actor_method_call(self.method_name, args=args)

    class ActorHandle(object):
        def __init__(self, *args, **kwargs):
@@ -307,10 +527,12 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
            # the current cursor should be added as a dependency, and then
            # updated to reflect the new invocation.
            self._ray_actor_cursor = None
-            self._ray_actor_methods = {
-                k: v for (k, v) in inspect.getmembers(
-                    Class, predicate=(lambda x: (inspect.isfunction(x) or
-                                                 inspect.ismethod(x))))}
+            ray_actor_methods = inspect.getmembers(
+                Class, predicate=(lambda x: (inspect.isfunction(x) or
+                                             inspect.ismethod(x))))
+            self._ray_actor_methods = {}
+            for actor_method_name, actor_method in ray_actor_methods:
+                self._ray_actor_methods[actor_method_name] = actor_method
            # Extract the signatures of each of the methods. This will be used
            # to catch some errors if the methods are called with inappropriate
            # arguments.
@@ -346,18 +568,41 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):

            # Call __init__ as a remote function.
            if "__init__" in self._ray_actor_methods.keys():
-                self._actor_method_call(
-                    "__init__", self._ray_method_signatures["__init__"], *args,
-                    **kwargs)
+                self._actor_method_call("__init__", args=args, kwargs=kwargs)
            else:
                print("WARNING: this object has no __init__ method.")

-        # The function actor_method_call gets called if somebody tries to call
-        # a method on their local actor stub object.
-        def _actor_method_call(self, attr, function_signature, *args,
-                               **kwargs):
+        def _actor_method_call(self, method_name, args=None, kwargs=None,
+                               dependency=None):
+            """Method execution stub for an actor handle.
+
+            This is the function that executes when
+            `actor.method_name.remote(*args, **kwargs)` is called. Instead of
+            executing locally, the method is packaged as a task and scheduled
+            to the remote actor instance.
+
+            Args:
+                self: The local actor handle.
+                method_name: The name of the actor method to execute.
+                args: A list of arguments for the actor method.
+                kwargs: A dictionary of keyword arguments for the actor method.
+                dependency: The object ID that this method is dependent on.
+                    Defaults to None, for no dependencies. Most tasks should
+                    pass in the dummy object returned by the preceding task.
+                    Some tasks, such as checkpoint and terminate methods, have
+                    no dependencies.
+
+            Returns:
+                object_ids: A list of object IDs returned by the remote actor
+                    method.
+            """
            ray.worker.check_connected()
            ray.worker.check_main_thread()
+            function_signature = self._ray_method_signatures[method_name]
+            if args is None:
+                args = []
+            if kwargs is None:
+                kwargs = {}
            args = signature.extend_args(function_signature, args, kwargs)

            # Execute functions locally if Ray is run in PYTHON_MODE
@@ -365,23 +610,33 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
            if ray.worker.global_worker.mode == ray.PYTHON_MODE:
                return getattr(
                    ray.worker.global_worker.actors[self._ray_actor_id],
-                    attr)(*copy.deepcopy(args))
+                    method_name)(*copy.deepcopy(args))

-            # Add the current actor cursor, a dummy object returned by the most
-            # recent method invocation, as a dependency for the next method
-            # invocation.
-            if self._ray_actor_cursor is not None:
-                args.append(self._ray_actor_cursor)
+            # Add the dummy argument that represents dependency on a preceding
+            # task.
+            args.append(dependency)

-            function_id = get_actor_method_function_id(attr)
+            actor_counter = self._ray_actor_counter
+            # Mark checkpoint methods with a negative task counter.
+            if is_checkpoint_task(actor_counter, checkpoint_interval):
+                actor_counter = self._ray_actor_counter * -1
+
+            function_id = get_actor_method_function_id(method_name)
            object_ids = ray.worker.global_worker.submit_task(
                function_id, args, actor_id=self._ray_actor_id,
-                actor_counter=self._ray_actor_counter)
+                actor_counter=actor_counter)
            # Update the actor counter and cursor to reflect the most recent
            # invocation.
            self._ray_actor_counter += 1
            self._ray_actor_cursor = object_ids.pop()

+            # Submit a checkpoint task if necessary.
+            if is_checkpoint_task(self._ray_actor_counter,
+                                  checkpoint_interval):
+                self.__ray_checkpoint__.remote()
+
+            # The last object returned is the dummy object that should be
+            # passed in to the next actor method. Do not return it to the user.
            if len(object_ids) == 1:
                return object_ids[0]
            elif len(object_ids) > 1:
@@ -405,8 +660,11 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
                # ActorMethod has a reference to the ActorHandle and this was
                # causing cyclic references which were prevent object
                # deallocation from behaving in a predictable manner.
-                return ActorMethod(self, attr,
-                                   self._ray_method_signatures[attr])
+                if attr == "__ray_checkpoint__":
+                    actor_method_cls = CheckpointMethod
+                else:
+                    actor_method_cls = ActorMethod
+                return actor_method_cls(self, attr)
            else:
                # There is no method with this name, so raise an exception.
                raise AttributeError("'{}' Actor object has no attribute '{}'"
@@ -421,10 +679,8 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
        def __del__(self):
            """Kill the worker that is running this actor."""
            if ray.worker.global_worker.connected:
-                self._actor_method_call(
-                    "__ray_terminate__",
-                    self._ray_method_signatures["__ray_terminate__"],
-                    self._ray_actor_id.id())
+                self._actor_method_call("__ray_terminate__",
+                                        args=[self._ray_actor_id.id()])

    return ActorHandle

@@ -226,6 +226,11 @@ class Worker(object):
        self.fetch_and_register_actor = None
        self.make_actor = None
        self.actors = {}
+        self.actor_task_counter = 0
+        # This field is used to report actor checkpoint failure for the last
+        # task assigned. Workers are not assigned a task on startup, so we
+        # initialize to False.
+        self.actor_checkpoint_failed = False
        # TODO(swang): This is a hack to prevent the object store from evicting
        # dummy objects. Once we allow object pinning in the store, we may
        # remove this variable.
@@ -691,7 +696,7 @@ class Worker(object):
        args = task.arguments()
        return_object_ids = task.returns()
        if task.actor_id().id() != NIL_ACTOR_ID:
-            return_object_ids.pop()
+            dummy_return_id = return_object_ids.pop()
        function_name, function_executor = (self.functions
                                            [self.task_driver_id.id()]
                                            [function_id.id()])
@@ -717,14 +722,10 @@ class Worker(object):
                if task.actor_id().id() == NIL_ACTOR_ID:
                    outputs = function_executor.executor(arguments)
                else:
-                    # If this is any actor task other than the first, which has
-                    # no dependencies, the last argument is a dummy argument
-                    # that represents the dependency on the previous actor
-                    # task. Remove this argument for invocation.
-                    if task.actor_counter() > 0:
-                        arguments = arguments[:-1]
                    outputs = function_executor(
-                        self.actors[task.actor_id().id()], *arguments)
+                        dummy_return_id, task.actor_counter(),
+                        self.actors[task.actor_id().id()],
+                        *arguments)
        except Exception as e:
            # Determine whether the exception occured during a task, not an
            # actor method.
@@ -764,35 +765,6 @@ class Worker(object):
                                  data={"function_id": function_id.id(),
                                        "function_name": function_name})

-    def _checkpoint_actor_state(self, actor_counter):
-        """Checkpoint the actor state.
-
-        This currently saves the checkpoint to Redis, but the checkpoint really
-        needs to go somewhere else.
-
-        Args:
-            actor_counter: The index of the most recent task that ran on this
-                actor.
-        """
-        print("Saving actor checkpoint. actor_counter = {}."
-              .format(actor_counter))
-        actor_key = b"Actor:" + self.actor_id
-        checkpoint = self.actors[self.actor_id].__ray_save_checkpoint__()
-        # Save the checkpoint in Redis. TODO(rkn): Checkpoints should not
-        # be stored in Redis. Fix this.
-        self.redis_client.hset(
-            actor_key,
-            "checkpoint_{}".format(actor_counter),
-            checkpoint)
-        # Remove the previous checkpoints if there is one.
-        checkpoint_indices = [int(key[len(b"checkpoint_"):])
-                              for key in self.redis_client.hkeys(actor_key)
-                              if key.startswith(b"checkpoint_")]
-        for index in checkpoint_indices:
-            if index < actor_counter:
-                self.redis_client.hdel(actor_key,
-                                       "checkpoint_{}".format(index))
-
    def _wait_for_and_process_task(self, task):
        """Wait for a task to be ready and process the task.

@@ -824,19 +796,6 @@ class Worker(object):
            with log_span("ray:task", contents=contents, worker=self):
                self._process_task(task)

-            # Add the dummy output for actor tasks. TODO(swang): We use a
-            # numpy array as a hack to pin the object in the object store.
-            # Once we allow object pinning in the store, we may use `None`.
-            if task.actor_id().id() != NIL_ACTOR_ID:
-                dummy_object_id = task.returns().pop()
-                dummy_object = np.zeros(1)
-                self.put_object(dummy_object_id, dummy_object)
-
-                # Keep the dummy output in scope for the lifetime of the actor,
-                # to prevent eviction from the object store.
-                dummy_object = self.get_object([dummy_object_id])
-                self.actor_pinned_objects.append(dummy_object[0])
-
        # Push all of the log events to the global state store.
        flush_log()

@@ -853,13 +812,6 @@ class Worker(object):
            ray.worker.global_worker.local_scheduler_client.disconnect()
            os._exit(0)

-        # Checkpoint the actor state if it is the right time to do so.
-        actor_counter = task.actor_counter()
-        if (self.actor_id != NIL_ACTOR_ID and
-                self.actor_checkpoint_interval != -1 and
-                actor_counter % self.actor_checkpoint_interval == 0):
-            self._checkpoint_actor_state(actor_counter)
-
    def _get_next_task_from_local_scheduler(self):
        """Get the next task from the local scheduler.

@@ -867,7 +819,12 @@ class Worker(object):
            A task from the local scheduler.
        """
        with log_span("ray:get_task", worker=self):
-            task = self.local_scheduler_client.get_task()
+            task = self.local_scheduler_client.get_task(
+                self.actor_checkpoint_failed)
+            # We assume that the task is not a checkpoint, or that if it is,
+            # that the task will succeed. The checkpoint task executor is
+            # responsible for reporting task failure to the local scheduler.
+            self.actor_checkpoint_failed = False

        # Automatically restrict the GPUs available to this task.
        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
@@ -1892,7 +1849,7 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker,
        worker.class_id = class_id
        # Store a list of the dummy outputs produced by actor tasks, to pin the
        # dummy outputs in the object store.
-        worker.actor_pinned_objects = []
+        worker.actor_pinned_objects = {}

    # Initialize the serialization library. This registers some classes, and so
    # it must be run before we export all of the cached remote functions.