Actor fault tolerance using object lineage reconstruction (#902)

* Revert Python actor reconstruction * Actor reconstruction using object lineage * Add dummy arguments and return values for actor tasks * Pin dummy outputs for actor tasks * Skip checkpointing test for now * TODOs * minor edits * Generate dummy object dependencies in Python, not C * Fix linting. * Move actor counter and dummy objects inside of the actor handle * Refactor Worker._process_task, suppress exception propagation for sequential actor tasks
2026-07-04 16:14:43 +08:00 · 2017-09-10 19:29:28 -07:00
parent d8aa826e63
commit 99c8b1f38c
7 changed files with 207 additions and 258 deletions
@@ -6,13 +6,12 @@ import cloudpickle as pickle
 import hashlib
 import inspect
 import json
-import time
 import traceback

 import ray.local_scheduler
 import ray.signature as signature
 import ray.worker
-from ray.utils import (FunctionProperties, hex_to_binary, random_string,
+from ray.utils import (FunctionProperties, random_string,
                       select_local_scheduler)


@@ -103,7 +102,7 @@ def fetch_and_register_actor(actor_class_key, worker):
        worker.functions[driver_id][function_id] = (actor_method_name,
                                                    temporary_actor_method)
        worker.function_properties[driver_id][function_id] = (
-            FunctionProperties(num_return_vals=1,
+            FunctionProperties(num_return_vals=2,
                               num_cpus=1,
                               num_gpus=0,
                               num_custom_resource=0,
@@ -174,7 +173,7 @@ def export_actor(actor_id, class_id, actor_method_names, num_cpus, num_gpus,
        # problem.
        function_id = get_actor_method_function_id(actor_method_name).id()
        worker.function_properties[driver_id][function_id] = (
-            FunctionProperties(num_return_vals=1,
+            FunctionProperties(num_return_vals=2,
                               num_cpus=1,
                               num_gpus=0,
                               num_custom_resource=0,
@@ -204,127 +203,6 @@ def export_actor(actor_id, class_id, actor_method_names, num_cpus, num_gpus,
                                     worker.redis_client)


-def reconstruct_actor_state(actor_id, worker):
-    """Reconstruct the state of an actor that is being reconstructed.
-
-    Args:
-        actor_id: The ID of the actor being reconstructed.
-        worker: The worker object that is running the actor.
-    """
-    # Get the most recent actor checkpoint.
-    checkpoint_index, checkpoint = get_actor_checkpoint(actor_id, worker)
-    if checkpoint is not None:
-        print("Loading actor state from checkpoint {}"
-              .format(checkpoint_index))
-        # Wait for the actor to have been defined.
-        while not hasattr(worker, "actor_class"):
-            time.sleep(0.001)
-        # TODO(rkn): Restoring from the checkpoint may fail, so this should be
-        # in a try-except block and we should give a good error message.
-        worker.actors[actor_id] = (
-            worker.actor_class.__ray_restore_from_checkpoint__(checkpoint))
-
-    # TODO(rkn): This call is expensive. It'd be nice to find a way to get only
-    # the tasks that are relevant to this actor.
-    tasks = ray.global_state.task_table()
-
-    def hex_to_object_id(hex_id):
-        return ray.local_scheduler.ObjectID(hex_to_binary(hex_id))
-
-    relevant_tasks = []
-
-    # Loop over the task table and keep the tasks that are relevant to this
-    # actor.
-    for _, task_info in tasks.items():
-        task_spec_info = task_info["TaskSpec"]
-        if hex_to_binary(task_spec_info["ActorID"]) == actor_id:
-            relevant_tasks.append(task_spec_info)
-
-    # Sort the tasks by actor ID.
-    relevant_tasks.sort(key=lambda task: task["ActorCounter"])
-    for i in range(len(relevant_tasks)):
-        assert relevant_tasks[i]["ActorCounter"] == i
-
-    # This is a mini replica of the worker's main_loop. This will loop over all
-    # of the tasks that this actor is supposed to rerun. For each task, the
-    # worker will submit the task to the local scheduler, retrieve the task
-    # from the local scheduler, and execute the task.
-    for task_spec_info in relevant_tasks:
-        # Create a task spec out of the dictionary of info. This isn't
-        # necessary. It is strictly for the purposes of checking that the task
-        # we get back from the local scheduler is identical to the one we
-        # submit.
-        task_spec = ray.local_scheduler.Task(
-            hex_to_object_id(task_spec_info["DriverID"]),
-            hex_to_object_id(task_spec_info["FunctionID"]),
-            task_spec_info["Args"],
-            len(task_spec_info["ReturnObjectIDs"]),
-            hex_to_object_id(task_spec_info["ParentTaskID"]),
-            task_spec_info["ParentCounter"],
-            hex_to_object_id(task_spec_info["ActorID"]),
-            task_spec_info["ActorCounter"],
-            [task_spec_info["RequiredResources"]["CPUs"],
-             task_spec_info["RequiredResources"]["GPUs"],
-             task_spec_info["RequiredResources"]["CustomResource"]])
-
-        # Verify that the return object IDs are the same as they were the
-        # first time.
-        assert task_spec_info["ReturnObjectIDs"] == task_spec.returns()
-
-        # We need to wait for the actor to be imported and for the functions to
-        # be defined before we can submit the task.
-        worker._wait_for_function(hex_to_binary(task_spec_info["FunctionID"]),
-                                  hex_to_binary(task_spec_info["DriverID"]))
-
-        # Set some additional state. During normal operation
-        # (non-reconstruction) this state would already be set because tasks
-        # are only submitted from drivers or from workers that are in the
-        # middle of executing other tasks.
-        worker.task_driver_id = ray.local_scheduler.ObjectID(
-            hex_to_binary(task_spec_info["DriverID"]))
-        worker.current_task_id = ray.local_scheduler.ObjectID(
-            hex_to_binary(task_spec_info["ParentTaskID"]))
-        worker.task_index = task_spec_info["ParentCounter"]
-
-        # Submit the task to the local scheduler. This is important so that the
-        # local scheduler does bookkeeping about this actor's resource
-        # utilization and things like that. It's also important for updating
-        # some state on the worker.
-        if task_spec_info["ActorCounter"] > checkpoint_index:
-            worker.submit_task(
-                hex_to_object_id(task_spec_info["FunctionID"]),
-                task_spec_info["Args"],
-                actor_id=hex_to_object_id(task_spec_info["ActorID"]))
-        else:
-            # Pass in a dummy task with no arguments to avoid having to
-            # unnecessarily reconstruct past arguments.
-            worker.submit_task(
-                hex_to_object_id(task_spec_info["FunctionID"]),
-                [],
-                actor_id=hex_to_object_id(task_spec_info["ActorID"]))
-
-        # Clear the extra state that we set.
-        del worker.task_driver_id
-        del worker.current_task_id
-        del worker.task_index
-
-        # Get the task from the local scheduler.
-        retrieved_task = worker._get_next_task_from_local_scheduler()
-
-        # If the task happened before the most recent checkpoint, ignore it.
-        # Otherwise, execute it.
-        if retrieved_task.actor_counter() > checkpoint_index:
-            # Assert that the retrieved task is the same as the constructed
-            # task.
-            assert (ray.local_scheduler.task_to_string(task_spec) ==
-                    ray.local_scheduler.task_to_string(retrieved_task))
-            # Wait for the task to be ready and then execute it.
-            worker._wait_for_and_process_task(retrieved_task)
-
-    # Enter the main loop to receive and process tasks.
-    worker.main_loop()
-
-
 def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
    # Modify the class to have an additional method that will be used for
    # terminating the worker.
@@ -368,29 +246,14 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
    class_id = random_actor_class_id()
    # The list exported will have length 0 if the class has not been exported
    # yet, and length one if it has. This is just implementing a bool, but we
-    # don't use a bool because we need to modify it inside of the NewClass
+    # don't use a bool because we need to modify it inside of the ActorHandle
    # constructor.
    exported = []

-    # The function actor_method_call gets called if somebody tries to call a
-    # method on their local actor stub object.
-    def actor_method_call(actor_id, attr, function_signature, *args, **kwargs):
-        ray.worker.check_connected()
-        ray.worker.check_main_thread()
-        args = signature.extend_args(function_signature, args, kwargs)
-
-        function_id = get_actor_method_function_id(attr)
-        object_ids = ray.worker.global_worker.submit_task(function_id, args,
-                                                          actor_id=actor_id)
-        if len(object_ids) == 1:
-            return object_ids[0]
-        elif len(object_ids) > 1:
-            return object_ids
-
    class ActorMethod(object):
-        def __init__(self, method_name, actor_id, method_signature):
+        def __init__(self, actor, method_name, method_signature):
+            self.actor = actor
            self.method_name = method_name
-            self.actor_id = actor_id
            self.method_signature = method_signature

        def __call__(self, *args, **kwargs):
@@ -400,10 +263,11 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
                            .format(self.method_name, self.method_name))

        def remote(self, *args, **kwargs):
-            return actor_method_call(self.actor_id, self.method_name,
-                                     self.method_signature, *args, **kwargs)
+            return self.actor._actor_method_call(self.method_name,
+                                                 self.method_signature, *args,
+                                                 **kwargs)

-    class NewClass(object):
+    class ActorHandle(object):
        def __init__(self, *args, **kwargs):
            raise Exception("Actor classes cannot be instantiated directly. "
                            "Instead of running '{}()', try '{}.remote()'."
@@ -417,6 +281,13 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):

        def _manual_init(self, *args, **kwargs):
            self._ray_actor_id = random_actor_id()
+            # The number of actor method invocations that we've called so far.
+            self._ray_actor_counter = 0
+            # The actor cursor is a dummy object representing the most recent
+            # actor method invocation. For each subsequent method invocation,
+            # the current cursor should be added as a dependency, and then
+            # updated to reflect the new invocation.
+            self._ray_actor_cursor = None
            self._ray_actor_methods = {
                k: v for (k, v) in inspect.getmembers(
                    Class, predicate=(lambda x: (inspect.isfunction(x) or
@@ -441,7 +312,7 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
            self._actor_method_invokers = dict()
            for k, v in self._ray_actor_methods.items():
                self._actor_method_invokers[k] = ActorMethod(
-                    k, self._ray_actor_id, self._ray_method_signatures[k])
+                    self, k, self._ray_method_signatures[k])

            # Export the actor class if it has not been exported yet.
            if len(exported) == 0:
@@ -456,12 +327,39 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
                         ray.worker.global_worker)
            # Call __init__ as a remote function.
            if "__init__" in self._ray_actor_methods.keys():
-                actor_method_call(self._ray_actor_id, "__init__",
-                                  self._ray_method_signatures["__init__"],
-                                  *args, **kwargs)
+                self._actor_method_call(
+                    "__init__", self._ray_method_signatures["__init__"], *args,
+                    **kwargs)
            else:
                print("WARNING: this object has no __init__ method.")

+        # The function actor_method_call gets called if somebody tries to call
+        # a method on their local actor stub object.
+        def _actor_method_call(self, attr, function_signature, *args,
+                               **kwargs):
+            ray.worker.check_connected()
+            ray.worker.check_main_thread()
+            args = signature.extend_args(function_signature, args, kwargs)
+            # Add the current actor cursor, a dummy object returned by the most
+            # recent method invocation, as a dependency for the next method
+            # invocation.
+            if self._ray_actor_cursor is not None:
+                args.append(self._ray_actor_cursor)
+
+            function_id = get_actor_method_function_id(attr)
+            object_ids = ray.worker.global_worker.submit_task(
+                function_id, args, actor_id=self._ray_actor_id,
+                actor_counter=self._ray_actor_counter)
+            # Update the actor counter and cursor to reflect the most recent
+            # invocation.
+            self._ray_actor_counter += 1
+            self._ray_actor_cursor = object_ids.pop()
+
+            if len(object_ids) == 1:
+                return object_ids[0]
+            elif len(object_ids) > 1:
+                return object_ids
+
        # Make tab completion work.
        def __dir__(self):
            return self._ray_actor_methods
@@ -469,8 +367,10 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
        def __getattribute__(self, attr):
            # The following is needed so we can still access
            # self.actor_methods.
-            if attr in ["_manual_init", "_ray_actor_id", "_ray_actor_methods",
-                        "_actor_method_invokers", "_ray_method_signatures"]:
+            if attr in ["_manual_init", "_ray_actor_id", "_ray_actor_counter",
+                        "_ray_actor_cursor", "_ray_actor_methods",
+                        "_actor_method_invokers", "_ray_method_signatures",
+                        "_actor_method_call"]:
                return object.__getattribute__(self, attr)
            if attr in self._ray_actor_methods.keys():
                return self._actor_method_invokers[attr]
@@ -487,12 +387,12 @@ def make_actor(cls, num_cpus, num_gpus, checkpoint_interval):
        def __del__(self):
            """Kill the worker that is running this actor."""
            if ray.worker.global_worker.connected:
-                actor_method_call(
-                    self._ray_actor_id, "__ray_terminate__",
+                self._actor_method_call(
+                    "__ray_terminate__",
                    self._ray_method_signatures["__ray_terminate__"],
                    self._ray_actor_id.id())

-    return NewClass
+    return ActorHandle


 ray.worker.global_worker.fetch_and_register_actor = fetch_and_register_actor