Recreate actors when local schedulers die. (#804)

* Reconstruct actor state when local schedulers fail. * Simplify construction of arguments to pass into default_worker.py from local scheduler. * Remove deprecated ray.actor. * Simplify actor reconstruction method. * Fix linting. * Small fixes.
2026-06-28 17:50:55 +08:00 · 2017-08-02 18:02:52 -07:00
parent 37282330c0
commit cb84972f6b
13 changed files with 441 additions and 79 deletions
@@ -5,9 +5,11 @@ from __future__ import print_function
 from ray.worker import (register_class, error_info, init, connect, disconnect,
                        get, put, wait, remote, log_event, log_span,
                        flush_log, get_gpu_ids)
-from ray.actor import actor
 from ray.worker import SCRIPT_MODE, WORKER_MODE, PYTHON_MODE, SILENT_MODE
 from ray.worker import global_state
+# We import ray.actor because some code is run in actor.py which initializes
+# some functions in the worker.
+import ray.actor  # noqa: F401

 # Ray version string. TODO(rkn): This is also defined separately in setup.py.
 # Fix this.
@@ -11,7 +11,7 @@ import traceback
 import ray.local_scheduler
 import ray.signature as signature
 import ray.worker
-from ray.utils import (FunctionProperties, random_string,
+from ray.utils import (FunctionProperties, hex_to_binary, random_string,
                       select_local_scheduler)


@@ -152,26 +152,128 @@ def export_actor(actor_id, class_id, actor_method_names, num_cpus, num_gpus,
    # notification so that when the newly created actor attempts to fetch the
    # information from Redis, it is already there.
    worker.redis_client.hmset(key, {"class_id": class_id,
-                                    "num_gpus": num_gpus})
+                                    "driver_id": driver_id,
+                                    "local_scheduler_id": local_scheduler_id,
+                                    "num_gpus": num_gpus,
+                                    "removed": False})

    # TODO(rkn): There is actually no guarantee that the local scheduler that
    # we are publishing to has already subscribed to the actor_notifications
    # channel. Therefore, this message may be missed and the workload will
    # hang. This is a bug.
    ray.utils.publish_actor_creation(actor_id.id(), driver_id,
-                                     local_scheduler_id, worker.redis_client)
+                                     local_scheduler_id, False,
+                                     worker.redis_client)


-def actor(*args, **kwargs):
-    raise Exception("The @ray.actor decorator is deprecated. Instead, please "
-                    "use @ray.remote.")
+def reconstruct_actor_state(actor_id, worker):
+    """Reconstruct the state of an actor that is being reconstructed.
+
+    Args:
+        actor_id: The ID of the actor being reconstructed.
+        worker: The worker object that is running the actor.
+    """
+    # TODO(rkn): This call is expensive. It'd be nice to find a way to get only
+    # the tasks that are relevant to this actor.
+    tasks = ray.global_state.task_table()
+
+    def hex_to_object_id(hex_id):
+        return ray.local_scheduler.ObjectID(hex_to_binary(hex_id))
+
+    relevant_tasks = []
+
+    # Loop over the task table and keep the tasks that are relevant to this
+    # actor.
+    for _, task_info in tasks.items():
+        task_spec_info = task_info["TaskSpec"]
+        if hex_to_binary(task_spec_info["ActorID"]) == actor_id:
+            relevant_tasks.append(task_spec_info)
+
+    # Sort the tasks by actor ID.
+    relevant_tasks.sort(key=lambda task: task["ActorCounter"])
+    for i in range(len(relevant_tasks)):
+        assert relevant_tasks[i]["ActorCounter"] == i
+
+    # This is a mini replica of the worker's main_loop. This will loop over all
+    # of the tasks that this actor is supposed to rerun. For each task, the
+    # worker will submit the task to the local scheduler, retrieve the task
+    # from the local scheduler, and execute the task.
+    for task_spec_info in relevant_tasks:
+        # Create a task spec out of the dictionary of info. This isn't
+        # necessary. It is strictly for the purposes of checking that the task
+        # we get back from the local scheduler is identical to the one we
+        # submit.
+        task_spec = ray.local_scheduler.Task(
+            hex_to_object_id(task_spec_info["DriverID"]),
+            hex_to_object_id(task_spec_info["FunctionID"]),
+            task_spec_info["Args"],
+            len(task_spec_info["ReturnObjectIDs"]),
+            hex_to_object_id(task_spec_info["ParentTaskID"]),
+            task_spec_info["ParentCounter"],
+            hex_to_object_id(task_spec_info["ActorID"]),
+            task_spec_info["ActorCounter"],
+            [task_spec_info["RequiredResources"]["CPUs"],
+             task_spec_info["RequiredResources"]["GPUs"]])
+
+        # Verify that the return object IDs are the same as they were the
+        # first time.
+        assert task_spec_info["ReturnObjectIDs"] == task_spec.returns()
+
+        # We need to wait for the actor to be imported and for the functions to
+        # be defined before we can submit the task.
+        worker._wait_for_function(hex_to_binary(task_spec_info["FunctionID"]),
+                                  hex_to_binary(task_spec_info["DriverID"]))
+
+        # Set some additional state. During normal operation
+        # (non-reconstruction) this state would already be set because tasks
+        # are only submitted from drivers or from workers that are in the
+        # middle of executing other tasks.
+        worker.task_driver_id = ray.local_scheduler.ObjectID(
+            hex_to_binary(task_spec_info["DriverID"]))
+        worker.current_task_id = ray.local_scheduler.ObjectID(
+            hex_to_binary(task_spec_info["ParentTaskID"]))
+        worker.task_index = task_spec_info["ParentCounter"]
+
+        # Submit the task to the local scheduler. This is important so that the
+        # local scheduler does bookkeeping about this actor's resource
+        # utilization and things like that. It's also important for updating
+        # some state on the worker.
+        worker.submit_task(
+            hex_to_object_id(task_spec_info["FunctionID"]),
+            task_spec_info["Args"],
+            actor_id=hex_to_object_id(task_spec_info["ActorID"]))
+
+        # Clear the extra state that we set.
+        del worker.task_driver_id
+        del worker.current_task_id
+        del worker.task_index
+
+        # Get the task from the local scheduler.
+        retrieved_task = worker._get_next_task_from_local_scheduler()
+        # Assert that the retrieved task is the same as the constructed task.
+        assert (ray.local_scheduler.task_to_string(task_spec) ==
+                ray.local_scheduler.task_to_string(retrieved_task))
+
+        # Wait for the task to be ready and execute the task.
+        worker._wait_for_and_process_task(retrieved_task)
+
+    # Enter the main loop to receive and process tasks.
+    worker.main_loop()


 def make_actor(cls, num_cpus, num_gpus):
    # Modify the class to have an additional method that will be used for
    # terminating the worker.
    class Class(cls):
-        def __ray_terminate__(self):
+        def __ray_terminate__(self, actor_id):
+            # Record that this actor has been removed so that if this node
+            # dies later, the actor won't be recreated. Alternatively, we could
+            # remove the actor key from Redis here.
+            ray.worker.global_worker.redis_client.hset(b"Actor:" + actor_id,
+                                                       "removed", True)
+            # Disconnect the worker from he local scheduler. The point of this
+            # is so that when the worker kills itself below, the local
+            # scheduler won't push an error message to the driver.
            ray.worker.global_worker.local_scheduler_client.disconnect()
            import os
            os._exit(0)
@@ -302,7 +404,8 @@ def make_actor(cls, num_cpus, num_gpus):
            if ray.worker.global_worker.connected:
                actor_method_call(
                    self._ray_actor_id, "__ray_terminate__",
-                    self._ray_method_signatures["__ray_terminate__"])
+                    self._ray_method_signatures["__ray_terminate__"],
+                    self._ray_actor_id.id())

    return NewClass

@@ -10,11 +10,9 @@ import redis
 import time

 import ray
-from ray.services import get_ip_address
-from ray.services import get_port
-from ray.utils import binary_to_object_id
-from ray.utils import binary_to_hex
-from ray.utils import hex_to_binary
+from ray.services import get_ip_address, get_port
+import ray.utils
+from ray.utils import binary_to_object_id, binary_to_hex, hex_to_binary

 # Import flatbuffer bindings.
 from ray.core.generated.SubscribeToDBClientTableReply \
@@ -98,6 +96,41 @@ class Monitor(object):
        self.subscribe_client.subscribe(channel)
        self.subscribed[channel] = False

+    def cleanup_actors(self):
+        """Recreate any live actors whose corresponding local scheduler died.
+
+        For any live actor whose local scheduler just died, we choose a new
+        local scheduler and broadcast a notification to create that actor.
+        """
+        actor_info = self.state.actors()
+        for actor_id, info in actor_info.items():
+            if (not info["removed"] and
+                    info["local_scheduler_id"] in self.dead_local_schedulers):
+                # Choose a new local scheduler to run the actor.
+                local_scheduler_id = ray.utils.select_local_scheduler(
+                    info["driver_id"], self.state.local_schedulers(),
+                    info["num_gpus"], self.redis)
+                import sys
+                sys.stdout.flush()
+                # The new local scheduler should not be the same as the old
+                # local scheduler. TODO(rkn): This should not be an assert, it
+                # should be something more benign.
+                assert (binary_to_hex(local_scheduler_id) !=
+                        info["local_scheduler_id"])
+                # Announce to all of the local schedulers that the actor should
+                # be recreated on this new local scheduler.
+                ray.utils.publish_actor_creation(
+                    hex_to_binary(actor_id), hex_to_binary(info["driver_id"]),
+                    local_scheduler_id, True, self.redis)
+                log.info("Actor {} for driver {} was on dead local scheduler "
+                         "{}. It is being recreated on local scheduler {}"
+                         .format(actor_id, info["driver_id"],
+                                 info["local_scheduler_id"],
+                                 binary_to_hex(local_scheduler_id)))
+                # Update the actor info in Redis.
+                self.redis.hset(b"Actor:" + hex_to_binary(actor_id),
+                                "local_scheduler_id", local_scheduler_id)
+
    def cleanup_task_table(self):
        """Clean up global state for failed local schedulers.

@@ -348,6 +381,7 @@ class Monitor(object):
        # state in the state tables.
        if len(self.dead_local_schedulers) > 0:
            self.cleanup_task_table()
+            self.cleanup_actors()
        if len(self.dead_plasma_managers) > 0:
            self.cleanup_object_table()
        log.debug("{} dead local schedulers, {} plasma managers total, {} "
@@ -369,6 +403,7 @@ class Monitor(object):
            # dead in this round, clean up the associated state.
            if len(self.dead_local_schedulers) > num_dead_local_schedulers:
                self.cleanup_task_table()
+                self.cleanup_actors()
            if len(self.dead_plasma_managers) > num_dead_plasma_managers:
                self.cleanup_object_table()

@@ -183,7 +183,7 @@ def select_local_scheduler(driver_id, local_schedulers, num_gpus,


 def publish_actor_creation(actor_id, driver_id, local_scheduler_id,
-                           redis_client):
+                           reconstruct, redis_client):
    """Publish a notification that an actor should be created.

    This broadcast will be received by all of the local schedulers. The local
@@ -197,11 +197,14 @@ def publish_actor_creation(actor_id, driver_id, local_scheduler_id,
        driver_id: The ID of the driver responsible for the actor.
        local_scheduler_id: The ID of the local scheduler that is suposed to
            create the actor.
+        reconstruct: True if the actor should be created in "reconstruct" mode.
        redis_client: The client used to interact with Redis.
    """
+    reconstruct_bit = b"1" if reconstruct else b"0"
    # Really we should encode this message as a flatbuffer object. However,
    # we're having trouble getting that to work. It almost works, but in Python
    # 2.7, builder.CreateString fails on byte strings that contain characters
    # outside range(128).
    redis_client.publish("actor_notifications",
-                         actor_id + driver_id + local_scheduler_id)
+                         actor_id + driver_id + local_scheduler_id +
+                         reconstruct_bit)
@@ -9,6 +9,7 @@ import redis
 import traceback

 import ray
+import ray.actor

 parser = argparse.ArgumentParser(description=("Parse addresses for the worker "
                                              "to connect to."))
@@ -24,6 +25,9 @@ parser.add_argument("--local-scheduler-name", required=True, type=str,
                    help="the local scheduler's name")
 parser.add_argument("--actor-id", required=False, type=str,
                    help="the actor ID of this worker")
+parser.add_argument("--reconstruct", action="store_true",
+                    help=("true if the actor should be started in reconstruct "
+                          "mode"))


 def random_string():
@@ -57,6 +61,11 @@ def push_error_to_all_drivers(redis_client, message):

 if __name__ == "__main__":
    args = parser.parse_args()
+
+    # If this worker is not an actor, it cannot be started in reconstruct mode.
+    if args.actor_id is None:
+        assert not args.reconstruct
+
    info = {"node_ip_address": args.node_ip_address,
            "redis_address": args.redis_address,
            "store_socket_name": args.object_store_name,
@@ -70,6 +79,17 @@ if __name__ == "__main__":

    ray.worker.connect(info, mode=ray.WORKER_MODE, actor_id=actor_id)

+    # If this is an actor started in reconstruct mode, rerun tasks to
+    # reconstruct its state.
+    if args.reconstruct:
+        try:
+            ray.actor.reconstruct_actor_state(actor_id,
+                                              ray.worker.global_worker)
+        except Exception as e:
+            redis_client = create_redis_client(args.redis_address)
+            push_error_to_all_drivers(redis_client, traceback.format_exc())
+            raise e
+
    error_explanation = """
  This error is unexpected and should not have happened. Somehow a worker
  crashed in an unanticipated way causing the main_loop to throw an exception,