Recreate actors when local schedulers die. (#804)

* Reconstruct actor state when local schedulers fail. * Simplify construction of arguments to pass into default_worker.py from local scheduler. * Remove deprecated ray.actor. * Simplify actor reconstruction method. * Fix linting. * Small fixes.
2026-07-01 18:04:09 +08:00 · 2017-08-02 18:02:52 -07:00
parent 37282330c0
commit cb84972f6b
13 changed files with 441 additions and 79 deletions
@@ -10,11 +10,9 @@ import redis
 import time

 import ray
-from ray.services import get_ip_address
-from ray.services import get_port
-from ray.utils import binary_to_object_id
-from ray.utils import binary_to_hex
-from ray.utils import hex_to_binary
+from ray.services import get_ip_address, get_port
+import ray.utils
+from ray.utils import binary_to_object_id, binary_to_hex, hex_to_binary

 # Import flatbuffer bindings.
 from ray.core.generated.SubscribeToDBClientTableReply \
@@ -98,6 +96,41 @@ class Monitor(object):
        self.subscribe_client.subscribe(channel)
        self.subscribed[channel] = False

+    def cleanup_actors(self):
+        """Recreate any live actors whose corresponding local scheduler died.
+
+        For any live actor whose local scheduler just died, we choose a new
+        local scheduler and broadcast a notification to create that actor.
+        """
+        actor_info = self.state.actors()
+        for actor_id, info in actor_info.items():
+            if (not info["removed"] and
+                    info["local_scheduler_id"] in self.dead_local_schedulers):
+                # Choose a new local scheduler to run the actor.
+                local_scheduler_id = ray.utils.select_local_scheduler(
+                    info["driver_id"], self.state.local_schedulers(),
+                    info["num_gpus"], self.redis)
+                import sys
+                sys.stdout.flush()
+                # The new local scheduler should not be the same as the old
+                # local scheduler. TODO(rkn): This should not be an assert, it
+                # should be something more benign.
+                assert (binary_to_hex(local_scheduler_id) !=
+                        info["local_scheduler_id"])
+                # Announce to all of the local schedulers that the actor should
+                # be recreated on this new local scheduler.
+                ray.utils.publish_actor_creation(
+                    hex_to_binary(actor_id), hex_to_binary(info["driver_id"]),
+                    local_scheduler_id, True, self.redis)
+                log.info("Actor {} for driver {} was on dead local scheduler "
+                         "{}. It is being recreated on local scheduler {}"
+                         .format(actor_id, info["driver_id"],
+                                 info["local_scheduler_id"],
+                                 binary_to_hex(local_scheduler_id)))
+                # Update the actor info in Redis.
+                self.redis.hset(b"Actor:" + hex_to_binary(actor_id),
+                                "local_scheduler_id", local_scheduler_id)
+
    def cleanup_task_table(self):
        """Clean up global state for failed local schedulers.

@@ -348,6 +381,7 @@ class Monitor(object):
        # state in the state tables.
        if len(self.dead_local_schedulers) > 0:
            self.cleanup_task_table()
+            self.cleanup_actors()
        if len(self.dead_plasma_managers) > 0:
            self.cleanup_object_table()
        log.debug("{} dead local schedulers, {} plasma managers total, {} "
@@ -369,6 +403,7 @@ class Monitor(object):
            # dead in this round, clean up the associated state.
            if len(self.dead_local_schedulers) > num_dead_local_schedulers:
                self.cleanup_task_table()
+                self.cleanup_actors()
            if len(self.dead_plasma_managers) > num_dead_plasma_managers:
                self.cleanup_object_table()