mirror of
https://github.com/wassname/ray.git
synced 2026-07-01 18:04:09 +08:00
Recreate actors when local schedulers die. (#804)
* Reconstruct actor state when local schedulers fail. * Simplify construction of arguments to pass into default_worker.py from local scheduler. * Remove deprecated ray.actor. * Simplify actor reconstruction method. * Fix linting. * Small fixes.
This commit is contained in:
committed by
Philipp Moritz
parent
37282330c0
commit
cb84972f6b
+40
-5
@@ -10,11 +10,9 @@ import redis
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.services import get_ip_address
|
||||
from ray.services import get_port
|
||||
from ray.utils import binary_to_object_id
|
||||
from ray.utils import binary_to_hex
|
||||
from ray.utils import hex_to_binary
|
||||
from ray.services import get_ip_address, get_port
|
||||
import ray.utils
|
||||
from ray.utils import binary_to_object_id, binary_to_hex, hex_to_binary
|
||||
|
||||
# Import flatbuffer bindings.
|
||||
from ray.core.generated.SubscribeToDBClientTableReply \
|
||||
@@ -98,6 +96,41 @@ class Monitor(object):
|
||||
self.subscribe_client.subscribe(channel)
|
||||
self.subscribed[channel] = False
|
||||
|
||||
def cleanup_actors(self):
|
||||
"""Recreate any live actors whose corresponding local scheduler died.
|
||||
|
||||
For any live actor whose local scheduler just died, we choose a new
|
||||
local scheduler and broadcast a notification to create that actor.
|
||||
"""
|
||||
actor_info = self.state.actors()
|
||||
for actor_id, info in actor_info.items():
|
||||
if (not info["removed"] and
|
||||
info["local_scheduler_id"] in self.dead_local_schedulers):
|
||||
# Choose a new local scheduler to run the actor.
|
||||
local_scheduler_id = ray.utils.select_local_scheduler(
|
||||
info["driver_id"], self.state.local_schedulers(),
|
||||
info["num_gpus"], self.redis)
|
||||
import sys
|
||||
sys.stdout.flush()
|
||||
# The new local scheduler should not be the same as the old
|
||||
# local scheduler. TODO(rkn): This should not be an assert, it
|
||||
# should be something more benign.
|
||||
assert (binary_to_hex(local_scheduler_id) !=
|
||||
info["local_scheduler_id"])
|
||||
# Announce to all of the local schedulers that the actor should
|
||||
# be recreated on this new local scheduler.
|
||||
ray.utils.publish_actor_creation(
|
||||
hex_to_binary(actor_id), hex_to_binary(info["driver_id"]),
|
||||
local_scheduler_id, True, self.redis)
|
||||
log.info("Actor {} for driver {} was on dead local scheduler "
|
||||
"{}. It is being recreated on local scheduler {}"
|
||||
.format(actor_id, info["driver_id"],
|
||||
info["local_scheduler_id"],
|
||||
binary_to_hex(local_scheduler_id)))
|
||||
# Update the actor info in Redis.
|
||||
self.redis.hset(b"Actor:" + hex_to_binary(actor_id),
|
||||
"local_scheduler_id", local_scheduler_id)
|
||||
|
||||
def cleanup_task_table(self):
|
||||
"""Clean up global state for failed local schedulers.
|
||||
|
||||
@@ -348,6 +381,7 @@ class Monitor(object):
|
||||
# state in the state tables.
|
||||
if len(self.dead_local_schedulers) > 0:
|
||||
self.cleanup_task_table()
|
||||
self.cleanup_actors()
|
||||
if len(self.dead_plasma_managers) > 0:
|
||||
self.cleanup_object_table()
|
||||
log.debug("{} dead local schedulers, {} plasma managers total, {} "
|
||||
@@ -369,6 +403,7 @@ class Monitor(object):
|
||||
# dead in this round, clean up the associated state.
|
||||
if len(self.dead_local_schedulers) > num_dead_local_schedulers:
|
||||
self.cleanup_task_table()
|
||||
self.cleanup_actors()
|
||||
if len(self.dead_plasma_managers) > num_dead_plasma_managers:
|
||||
self.cleanup_object_table()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user