mirror of
https://github.com/wassname/ray.git
synced 2026-07-02 04:25:18 +08:00
Recreate actors when local schedulers die. (#804)
* Reconstruct actor state when local schedulers fail. * Simplify construction of arguments to pass into default_worker.py from local scheduler. * Remove deprecated ray.actor. * Simplify actor reconstruction method. * Fix linting. * Small fixes.
This commit is contained in:
committed by
Philipp Moritz
parent
37282330c0
commit
cb84972f6b
@@ -9,6 +9,7 @@ import redis
|
||||
import traceback
|
||||
|
||||
import ray
|
||||
import ray.actor
|
||||
|
||||
parser = argparse.ArgumentParser(description=("Parse addresses for the worker "
|
||||
"to connect to."))
|
||||
@@ -24,6 +25,9 @@ parser.add_argument("--local-scheduler-name", required=True, type=str,
|
||||
help="the local scheduler's name")
|
||||
parser.add_argument("--actor-id", required=False, type=str,
|
||||
help="the actor ID of this worker")
|
||||
parser.add_argument("--reconstruct", action="store_true",
|
||||
help=("true if the actor should be started in reconstruct "
|
||||
"mode"))
|
||||
|
||||
|
||||
def random_string():
|
||||
@@ -57,6 +61,11 @@ def push_error_to_all_drivers(redis_client, message):
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
# If this worker is not an actor, it cannot be started in reconstruct mode.
|
||||
if args.actor_id is None:
|
||||
assert not args.reconstruct
|
||||
|
||||
info = {"node_ip_address": args.node_ip_address,
|
||||
"redis_address": args.redis_address,
|
||||
"store_socket_name": args.object_store_name,
|
||||
@@ -70,6 +79,17 @@ if __name__ == "__main__":
|
||||
|
||||
ray.worker.connect(info, mode=ray.WORKER_MODE, actor_id=actor_id)
|
||||
|
||||
# If this is an actor started in reconstruct mode, rerun tasks to
|
||||
# reconstruct its state.
|
||||
if args.reconstruct:
|
||||
try:
|
||||
ray.actor.reconstruct_actor_state(actor_id,
|
||||
ray.worker.global_worker)
|
||||
except Exception as e:
|
||||
redis_client = create_redis_client(args.redis_address)
|
||||
push_error_to_all_drivers(redis_client, traceback.format_exc())
|
||||
raise e
|
||||
|
||||
error_explanation = """
|
||||
This error is unexpected and should not have happened. Somehow a worker
|
||||
crashed in an unanticipated way causing the main_loop to throw an exception,
|
||||
|
||||
Reference in New Issue
Block a user