mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 06:14:00 +08:00
Availability after local scheduler failure (#329)
* Clean up plasma subscribers on EPIPE First pass at a monitoring script - monitor can detect local scheduler death Clean up task table upon local scheduler death in monitoring script Don't schedule to dead local schedulers in global scheduler Have global scheduler update the db clients table, monitor script cleans up state Documentation Monitor script should scan tables before beginning to read from subscription channel Fix for python3 Redirect monitor output to redis logs, fix hanging in multinode tests * Publish auxiliary addresses as part of db_client deletion notifications * Fix test case? * Small changes. * Use SCAN instead of KEYS * Address comments * Address more comments * Free redis module strings
This commit is contained in:
committed by
Robert Nishihara
parent
4f9e74469e
commit
41b8675d04
@@ -6,6 +6,8 @@ def get_local_schedulers(worker):
|
||||
local_schedulers = []
|
||||
for client in worker.redis_client.keys("CL:*"):
|
||||
client_info = worker.redis_client.hgetall(client)
|
||||
if b"client_type" not in client_info:
|
||||
continue
|
||||
if client_info[b"client_type"] == b"local_scheduler":
|
||||
local_schedulers.append(client_info)
|
||||
return local_schedulers
|
||||
|
||||
@@ -0,0 +1,187 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import binascii
|
||||
from collections import Counter
|
||||
import logging
|
||||
import redis
|
||||
import time
|
||||
|
||||
from ray.services import get_ip_address
|
||||
from ray.services import get_port
|
||||
|
||||
# These variables must be kept in sync with the C codebase.
|
||||
# common/common.h
|
||||
DB_CLIENT_ID_SIZE = 20
|
||||
NIL_ID = b"\xff" * DB_CLIENT_ID_SIZE
|
||||
# common/task.h
|
||||
TASK_STATUS_LOST = 32
|
||||
# common/redis_module/ray_redis_module.c
|
||||
TASK_PREFIX = "TT:"
|
||||
DB_CLIENT_PREFIX = "CL:"
|
||||
DB_CLIENT_TABLE_NAME = b"db_clients"
|
||||
# local_scheduler/local_scheduler.h
|
||||
LOCAL_SCHEDULER_HEARTBEAT_TIMEOUT_MILLISECONDS = 100
|
||||
LOCAL_SCHEDULER_CLIENT_TYPE = b"local_scheduler"
|
||||
|
||||
# Set up logging.
|
||||
logging.basicConfig()
|
||||
log = logging.getLogger()
|
||||
|
||||
class Monitor(object):
|
||||
"""A monitor for Ray processes.
|
||||
|
||||
The monitor is in charge of cleaning up the tables in the global state after
|
||||
processes have died. The monitor is currently not responsible for detecting
|
||||
component failures.
|
||||
|
||||
Attributes:
|
||||
redis: A connection to the Redis server.
|
||||
subscribe_client: A pubsub client for the Redis server. This is used to
|
||||
receive notifications about failed components.
|
||||
local_schedulers: A set of the local scheduler IDs of all of the currently
|
||||
live local schedulers in the cluster. In addition, this also includes
|
||||
NIL_ID.
|
||||
"""
|
||||
def __init__(self, redis_address, redis_port):
|
||||
self.redis = redis.StrictRedis(host=redis_address, port=redis_port, db=0)
|
||||
self.subscribe_client = self.redis.pubsub()
|
||||
|
||||
# Initialize data structures to keep track of the active database clients.
|
||||
self.local_schedulers = set()
|
||||
# Add the NIL_ID so that we don't accidentally mark tasks that aren't
|
||||
# associated with a node as LOST during cleanup.
|
||||
self.local_schedulers.add(NIL_ID)
|
||||
|
||||
def subscribe(self):
|
||||
"""Subscribe to the db_clients channel.
|
||||
|
||||
Raises:
|
||||
Exception: An exception is raised if the subscription fails.
|
||||
"""
|
||||
self.subscribe_client.subscribe(DB_CLIENT_TABLE_NAME)
|
||||
# Wait for the first message to signal that the subscription was successful.
|
||||
while True:
|
||||
message = self.subscribe_client.get_message()
|
||||
if message is None:
|
||||
time.sleep(LOCAL_SCHEDULER_HEARTBEAT_TIMEOUT_MILLISECONDS / 1000)
|
||||
continue
|
||||
break
|
||||
|
||||
# The first message's payload should be the index of our subscription.
|
||||
if "data" not in message:
|
||||
Exception("Unable to subscribe to local scheduler table.")
|
||||
|
||||
def read_message(self):
|
||||
"""Read a message from the db_clients channel.
|
||||
|
||||
Returns:
|
||||
None if no message was to read. Otherwise, a tuple of (db_client_id,
|
||||
client_type, auxiliary_address, is_insertion) is returned. The value
|
||||
is_insertion is a bool that is true if the update to the db_clients
|
||||
table was an insertion and false if deletion.
|
||||
"""
|
||||
message = self.subscribe_client.get_message()
|
||||
if message is None:
|
||||
return None
|
||||
|
||||
# Parse the message.
|
||||
data = message["data"]
|
||||
db_client_id = data[:DB_CLIENT_ID_SIZE]
|
||||
data = data[DB_CLIENT_ID_SIZE + 1:]
|
||||
data = data.split(b" ")
|
||||
client_type, auxiliary_address, is_insertion = data
|
||||
is_insertion = int(is_insertion)
|
||||
if is_insertion != 1 and is_insertion != 0:
|
||||
raise Exception("Expected 0 or 1 for insertion field, got {} instead".format(is_insertion))
|
||||
is_insertion = bool(is_insertion)
|
||||
|
||||
return db_client_id, client_type, auxiliary_address, is_insertion
|
||||
|
||||
def cleanup_task_table(self):
|
||||
"""Clean up global state for a failed local schedulers.
|
||||
|
||||
This marks any tasks that were scheduled on dead local schedulers as
|
||||
TASK_STATUS_LOST. A local scheduler is deemed dead if it is not in
|
||||
self.local_schedulers.
|
||||
"""
|
||||
task_ids = self.redis.scan_iter(match="{prefix}*".format(prefix=TASK_PREFIX))
|
||||
for task_id in task_ids:
|
||||
task_id = task_id[len(TASK_PREFIX):]
|
||||
response = self.redis.execute_command("RAY.TASK_TABLE_GET", task_id)
|
||||
if response[1] not in self.local_schedulers:
|
||||
ok = self.redis.execute_command("RAY.TASK_TABLE_UPDATE",
|
||||
task_id,
|
||||
TASK_STATUS_LOST,
|
||||
NIL_ID)
|
||||
if ok != b"OK":
|
||||
log.warn("Failed to update lost task for dead scheduler.")
|
||||
|
||||
def scan_db_client_table(self):
|
||||
"""Scan the database client table for the current clients.
|
||||
|
||||
After subscribing to the client table, it's necessary to call this before
|
||||
reading any messages from the subscription channel.
|
||||
"""
|
||||
db_client_keys = self.redis.keys("{prefix}*".format(prefix=DB_CLIENT_PREFIX))
|
||||
for db_client_key in db_client_keys:
|
||||
db_client_id = db_client_key[len(DB_CLIENT_PREFIX):]
|
||||
client_type = self.redis.hget(db_client_key, "client_type")
|
||||
if client_type == LOCAL_SCHEDULER_CLIENT_TYPE:
|
||||
self.local_schedulers.add(db_client_id)
|
||||
|
||||
def run(self):
|
||||
"""Run the monitor.
|
||||
|
||||
This function loops forever, checking for messages about dead database
|
||||
clients and cleaning up state accordingly.
|
||||
"""
|
||||
# Initialize the subscription channel.
|
||||
self.subscribe()
|
||||
|
||||
# Scan the database table and clean up any state associated with clients
|
||||
# not in the database table. NOTE: This must be called before reading any
|
||||
# messages from the subscription channel. This ensures that we start in a
|
||||
# consistent state, since we may have missed notifications that were sent
|
||||
# before we connected to the subscription channel.
|
||||
self.scan_db_client_table()
|
||||
self.cleanup_task_table()
|
||||
log.debug("Scanned schedulers: {}".format(self.local_schedulers))
|
||||
|
||||
# Read messages from the subscription channel.
|
||||
while True:
|
||||
time.sleep(LOCAL_SCHEDULER_HEARTBEAT_TIMEOUT_MILLISECONDS / 1000)
|
||||
client = self.read_message()
|
||||
# There was no message to be read.
|
||||
if client is None:
|
||||
continue
|
||||
|
||||
db_client_id, client_type, auxiliary_address, is_insertion = client
|
||||
|
||||
# If the update was an insertion, record the client ID.
|
||||
if is_insertion:
|
||||
self.local_schedulers.add(db_client_id)
|
||||
log.debug("Added scheduler: {}".format(db_client_id))
|
||||
continue
|
||||
|
||||
# If the update was a deletion, clean up global state.
|
||||
if client_type == LOCAL_SCHEDULER_CLIENT_TYPE:
|
||||
if db_client_id in self.local_schedulers:
|
||||
log.warn("Removed scheduler: {}".format(db_client_id))
|
||||
self.local_schedulers.remove(db_client_id)
|
||||
self.cleanup_task_table()
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description=("Parse Redis server for the "
|
||||
"monitor to connect to."))
|
||||
parser.add_argument("--redis-address", required=True, type=str,
|
||||
help="the address to use for Redis")
|
||||
args = parser.parse_args()
|
||||
|
||||
redis_ip_address = get_ip_address(args.redis_address)
|
||||
redis_port = get_port(args.redis_address)
|
||||
|
||||
monitor = Monitor(redis_ip_address, redis_port)
|
||||
monitor.run()
|
||||
+32
-3
@@ -21,6 +21,7 @@ import ray.local_scheduler as local_scheduler
|
||||
import ray.plasma as plasma
|
||||
import ray.global_scheduler as global_scheduler
|
||||
|
||||
PROCESS_TYPE_MONITOR = "monitor"
|
||||
PROCESS_TYPE_WORKER = "worker"
|
||||
PROCESS_TYPE_LOCAL_SCHEDULER = "local_scheduler"
|
||||
PROCESS_TYPE_PLASMA_MANAGER = "plasma_manager"
|
||||
@@ -34,13 +35,14 @@ PROCESS_TYPE_WEB_UI = "web_ui"
|
||||
# important because it determines the order in which these processes will be
|
||||
# terminated when Ray exits, and certain orders will cause errors to be logged
|
||||
# to the screen.
|
||||
all_processes = OrderedDict([(PROCESS_TYPE_WORKER, []),
|
||||
all_processes = OrderedDict([(PROCESS_TYPE_MONITOR, []),
|
||||
(PROCESS_TYPE_WORKER, []),
|
||||
(PROCESS_TYPE_LOCAL_SCHEDULER, []),
|
||||
(PROCESS_TYPE_PLASMA_MANAGER, []),
|
||||
(PROCESS_TYPE_PLASMA_STORE, []),
|
||||
(PROCESS_TYPE_GLOBAL_SCHEDULER, []),
|
||||
(PROCESS_TYPE_REDIS_SERVER, []),
|
||||
(PROCESS_TYPE_WEB_UI, [])])
|
||||
(PROCESS_TYPE_WEB_UI, [])],)
|
||||
|
||||
# True if processes are run in the valgrind profiler.
|
||||
RUN_LOCAL_SCHEDULER_PROFILER = False
|
||||
@@ -527,7 +529,7 @@ def start_worker(node_ip_address, object_store_name, object_store_manager_name,
|
||||
object_store_name (str): The name of the object store.
|
||||
object_store_manager_name (str): The name of the object store manager.
|
||||
local_scheduler_name (str): The name of the local scheduler.
|
||||
redis_address (int): The address that the Redis server is listening on.
|
||||
redis_address (str): The address that the Redis server is listening on.
|
||||
worker_path (str): The path of the source code which the worker process will
|
||||
run.
|
||||
stdout_file: A file handle opened for writing to redirect stdout to. If no
|
||||
@@ -549,6 +551,28 @@ def start_worker(node_ip_address, object_store_name, object_store_manager_name,
|
||||
if cleanup:
|
||||
all_processes[PROCESS_TYPE_WORKER].append(p)
|
||||
|
||||
def start_monitor(redis_address, stdout_file=None, stderr_file=None,
|
||||
cleanup=True):
|
||||
"""Run a process to monitor the other processes.
|
||||
|
||||
Args:
|
||||
redis_address (str): The address that the Redis server is listening on.
|
||||
stdout_file: A file handle opened for writing to redirect stdout to. If no
|
||||
redirection should happen, then this should be None.
|
||||
stderr_file: A file handle opened for writing to redirect stderr to. If no
|
||||
redirection should happen, then this should be None.
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by services.cleanup() when the Python process
|
||||
that imported services exits. This is True by default.
|
||||
"""
|
||||
monitor_path= os.path.join(os.path.dirname(os.path.abspath(__file__)), "monitor.py")
|
||||
command = ["python",
|
||||
monitor_path,
|
||||
"--redis-address=" + str(redis_address)]
|
||||
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
|
||||
if cleanup:
|
||||
all_processes[PROCESS_TYPE_WORKER].append(p)
|
||||
|
||||
def start_ray_processes(address_info=None,
|
||||
node_ip_address="127.0.0.1",
|
||||
num_workers=0,
|
||||
@@ -641,6 +665,11 @@ def start_ray_processes(address_info=None,
|
||||
stderr_file=redis_stderr_file,
|
||||
cleanup=cleanup)
|
||||
assert redis_port == new_redis_port
|
||||
# Start monitoring the processes.
|
||||
monitor_stdout_file, monitor_stderr_file = new_log_files("monitor", redirect_output)
|
||||
start_monitor(redis_address,
|
||||
stdout_file=monitor_stdout_file,
|
||||
stderr_file=monitor_stderr_file)
|
||||
else:
|
||||
if redis_address is None:
|
||||
raise Exception("Redis address expected")
|
||||
|
||||
@@ -958,9 +958,13 @@ def cleanup(worker=global_worker):
|
||||
{"end_time": time.time()})
|
||||
services.cleanup()
|
||||
else:
|
||||
# If this is not a driver, make sure there are no orphan processes.
|
||||
# If this is not a driver, make sure there are no orphan processes, besides
|
||||
# possibly the worker itself.
|
||||
for process_type, processes in services.all_processes.items():
|
||||
assert(len(processes) == 0)
|
||||
if process_type == services.PROCESS_TYPE_WORKER:
|
||||
assert(len(processes)) <= 1
|
||||
else:
|
||||
assert(len(processes) == 0)
|
||||
|
||||
worker.set_mode(None)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user