Availability after local scheduler failure (#329)

* Clean up plasma subscribers on EPIPE

First pass at a monitoring script - monitor can detect local scheduler death

Clean up task table upon local scheduler death in monitoring script

Don't schedule to dead local schedulers in global scheduler

Have global scheduler update the db clients table, monitor script cleans up state

Documentation

Monitor script should scan tables before beginning to read from subscription channel

Fix for python3

Redirect monitor output to redis logs, fix hanging in multinode tests

* Publish auxiliary addresses as part of db_client deletion notifications

* Fix test case?

* Small changes.

* Use SCAN instead of KEYS

* Address comments

* Address more comments

* Free redis module strings
This commit is contained in:
Stephanie Wang
2017-03-02 19:51:20 -08:00
committed by Robert Nishihara
parent 4f9e74469e
commit 41b8675d04
19 changed files with 606 additions and 75 deletions
+2
View File
@@ -6,6 +6,8 @@ def get_local_schedulers(worker):
local_schedulers = []
for client in worker.redis_client.keys("CL:*"):
client_info = worker.redis_client.hgetall(client)
if b"client_type" not in client_info:
continue
if client_info[b"client_type"] == b"local_scheduler":
local_schedulers.append(client_info)
return local_schedulers
+187
View File
@@ -0,0 +1,187 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import binascii
from collections import Counter
import logging
import redis
import time
from ray.services import get_ip_address
from ray.services import get_port
# These variables must be kept in sync with the C codebase.
# common/common.h
DB_CLIENT_ID_SIZE = 20
NIL_ID = b"\xff" * DB_CLIENT_ID_SIZE
# common/task.h
TASK_STATUS_LOST = 32
# common/redis_module/ray_redis_module.c
TASK_PREFIX = "TT:"
DB_CLIENT_PREFIX = "CL:"
DB_CLIENT_TABLE_NAME = b"db_clients"
# local_scheduler/local_scheduler.h
LOCAL_SCHEDULER_HEARTBEAT_TIMEOUT_MILLISECONDS = 100
LOCAL_SCHEDULER_CLIENT_TYPE = b"local_scheduler"
# Set up logging.
logging.basicConfig()
log = logging.getLogger()
class Monitor(object):
"""A monitor for Ray processes.
The monitor is in charge of cleaning up the tables in the global state after
processes have died. The monitor is currently not responsible for detecting
component failures.
Attributes:
redis: A connection to the Redis server.
subscribe_client: A pubsub client for the Redis server. This is used to
receive notifications about failed components.
local_schedulers: A set of the local scheduler IDs of all of the currently
live local schedulers in the cluster. In addition, this also includes
NIL_ID.
"""
def __init__(self, redis_address, redis_port):
self.redis = redis.StrictRedis(host=redis_address, port=redis_port, db=0)
self.subscribe_client = self.redis.pubsub()
# Initialize data structures to keep track of the active database clients.
self.local_schedulers = set()
# Add the NIL_ID so that we don't accidentally mark tasks that aren't
# associated with a node as LOST during cleanup.
self.local_schedulers.add(NIL_ID)
def subscribe(self):
"""Subscribe to the db_clients channel.
Raises:
Exception: An exception is raised if the subscription fails.
"""
self.subscribe_client.subscribe(DB_CLIENT_TABLE_NAME)
# Wait for the first message to signal that the subscription was successful.
while True:
message = self.subscribe_client.get_message()
if message is None:
time.sleep(LOCAL_SCHEDULER_HEARTBEAT_TIMEOUT_MILLISECONDS / 1000)
continue
break
# The first message's payload should be the index of our subscription.
if "data" not in message:
Exception("Unable to subscribe to local scheduler table.")
def read_message(self):
"""Read a message from the db_clients channel.
Returns:
None if no message was to read. Otherwise, a tuple of (db_client_id,
client_type, auxiliary_address, is_insertion) is returned. The value
is_insertion is a bool that is true if the update to the db_clients
table was an insertion and false if deletion.
"""
message = self.subscribe_client.get_message()
if message is None:
return None
# Parse the message.
data = message["data"]
db_client_id = data[:DB_CLIENT_ID_SIZE]
data = data[DB_CLIENT_ID_SIZE + 1:]
data = data.split(b" ")
client_type, auxiliary_address, is_insertion = data
is_insertion = int(is_insertion)
if is_insertion != 1 and is_insertion != 0:
raise Exception("Expected 0 or 1 for insertion field, got {} instead".format(is_insertion))
is_insertion = bool(is_insertion)
return db_client_id, client_type, auxiliary_address, is_insertion
def cleanup_task_table(self):
"""Clean up global state for a failed local schedulers.
This marks any tasks that were scheduled on dead local schedulers as
TASK_STATUS_LOST. A local scheduler is deemed dead if it is not in
self.local_schedulers.
"""
task_ids = self.redis.scan_iter(match="{prefix}*".format(prefix=TASK_PREFIX))
for task_id in task_ids:
task_id = task_id[len(TASK_PREFIX):]
response = self.redis.execute_command("RAY.TASK_TABLE_GET", task_id)
if response[1] not in self.local_schedulers:
ok = self.redis.execute_command("RAY.TASK_TABLE_UPDATE",
task_id,
TASK_STATUS_LOST,
NIL_ID)
if ok != b"OK":
log.warn("Failed to update lost task for dead scheduler.")
def scan_db_client_table(self):
"""Scan the database client table for the current clients.
After subscribing to the client table, it's necessary to call this before
reading any messages from the subscription channel.
"""
db_client_keys = self.redis.keys("{prefix}*".format(prefix=DB_CLIENT_PREFIX))
for db_client_key in db_client_keys:
db_client_id = db_client_key[len(DB_CLIENT_PREFIX):]
client_type = self.redis.hget(db_client_key, "client_type")
if client_type == LOCAL_SCHEDULER_CLIENT_TYPE:
self.local_schedulers.add(db_client_id)
def run(self):
"""Run the monitor.
This function loops forever, checking for messages about dead database
clients and cleaning up state accordingly.
"""
# Initialize the subscription channel.
self.subscribe()
# Scan the database table and clean up any state associated with clients
# not in the database table. NOTE: This must be called before reading any
# messages from the subscription channel. This ensures that we start in a
# consistent state, since we may have missed notifications that were sent
# before we connected to the subscription channel.
self.scan_db_client_table()
self.cleanup_task_table()
log.debug("Scanned schedulers: {}".format(self.local_schedulers))
# Read messages from the subscription channel.
while True:
time.sleep(LOCAL_SCHEDULER_HEARTBEAT_TIMEOUT_MILLISECONDS / 1000)
client = self.read_message()
# There was no message to be read.
if client is None:
continue
db_client_id, client_type, auxiliary_address, is_insertion = client
# If the update was an insertion, record the client ID.
if is_insertion:
self.local_schedulers.add(db_client_id)
log.debug("Added scheduler: {}".format(db_client_id))
continue
# If the update was a deletion, clean up global state.
if client_type == LOCAL_SCHEDULER_CLIENT_TYPE:
if db_client_id in self.local_schedulers:
log.warn("Removed scheduler: {}".format(db_client_id))
self.local_schedulers.remove(db_client_id)
self.cleanup_task_table()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=("Parse Redis server for the "
"monitor to connect to."))
parser.add_argument("--redis-address", required=True, type=str,
help="the address to use for Redis")
args = parser.parse_args()
redis_ip_address = get_ip_address(args.redis_address)
redis_port = get_port(args.redis_address)
monitor = Monitor(redis_ip_address, redis_port)
monitor.run()
+32 -3
View File
@@ -21,6 +21,7 @@ import ray.local_scheduler as local_scheduler
import ray.plasma as plasma
import ray.global_scheduler as global_scheduler
PROCESS_TYPE_MONITOR = "monitor"
PROCESS_TYPE_WORKER = "worker"
PROCESS_TYPE_LOCAL_SCHEDULER = "local_scheduler"
PROCESS_TYPE_PLASMA_MANAGER = "plasma_manager"
@@ -34,13 +35,14 @@ PROCESS_TYPE_WEB_UI = "web_ui"
# important because it determines the order in which these processes will be
# terminated when Ray exits, and certain orders will cause errors to be logged
# to the screen.
all_processes = OrderedDict([(PROCESS_TYPE_WORKER, []),
all_processes = OrderedDict([(PROCESS_TYPE_MONITOR, []),
(PROCESS_TYPE_WORKER, []),
(PROCESS_TYPE_LOCAL_SCHEDULER, []),
(PROCESS_TYPE_PLASMA_MANAGER, []),
(PROCESS_TYPE_PLASMA_STORE, []),
(PROCESS_TYPE_GLOBAL_SCHEDULER, []),
(PROCESS_TYPE_REDIS_SERVER, []),
(PROCESS_TYPE_WEB_UI, [])])
(PROCESS_TYPE_WEB_UI, [])],)
# True if processes are run in the valgrind profiler.
RUN_LOCAL_SCHEDULER_PROFILER = False
@@ -527,7 +529,7 @@ def start_worker(node_ip_address, object_store_name, object_store_manager_name,
object_store_name (str): The name of the object store.
object_store_manager_name (str): The name of the object store manager.
local_scheduler_name (str): The name of the local scheduler.
redis_address (int): The address that the Redis server is listening on.
redis_address (str): The address that the Redis server is listening on.
worker_path (str): The path of the source code which the worker process will
run.
stdout_file: A file handle opened for writing to redirect stdout to. If no
@@ -549,6 +551,28 @@ def start_worker(node_ip_address, object_store_name, object_store_manager_name,
if cleanup:
all_processes[PROCESS_TYPE_WORKER].append(p)
def start_monitor(redis_address, stdout_file=None, stderr_file=None,
cleanup=True):
"""Run a process to monitor the other processes.
Args:
redis_address (str): The address that the Redis server is listening on.
stdout_file: A file handle opened for writing to redirect stdout to. If no
redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If no
redirection should happen, then this should be None.
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
this process will be killed by services.cleanup() when the Python process
that imported services exits. This is True by default.
"""
monitor_path= os.path.join(os.path.dirname(os.path.abspath(__file__)), "monitor.py")
command = ["python",
monitor_path,
"--redis-address=" + str(redis_address)]
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
if cleanup:
all_processes[PROCESS_TYPE_WORKER].append(p)
def start_ray_processes(address_info=None,
node_ip_address="127.0.0.1",
num_workers=0,
@@ -641,6 +665,11 @@ def start_ray_processes(address_info=None,
stderr_file=redis_stderr_file,
cleanup=cleanup)
assert redis_port == new_redis_port
# Start monitoring the processes.
monitor_stdout_file, monitor_stderr_file = new_log_files("monitor", redirect_output)
start_monitor(redis_address,
stdout_file=monitor_stdout_file,
stderr_file=monitor_stderr_file)
else:
if redis_address is None:
raise Exception("Redis address expected")
+6 -2
View File
@@ -958,9 +958,13 @@ def cleanup(worker=global_worker):
{"end_time": time.time()})
services.cleanup()
else:
# If this is not a driver, make sure there are no orphan processes.
# If this is not a driver, make sure there are no orphan processes, besides
# possibly the worker itself.
for process_type, processes in services.all_processes.items():
assert(len(processes) == 0)
if process_type == services.PROCESS_TYPE_WORKER:
assert(len(processes)) <= 1
else:
assert(len(processes) == 0)
worker.set_mode(None)