[xray] Monitor for Raylet processes (#1831)

* Add raylet monitor script to timeout Raylet heartbeats

* Unit test for removing a different client from the client table

* Set node manager heartbeat according to global config

* Doc and fixes

* Add regression test for client table disconnect, refactor client table

* Fix linting.
This commit is contained in:
Stephanie Wang
2018-04-05 20:45:38 -07:00
committed by Robert Nishihara
parent 0d9a7a3c19
commit cbf3181fd2
11 changed files with 275 additions and 30 deletions
+34 -2
View File
@@ -77,7 +77,10 @@ CREDIS_MEMBER_MODULE = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
"core/src/credis/build/src/libmember.so")
# Location of the raylet executable.
# Location of the raylet executables.
RAYLET_MONITOR_EXECUTABLE = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
"core/src/ray/raylet/raylet_monitor")
RAYLET_EXECUTABLE = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
"core/src/ray/raylet/raylet")
@@ -1112,11 +1115,35 @@ def start_monitor(redis_address, node_ip_address, stdout_file=None,
command.append("--autoscaling-config=" + str(autoscaling_config))
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
if cleanup:
all_processes[PROCESS_TYPE_WORKER].append(p)
all_processes[PROCESS_TYPE_MONITOR].append(p)
record_log_files_in_redis(redis_address, node_ip_address,
[stdout_file, stderr_file])
def start_raylet_monitor(redis_address, stdout_file=None,
stderr_file=None, cleanup=True):
"""Run a process to monitor the other processes.
Args:
redis_address (str): The address that the Redis server is listening on.
stdout_file: A file handle opened for writing to redirect stdout to. If
no redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If
no redirection should happen, then this should be None.
cleanup (bool): True if using Ray in local mode. If cleanup is true,
then this process will be killed by services.cleanup() when the
Python process that imported services exits. This is True by
default.
"""
gcs_ip_address, gcs_port = redis_address.split(":")
command = [RAYLET_MONITOR_EXECUTABLE,
gcs_ip_address,
gcs_port]
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
if cleanup:
all_processes[PROCESS_TYPE_MONITOR].append(p)
def start_ray_processes(address_info=None,
node_ip_address="127.0.0.1",
redis_port=None,
@@ -1253,6 +1280,11 @@ def start_ray_processes(address_info=None,
stderr_file=monitor_stderr_file,
cleanup=cleanup,
autoscaling_config=autoscaling_config)
if use_raylet:
start_raylet_monitor(redis_address,
stdout_file=monitor_stdout_file,
stderr_file=monitor_stderr_file,
cleanup=cleanup)
if redis_shards == []:
# Get redis shards from primary redis instance.
+1
View File
@@ -23,6 +23,7 @@ ray_files = [
"ray/core/src/local_scheduler/local_scheduler",
"ray/core/src/local_scheduler/liblocal_scheduler_library.so",
"ray/core/src/global_scheduler/global_scheduler",
"ray/core/src/ray/raylet/raylet_monitor",
"ray/core/src/ray/raylet/raylet",
"ray/WebUI.ipynb"
]