mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 17:18:45 +08:00
[xray] Monitor for Raylet processes (#1831)
* Add raylet monitor script to timeout Raylet heartbeats * Unit test for removing a different client from the client table * Set node manager heartbeat according to global config * Doc and fixes * Add regression test for client table disconnect, refactor client table * Fix linting.
This commit is contained in:
committed by
Robert Nishihara
parent
0d9a7a3c19
commit
cbf3181fd2
+34
-2
@@ -77,7 +77,10 @@ CREDIS_MEMBER_MODULE = os.path.join(
|
||||
os.path.abspath(os.path.dirname(__file__)),
|
||||
"core/src/credis/build/src/libmember.so")
|
||||
|
||||
# Location of the raylet executable.
|
||||
# Location of the raylet executables.
|
||||
RAYLET_MONITOR_EXECUTABLE = os.path.join(
|
||||
os.path.abspath(os.path.dirname(__file__)),
|
||||
"core/src/ray/raylet/raylet_monitor")
|
||||
RAYLET_EXECUTABLE = os.path.join(
|
||||
os.path.abspath(os.path.dirname(__file__)),
|
||||
"core/src/ray/raylet/raylet")
|
||||
@@ -1112,11 +1115,35 @@ def start_monitor(redis_address, node_ip_address, stdout_file=None,
|
||||
command.append("--autoscaling-config=" + str(autoscaling_config))
|
||||
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
|
||||
if cleanup:
|
||||
all_processes[PROCESS_TYPE_WORKER].append(p)
|
||||
all_processes[PROCESS_TYPE_MONITOR].append(p)
|
||||
record_log_files_in_redis(redis_address, node_ip_address,
|
||||
[stdout_file, stderr_file])
|
||||
|
||||
|
||||
def start_raylet_monitor(redis_address, stdout_file=None,
|
||||
stderr_file=None, cleanup=True):
|
||||
"""Run a process to monitor the other processes.
|
||||
|
||||
Args:
|
||||
redis_address (str): The address that the Redis server is listening on.
|
||||
stdout_file: A file handle opened for writing to redirect stdout to. If
|
||||
no redirection should happen, then this should be None.
|
||||
stderr_file: A file handle opened for writing to redirect stderr to. If
|
||||
no redirection should happen, then this should be None.
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
||||
then this process will be killed by services.cleanup() when the
|
||||
Python process that imported services exits. This is True by
|
||||
default.
|
||||
"""
|
||||
gcs_ip_address, gcs_port = redis_address.split(":")
|
||||
command = [RAYLET_MONITOR_EXECUTABLE,
|
||||
gcs_ip_address,
|
||||
gcs_port]
|
||||
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
|
||||
if cleanup:
|
||||
all_processes[PROCESS_TYPE_MONITOR].append(p)
|
||||
|
||||
|
||||
def start_ray_processes(address_info=None,
|
||||
node_ip_address="127.0.0.1",
|
||||
redis_port=None,
|
||||
@@ -1253,6 +1280,11 @@ def start_ray_processes(address_info=None,
|
||||
stderr_file=monitor_stderr_file,
|
||||
cleanup=cleanup,
|
||||
autoscaling_config=autoscaling_config)
|
||||
if use_raylet:
|
||||
start_raylet_monitor(redis_address,
|
||||
stdout_file=monitor_stdout_file,
|
||||
stderr_file=monitor_stderr_file,
|
||||
cleanup=cleanup)
|
||||
|
||||
if redis_shards == []:
|
||||
# Get redis shards from primary redis instance.
|
||||
|
||||
@@ -23,6 +23,7 @@ ray_files = [
|
||||
"ray/core/src/local_scheduler/local_scheduler",
|
||||
"ray/core/src/local_scheduler/liblocal_scheduler_library.so",
|
||||
"ray/core/src/global_scheduler/global_scheduler",
|
||||
"ray/core/src/ray/raylet/raylet_monitor",
|
||||
"ray/core/src/ray/raylet/raylet",
|
||||
"ray/WebUI.ipynb"
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user