mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 19:01:10 +08:00
Use gcs server to replace raylet monitor when RAY_GCS_SERVICE_ENABLED=true (#7166)
This commit is contained in:
+3
-1
@@ -620,9 +620,11 @@ class Node:
|
||||
|
||||
if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None):
|
||||
self.start_gcs_server()
|
||||
else:
|
||||
self.start_raylet_monitor()
|
||||
|
||||
self.start_monitor()
|
||||
self.start_raylet_monitor()
|
||||
|
||||
if self._ray_params.include_webui:
|
||||
self.start_dashboard(require_webui=True)
|
||||
elif self._ray_params.include_webui is None:
|
||||
|
||||
@@ -82,10 +82,10 @@ def test_driver_lives_sequential(ray_start_regular):
|
||||
ray.worker._global_node.kill_plasma_store()
|
||||
ray.worker._global_node.kill_log_monitor()
|
||||
ray.worker._global_node.kill_monitor()
|
||||
ray.worker._global_node.kill_raylet_monitor()
|
||||
|
||||
if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None):
|
||||
ray.worker._global_node.kill_gcs_server()
|
||||
else:
|
||||
ray.worker._global_node.kill_raylet_monitor()
|
||||
|
||||
# If the driver can reach the tearDown method, then it is still alive.
|
||||
|
||||
@@ -97,14 +97,11 @@ def test_driver_lives_parallel(ray_start_regular):
|
||||
all_processes = ray.worker._global_node.all_processes
|
||||
|
||||
if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None):
|
||||
process_infos = (
|
||||
all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_MONITOR] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR])
|
||||
assert len(process_infos) == 6
|
||||
process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE]
|
||||
+ all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER]
|
||||
+ all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR]
|
||||
+ all_processes[ray_constants.PROCESS_TYPE_MONITOR])
|
||||
else:
|
||||
process_infos = (
|
||||
all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
|
||||
@@ -112,7 +109,7 @@ def test_driver_lives_parallel(ray_start_regular):
|
||||
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_MONITOR] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR])
|
||||
assert len(process_infos) == 5
|
||||
assert len(process_infos) == 5
|
||||
|
||||
# Kill all the components in parallel.
|
||||
for process_info in process_infos:
|
||||
|
||||
@@ -132,7 +132,10 @@ def test_driver_lives_sequential(ray_start_regular):
|
||||
ray.worker._global_node.kill_plasma_store()
|
||||
ray.worker._global_node.kill_log_monitor()
|
||||
ray.worker._global_node.kill_monitor()
|
||||
ray.worker._global_node.kill_raylet_monitor()
|
||||
if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None):
|
||||
ray.worker._global_node.kill_gcs_server()
|
||||
else:
|
||||
ray.worker._global_node.kill_raylet_monitor()
|
||||
|
||||
# If the driver can reach the tearDown method, then it is still alive.
|
||||
|
||||
@@ -142,11 +145,19 @@ def test_driver_lives_sequential(ray_start_regular):
|
||||
reason="Hanging with new GCS API.")
|
||||
def test_driver_lives_parallel(ray_start_regular):
|
||||
all_processes = ray.worker._global_node.all_processes
|
||||
process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_MONITOR] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR])
|
||||
if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None):
|
||||
process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE]
|
||||
+ all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER]
|
||||
+ all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR]
|
||||
+ all_processes[ray_constants.PROCESS_TYPE_MONITOR])
|
||||
else:
|
||||
process_infos = (
|
||||
all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_MONITOR] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR])
|
||||
assert len(process_infos) == 5
|
||||
|
||||
# Kill all the components in parallel.
|
||||
|
||||
@@ -101,13 +101,15 @@ def test_raylet_tempfiles(shutdown_only):
|
||||
log_files = set(os.listdir(node.get_logs_dir_path()))
|
||||
log_files_expected = {
|
||||
"log_monitor.out", "log_monitor.err", "plasma_store.out",
|
||||
"plasma_store.err", "monitor.out", "monitor.err", "raylet_monitor.out",
|
||||
"raylet_monitor.err", "redis-shard_0.out", "redis-shard_0.err",
|
||||
"redis.out", "redis.err", "raylet.out", "raylet.err"
|
||||
"plasma_store.err", "monitor.out", "monitor.err", "redis-shard_0.out",
|
||||
"redis-shard_0.err", "redis.out", "redis.err", "raylet.out",
|
||||
"raylet.err"
|
||||
}
|
||||
|
||||
if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None):
|
||||
log_files_expected.update({"gcs_server.out", "gcs_server.err"})
|
||||
else:
|
||||
log_files_expected.update({"raylet_monitor.out", "raylet_monitor.err"})
|
||||
|
||||
assert log_files.issuperset(log_files_expected)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user