Use gcs server to replace raylet monitor when RAY_GCS_SERVICE_ENABLED=true (#7166)

This commit is contained in:
ZhuSenlin
2020-03-12 22:13:56 +08:00
committed by GitHub
parent 428fb79b27
commit b663bc6d67
17 changed files with 269 additions and 142 deletions
+3 -1
View File
@@ -620,9 +620,11 @@ class Node:
if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None):
self.start_gcs_server()
else:
self.start_raylet_monitor()
self.start_monitor()
self.start_raylet_monitor()
if self._ray_params.include_webui:
self.start_dashboard(require_webui=True)
elif self._ray_params.include_webui is None:
+8 -11
View File
@@ -82,10 +82,10 @@ def test_driver_lives_sequential(ray_start_regular):
ray.worker._global_node.kill_plasma_store()
ray.worker._global_node.kill_log_monitor()
ray.worker._global_node.kill_monitor()
ray.worker._global_node.kill_raylet_monitor()
if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None):
ray.worker._global_node.kill_gcs_server()
else:
ray.worker._global_node.kill_raylet_monitor()
# If the driver can reach the tearDown method, then it is still alive.
@@ -97,14 +97,11 @@ def test_driver_lives_parallel(ray_start_regular):
all_processes = ray.worker._global_node.all_processes
if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None):
process_infos = (
all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER] +
all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
all_processes[ray_constants.PROCESS_TYPE_MONITOR] +
all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR])
assert len(process_infos) == 6
process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE]
+ all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER]
+ all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR]
+ all_processes[ray_constants.PROCESS_TYPE_MONITOR])
else:
process_infos = (
all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
@@ -112,7 +109,7 @@ def test_driver_lives_parallel(ray_start_regular):
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
all_processes[ray_constants.PROCESS_TYPE_MONITOR] +
all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR])
assert len(process_infos) == 5
assert len(process_infos) == 5
# Kill all the components in parallel.
for process_info in process_infos:
+17 -6
View File
@@ -132,7 +132,10 @@ def test_driver_lives_sequential(ray_start_regular):
ray.worker._global_node.kill_plasma_store()
ray.worker._global_node.kill_log_monitor()
ray.worker._global_node.kill_monitor()
ray.worker._global_node.kill_raylet_monitor()
if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None):
ray.worker._global_node.kill_gcs_server()
else:
ray.worker._global_node.kill_raylet_monitor()
# If the driver can reach the tearDown method, then it is still alive.
@@ -142,11 +145,19 @@ def test_driver_lives_sequential(ray_start_regular):
reason="Hanging with new GCS API.")
def test_driver_lives_parallel(ray_start_regular):
all_processes = ray.worker._global_node.all_processes
process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
all_processes[ray_constants.PROCESS_TYPE_MONITOR] +
all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR])
if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None):
process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE]
+ all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER]
+ all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR]
+ all_processes[ray_constants.PROCESS_TYPE_MONITOR])
else:
process_infos = (
all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
all_processes[ray_constants.PROCESS_TYPE_MONITOR] +
all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR])
assert len(process_infos) == 5
# Kill all the components in parallel.
+5 -3
View File
@@ -101,13 +101,15 @@ def test_raylet_tempfiles(shutdown_only):
log_files = set(os.listdir(node.get_logs_dir_path()))
log_files_expected = {
"log_monitor.out", "log_monitor.err", "plasma_store.out",
"plasma_store.err", "monitor.out", "monitor.err", "raylet_monitor.out",
"raylet_monitor.err", "redis-shard_0.out", "redis-shard_0.err",
"redis.out", "redis.err", "raylet.out", "raylet.err"
"plasma_store.err", "monitor.out", "monitor.err", "redis-shard_0.out",
"redis-shard_0.err", "redis.out", "redis.err", "raylet.out",
"raylet.err"
}
if os.environ.get(ray_constants.RAY_GCS_SERVICE_ENABLED, None):
log_files_expected.update({"gcs_server.out", "gcs_server.err"})
else:
log_files_expected.update({"raylet_monitor.out", "raylet_monitor.err"})
assert log_files.issuperset(log_files_expected)