Add a web dashboard for monitoring node resource usage (#4066)

This commit is contained in:
Daniel Edgecumbe
2019-02-21 08:10:04 +00:00
committed by Robert Nishihara
parent 3ac8fd7ee8
commit 2e30f7ba38
13 changed files with 1239 additions and 44 deletions
+57 -2
View File
@@ -10,6 +10,7 @@ import json
import os
import logging
import signal
import sys
import tempfile
import threading
import time
@@ -24,6 +25,8 @@ from ray.utils import try_to_create_directory
# using logging.basicConfig in its entry/init points.
logger = logging.getLogger(__name__)
PY3 = sys.version_info.major >= 3
class Node(object):
"""An encapsulation of the Ray processes on a single node.
@@ -81,6 +84,7 @@ class Node(object):
self._plasma_store_socket_name = None
self._raylet_socket_name = None
self._webui_url = None
self._dashboard_url = None
else:
self._plasma_store_socket_name = (
ray_params.plasma_store_socket_name)
@@ -284,6 +288,35 @@ class Node(object):
process_info
]
def start_reporter(self):
"""Start the reporter."""
stdout_file, stderr_file = self.new_log_files("reporter", True)
process_info = ray.services.start_reporter(
self.redis_address,
stdout_file=stdout_file,
stderr_file=stderr_file,
redis_password=self._ray_params.redis_password)
assert ray_constants.PROCESS_TYPE_REPORTER not in self.all_processes
if process_info is not None:
self.all_processes[ray_constants.PROCESS_TYPE_REPORTER] = [
process_info
]
def start_dashboard(self):
"""Start the dashboard."""
stdout_file, stderr_file = self.new_log_files("dashboard", True)
self._dashboard_url, process_info = ray.services.start_dashboard(
self.redis_address,
self._temp_dir,
stdout_file=stdout_file,
stderr_file=stderr_file,
redis_password=self._ray_params.redis_password)
assert ray_constants.PROCESS_TYPE_DASHBOARD not in self.all_processes
if process_info is not None:
self.all_processes[ray_constants.PROCESS_TYPE_DASHBOARD] = [
process_info
]
def start_ui(self):
"""Start the web UI."""
stdout_file, stderr_file = self.new_log_files("webui")
@@ -408,14 +441,16 @@ class Node(object):
self.start_redis()
self.start_monitor()
self.start_raylet_monitor()
if PY3 and self._ray_params.include_webui:
self.start_dashboard()
self.start_plasma_store()
self.start_raylet()
if PY3 and self._ray_params.include_webui:
self.start_reporter()
if self._ray_params.include_log_monitor:
self.start_log_monitor()
if self._ray_params.include_webui:
self.start_ui()
def _kill_process_type(self,
process_type,
@@ -545,6 +580,26 @@ class Node(object):
self._kill_process_type(
ray_constants.PROCESS_TYPE_LOG_MONITOR, check_alive=check_alive)
def kill_reporter(self, check_alive=True):
"""Kill the reporter.
Args:
check_alive (bool): Raise an exception if the process was already
dead.
"""
self._kill_process_type(
ray_constants.PROCESS_TYPE_REPORTER, check_alive=check_alive)
def kill_dashboard(self, check_alive=True):
"""Kill the dashboard.
Args:
check_alive (bool): Raise an exception if the process was already
dead.
"""
self._kill_process_type(
ray_constants.PROCESS_TYPE_DASHBOARD, check_alive=check_alive)
def kill_monitor(self, check_alive=True):
"""Kill the monitor.