diff --git a/dashboard/agent.py b/dashboard/agent.py index 84404987a..d76c17f2d 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -12,6 +12,7 @@ import traceback import aiohttp import aiohttp.web import aiohttp_cors +import psutil from aiohttp import hdrs from grpc.experimental import aio as aiogrpc @@ -23,7 +24,7 @@ import ray._private.services import ray.utils from ray.core.generated import agent_manager_pb2 from ray.core.generated import agent_manager_pb2_grpc -import psutil +from ray.ray_logging import setup_component_logger try: create_task = asyncio.create_task @@ -249,17 +250,17 @@ if __name__ == "__main__": "--logging-rotate-bytes", required=False, type=int, - default=dashboard_consts.LOGGING_ROTATE_BYTES, + default=ray_constants.LOGGING_ROTATE_BYTES, help="Specify the max bytes for rotating " "log file, default is {} bytes.".format( - dashboard_consts.LOGGING_ROTATE_BYTES)) + ray_constants.LOGGING_ROTATE_BYTES)) parser.add_argument( "--logging-rotate-backup-count", required=False, type=int, - default=dashboard_consts.LOGGING_ROTATE_BACKUP_COUNT, + default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT, help="Specify the backup count of rotated log file, default is {}.". - format(dashboard_consts.LOGGING_ROTATE_BACKUP_COUNT)) + format(ray_constants.LOGGING_ROTATE_BACKUP_COUNT)) parser.add_argument( "--log-dir", required=True, @@ -275,19 +276,13 @@ if __name__ == "__main__": args = parser.parse_args() try: - if args.logging_filename: - logging_handlers = [ - logging.handlers.RotatingFileHandler( - os.path.join(args.log_dir, args.logging_filename), - maxBytes=args.logging_rotate_bytes, - backupCount=args.logging_rotate_backup_count) - ] - else: - logging_handlers = None - logging.basicConfig( - level=args.logging_level, - format=args.logging_format, - handlers=logging_handlers) + setup_component_logger( + logging_level=args.logging_level, + logging_format=args.logging_format, + log_dir=args.log_dir, + filename=args.logging_filename, + max_bytes=args.logging_rotate_bytes, + backup_count=args.logging_rotate_backup_count) agent = DashboardAgent( args.node_ip_address, diff --git a/dashboard/consts.py b/dashboard/consts.py index 66f738bbb..3139dcc14 100644 --- a/dashboard/consts.py +++ b/dashboard/consts.py @@ -22,9 +22,6 @@ AIOHTTP_CACHE_DISABLE_ENVIRONMENT_KEY = "RAY_DASHBOARD_NO_CACHE" SIGNAL_NODE_INFO_FETCHED = "node_info_fetched" SIGNAL_NODE_SUMMARY_FETCHED = "node_summary_fetched" SIGNAL_WORKER_INFO_FETCHED = "worker_info_fetched" -# Default param for RotatingFileHandler -LOGGING_ROTATE_BYTES = 100 * 1000 * 1000 # maxBytes -LOGGING_ROTATE_BACKUP_COUNT = 5 # backupCount # Default value for datacenter (the default value in protobuf) DEFAULT_LANGUAGE = "PYTHON" DEFAULT_JOB_ID = "ffff" diff --git a/dashboard/dashboard.py b/dashboard/dashboard.py index e5188fb97..5fecebdb5 100644 --- a/dashboard/dashboard.py +++ b/dashboard/dashboard.py @@ -20,7 +20,9 @@ import ray.new_dashboard.utils as dashboard_utils import ray.ray_constants as ray_constants import ray._private.services import ray.utils +from ray.ray_logging import setup_component_logger from ray.metrics_agent import PrometheusServiceDiscoveryWriter + # Logger for this module. It should be configured at the entry point # into the program using Ray. Ray provides a default configuration at # entry/init points. @@ -148,17 +150,17 @@ if __name__ == "__main__": "--logging-rotate-bytes", required=False, type=int, - default=dashboard_consts.LOGGING_ROTATE_BYTES, + default=ray_constants.LOGGING_ROTATE_BYTES, help="Specify the max bytes for rotating " "log file, default is {} bytes.".format( - dashboard_consts.LOGGING_ROTATE_BYTES)) + ray_constants.LOGGING_ROTATE_BYTES)) parser.add_argument( "--logging-rotate-backup-count", required=False, type=int, - default=dashboard_consts.LOGGING_ROTATE_BACKUP_COUNT, + default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT, help="Specify the backup count of rotated log file, default is {}.". - format(dashboard_consts.LOGGING_ROTATE_BACKUP_COUNT)) + format(ray_constants.LOGGING_ROTATE_BACKUP_COUNT)) parser.add_argument( "--log-dir", required=True, @@ -174,19 +176,13 @@ if __name__ == "__main__": args = parser.parse_args() try: - if args.logging_filename: - logging_handlers = [ - logging.handlers.RotatingFileHandler( - os.path.join(args.log_dir, args.logging_filename), - maxBytes=args.logging_rotate_bytes, - backupCount=args.logging_rotate_backup_count) - ] - else: - logging_handlers = None - logging.basicConfig( - level=args.logging_level, - format=args.logging_format, - handlers=logging_handlers) + setup_component_logger( + logging_level=args.logging_level, + logging_format=args.logging_format, + log_dir=args.log_dir, + filename=args.logging_filename, + max_bytes=args.logging_rotate_bytes, + backup_count=args.logging_rotate_backup_count) dashboard = Dashboard( args.host, diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index 8205534c1..9ddc8f3b7 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -1784,6 +1784,7 @@ def start_worker(node_ip_address, def start_monitor(redis_address, + logs_dir, stdout_file=None, stderr_file=None, autoscaling_config=None, @@ -1793,6 +1794,7 @@ def start_monitor(redis_address, Args: redis_address (str): The address that the Redis server is listening on. + logs_dir(str): The path to the log directory. stdout_file: A file handle opened for writing to redirect stdout to. If no redirection should happen, then this should be None. stderr_file: A file handle opened for writing to redirect stderr to. If @@ -1808,6 +1810,7 @@ def start_monitor(redis_address, sys.executable, "-u", monitor_path, + f"--logs-dir={logs_dir}", "--redis-address=" + str(redis_address), ] if autoscaling_config: diff --git a/python/ray/log_monitor.py b/python/ray/log_monitor.py index 0f5ecfeac..4932b7338 100644 --- a/python/ray/log_monitor.py +++ b/python/ray/log_monitor.py @@ -3,6 +3,7 @@ import errno import glob import json import logging +import logging.handlers import os import platform import re @@ -13,6 +14,7 @@ import traceback import ray.ray_constants as ray_constants import ray._private.services as services import ray.utils +from ray.ray_logging import setup_component_logger # Logger for this module. It should be configured at the entry point # into the program using Ray. Ray provides a default configuration at @@ -307,14 +309,43 @@ if __name__ == "__main__": type=str, default=ray_constants.LOGGER_FORMAT, help=ray_constants.LOGGER_FORMAT_HELP) + parser.add_argument( + "--logging-filename", + required=False, + type=str, + default=ray_constants.LOG_MONITOR_LOG_FILE_NAME, + help="Specify the name of log file, " + "log to stdout if set empty, default is " + f"\"{ray_constants.LOG_MONITOR_LOG_FILE_NAME}\"") parser.add_argument( "--logs-dir", required=True, type=str, help="Specify the path of the temporary directory used by Ray " "processes.") + parser.add_argument( + "--logging-rotate-bytes", + required=False, + type=int, + default=ray_constants.LOGGING_ROTATE_BYTES, + help="Specify the max bytes for rotating " + "log file, default is " + f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.") + parser.add_argument( + "--logging-rotate-backup-count", + required=False, + type=int, + default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT, + help="Specify the backup count of rotated log file, default is " + f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.") args = parser.parse_args() - ray.ray_logging.setup_logger(args.logging_level, args.logging_format) + setup_component_logger( + logging_level=args.logging_level, + logging_format=args.logging_format, + log_dir=args.logs_dir, + filename=args.logging_filename, + max_bytes=args.logging_rotate_bytes, + backup_count=args.logging_rotate_backup_count) log_monitor = LogMonitor( args.logs_dir, args.redis_address, redis_password=args.redis_password) diff --git a/python/ray/monitor.py b/python/ray/monitor.py index 7c0e0c06b..5f93e535a 100644 --- a/python/ray/monitor.py +++ b/python/ray/monitor.py @@ -2,6 +2,7 @@ import argparse import logging +import logging.handlers import os import time import traceback @@ -17,7 +18,7 @@ from ray.autoscaler._private.constants import \ import ray.gcs_utils import ray.utils import ray.ray_constants as ray_constants -from ray.ray_logging import setup_logger +from ray.ray_logging import setup_component_logger from ray._raylet import GlobalStateAccessor import redis @@ -340,8 +341,43 @@ if __name__ == "__main__": type=str, default=ray_constants.LOGGER_FORMAT, help=ray_constants.LOGGER_FORMAT_HELP) + parser.add_argument( + "--logging-filename", + required=False, + type=str, + default=ray_constants.MONITOR_LOG_FILE_NAME, + help="Specify the name of log file, " + "log to stdout if set empty, default is " + f"\"{ray_constants.MONITOR_LOG_FILE_NAME}\"") + parser.add_argument( + "--logs-dir", + required=True, + type=str, + help="Specify the path of the temporary directory used by Ray " + "processes.") + parser.add_argument( + "--logging-rotate-bytes", + required=False, + type=int, + default=ray_constants.LOGGING_ROTATE_BYTES, + help="Specify the max bytes for rotating " + "log file, default is " + f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.") + parser.add_argument( + "--logging-rotate-backup-count", + required=False, + type=int, + default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT, + help="Specify the backup count of rotated log file, default is " + f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.") args = parser.parse_args() - setup_logger(args.logging_level, args.logging_format) + setup_component_logger( + logging_level=args.logging_level, + logging_format=args.logging_format, + log_dir=args.logs_dir, + filename=args.logging_filename, + max_bytes=args.logging_rotate_bytes, + backup_count=args.logging_rotate_backup_count) if args.autoscaling_config: autoscaling_config = os.path.expanduser(args.autoscaling_config) diff --git a/python/ray/node.py b/python/ray/node.py index b2a71404d..13d73956c 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -602,13 +602,11 @@ class Node: def start_log_monitor(self): """Start the log monitor.""" - stdout_file, stderr_file = self.get_log_file_handles( - "log_monitor", unique=True) process_info = ray._private.services.start_log_monitor( self.redis_address, self._logs_dir, - stdout_file=stdout_file, - stderr_file=stderr_file, + stdout_file=subprocess.DEVNULL, + stderr_file=subprocess.DEVNULL, redis_password=self._ray_params.redis_password, fate_share=self.kernel_fate_share) assert ray_constants.PROCESS_TYPE_LOG_MONITOR not in self.all_processes @@ -742,12 +740,11 @@ class Node: def start_monitor(self): """Start the monitor.""" - stdout_file, stderr_file = self.get_log_file_handles( - "monitor", unique=True) process_info = ray._private.services.start_monitor( self._redis_address, - stdout_file=stdout_file, - stderr_file=stderr_file, + self._logs_dir, + stdout_file=subprocess.DEVNULL, + stderr_file=subprocess.DEVNULL, autoscaling_config=self._ray_params.autoscaling_config, redis_password=self._ray_params.redis_password, fate_share=self.kernel_fate_share) diff --git a/python/ray/ray_constants.py b/python/ray/ray_constants.py index f0c7fce69..a7e08112b 100644 --- a/python/ray/ray_constants.py +++ b/python/ray/ray_constants.py @@ -150,13 +150,21 @@ LOGGER_LEVEL = "info" LOGGER_LEVEL_CHOICES = ["debug", "info", "warning", "error", "critical"] LOGGER_LEVEL_HELP = ("The logging level threshold, choices=['debug', 'info'," " 'warning', 'error', 'critical'], default='info'") +# Default param for RotatingFileHandler +# maxBytes. 10G by default. We intentionally set the default value high +# so that users who won't care don't know about the existence of this. +LOGGING_ROTATE_BYTES = 10 * 1000 * 1000 * 1000 +# The default will grow logs up until 500GB without log loss. +LOGGING_ROTATE_BACKUP_COUNT = 50 # backupCount # Constants used to define the different process types. PROCESS_TYPE_REAPER = "reaper" PROCESS_TYPE_MONITOR = "monitor" PROCESS_TYPE_LOG_MONITOR = "log_monitor" +# TODO(sang): Delete it. PROCESS_TYPE_REPORTER = "reporter" PROCESS_TYPE_DASHBOARD = "dashboard" +PROCESS_TYPE_DASHBOARD_AGENT = "dashboard_agent" PROCESS_TYPE_WORKER = "worker" PROCESS_TYPE_RAYLET = "raylet" PROCESS_TYPE_PLASMA_STORE = "plasma_store" @@ -164,6 +172,10 @@ PROCESS_TYPE_REDIS_SERVER = "redis_server" PROCESS_TYPE_WEB_UI = "web_ui" PROCESS_TYPE_GCS_SERVER = "gcs_server" +# Log file names +MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_MONITOR}.log" +LOG_MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_LOG_MONITOR}.log" + WORKER_PROCESS_TYPE_IDLE_WORKER = "ray::IDLE" WORKER_PROCESS_TYPE_SPILL_WORKER_NAME = "SpillWorker" WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME = "RestoreWorker" diff --git a/python/ray/ray_logging.py b/python/ray/ray_logging.py index 52425e90f..e5f84eb2a 100644 --- a/python/ray/ray_logging.py +++ b/python/ray/ray_logging.py @@ -25,6 +25,36 @@ def setup_logger(logging_level, logging_format): logger.propagate = False +def setup_component_logger(*, logging_level, logging_format, log_dir, filename, + max_bytes, backup_count): + """Configure the root logger that is used for Ray's python components. + + For example, it should be used for monitor, dashboard, and log monitor. + The only exception is workers. They use the different logging config. + + Args: + logging_level(str | int): Logging level in string or logging enum. + logging_format(str): Logging format string. + log_dir(str): Log directory path. + filename(str): Name of the file to write logs. + max_bytes(int): Same argument as RotatingFileHandler's maxBytes. + backup_count(int): Same argument as RotatingFileHandler's backupCount. + """ + # Get the root logger. + logger = logging.getLogger("") + if type(logging_level) is str: + logging_level = logging.getLevelName(logging_level.upper()) + assert filename, "filename argument should not be None." + assert log_dir, "log_dir should not be None." + handler = logging.handlers.RotatingFileHandler( + os.path.join(log_dir, filename), + maxBytes=max_bytes, + backupCount=backup_count) + logger.setLevel(logging_level) + handler.setFormatter(logging.Formatter(logging_format)) + logger.addHandler(handler) + + """ All components underneath here is used specifically for the default_worker.py. """ diff --git a/python/ray/tests/test_tempfile.py b/python/ray/tests/test_tempfile.py index 2de26a739..616a45465 100644 --- a/python/ray/tests/test_tempfile.py +++ b/python/ray/tests/test_tempfile.py @@ -5,7 +5,7 @@ import time import pytest import ray -from ray.test_utils import check_call_ray +from ray.test_utils import check_call_ray, wait_for_condition def unix_socket_create_path(name): @@ -74,14 +74,23 @@ def test_raylet_tempfiles(shutdown_only): assert top_levels.issuperset({"sockets", "logs"}) log_files = set(os.listdir(node.get_logs_dir_path())) log_files_expected = { - "log_monitor.out", "log_monitor.err", "plasma_store.out", - "plasma_store.err", "monitor.out", "monitor.err", "redis-shard_0.out", - "redis-shard_0.err", "redis.out", "redis.err", "raylet.out", - "raylet.err", "gcs_server.out", "gcs_server.err" + "log_monitor.log", "plasma_store.out", "plasma_store.err", + "monitor.log", "redis-shard_0.out", "redis-shard_0.err", "redis.out", + "redis.err", "raylet.out", "raylet.err", "gcs_server.out", + "gcs_server.err", "dashboard.log", "dashboard_agent.log" } - for expected in log_files_expected: - assert expected in log_files + def check_all_log_file_exists(): + for expected in log_files_expected: + log_files = set(os.listdir(node.get_logs_dir_path())) + if expected not in log_files: + return False + return True + + wait_for_condition(check_all_log_file_exists) + # Get the list of log files again since the previous one + # might have the stale information. + log_files = set(os.listdir(node.get_logs_dir_path())) assert log_files_expected.issubset(log_files) assert log_files.issuperset(log_files_expected)