mirror of
https://github.com/wassname/ray.git
synced 2026-07-04 15:05:49 +08:00
[Logging] Log rotation on all components (#12101)
* In Progress. * Done. * Fix the issue. * Add wait for condition because logs are not written right away now. * debug string. * lint. * Fix flaky test. * Fix issues. * Fix test. * lint.
This commit is contained in:
@@ -1784,6 +1784,7 @@ def start_worker(node_ip_address,
|
||||
|
||||
|
||||
def start_monitor(redis_address,
|
||||
logs_dir,
|
||||
stdout_file=None,
|
||||
stderr_file=None,
|
||||
autoscaling_config=None,
|
||||
@@ -1793,6 +1794,7 @@ def start_monitor(redis_address,
|
||||
|
||||
Args:
|
||||
redis_address (str): The address that the Redis server is listening on.
|
||||
logs_dir(str): The path to the log directory.
|
||||
stdout_file: A file handle opened for writing to redirect stdout to. If
|
||||
no redirection should happen, then this should be None.
|
||||
stderr_file: A file handle opened for writing to redirect stderr to. If
|
||||
@@ -1808,6 +1810,7 @@ def start_monitor(redis_address,
|
||||
sys.executable,
|
||||
"-u",
|
||||
monitor_path,
|
||||
f"--logs-dir={logs_dir}",
|
||||
"--redis-address=" + str(redis_address),
|
||||
]
|
||||
if autoscaling_config:
|
||||
|
||||
@@ -3,6 +3,7 @@ import errno
|
||||
import glob
|
||||
import json
|
||||
import logging
|
||||
import logging.handlers
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
@@ -13,6 +14,7 @@ import traceback
|
||||
import ray.ray_constants as ray_constants
|
||||
import ray._private.services as services
|
||||
import ray.utils
|
||||
from ray.ray_logging import setup_component_logger
|
||||
|
||||
# Logger for this module. It should be configured at the entry point
|
||||
# into the program using Ray. Ray provides a default configuration at
|
||||
@@ -307,14 +309,43 @@ if __name__ == "__main__":
|
||||
type=str,
|
||||
default=ray_constants.LOGGER_FORMAT,
|
||||
help=ray_constants.LOGGER_FORMAT_HELP)
|
||||
parser.add_argument(
|
||||
"--logging-filename",
|
||||
required=False,
|
||||
type=str,
|
||||
default=ray_constants.LOG_MONITOR_LOG_FILE_NAME,
|
||||
help="Specify the name of log file, "
|
||||
"log to stdout if set empty, default is "
|
||||
f"\"{ray_constants.LOG_MONITOR_LOG_FILE_NAME}\"")
|
||||
parser.add_argument(
|
||||
"--logs-dir",
|
||||
required=True,
|
||||
type=str,
|
||||
help="Specify the path of the temporary directory used by Ray "
|
||||
"processes.")
|
||||
parser.add_argument(
|
||||
"--logging-rotate-bytes",
|
||||
required=False,
|
||||
type=int,
|
||||
default=ray_constants.LOGGING_ROTATE_BYTES,
|
||||
help="Specify the max bytes for rotating "
|
||||
"log file, default is "
|
||||
f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.")
|
||||
parser.add_argument(
|
||||
"--logging-rotate-backup-count",
|
||||
required=False,
|
||||
type=int,
|
||||
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
|
||||
help="Specify the backup count of rotated log file, default is "
|
||||
f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.")
|
||||
args = parser.parse_args()
|
||||
ray.ray_logging.setup_logger(args.logging_level, args.logging_format)
|
||||
setup_component_logger(
|
||||
logging_level=args.logging_level,
|
||||
logging_format=args.logging_format,
|
||||
log_dir=args.logs_dir,
|
||||
filename=args.logging_filename,
|
||||
max_bytes=args.logging_rotate_bytes,
|
||||
backup_count=args.logging_rotate_backup_count)
|
||||
|
||||
log_monitor = LogMonitor(
|
||||
args.logs_dir, args.redis_address, redis_password=args.redis_password)
|
||||
|
||||
+38
-2
@@ -2,6 +2,7 @@
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import logging.handlers
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
@@ -17,7 +18,7 @@ from ray.autoscaler._private.constants import \
|
||||
import ray.gcs_utils
|
||||
import ray.utils
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray.ray_logging import setup_logger
|
||||
from ray.ray_logging import setup_component_logger
|
||||
from ray._raylet import GlobalStateAccessor
|
||||
|
||||
import redis
|
||||
@@ -340,8 +341,43 @@ if __name__ == "__main__":
|
||||
type=str,
|
||||
default=ray_constants.LOGGER_FORMAT,
|
||||
help=ray_constants.LOGGER_FORMAT_HELP)
|
||||
parser.add_argument(
|
||||
"--logging-filename",
|
||||
required=False,
|
||||
type=str,
|
||||
default=ray_constants.MONITOR_LOG_FILE_NAME,
|
||||
help="Specify the name of log file, "
|
||||
"log to stdout if set empty, default is "
|
||||
f"\"{ray_constants.MONITOR_LOG_FILE_NAME}\"")
|
||||
parser.add_argument(
|
||||
"--logs-dir",
|
||||
required=True,
|
||||
type=str,
|
||||
help="Specify the path of the temporary directory used by Ray "
|
||||
"processes.")
|
||||
parser.add_argument(
|
||||
"--logging-rotate-bytes",
|
||||
required=False,
|
||||
type=int,
|
||||
default=ray_constants.LOGGING_ROTATE_BYTES,
|
||||
help="Specify the max bytes for rotating "
|
||||
"log file, default is "
|
||||
f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.")
|
||||
parser.add_argument(
|
||||
"--logging-rotate-backup-count",
|
||||
required=False,
|
||||
type=int,
|
||||
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
|
||||
help="Specify the backup count of rotated log file, default is "
|
||||
f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.")
|
||||
args = parser.parse_args()
|
||||
setup_logger(args.logging_level, args.logging_format)
|
||||
setup_component_logger(
|
||||
logging_level=args.logging_level,
|
||||
logging_format=args.logging_format,
|
||||
log_dir=args.logs_dir,
|
||||
filename=args.logging_filename,
|
||||
max_bytes=args.logging_rotate_bytes,
|
||||
backup_count=args.logging_rotate_backup_count)
|
||||
|
||||
if args.autoscaling_config:
|
||||
autoscaling_config = os.path.expanduser(args.autoscaling_config)
|
||||
|
||||
+5
-8
@@ -602,13 +602,11 @@ class Node:
|
||||
|
||||
def start_log_monitor(self):
|
||||
"""Start the log monitor."""
|
||||
stdout_file, stderr_file = self.get_log_file_handles(
|
||||
"log_monitor", unique=True)
|
||||
process_info = ray._private.services.start_log_monitor(
|
||||
self.redis_address,
|
||||
self._logs_dir,
|
||||
stdout_file=stdout_file,
|
||||
stderr_file=stderr_file,
|
||||
stdout_file=subprocess.DEVNULL,
|
||||
stderr_file=subprocess.DEVNULL,
|
||||
redis_password=self._ray_params.redis_password,
|
||||
fate_share=self.kernel_fate_share)
|
||||
assert ray_constants.PROCESS_TYPE_LOG_MONITOR not in self.all_processes
|
||||
@@ -742,12 +740,11 @@ class Node:
|
||||
|
||||
def start_monitor(self):
|
||||
"""Start the monitor."""
|
||||
stdout_file, stderr_file = self.get_log_file_handles(
|
||||
"monitor", unique=True)
|
||||
process_info = ray._private.services.start_monitor(
|
||||
self._redis_address,
|
||||
stdout_file=stdout_file,
|
||||
stderr_file=stderr_file,
|
||||
self._logs_dir,
|
||||
stdout_file=subprocess.DEVNULL,
|
||||
stderr_file=subprocess.DEVNULL,
|
||||
autoscaling_config=self._ray_params.autoscaling_config,
|
||||
redis_password=self._ray_params.redis_password,
|
||||
fate_share=self.kernel_fate_share)
|
||||
|
||||
@@ -150,13 +150,21 @@ LOGGER_LEVEL = "info"
|
||||
LOGGER_LEVEL_CHOICES = ["debug", "info", "warning", "error", "critical"]
|
||||
LOGGER_LEVEL_HELP = ("The logging level threshold, choices=['debug', 'info',"
|
||||
" 'warning', 'error', 'critical'], default='info'")
|
||||
# Default param for RotatingFileHandler
|
||||
# maxBytes. 10G by default. We intentionally set the default value high
|
||||
# so that users who won't care don't know about the existence of this.
|
||||
LOGGING_ROTATE_BYTES = 10 * 1000 * 1000 * 1000
|
||||
# The default will grow logs up until 500GB without log loss.
|
||||
LOGGING_ROTATE_BACKUP_COUNT = 50 # backupCount
|
||||
|
||||
# Constants used to define the different process types.
|
||||
PROCESS_TYPE_REAPER = "reaper"
|
||||
PROCESS_TYPE_MONITOR = "monitor"
|
||||
PROCESS_TYPE_LOG_MONITOR = "log_monitor"
|
||||
# TODO(sang): Delete it.
|
||||
PROCESS_TYPE_REPORTER = "reporter"
|
||||
PROCESS_TYPE_DASHBOARD = "dashboard"
|
||||
PROCESS_TYPE_DASHBOARD_AGENT = "dashboard_agent"
|
||||
PROCESS_TYPE_WORKER = "worker"
|
||||
PROCESS_TYPE_RAYLET = "raylet"
|
||||
PROCESS_TYPE_PLASMA_STORE = "plasma_store"
|
||||
@@ -164,6 +172,10 @@ PROCESS_TYPE_REDIS_SERVER = "redis_server"
|
||||
PROCESS_TYPE_WEB_UI = "web_ui"
|
||||
PROCESS_TYPE_GCS_SERVER = "gcs_server"
|
||||
|
||||
# Log file names
|
||||
MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_MONITOR}.log"
|
||||
LOG_MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_LOG_MONITOR}.log"
|
||||
|
||||
WORKER_PROCESS_TYPE_IDLE_WORKER = "ray::IDLE"
|
||||
WORKER_PROCESS_TYPE_SPILL_WORKER_NAME = "SpillWorker"
|
||||
WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME = "RestoreWorker"
|
||||
|
||||
@@ -25,6 +25,36 @@ def setup_logger(logging_level, logging_format):
|
||||
logger.propagate = False
|
||||
|
||||
|
||||
def setup_component_logger(*, logging_level, logging_format, log_dir, filename,
|
||||
max_bytes, backup_count):
|
||||
"""Configure the root logger that is used for Ray's python components.
|
||||
|
||||
For example, it should be used for monitor, dashboard, and log monitor.
|
||||
The only exception is workers. They use the different logging config.
|
||||
|
||||
Args:
|
||||
logging_level(str | int): Logging level in string or logging enum.
|
||||
logging_format(str): Logging format string.
|
||||
log_dir(str): Log directory path.
|
||||
filename(str): Name of the file to write logs.
|
||||
max_bytes(int): Same argument as RotatingFileHandler's maxBytes.
|
||||
backup_count(int): Same argument as RotatingFileHandler's backupCount.
|
||||
"""
|
||||
# Get the root logger.
|
||||
logger = logging.getLogger("")
|
||||
if type(logging_level) is str:
|
||||
logging_level = logging.getLevelName(logging_level.upper())
|
||||
assert filename, "filename argument should not be None."
|
||||
assert log_dir, "log_dir should not be None."
|
||||
handler = logging.handlers.RotatingFileHandler(
|
||||
os.path.join(log_dir, filename),
|
||||
maxBytes=max_bytes,
|
||||
backupCount=backup_count)
|
||||
logger.setLevel(logging_level)
|
||||
handler.setFormatter(logging.Formatter(logging_format))
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
"""
|
||||
All components underneath here is used specifically for the default_worker.py.
|
||||
"""
|
||||
|
||||
@@ -5,7 +5,7 @@ import time
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
from ray.test_utils import check_call_ray
|
||||
from ray.test_utils import check_call_ray, wait_for_condition
|
||||
|
||||
|
||||
def unix_socket_create_path(name):
|
||||
@@ -74,14 +74,23 @@ def test_raylet_tempfiles(shutdown_only):
|
||||
assert top_levels.issuperset({"sockets", "logs"})
|
||||
log_files = set(os.listdir(node.get_logs_dir_path()))
|
||||
log_files_expected = {
|
||||
"log_monitor.out", "log_monitor.err", "plasma_store.out",
|
||||
"plasma_store.err", "monitor.out", "monitor.err", "redis-shard_0.out",
|
||||
"redis-shard_0.err", "redis.out", "redis.err", "raylet.out",
|
||||
"raylet.err", "gcs_server.out", "gcs_server.err"
|
||||
"log_monitor.log", "plasma_store.out", "plasma_store.err",
|
||||
"monitor.log", "redis-shard_0.out", "redis-shard_0.err", "redis.out",
|
||||
"redis.err", "raylet.out", "raylet.err", "gcs_server.out",
|
||||
"gcs_server.err", "dashboard.log", "dashboard_agent.log"
|
||||
}
|
||||
|
||||
for expected in log_files_expected:
|
||||
assert expected in log_files
|
||||
def check_all_log_file_exists():
|
||||
for expected in log_files_expected:
|
||||
log_files = set(os.listdir(node.get_logs_dir_path()))
|
||||
if expected not in log_files:
|
||||
return False
|
||||
return True
|
||||
|
||||
wait_for_condition(check_all_log_file_exists)
|
||||
# Get the list of log files again since the previous one
|
||||
# might have the stale information.
|
||||
log_files = set(os.listdir(node.get_logs_dir_path()))
|
||||
assert log_files_expected.issubset(log_files)
|
||||
assert log_files.issuperset(log_files_expected)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user