[Logging] Log rotation on all components (#12101)

* In Progress.

* Done.

* Fix the issue.

* Add wait for condition because logs are not written right away now.

* debug string.

* lint.

* Fix flaky test.

* Fix issues.

* Fix test.

* lint.
This commit is contained in:
SangBin Cho
2020-11-30 19:03:55 -08:00
committed by GitHub
parent 2708b3abbc
commit 8223a33bff
10 changed files with 162 additions and 56 deletions
+3
View File
@@ -1784,6 +1784,7 @@ def start_worker(node_ip_address,
def start_monitor(redis_address,
logs_dir,
stdout_file=None,
stderr_file=None,
autoscaling_config=None,
@@ -1793,6 +1794,7 @@ def start_monitor(redis_address,
Args:
redis_address (str): The address that the Redis server is listening on.
logs_dir(str): The path to the log directory.
stdout_file: A file handle opened for writing to redirect stdout to. If
no redirection should happen, then this should be None.
stderr_file: A file handle opened for writing to redirect stderr to. If
@@ -1808,6 +1810,7 @@ def start_monitor(redis_address,
sys.executable,
"-u",
monitor_path,
f"--logs-dir={logs_dir}",
"--redis-address=" + str(redis_address),
]
if autoscaling_config:
+32 -1
View File
@@ -3,6 +3,7 @@ import errno
import glob
import json
import logging
import logging.handlers
import os
import platform
import re
@@ -13,6 +14,7 @@ import traceback
import ray.ray_constants as ray_constants
import ray._private.services as services
import ray.utils
from ray.ray_logging import setup_component_logger
# Logger for this module. It should be configured at the entry point
# into the program using Ray. Ray provides a default configuration at
@@ -307,14 +309,43 @@ if __name__ == "__main__":
type=str,
default=ray_constants.LOGGER_FORMAT,
help=ray_constants.LOGGER_FORMAT_HELP)
parser.add_argument(
"--logging-filename",
required=False,
type=str,
default=ray_constants.LOG_MONITOR_LOG_FILE_NAME,
help="Specify the name of log file, "
"log to stdout if set empty, default is "
f"\"{ray_constants.LOG_MONITOR_LOG_FILE_NAME}\"")
parser.add_argument(
"--logs-dir",
required=True,
type=str,
help="Specify the path of the temporary directory used by Ray "
"processes.")
parser.add_argument(
"--logging-rotate-bytes",
required=False,
type=int,
default=ray_constants.LOGGING_ROTATE_BYTES,
help="Specify the max bytes for rotating "
"log file, default is "
f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.")
parser.add_argument(
"--logging-rotate-backup-count",
required=False,
type=int,
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
help="Specify the backup count of rotated log file, default is "
f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.")
args = parser.parse_args()
ray.ray_logging.setup_logger(args.logging_level, args.logging_format)
setup_component_logger(
logging_level=args.logging_level,
logging_format=args.logging_format,
log_dir=args.logs_dir,
filename=args.logging_filename,
max_bytes=args.logging_rotate_bytes,
backup_count=args.logging_rotate_backup_count)
log_monitor = LogMonitor(
args.logs_dir, args.redis_address, redis_password=args.redis_password)
+38 -2
View File
@@ -2,6 +2,7 @@
import argparse
import logging
import logging.handlers
import os
import time
import traceback
@@ -17,7 +18,7 @@ from ray.autoscaler._private.constants import \
import ray.gcs_utils
import ray.utils
import ray.ray_constants as ray_constants
from ray.ray_logging import setup_logger
from ray.ray_logging import setup_component_logger
from ray._raylet import GlobalStateAccessor
import redis
@@ -340,8 +341,43 @@ if __name__ == "__main__":
type=str,
default=ray_constants.LOGGER_FORMAT,
help=ray_constants.LOGGER_FORMAT_HELP)
parser.add_argument(
"--logging-filename",
required=False,
type=str,
default=ray_constants.MONITOR_LOG_FILE_NAME,
help="Specify the name of log file, "
"log to stdout if set empty, default is "
f"\"{ray_constants.MONITOR_LOG_FILE_NAME}\"")
parser.add_argument(
"--logs-dir",
required=True,
type=str,
help="Specify the path of the temporary directory used by Ray "
"processes.")
parser.add_argument(
"--logging-rotate-bytes",
required=False,
type=int,
default=ray_constants.LOGGING_ROTATE_BYTES,
help="Specify the max bytes for rotating "
"log file, default is "
f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.")
parser.add_argument(
"--logging-rotate-backup-count",
required=False,
type=int,
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
help="Specify the backup count of rotated log file, default is "
f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.")
args = parser.parse_args()
setup_logger(args.logging_level, args.logging_format)
setup_component_logger(
logging_level=args.logging_level,
logging_format=args.logging_format,
log_dir=args.logs_dir,
filename=args.logging_filename,
max_bytes=args.logging_rotate_bytes,
backup_count=args.logging_rotate_backup_count)
if args.autoscaling_config:
autoscaling_config = os.path.expanduser(args.autoscaling_config)
+5 -8
View File
@@ -602,13 +602,11 @@ class Node:
def start_log_monitor(self):
"""Start the log monitor."""
stdout_file, stderr_file = self.get_log_file_handles(
"log_monitor", unique=True)
process_info = ray._private.services.start_log_monitor(
self.redis_address,
self._logs_dir,
stdout_file=stdout_file,
stderr_file=stderr_file,
stdout_file=subprocess.DEVNULL,
stderr_file=subprocess.DEVNULL,
redis_password=self._ray_params.redis_password,
fate_share=self.kernel_fate_share)
assert ray_constants.PROCESS_TYPE_LOG_MONITOR not in self.all_processes
@@ -742,12 +740,11 @@ class Node:
def start_monitor(self):
"""Start the monitor."""
stdout_file, stderr_file = self.get_log_file_handles(
"monitor", unique=True)
process_info = ray._private.services.start_monitor(
self._redis_address,
stdout_file=stdout_file,
stderr_file=stderr_file,
self._logs_dir,
stdout_file=subprocess.DEVNULL,
stderr_file=subprocess.DEVNULL,
autoscaling_config=self._ray_params.autoscaling_config,
redis_password=self._ray_params.redis_password,
fate_share=self.kernel_fate_share)
+12
View File
@@ -150,13 +150,21 @@ LOGGER_LEVEL = "info"
LOGGER_LEVEL_CHOICES = ["debug", "info", "warning", "error", "critical"]
LOGGER_LEVEL_HELP = ("The logging level threshold, choices=['debug', 'info',"
" 'warning', 'error', 'critical'], default='info'")
# Default param for RotatingFileHandler
# maxBytes. 10G by default. We intentionally set the default value high
# so that users who won't care don't know about the existence of this.
LOGGING_ROTATE_BYTES = 10 * 1000 * 1000 * 1000
# The default will grow logs up until 500GB without log loss.
LOGGING_ROTATE_BACKUP_COUNT = 50 # backupCount
# Constants used to define the different process types.
PROCESS_TYPE_REAPER = "reaper"
PROCESS_TYPE_MONITOR = "monitor"
PROCESS_TYPE_LOG_MONITOR = "log_monitor"
# TODO(sang): Delete it.
PROCESS_TYPE_REPORTER = "reporter"
PROCESS_TYPE_DASHBOARD = "dashboard"
PROCESS_TYPE_DASHBOARD_AGENT = "dashboard_agent"
PROCESS_TYPE_WORKER = "worker"
PROCESS_TYPE_RAYLET = "raylet"
PROCESS_TYPE_PLASMA_STORE = "plasma_store"
@@ -164,6 +172,10 @@ PROCESS_TYPE_REDIS_SERVER = "redis_server"
PROCESS_TYPE_WEB_UI = "web_ui"
PROCESS_TYPE_GCS_SERVER = "gcs_server"
# Log file names
MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_MONITOR}.log"
LOG_MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_LOG_MONITOR}.log"
WORKER_PROCESS_TYPE_IDLE_WORKER = "ray::IDLE"
WORKER_PROCESS_TYPE_SPILL_WORKER_NAME = "SpillWorker"
WORKER_PROCESS_TYPE_RESTORE_WORKER_NAME = "RestoreWorker"
+30
View File
@@ -25,6 +25,36 @@ def setup_logger(logging_level, logging_format):
logger.propagate = False
def setup_component_logger(*, logging_level, logging_format, log_dir, filename,
max_bytes, backup_count):
"""Configure the root logger that is used for Ray's python components.
For example, it should be used for monitor, dashboard, and log monitor.
The only exception is workers. They use the different logging config.
Args:
logging_level(str | int): Logging level in string or logging enum.
logging_format(str): Logging format string.
log_dir(str): Log directory path.
filename(str): Name of the file to write logs.
max_bytes(int): Same argument as RotatingFileHandler's maxBytes.
backup_count(int): Same argument as RotatingFileHandler's backupCount.
"""
# Get the root logger.
logger = logging.getLogger("")
if type(logging_level) is str:
logging_level = logging.getLevelName(logging_level.upper())
assert filename, "filename argument should not be None."
assert log_dir, "log_dir should not be None."
handler = logging.handlers.RotatingFileHandler(
os.path.join(log_dir, filename),
maxBytes=max_bytes,
backupCount=backup_count)
logger.setLevel(logging_level)
handler.setFormatter(logging.Formatter(logging_format))
logger.addHandler(handler)
"""
All components underneath here is used specifically for the default_worker.py.
"""
+16 -7
View File
@@ -5,7 +5,7 @@ import time
import pytest
import ray
from ray.test_utils import check_call_ray
from ray.test_utils import check_call_ray, wait_for_condition
def unix_socket_create_path(name):
@@ -74,14 +74,23 @@ def test_raylet_tempfiles(shutdown_only):
assert top_levels.issuperset({"sockets", "logs"})
log_files = set(os.listdir(node.get_logs_dir_path()))
log_files_expected = {
"log_monitor.out", "log_monitor.err", "plasma_store.out",
"plasma_store.err", "monitor.out", "monitor.err", "redis-shard_0.out",
"redis-shard_0.err", "redis.out", "redis.err", "raylet.out",
"raylet.err", "gcs_server.out", "gcs_server.err"
"log_monitor.log", "plasma_store.out", "plasma_store.err",
"monitor.log", "redis-shard_0.out", "redis-shard_0.err", "redis.out",
"redis.err", "raylet.out", "raylet.err", "gcs_server.out",
"gcs_server.err", "dashboard.log", "dashboard_agent.log"
}
for expected in log_files_expected:
assert expected in log_files
def check_all_log_file_exists():
for expected in log_files_expected:
log_files = set(os.listdir(node.get_logs_dir_path()))
if expected not in log_files:
return False
return True
wait_for_condition(check_all_log_file_exists)
# Get the list of log files again since the previous one
# might have the stale information.
log_files = set(os.listdir(node.get_logs_dir_path()))
assert log_files_expected.issubset(log_files)
assert log_files.issuperset(log_files_expected)