[Logging] Remove per worker job log file / support worker log rotation (#11927)

* In progress.

* MVP done.

* In Progress.

* Remove unnecessay code.

* Fix some issues.

* Fix test failures.

* Addressed code review + fix object spilling test failure.
This commit is contained in:
SangBin Cho
2020-11-16 11:29:43 -08:00
committed by GitHub
parent b6b54f1c81
commit f56d7c1a76
11 changed files with 215 additions and 254 deletions
+140 -67
View File
@@ -1,9 +1,12 @@
import argparse
import base64
import json
import logging
import time
import sys
import os
from contextlib import redirect_stdout, redirect_stderr
from logging.handlers import RotatingFileHandler
import ray
import ray.actor
@@ -11,6 +14,142 @@ import ray.node
import ray.ray_constants as ray_constants
import ray.utils
from ray.parameter import RayParams
from ray.ray_logging import StandardStreamInterceptor
def setup_and_get_worker_interceptor_logger(is_for_stdout: bool = True):
"""Setup a logger to be used to intercept worker log messages.
NOTE: The method is not idempotent.
Ray worker logs should be treated in a special way because
there's a need to intercept stdout and stderr to support various
ray features. For example, ray will prepend 0 or 1 in the beggining
of each log message to decide if logs should be streamed to driveres.
This logger will also setup the RotatingFileHandler for
ray workers processes.
Args:
is_for_stdout(bool): True if logger will be used to intercept stdout.
False otherwise.
"""
file_extension = "out" if is_for_stdout else "err"
logger = logging.getLogger(f"ray_default_worker_{file_extension}")
logger.setLevel(logging.INFO)
# TODO(sang): This is how the job id is propagated to workers now.
# But eventually, it will be clearer to just pass the job id.
job_id = os.environ.get("RAY_JOB_ID")
if args.worker_type == "WORKER":
assert job_id is not None, (
"RAY_JOB_ID should be set as an env "
"variable within default_worker.py. If you see this error, "
"please report it to Ray's Github issue.")
worker_name = "worker"
else:
job_id = ray.JobID.nil()
worker_name = "io_worker"
# Make sure these values are set already.
assert ray.worker._global_node is not None
assert ray.worker.global_worker is not None
handler = RotatingFileHandler(
f"{ray.worker._global_node.get_session_dir_path()}/logs/"
f"{worker_name}-"
f"{ray.utils.binary_to_hex(ray.worker.global_worker.worker_id)}-"
f"{job_id}-{os.getpid()}.{file_extension}")
logger.addHandler(handler)
# TODO(sang): Add 0 or 1 to decide whether
# or not logs are streamed to drivers.
handler.setFormatter(logging.Formatter("%(message)s"))
# Avoid messages are propagated to parent loggers.
logger.propagate = False
# Remove the terminator. It is important because we don't want this
# logger to add a newline at the end of string.
handler.terminator = ""
return logger
def main(args):
ray.ray_logging.setup_logger(args.logging_level, args.logging_format)
if args.worker_type == "WORKER":
mode = ray.WORKER_MODE
elif args.worker_type == "SPILL_WORKER":
mode = ray.SPILL_WORKER_MODE
elif args.worker_type == "RESTORE_WORKER":
mode = ray.RESTORE_WORKER_MODE
else:
raise ValueError("Unknown worker type: " + args.worker_type)
# NOTE(suquark): We must initialize the external storage before we
# connect to raylet. Otherwise we may receive requests before the
# external storage is intialized.
if mode == ray.RESTORE_WORKER_MODE or mode == ray.SPILL_WORKER_MODE:
from ray import external_storage
if args.object_spilling_config:
object_spilling_config = base64.b64decode(
args.object_spilling_config)
object_spilling_config = json.loads(object_spilling_config)
else:
object_spilling_config = {}
external_storage.setup_external_storage(object_spilling_config)
raylet_ip_address = args.raylet_ip_address
if raylet_ip_address is None:
raylet_ip_address = args.node_ip_address
code_search_path = args.code_search_path
if code_search_path is not None:
for p in code_search_path.split(":"):
if os.path.isfile(p):
p = os.path.dirname(p)
sys.path.append(p)
ray_params = RayParams(
node_ip_address=args.node_ip_address,
raylet_ip_address=raylet_ip_address,
node_manager_port=args.node_manager_port,
redis_address=args.redis_address,
redis_password=args.redis_password,
plasma_store_socket_name=args.object_store_name,
raylet_socket_name=args.raylet_name,
temp_dir=args.temp_dir,
load_code_from_local=args.load_code_from_local,
metrics_agent_port=args.metrics_agent_port,
)
node = ray.node.Node(
ray_params,
head=False,
shutdown_at_exit=False,
spawn_reaper=False,
connect_only=True)
ray.worker._global_node = node
ray.worker.connect(node, mode=mode)
# Redirect stdout and stderr to the default worker interceptor logger.
# NOTE: We deprecated redirect_worker_output arg,
# so we don't need to handle here.
stdout_interceptor = StandardStreamInterceptor(
setup_and_get_worker_interceptor_logger(is_for_stdout=True),
intercept_stdout=True)
stderr_interceptor = StandardStreamInterceptor(
setup_and_get_worker_interceptor_logger(is_for_stdout=False),
intercept_stdout=False)
with redirect_stdout(stdout_interceptor):
with redirect_stderr(stderr_interceptor):
if mode == ray.WORKER_MODE:
ray.worker.global_worker.main_loop()
elif (mode == ray.RESTORE_WORKER_MODE
or mode == ray.SPILL_WORKER_MODE):
# It is handled by another thread in the C++ core worker.
# We just need to keep the worker alive.
while True:
time.sleep(100000)
else:
raise ValueError(f"Unexcepted worker mode: {mode}")
parser = argparse.ArgumentParser(
description=("Parse addresses for the worker "
@@ -110,70 +249,4 @@ parser.add_argument(
"Java and `PYTHONPATH` in Python.")
if __name__ == "__main__":
args = parser.parse_args()
ray.utils.setup_logger(args.logging_level, args.logging_format)
if args.worker_type == "WORKER":
mode = ray.WORKER_MODE
elif args.worker_type == "SPILL_WORKER":
mode = ray.SPILL_WORKER_MODE
elif args.worker_type == "RESTORE_WORKER":
mode = ray.RESTORE_WORKER_MODE
else:
raise ValueError("Unknown worker type: " + args.worker_type)
# NOTE(suquark): We must initialize the external storage before we
# connect to raylet. Otherwise we may receive requests before the
# external storage is intialized.
if mode == ray.RESTORE_WORKER_MODE or mode == ray.SPILL_WORKER_MODE:
from ray import external_storage
if args.object_spilling_config:
object_spilling_config = base64.b64decode(
args.object_spilling_config)
object_spilling_config = json.loads(object_spilling_config)
else:
object_spilling_config = {}
external_storage.setup_external_storage(object_spilling_config)
raylet_ip_address = args.raylet_ip_address
if raylet_ip_address is None:
raylet_ip_address = args.node_ip_address
code_search_path = args.code_search_path
if code_search_path is not None:
for p in code_search_path.split(":"):
if os.path.isfile(p):
p = os.path.dirname(p)
sys.path.append(p)
ray_params = RayParams(
node_ip_address=args.node_ip_address,
raylet_ip_address=raylet_ip_address,
node_manager_port=args.node_manager_port,
redis_address=args.redis_address,
redis_password=args.redis_password,
plasma_store_socket_name=args.object_store_name,
raylet_socket_name=args.raylet_name,
temp_dir=args.temp_dir,
load_code_from_local=args.load_code_from_local,
metrics_agent_port=args.metrics_agent_port,
)
node = ray.node.Node(
ray_params,
head=False,
shutdown_at_exit=False,
spawn_reaper=False,
connect_only=True)
ray.worker._global_node = node
ray.worker.connect(node, mode=mode)
if mode == ray.WORKER_MODE:
ray.worker.global_worker.main_loop()
elif mode == ray.RESTORE_WORKER_MODE or mode == ray.SPILL_WORKER_MODE:
# It is handled by another thread in the C++ core worker.
# We just need to keep the worker alive.
while True:
time.sleep(100000)
else:
raise ValueError(f"Unexcepted worker mode: {mode}")
main(args)