From ee1d4e5ea22314874012e413da37770131c00f75 Mon Sep 17 00:00:00 2001 From: alanamarzoev Date: Thu, 8 Jun 2017 18:30:48 -0700 Subject: [PATCH] Redirect worker stdout/stderr to log files. (#646) * local scheduler * redirect output files to be associated with workers rather than the local scheduler * fixed formatting * fixes * Moved output redirection logic to worker.py. * Changed write mode. * Fixed formatting. * Added comment. * Reuse log file creation in services.py. * Fix linting. * Fix problem in which multiple processes attempt to create /tmp/raylogs at the same time. --- python/ray/services.py | 8 +++++++- python/ray/worker.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/python/ray/services.py b/python/ray/services.py index 5fe278f71..6df268ec6 100644 --- a/python/ray/services.py +++ b/python/ray/services.py @@ -1145,7 +1145,13 @@ def new_log_files(name, redirect_output): logs_dir = "/tmp/raylogs" if not os.path.exists(logs_dir): - os.makedirs(logs_dir) + try: + os.makedirs(logs_dir) + except OSError as e: + if e.errno != os.errno.EEXIST: + raise e + print("Attempted to create '/tmp/raylogs', but the directory already " + "exists.") # Change the log directory permissions so others can use it. This is # important when multiple people are using the same machine. os.chmod(logs_dir, 0o0777) diff --git a/python/ray/worker.py b/python/ray/worker.py index bd353db30..03c57fac9 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -1233,6 +1233,14 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker, worker.actor_id = actor_id worker.connected = True worker.set_mode(mode) + # Redirect worker output and error to their own files. + if mode == WORKER_MODE: + log_stdout_file, log_stderr_file = services.new_log_files("worker", True) + sys.stdout = log_stdout_file + sys.stderr = log_stderr_file + services.record_log_files_in_redis(info["redis_address"], + info["node_ip_address"], + [log_stdout_file, log_stderr_file]) # The worker.events field is used to aggregate logging information and # display it in the web UI. Note that Python lists protected by the GIL, # which is important because we will append to this field from multiple @@ -1275,6 +1283,8 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker, worker.redis_client.hmset( b"Workers:" + worker.worker_id, {"node_ip_address": worker.node_ip_address, + "stdout_file": os.path.abspath(log_stdout_file.name), + "stderr_file": os.path.abspath(log_stderr_file.name), "plasma_store_socket": info["store_socket_name"], "plasma_manager_socket": info["manager_socket_name"], "local_scheduler_socket": info["local_scheduler_socket_name"]})