[Core] Log output from different jobs to different drivers. (#8885)

* .

* .

* Correct now

* No interactivity errors

* format

* Filtering

* lint

* .

* No more filtering

* Removed interactivity

* .

* .

* .

* .

* .

* .

* Redirection works

* formatting

* something broken?

* .

* Works

* formatting

* redirect output

* formatting

* formatting

* Fix file descriptor leakage

* format

* .

* .

* .

* .

* .

* Refactor

* .

* Only run on job switch

* .

* cleanup

* .

* ...

* Review

* .

* .

* .

* .

* whoops

* .

* Should fix bug

* .

* .

* addressed comments

* formatting

* formatting

* Fix typo

* .

* .

* .

* .

Co-authored-by: Ubuntu <ubuntu@ip-172-31-14-33.us-west-2.compute.internal>
This commit is contained in:
Alex Wu
2020-06-23 18:45:32 -07:00
committed by GitHub
parent acb3280235
commit c152730e4a
5 changed files with 213 additions and 67 deletions
+82 -24
View File
@@ -17,7 +17,7 @@ import ray.ray_constants as ray_constants
import ray.services
import ray.utils
from ray.resource_spec import ResourceSpec
from ray.utils import try_to_create_directory, try_to_symlink
from ray.utils import try_to_create_directory, try_to_symlink, open_log
# Logger for this module. It should be configured at the entry point
# into the program using Ray. Ray configures it by default automatically
@@ -383,14 +383,16 @@ class Node:
raise FileExistsError(errno.EEXIST,
"No usable temporary filename found")
def new_log_files(self, name):
def get_log_file_names(self, name, unique=False):
"""Generate partially randomized filenames for log files.
Args:
name (str): descriptive string for this log file.
unique (bool): if true, a counter will be attached to `name` to
ensure the returned filename is not already used.
Returns:
A tuple of two file handles for redirecting (stdout, stderr).
A tuple of two file names for redirecting (stdout, stderr).
"""
redirect_output = self._ray_params.redirect_output
@@ -401,14 +403,15 @@ class Node:
if not redirect_output:
return None, None
log_stdout = self._make_inc_temp(
suffix=".out", prefix=name, directory_name=self._logs_dir)
log_stderr = self._make_inc_temp(
suffix=".err", prefix=name, directory_name=self._logs_dir)
# Line-buffer the output (mode 1).
log_stdout_file = open(log_stdout, "a", buffering=1)
log_stderr_file = open(log_stderr, "a", buffering=1)
return log_stdout_file, log_stderr_file
if unique:
log_stdout = self._make_inc_temp(
suffix=".out", prefix=name, directory_name=self._logs_dir)
log_stderr = self._make_inc_temp(
suffix=".err", prefix=name, directory_name=self._logs_dir)
else:
log_stdout = os.path.join(self._logs_dir, "{}.out".format(name))
log_stderr = os.path.join(self._logs_dir, "{}.err".format(name))
return log_stdout, log_stderr
def _get_unused_port(self, close_on_exit=True):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@@ -487,9 +490,15 @@ class Node:
def start_redis(self):
"""Start the Redis servers."""
assert self._redis_address is None
redis_log_files = [self.new_log_files("redis")]
redis_out_name, redis_err_name = self.get_log_file_names(
"redis", unique=True)
redis_log_files = [(open_log(redis_out_name),
open_log(redis_err_name))]
for i in range(self._ray_params.num_redis_shards):
redis_log_files.append(self.new_log_files("redis-shard_" + str(i)))
shard_out_name, shard_err_name = self.get_log_file_names(
"redis-shard_{}".format(i), unique=True)
redis_log_files.append((open_log(shard_out_name),
open_log(shard_err_name)))
(self._redis_address, redis_shards,
process_infos) = ray.services.start_redis(
@@ -511,7 +520,10 @@ class Node:
def start_log_monitor(self):
"""Start the log monitor."""
stdout_file, stderr_file = self.new_log_files("log_monitor")
log_out_name, log_err_name = self.get_log_file_names(
"log_monitor", unique=True)
stdout_file, stderr_file = open_log(log_out_name), open_log(
log_err_name)
process_info = ray.services.start_log_monitor(
self.redis_address,
self._logs_dir,
@@ -526,7 +538,10 @@ class Node:
def start_reporter(self):
"""Start the reporter."""
stdout_file, stderr_file = self.new_log_files("reporter")
reporter_out_name, reporter_err_name = self.get_log_file_names(
"reporter", unique=True)
stdout_file, stderr_file = (open_log(reporter_out_name),
open_log(reporter_err_name))
process_info = ray.services.start_reporter(
self.redis_address,
stdout_file=stdout_file,
@@ -547,7 +562,10 @@ class Node:
if we fail to start the dashboard. Otherwise it will print
a warning if we fail to start the dashboard.
"""
stdout_file, stderr_file = self.new_log_files("dashboard")
dashboard_out_name, dashboard_err_name = self.get_log_file_names(
"dashboard", unique=True)
stdout_file, stderr_file = (open_log(dashboard_out_name),
open_log(dashboard_err_name))
self._webui_url, process_info = ray.services.start_dashboard(
require_dashboard,
self._ray_params.dashboard_host,
@@ -568,7 +586,10 @@ class Node:
def start_plasma_store(self):
"""Start the plasma store."""
stdout_file, stderr_file = self.new_log_files("plasma_store")
plasma_out_name, plasma_err_name = self.get_log_file_names(
"plasma_store", unique=True)
stdout_file, stderr_file = (open_log(plasma_out_name),
open_log(plasma_err_name))
process_info = ray.services.start_plasma_store(
self.get_resource_spec(),
self._plasma_store_socket_name,
@@ -587,7 +608,10 @@ class Node:
def start_gcs_server(self):
"""Start the gcs server.
"""
stdout_file, stderr_file = self.new_log_files("gcs_server")
gcs_out_name, gcs_err_name = self.get_log_file_names(
"gcs_server", unique=True)
stdout_file, stderr_file = (open_log(gcs_out_name),
open_log(gcs_err_name))
process_info = ray.services.start_gcs_server(
self._redis_address,
stdout_file=stdout_file,
@@ -610,7 +634,9 @@ class Node:
use_profiler (bool): True if we should start the process in the
valgrind profiler.
"""
stdout_file, stderr_file = self.new_log_files("raylet")
raylet_out_name, raylet_err_name = self.get_log_file_names("raylet")
stdout_file, stderr_file = (open_log(raylet_out_name),
open_log(raylet_err_name))
process_info = ray.services.start_raylet(
self._redis_address,
self._node_ip_address,
@@ -640,10 +666,39 @@ class Node:
assert ray_constants.PROCESS_TYPE_RAYLET not in self.all_processes
self.all_processes[ray_constants.PROCESS_TYPE_RAYLET] = [process_info]
def new_worker_redirected_log_file(self, worker_id):
"""Create new logging files for workers to redirect its output."""
worker_stdout_file, worker_stderr_file = (
self.new_log_files("worker-" + ray.utils.binary_to_hex(worker_id)))
def get_job_redirected_log_file(self,
worker_id: bytes,
job_id: bytes = None):
"""Determines (but does not create) logging files for workers to
redirect its output.
Args:
worker_id (bytes): A byte representation of the worker id.
job_id (bytes): A byte representation of the job id. If None,
provides a generic log file for the worker.
Returns:
(tuple) The stdout and stderr file names that the job should be
redirected to.
"""
redirect_output = self._ray_params.redirect_output
if redirect_output is None:
# Make the default behavior match that of glog.
redirect_output = os.getenv("GLOG_logtostderr") != "1"
if not redirect_output:
return None, None
if job_id is not None:
name = "worker-{}-{}".format(
ray.utils.binary_to_hex(worker_id),
ray.utils.binary_to_hex(job_id))
else:
name = "worker-{}".format(ray.utils.binary_to_hex(worker_id))
worker_stdout_file, worker_stderr_file = self.get_log_file_names(
name, unique=False)
return worker_stdout_file, worker_stderr_file
def start_worker(self):
@@ -652,7 +707,10 @@ class Node:
def start_monitor(self):
"""Start the monitor."""
stdout_file, stderr_file = self.new_log_files("monitor")
monitor_out_name, monitor_err_name = self.get_log_file_names(
"monitor", unique=True)
stdout_file, stderr_file = (open_log(monitor_out_name),
open_log(monitor_err_name))
process_info = ray.services.start_monitor(
self._redis_address,
stdout_file=stdout_file,