mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 16:49:48 +08:00
[Core] Log output from different jobs to different drivers. (#8885)
* . * . * Correct now * No interactivity errors * format * Filtering * lint * . * No more filtering * Removed interactivity * . * . * . * . * . * . * Redirection works * formatting * something broken? * . * Works * formatting * redirect output * formatting * formatting * Fix file descriptor leakage * format * . * . * . * . * . * Refactor * . * Only run on job switch * . * cleanup * . * ... * Review * . * . * . * . * whoops * . * Should fix bug * . * . * addressed comments * formatting * formatting * Fix typo * . * . * . * . Co-authored-by: Ubuntu <ubuntu@ip-172-31-14-33.us-west-2.compute.internal>
This commit is contained in:
+82
-24
@@ -17,7 +17,7 @@ import ray.ray_constants as ray_constants
|
||||
import ray.services
|
||||
import ray.utils
|
||||
from ray.resource_spec import ResourceSpec
|
||||
from ray.utils import try_to_create_directory, try_to_symlink
|
||||
from ray.utils import try_to_create_directory, try_to_symlink, open_log
|
||||
|
||||
# Logger for this module. It should be configured at the entry point
|
||||
# into the program using Ray. Ray configures it by default automatically
|
||||
@@ -383,14 +383,16 @@ class Node:
|
||||
raise FileExistsError(errno.EEXIST,
|
||||
"No usable temporary filename found")
|
||||
|
||||
def new_log_files(self, name):
|
||||
def get_log_file_names(self, name, unique=False):
|
||||
"""Generate partially randomized filenames for log files.
|
||||
|
||||
Args:
|
||||
name (str): descriptive string for this log file.
|
||||
unique (bool): if true, a counter will be attached to `name` to
|
||||
ensure the returned filename is not already used.
|
||||
|
||||
Returns:
|
||||
A tuple of two file handles for redirecting (stdout, stderr).
|
||||
A tuple of two file names for redirecting (stdout, stderr).
|
||||
"""
|
||||
redirect_output = self._ray_params.redirect_output
|
||||
|
||||
@@ -401,14 +403,15 @@ class Node:
|
||||
if not redirect_output:
|
||||
return None, None
|
||||
|
||||
log_stdout = self._make_inc_temp(
|
||||
suffix=".out", prefix=name, directory_name=self._logs_dir)
|
||||
log_stderr = self._make_inc_temp(
|
||||
suffix=".err", prefix=name, directory_name=self._logs_dir)
|
||||
# Line-buffer the output (mode 1).
|
||||
log_stdout_file = open(log_stdout, "a", buffering=1)
|
||||
log_stderr_file = open(log_stderr, "a", buffering=1)
|
||||
return log_stdout_file, log_stderr_file
|
||||
if unique:
|
||||
log_stdout = self._make_inc_temp(
|
||||
suffix=".out", prefix=name, directory_name=self._logs_dir)
|
||||
log_stderr = self._make_inc_temp(
|
||||
suffix=".err", prefix=name, directory_name=self._logs_dir)
|
||||
else:
|
||||
log_stdout = os.path.join(self._logs_dir, "{}.out".format(name))
|
||||
log_stderr = os.path.join(self._logs_dir, "{}.err".format(name))
|
||||
return log_stdout, log_stderr
|
||||
|
||||
def _get_unused_port(self, close_on_exit=True):
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
@@ -487,9 +490,15 @@ class Node:
|
||||
def start_redis(self):
|
||||
"""Start the Redis servers."""
|
||||
assert self._redis_address is None
|
||||
redis_log_files = [self.new_log_files("redis")]
|
||||
redis_out_name, redis_err_name = self.get_log_file_names(
|
||||
"redis", unique=True)
|
||||
redis_log_files = [(open_log(redis_out_name),
|
||||
open_log(redis_err_name))]
|
||||
for i in range(self._ray_params.num_redis_shards):
|
||||
redis_log_files.append(self.new_log_files("redis-shard_" + str(i)))
|
||||
shard_out_name, shard_err_name = self.get_log_file_names(
|
||||
"redis-shard_{}".format(i), unique=True)
|
||||
redis_log_files.append((open_log(shard_out_name),
|
||||
open_log(shard_err_name)))
|
||||
|
||||
(self._redis_address, redis_shards,
|
||||
process_infos) = ray.services.start_redis(
|
||||
@@ -511,7 +520,10 @@ class Node:
|
||||
|
||||
def start_log_monitor(self):
|
||||
"""Start the log monitor."""
|
||||
stdout_file, stderr_file = self.new_log_files("log_monitor")
|
||||
log_out_name, log_err_name = self.get_log_file_names(
|
||||
"log_monitor", unique=True)
|
||||
stdout_file, stderr_file = open_log(log_out_name), open_log(
|
||||
log_err_name)
|
||||
process_info = ray.services.start_log_monitor(
|
||||
self.redis_address,
|
||||
self._logs_dir,
|
||||
@@ -526,7 +538,10 @@ class Node:
|
||||
|
||||
def start_reporter(self):
|
||||
"""Start the reporter."""
|
||||
stdout_file, stderr_file = self.new_log_files("reporter")
|
||||
reporter_out_name, reporter_err_name = self.get_log_file_names(
|
||||
"reporter", unique=True)
|
||||
stdout_file, stderr_file = (open_log(reporter_out_name),
|
||||
open_log(reporter_err_name))
|
||||
process_info = ray.services.start_reporter(
|
||||
self.redis_address,
|
||||
stdout_file=stdout_file,
|
||||
@@ -547,7 +562,10 @@ class Node:
|
||||
if we fail to start the dashboard. Otherwise it will print
|
||||
a warning if we fail to start the dashboard.
|
||||
"""
|
||||
stdout_file, stderr_file = self.new_log_files("dashboard")
|
||||
dashboard_out_name, dashboard_err_name = self.get_log_file_names(
|
||||
"dashboard", unique=True)
|
||||
stdout_file, stderr_file = (open_log(dashboard_out_name),
|
||||
open_log(dashboard_err_name))
|
||||
self._webui_url, process_info = ray.services.start_dashboard(
|
||||
require_dashboard,
|
||||
self._ray_params.dashboard_host,
|
||||
@@ -568,7 +586,10 @@ class Node:
|
||||
|
||||
def start_plasma_store(self):
|
||||
"""Start the plasma store."""
|
||||
stdout_file, stderr_file = self.new_log_files("plasma_store")
|
||||
plasma_out_name, plasma_err_name = self.get_log_file_names(
|
||||
"plasma_store", unique=True)
|
||||
stdout_file, stderr_file = (open_log(plasma_out_name),
|
||||
open_log(plasma_err_name))
|
||||
process_info = ray.services.start_plasma_store(
|
||||
self.get_resource_spec(),
|
||||
self._plasma_store_socket_name,
|
||||
@@ -587,7 +608,10 @@ class Node:
|
||||
def start_gcs_server(self):
|
||||
"""Start the gcs server.
|
||||
"""
|
||||
stdout_file, stderr_file = self.new_log_files("gcs_server")
|
||||
gcs_out_name, gcs_err_name = self.get_log_file_names(
|
||||
"gcs_server", unique=True)
|
||||
stdout_file, stderr_file = (open_log(gcs_out_name),
|
||||
open_log(gcs_err_name))
|
||||
process_info = ray.services.start_gcs_server(
|
||||
self._redis_address,
|
||||
stdout_file=stdout_file,
|
||||
@@ -610,7 +634,9 @@ class Node:
|
||||
use_profiler (bool): True if we should start the process in the
|
||||
valgrind profiler.
|
||||
"""
|
||||
stdout_file, stderr_file = self.new_log_files("raylet")
|
||||
raylet_out_name, raylet_err_name = self.get_log_file_names("raylet")
|
||||
stdout_file, stderr_file = (open_log(raylet_out_name),
|
||||
open_log(raylet_err_name))
|
||||
process_info = ray.services.start_raylet(
|
||||
self._redis_address,
|
||||
self._node_ip_address,
|
||||
@@ -640,10 +666,39 @@ class Node:
|
||||
assert ray_constants.PROCESS_TYPE_RAYLET not in self.all_processes
|
||||
self.all_processes[ray_constants.PROCESS_TYPE_RAYLET] = [process_info]
|
||||
|
||||
def new_worker_redirected_log_file(self, worker_id):
|
||||
"""Create new logging files for workers to redirect its output."""
|
||||
worker_stdout_file, worker_stderr_file = (
|
||||
self.new_log_files("worker-" + ray.utils.binary_to_hex(worker_id)))
|
||||
def get_job_redirected_log_file(self,
|
||||
worker_id: bytes,
|
||||
job_id: bytes = None):
|
||||
"""Determines (but does not create) logging files for workers to
|
||||
redirect its output.
|
||||
|
||||
Args:
|
||||
worker_id (bytes): A byte representation of the worker id.
|
||||
job_id (bytes): A byte representation of the job id. If None,
|
||||
provides a generic log file for the worker.
|
||||
|
||||
Returns:
|
||||
(tuple) The stdout and stderr file names that the job should be
|
||||
redirected to.
|
||||
"""
|
||||
redirect_output = self._ray_params.redirect_output
|
||||
|
||||
if redirect_output is None:
|
||||
# Make the default behavior match that of glog.
|
||||
redirect_output = os.getenv("GLOG_logtostderr") != "1"
|
||||
|
||||
if not redirect_output:
|
||||
return None, None
|
||||
|
||||
if job_id is not None:
|
||||
name = "worker-{}-{}".format(
|
||||
ray.utils.binary_to_hex(worker_id),
|
||||
ray.utils.binary_to_hex(job_id))
|
||||
else:
|
||||
name = "worker-{}".format(ray.utils.binary_to_hex(worker_id))
|
||||
|
||||
worker_stdout_file, worker_stderr_file = self.get_log_file_names(
|
||||
name, unique=False)
|
||||
return worker_stdout_file, worker_stderr_file
|
||||
|
||||
def start_worker(self):
|
||||
@@ -652,7 +707,10 @@ class Node:
|
||||
|
||||
def start_monitor(self):
|
||||
"""Start the monitor."""
|
||||
stdout_file, stderr_file = self.new_log_files("monitor")
|
||||
monitor_out_name, monitor_err_name = self.get_log_file_names(
|
||||
"monitor", unique=True)
|
||||
stdout_file, stderr_file = (open_log(monitor_out_name),
|
||||
open_log(monitor_err_name))
|
||||
process_info = ray.services.start_monitor(
|
||||
self._redis_address,
|
||||
stdout_file=stdout_file,
|
||||
|
||||
Reference in New Issue
Block a user