mirror of
https://github.com/wassname/ray.git
synced 2026-07-02 14:32:01 +08:00
Change logfile names and also allow plasma store socket to be passed in. (#2862)
This commit is contained in:
committed by
Robert Nishihara
parent
9c606ea06c
commit
cc7e2ecdd5
+93
-111
@@ -2,14 +2,12 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import binascii
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
import random
|
||||
import resource
|
||||
import shutil
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
@@ -17,8 +15,6 @@ import sys
|
||||
import threading
|
||||
import time
|
||||
from collections import OrderedDict, namedtuple
|
||||
from datetime import datetime
|
||||
|
||||
import redis
|
||||
|
||||
import pyarrow
|
||||
@@ -28,6 +24,14 @@ import ray.global_scheduler as global_scheduler
|
||||
import ray.local_scheduler
|
||||
import ray.plasma
|
||||
|
||||
from ray.tempfile_services import (
|
||||
get_ipython_notebook_path, get_logs_dir_path, get_raylet_socket_name,
|
||||
get_temp_redis_config_path, get_temp_root, new_global_scheduler_log_file,
|
||||
new_local_scheduler_log_file, new_log_monitor_log_file,
|
||||
new_monitor_log_file, new_plasma_manager_log_file,
|
||||
new_plasma_store_log_file, new_raylet_log_file, new_redis_log_file,
|
||||
new_webui_log_file, new_worker_log_file, set_temp_root)
|
||||
|
||||
PROCESS_TYPE_MONITOR = "monitor"
|
||||
PROCESS_TYPE_LOG_MONITOR = "log_monitor"
|
||||
PROCESS_TYPE_WORKER = "worker"
|
||||
@@ -120,10 +124,6 @@ def new_port():
|
||||
return random.randint(10000, 65535)
|
||||
|
||||
|
||||
def random_name():
|
||||
return str(random.randint(0, 99999999))
|
||||
|
||||
|
||||
def kill_process(p):
|
||||
"""Kill a process.
|
||||
|
||||
@@ -456,8 +456,7 @@ def start_redis(node_ip_address,
|
||||
A tuple of the address for the primary Redis shard and a list of
|
||||
addresses for the remaining shards.
|
||||
"""
|
||||
redis_stdout_file, redis_stderr_file = new_log_files(
|
||||
"redis", redirect_output)
|
||||
redis_stdout_file, redis_stderr_file = new_redis_log_file(redirect_output)
|
||||
|
||||
if redis_shard_ports is None:
|
||||
redis_shard_ports = num_redis_shards * [None]
|
||||
@@ -517,8 +516,8 @@ def start_redis(node_ip_address,
|
||||
# prefixed by "redis-<shard number>".
|
||||
redis_shards = []
|
||||
for i in range(num_redis_shards):
|
||||
redis_stdout_file, redis_stderr_file = new_log_files(
|
||||
"redis-{}".format(i), redirect_output)
|
||||
redis_stdout_file, redis_stderr_file = new_redis_log_file(
|
||||
redirect_output, shard_number=i)
|
||||
if not use_credis:
|
||||
redis_shard_port, _ = _start_redis_instance(
|
||||
node_ip_address=node_ip_address,
|
||||
@@ -572,7 +571,7 @@ def _make_temp_redis_config(node_ip_address):
|
||||
node_ip_address: The IP address of this node. This should not be
|
||||
127.0.0.1.
|
||||
"""
|
||||
redis_config_name = "/tmp/redis_conf{}".format(random_name())
|
||||
redis_config_name = get_temp_redis_config_path()
|
||||
with open(redis_config_name, 'w') as f:
|
||||
# This allows redis clients on the same machine to connect using the
|
||||
# node's IP address as opposed to just 127.0.0.1. This is only relevant
|
||||
@@ -799,15 +798,7 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
|
||||
then this process will be killed by services.cleanup() when the
|
||||
Python process that imported services exits.
|
||||
"""
|
||||
new_env = os.environ.copy()
|
||||
notebook_filepath = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "WebUI.ipynb")
|
||||
# We copy the notebook file so that the original doesn't get modified by
|
||||
# the user.
|
||||
random_ui_id = random.randint(0, 100000)
|
||||
new_notebook_filepath = "/tmp/raylogs/ray_ui{}.ipynb".format(random_ui_id)
|
||||
new_notebook_directory = os.path.dirname(new_notebook_filepath)
|
||||
shutil.copy(notebook_filepath, new_notebook_filepath)
|
||||
|
||||
port = 8888
|
||||
while True:
|
||||
try:
|
||||
@@ -821,7 +812,8 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
|
||||
new_env["REDIS_ADDRESS"] = redis_address
|
||||
# We generate the token used for authentication ourselves to avoid
|
||||
# querying the jupyter server.
|
||||
token = ray.utils.decode(binascii.hexlify(os.urandom(24)))
|
||||
new_notebook_directory, webui_url, token = (
|
||||
get_ipython_notebook_path(port))
|
||||
# The --ip=0.0.0.0 flag is intended to enable connecting to a notebook
|
||||
# running within a docker container (from the outside).
|
||||
command = [
|
||||
@@ -847,8 +839,6 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
|
||||
else:
|
||||
if cleanup:
|
||||
all_processes[PROCESS_TYPE_WEB_UI].append(ui_process)
|
||||
webui_url = ("http://localhost:{}/notebooks/ray_ui{}.ipynb?token={}"
|
||||
.format(port, random_ui_id, token))
|
||||
logger.info("\n" + "=" * 70)
|
||||
logger.info("View the web UI at {}".format(webui_url))
|
||||
logger.info("=" * 70 + "\n")
|
||||
@@ -971,6 +961,7 @@ def start_local_scheduler(redis_address,
|
||||
|
||||
def start_raylet(redis_address,
|
||||
node_ip_address,
|
||||
raylet_name,
|
||||
plasma_store_name,
|
||||
worker_path,
|
||||
resources=None,
|
||||
@@ -988,6 +979,7 @@ def start_raylet(redis_address,
|
||||
scheduler is running on.
|
||||
plasma_store_name (str): The name of the plasma store socket to connect
|
||||
to.
|
||||
raylet_name (str): The name of the raylet socket to create.
|
||||
worker_path (str): The path of the script to use when the local
|
||||
scheduler starts up new workers.
|
||||
use_valgrind (bool): True if the raylet should be started inside
|
||||
@@ -1023,16 +1015,17 @@ def start_raylet(redis_address,
|
||||
])
|
||||
|
||||
gcs_ip_address, gcs_port = redis_address.split(":")
|
||||
raylet_name = "/tmp/raylet{}".format(random_name())
|
||||
|
||||
# Create the command that the Raylet will use to start workers.
|
||||
start_worker_command = ("{} {} "
|
||||
"--node-ip-address={} "
|
||||
"--object-store-name={} "
|
||||
"--raylet-name={} "
|
||||
"--redis-address={}".format(
|
||||
"--redis-address={} "
|
||||
"--temp-dir={}".format(
|
||||
sys.executable, worker_path, node_ip_address,
|
||||
plasma_store_name, raylet_name, redis_address))
|
||||
plasma_store_name, raylet_name, redis_address,
|
||||
get_temp_root()))
|
||||
|
||||
command = [
|
||||
RAYLET_EXECUTABLE,
|
||||
@@ -1084,7 +1077,8 @@ def start_plasma_store(node_ip_address,
|
||||
cleanup=True,
|
||||
plasma_directory=None,
|
||||
huge_pages=False,
|
||||
use_raylet=False):
|
||||
use_raylet=False,
|
||||
plasma_store_socket_name=None):
|
||||
"""This method starts an object store process.
|
||||
|
||||
Args:
|
||||
@@ -1158,7 +1152,8 @@ def start_plasma_store(node_ip_address,
|
||||
stdout_file=store_stdout_file,
|
||||
stderr_file=store_stderr_file,
|
||||
plasma_directory=plasma_directory,
|
||||
huge_pages=huge_pages)
|
||||
huge_pages=huge_pages,
|
||||
socket_name=plasma_store_socket_name)
|
||||
# Start the plasma manager.
|
||||
if not use_raylet:
|
||||
if object_manager_port is not None:
|
||||
@@ -1235,7 +1230,8 @@ def start_worker(node_ip_address,
|
||||
"--object-store-name=" + object_store_name,
|
||||
"--object-store-manager-name=" + object_store_manager_name,
|
||||
"--local-scheduler-name=" + local_scheduler_name,
|
||||
"--redis-address=" + str(redis_address)
|
||||
"--redis-address=" + str(redis_address),
|
||||
"--temp-dir=" + get_temp_root()
|
||||
]
|
||||
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
|
||||
if cleanup:
|
||||
@@ -1327,7 +1323,10 @@ def start_ray_processes(address_info=None,
|
||||
plasma_directory=None,
|
||||
huge_pages=False,
|
||||
autoscaling_config=None,
|
||||
use_raylet=False):
|
||||
use_raylet=False,
|
||||
plasma_store_socket_name=None,
|
||||
raylet_socket_name=None,
|
||||
temp_dir=None):
|
||||
"""Helper method to start Ray processes.
|
||||
|
||||
Args:
|
||||
@@ -1385,13 +1384,22 @@ def start_ray_processes(address_info=None,
|
||||
autoscaling_config: path to autoscaling config file.
|
||||
use_raylet: True if the new raylet code path should be used. This is
|
||||
not supported yet.
|
||||
plasma_store_socket_name (str): If provided, it will specify the socket
|
||||
name used by the plasma store.
|
||||
raylet_socket_name (str): If provided, it will specify the socket path
|
||||
used by the raylet process.
|
||||
temp_dir (str): If provided, it will specify the root temporary
|
||||
directory for the Ray process.
|
||||
|
||||
Returns:
|
||||
A dictionary of the address information for the processes that were
|
||||
started.
|
||||
"""
|
||||
logger.info(
|
||||
"Process STDOUT and STDERR is being redirected to /tmp/raylogs/.")
|
||||
|
||||
set_temp_root(temp_dir)
|
||||
|
||||
logger.info("Process STDOUT and STDERR is being redirected to {}.".format(
|
||||
get_logs_dir_path()))
|
||||
|
||||
if resources is None:
|
||||
resources = {}
|
||||
@@ -1438,8 +1446,8 @@ def start_ray_processes(address_info=None,
|
||||
time.sleep(0.1)
|
||||
|
||||
# Start monitoring the processes.
|
||||
monitor_stdout_file, monitor_stderr_file = new_log_files(
|
||||
"monitor", redirect_output)
|
||||
monitor_stdout_file, monitor_stderr_file = new_monitor_log_file(
|
||||
redirect_output)
|
||||
start_monitor(
|
||||
redis_address,
|
||||
node_ip_address,
|
||||
@@ -1464,8 +1472,8 @@ def start_ray_processes(address_info=None,
|
||||
|
||||
# Start the log monitor, if necessary.
|
||||
if include_log_monitor:
|
||||
log_monitor_stdout_file, log_monitor_stderr_file = new_log_files(
|
||||
"log_monitor", redirect_output=True)
|
||||
log_monitor_stdout_file, log_monitor_stderr_file = (
|
||||
new_log_monitor_log_file())
|
||||
start_log_monitor(
|
||||
redis_address,
|
||||
node_ip_address,
|
||||
@@ -1476,7 +1484,7 @@ def start_ray_processes(address_info=None,
|
||||
# Start the global scheduler, if necessary.
|
||||
if include_global_scheduler and not use_raylet:
|
||||
global_scheduler_stdout_file, global_scheduler_stderr_file = (
|
||||
new_log_files("global_scheduler", redirect_output))
|
||||
new_global_scheduler_log_file(redirect_output))
|
||||
start_global_scheduler(
|
||||
redis_address,
|
||||
node_ip_address,
|
||||
@@ -1505,10 +1513,14 @@ def start_ray_processes(address_info=None,
|
||||
# Start any object stores that do not yet exist.
|
||||
for i in range(num_local_schedulers - len(object_store_addresses)):
|
||||
# Start Plasma.
|
||||
plasma_store_stdout_file, plasma_store_stderr_file = new_log_files(
|
||||
"plasma_store_{}".format(i), redirect_output)
|
||||
plasma_manager_stdout_file, plasma_manager_stderr_file = new_log_files(
|
||||
"plasma_manager_{}".format(i), redirect_output)
|
||||
plasma_store_stdout_file, plasma_store_stderr_file = (
|
||||
new_plasma_store_log_file(i, redirect_output))
|
||||
|
||||
# If we use raylet, plasma manager won't be started and we don't need
|
||||
# to create temp files for them.
|
||||
plasma_manager_stdout_file, plasma_manager_stderr_file = (
|
||||
new_plasma_manager_log_file(i, redirect_output and not use_raylet))
|
||||
|
||||
object_store_address = start_plasma_store(
|
||||
node_ip_address,
|
||||
redis_address,
|
||||
@@ -1521,7 +1533,8 @@ def start_ray_processes(address_info=None,
|
||||
cleanup=cleanup,
|
||||
plasma_directory=plasma_directory,
|
||||
huge_pages=huge_pages,
|
||||
use_raylet=use_raylet)
|
||||
use_raylet=use_raylet,
|
||||
plasma_store_socket_name=plasma_store_socket_name)
|
||||
object_store_addresses.append(object_store_address)
|
||||
time.sleep(0.1)
|
||||
|
||||
@@ -1546,9 +1559,8 @@ def start_ray_processes(address_info=None,
|
||||
# redirect the worker output, then we cannot redirect the local
|
||||
# scheduler output.
|
||||
local_scheduler_stdout_file, local_scheduler_stderr_file = (
|
||||
new_log_files(
|
||||
"local_scheduler_{}".format(i),
|
||||
redirect_output=redirect_worker_output))
|
||||
new_local_scheduler_log_file(
|
||||
i, redirect_output=redirect_worker_output))
|
||||
local_scheduler_name = start_local_scheduler(
|
||||
redis_address,
|
||||
node_ip_address,
|
||||
@@ -1571,12 +1583,13 @@ def start_ray_processes(address_info=None,
|
||||
else:
|
||||
# Start any raylets that do not exist yet.
|
||||
for i in range(len(raylet_socket_names), num_local_schedulers):
|
||||
raylet_stdout_file, raylet_stderr_file = new_log_files(
|
||||
"raylet_{}".format(i), redirect_output=redirect_worker_output)
|
||||
raylet_stdout_file, raylet_stderr_file = new_raylet_log_file(
|
||||
i, redirect_output=redirect_worker_output)
|
||||
address_info["raylet_socket_names"].append(
|
||||
start_raylet(
|
||||
redis_address,
|
||||
node_ip_address,
|
||||
raylet_socket_name or get_raylet_socket_name(),
|
||||
object_store_addresses[i].name,
|
||||
worker_path,
|
||||
resources=resources[i],
|
||||
@@ -1592,8 +1605,8 @@ def start_ray_processes(address_info=None,
|
||||
object_store_address = object_store_addresses[i]
|
||||
local_scheduler_name = local_scheduler_socket_names[i]
|
||||
for j in range(num_local_scheduler_workers):
|
||||
worker_stdout_file, worker_stderr_file = new_log_files(
|
||||
"worker_{}_{}".format(i, j), redirect_output)
|
||||
worker_stdout_file, worker_stderr_file = new_worker_log_file(
|
||||
i, j, redirect_output)
|
||||
start_worker(
|
||||
node_ip_address,
|
||||
object_store_address.name,
|
||||
@@ -1611,8 +1624,7 @@ def start_ray_processes(address_info=None,
|
||||
|
||||
# Try to start the web UI.
|
||||
if include_webui:
|
||||
ui_stdout_file, ui_stderr_file = new_log_files(
|
||||
"webui", redirect_output=True)
|
||||
ui_stdout_file, ui_stderr_file = new_webui_log_file()
|
||||
address_info["webui_url"] = start_ui(
|
||||
redis_address,
|
||||
stdout_file=ui_stdout_file,
|
||||
@@ -1637,7 +1649,10 @@ def start_ray_node(node_ip_address,
|
||||
resources=None,
|
||||
plasma_directory=None,
|
||||
huge_pages=False,
|
||||
use_raylet=False):
|
||||
use_raylet=False,
|
||||
plasma_store_socket_name=None,
|
||||
raylet_socket_name=None,
|
||||
temp_dir=None):
|
||||
"""Start the Ray processes for a single node.
|
||||
|
||||
This assumes that the Ray processes on some master node have already been
|
||||
@@ -1672,6 +1687,12 @@ def start_ray_node(node_ip_address,
|
||||
Store with hugetlbfs support. Requires plasma_directory.
|
||||
use_raylet: True if the new raylet code path should be used. This is
|
||||
not supported yet.
|
||||
plasma_store_socket_name (str): If provided, it will specify the socket
|
||||
name used by the plasma store.
|
||||
raylet_socket_name (str): If provided, it will specify the socket path
|
||||
used by the raylet process.
|
||||
temp_dir (str): If provided, it will specify the root temporary
|
||||
directory for the Ray process.
|
||||
|
||||
Returns:
|
||||
A dictionary of the address information for the processes that were
|
||||
@@ -1695,7 +1716,10 @@ def start_ray_node(node_ip_address,
|
||||
resources=resources,
|
||||
plasma_directory=plasma_directory,
|
||||
huge_pages=huge_pages,
|
||||
use_raylet=use_raylet)
|
||||
use_raylet=use_raylet,
|
||||
plasma_store_socket_name=plasma_store_socket_name,
|
||||
raylet_socket_name=raylet_socket_name,
|
||||
temp_dir=temp_dir)
|
||||
|
||||
|
||||
def start_ray_head(address_info=None,
|
||||
@@ -1718,7 +1742,10 @@ def start_ray_head(address_info=None,
|
||||
plasma_directory=None,
|
||||
huge_pages=False,
|
||||
autoscaling_config=None,
|
||||
use_raylet=False):
|
||||
use_raylet=False,
|
||||
plasma_store_socket_name=None,
|
||||
raylet_socket_name=None,
|
||||
temp_dir=None):
|
||||
"""Start Ray in local mode.
|
||||
|
||||
Args:
|
||||
@@ -1770,6 +1797,12 @@ def start_ray_head(address_info=None,
|
||||
autoscaling_config: path to autoscaling config file.
|
||||
use_raylet: True if the new raylet code path should be used. This is
|
||||
not supported yet.
|
||||
plasma_store_socket_name (str): If provided, it will specify the socket
|
||||
name used by the plasma store.
|
||||
raylet_socket_name (str): If provided, it will specify the socket path
|
||||
used by the raylet process.
|
||||
temp_dir (str): If provided, it will specify the root temporary
|
||||
directory for the Ray process.
|
||||
|
||||
Returns:
|
||||
A dictionary of the address information for the processes that were
|
||||
@@ -1799,58 +1832,7 @@ def start_ray_head(address_info=None,
|
||||
plasma_directory=plasma_directory,
|
||||
huge_pages=huge_pages,
|
||||
autoscaling_config=autoscaling_config,
|
||||
use_raylet=use_raylet)
|
||||
|
||||
|
||||
def try_to_create_directory(directory_path):
|
||||
"""Attempt to create a directory that is globally readable/writable.
|
||||
|
||||
Args:
|
||||
directory_path: The path of the directory to create.
|
||||
"""
|
||||
if not os.path.exists(directory_path):
|
||||
try:
|
||||
os.makedirs(directory_path)
|
||||
except OSError as e:
|
||||
if e.errno != os.errno.EEXIST:
|
||||
raise e
|
||||
logger.warning(
|
||||
"Attempted to create '{}', but the directory already "
|
||||
"exists.".format(directory_path))
|
||||
# Change the log directory permissions so others can use it. This is
|
||||
# important when multiple people are using the same machine.
|
||||
os.chmod(directory_path, 0o0777)
|
||||
|
||||
|
||||
def new_log_files(name, redirect_output):
|
||||
"""Generate partially randomized filenames for log files.
|
||||
|
||||
Args:
|
||||
name (str): descriptive string for this log file.
|
||||
redirect_output (bool): True if files should be generated for logging
|
||||
stdout and stderr and false if stdout and stderr should not be
|
||||
redirected.
|
||||
|
||||
Returns:
|
||||
If redirect_output is true, this will return a tuple of two
|
||||
filehandles. The first is for redirecting stdout and the second is
|
||||
for redirecting stderr. If redirect_output is false, this will
|
||||
return a tuple of two None objects.
|
||||
"""
|
||||
if not redirect_output:
|
||||
return None, None
|
||||
|
||||
# Create a directory to be used for process log files.
|
||||
logs_dir = "/tmp/raylogs"
|
||||
try_to_create_directory(logs_dir)
|
||||
# Create another directory that will be used by some of the RL algorithms.
|
||||
try_to_create_directory("/tmp/ray")
|
||||
|
||||
log_id = random.randint(0, 10000)
|
||||
date_str = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
log_stdout = "{}/{}-{}-{:05d}.out".format(logs_dir, name, date_str, log_id)
|
||||
log_stderr = "{}/{}-{}-{:05d}.err".format(logs_dir, name, date_str, log_id)
|
||||
# Line-buffer the output (mode 1)
|
||||
log_stdout_file = open(log_stdout, "a", buffering=1)
|
||||
log_stderr_file = open(log_stderr, "a", buffering=1)
|
||||
return log_stdout_file, log_stderr_file
|
||||
use_raylet=use_raylet,
|
||||
plasma_store_socket_name=plasma_store_socket_name,
|
||||
raylet_socket_name=raylet_socket_name,
|
||||
temp_dir=temp_dir)
|
||||
|
||||
Reference in New Issue
Block a user