Change logfile names and also allow plasma store socket to be passed in. (#2862)

This commit is contained in:
Si-Yuan
2018-10-03 10:03:53 -07:00
committed by Robert Nishihara
parent 9c606ea06c
commit cc7e2ecdd5
13 changed files with 696 additions and 140 deletions
+93 -111
View File
@@ -2,14 +2,12 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import binascii
import json
import logging
import multiprocessing
import os
import random
import resource
import shutil
import signal
import socket
import subprocess
@@ -17,8 +15,6 @@ import sys
import threading
import time
from collections import OrderedDict, namedtuple
from datetime import datetime
import redis
import pyarrow
@@ -28,6 +24,14 @@ import ray.global_scheduler as global_scheduler
import ray.local_scheduler
import ray.plasma
from ray.tempfile_services import (
get_ipython_notebook_path, get_logs_dir_path, get_raylet_socket_name,
get_temp_redis_config_path, get_temp_root, new_global_scheduler_log_file,
new_local_scheduler_log_file, new_log_monitor_log_file,
new_monitor_log_file, new_plasma_manager_log_file,
new_plasma_store_log_file, new_raylet_log_file, new_redis_log_file,
new_webui_log_file, new_worker_log_file, set_temp_root)
PROCESS_TYPE_MONITOR = "monitor"
PROCESS_TYPE_LOG_MONITOR = "log_monitor"
PROCESS_TYPE_WORKER = "worker"
@@ -120,10 +124,6 @@ def new_port():
return random.randint(10000, 65535)
def random_name():
return str(random.randint(0, 99999999))
def kill_process(p):
"""Kill a process.
@@ -456,8 +456,7 @@ def start_redis(node_ip_address,
A tuple of the address for the primary Redis shard and a list of
addresses for the remaining shards.
"""
redis_stdout_file, redis_stderr_file = new_log_files(
"redis", redirect_output)
redis_stdout_file, redis_stderr_file = new_redis_log_file(redirect_output)
if redis_shard_ports is None:
redis_shard_ports = num_redis_shards * [None]
@@ -517,8 +516,8 @@ def start_redis(node_ip_address,
# prefixed by "redis-<shard number>".
redis_shards = []
for i in range(num_redis_shards):
redis_stdout_file, redis_stderr_file = new_log_files(
"redis-{}".format(i), redirect_output)
redis_stdout_file, redis_stderr_file = new_redis_log_file(
redirect_output, shard_number=i)
if not use_credis:
redis_shard_port, _ = _start_redis_instance(
node_ip_address=node_ip_address,
@@ -572,7 +571,7 @@ def _make_temp_redis_config(node_ip_address):
node_ip_address: The IP address of this node. This should not be
127.0.0.1.
"""
redis_config_name = "/tmp/redis_conf{}".format(random_name())
redis_config_name = get_temp_redis_config_path()
with open(redis_config_name, 'w') as f:
# This allows redis clients on the same machine to connect using the
# node's IP address as opposed to just 127.0.0.1. This is only relevant
@@ -799,15 +798,7 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
then this process will be killed by services.cleanup() when the
Python process that imported services exits.
"""
new_env = os.environ.copy()
notebook_filepath = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "WebUI.ipynb")
# We copy the notebook file so that the original doesn't get modified by
# the user.
random_ui_id = random.randint(0, 100000)
new_notebook_filepath = "/tmp/raylogs/ray_ui{}.ipynb".format(random_ui_id)
new_notebook_directory = os.path.dirname(new_notebook_filepath)
shutil.copy(notebook_filepath, new_notebook_filepath)
port = 8888
while True:
try:
@@ -821,7 +812,8 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
new_env["REDIS_ADDRESS"] = redis_address
# We generate the token used for authentication ourselves to avoid
# querying the jupyter server.
token = ray.utils.decode(binascii.hexlify(os.urandom(24)))
new_notebook_directory, webui_url, token = (
get_ipython_notebook_path(port))
# The --ip=0.0.0.0 flag is intended to enable connecting to a notebook
# running within a docker container (from the outside).
command = [
@@ -847,8 +839,6 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
else:
if cleanup:
all_processes[PROCESS_TYPE_WEB_UI].append(ui_process)
webui_url = ("http://localhost:{}/notebooks/ray_ui{}.ipynb?token={}"
.format(port, random_ui_id, token))
logger.info("\n" + "=" * 70)
logger.info("View the web UI at {}".format(webui_url))
logger.info("=" * 70 + "\n")
@@ -971,6 +961,7 @@ def start_local_scheduler(redis_address,
def start_raylet(redis_address,
node_ip_address,
raylet_name,
plasma_store_name,
worker_path,
resources=None,
@@ -988,6 +979,7 @@ def start_raylet(redis_address,
scheduler is running on.
plasma_store_name (str): The name of the plasma store socket to connect
to.
raylet_name (str): The name of the raylet socket to create.
worker_path (str): The path of the script to use when the local
scheduler starts up new workers.
use_valgrind (bool): True if the raylet should be started inside
@@ -1023,16 +1015,17 @@ def start_raylet(redis_address,
])
gcs_ip_address, gcs_port = redis_address.split(":")
raylet_name = "/tmp/raylet{}".format(random_name())
# Create the command that the Raylet will use to start workers.
start_worker_command = ("{} {} "
"--node-ip-address={} "
"--object-store-name={} "
"--raylet-name={} "
"--redis-address={}".format(
"--redis-address={} "
"--temp-dir={}".format(
sys.executable, worker_path, node_ip_address,
plasma_store_name, raylet_name, redis_address))
plasma_store_name, raylet_name, redis_address,
get_temp_root()))
command = [
RAYLET_EXECUTABLE,
@@ -1084,7 +1077,8 @@ def start_plasma_store(node_ip_address,
cleanup=True,
plasma_directory=None,
huge_pages=False,
use_raylet=False):
use_raylet=False,
plasma_store_socket_name=None):
"""This method starts an object store process.
Args:
@@ -1158,7 +1152,8 @@ def start_plasma_store(node_ip_address,
stdout_file=store_stdout_file,
stderr_file=store_stderr_file,
plasma_directory=plasma_directory,
huge_pages=huge_pages)
huge_pages=huge_pages,
socket_name=plasma_store_socket_name)
# Start the plasma manager.
if not use_raylet:
if object_manager_port is not None:
@@ -1235,7 +1230,8 @@ def start_worker(node_ip_address,
"--object-store-name=" + object_store_name,
"--object-store-manager-name=" + object_store_manager_name,
"--local-scheduler-name=" + local_scheduler_name,
"--redis-address=" + str(redis_address)
"--redis-address=" + str(redis_address),
"--temp-dir=" + get_temp_root()
]
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
if cleanup:
@@ -1327,7 +1323,10 @@ def start_ray_processes(address_info=None,
plasma_directory=None,
huge_pages=False,
autoscaling_config=None,
use_raylet=False):
use_raylet=False,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None):
"""Helper method to start Ray processes.
Args:
@@ -1385,13 +1384,22 @@ def start_ray_processes(address_info=None,
autoscaling_config: path to autoscaling config file.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
plasma_store_socket_name (str): If provided, it will specify the socket
name used by the plasma store.
raylet_socket_name (str): If provided, it will specify the socket path
used by the raylet process.
temp_dir (str): If provided, it will specify the root temporary
directory for the Ray process.
Returns:
A dictionary of the address information for the processes that were
started.
"""
logger.info(
"Process STDOUT and STDERR is being redirected to /tmp/raylogs/.")
set_temp_root(temp_dir)
logger.info("Process STDOUT and STDERR is being redirected to {}.".format(
get_logs_dir_path()))
if resources is None:
resources = {}
@@ -1438,8 +1446,8 @@ def start_ray_processes(address_info=None,
time.sleep(0.1)
# Start monitoring the processes.
monitor_stdout_file, monitor_stderr_file = new_log_files(
"monitor", redirect_output)
monitor_stdout_file, monitor_stderr_file = new_monitor_log_file(
redirect_output)
start_monitor(
redis_address,
node_ip_address,
@@ -1464,8 +1472,8 @@ def start_ray_processes(address_info=None,
# Start the log monitor, if necessary.
if include_log_monitor:
log_monitor_stdout_file, log_monitor_stderr_file = new_log_files(
"log_monitor", redirect_output=True)
log_monitor_stdout_file, log_monitor_stderr_file = (
new_log_monitor_log_file())
start_log_monitor(
redis_address,
node_ip_address,
@@ -1476,7 +1484,7 @@ def start_ray_processes(address_info=None,
# Start the global scheduler, if necessary.
if include_global_scheduler and not use_raylet:
global_scheduler_stdout_file, global_scheduler_stderr_file = (
new_log_files("global_scheduler", redirect_output))
new_global_scheduler_log_file(redirect_output))
start_global_scheduler(
redis_address,
node_ip_address,
@@ -1505,10 +1513,14 @@ def start_ray_processes(address_info=None,
# Start any object stores that do not yet exist.
for i in range(num_local_schedulers - len(object_store_addresses)):
# Start Plasma.
plasma_store_stdout_file, plasma_store_stderr_file = new_log_files(
"plasma_store_{}".format(i), redirect_output)
plasma_manager_stdout_file, plasma_manager_stderr_file = new_log_files(
"plasma_manager_{}".format(i), redirect_output)
plasma_store_stdout_file, plasma_store_stderr_file = (
new_plasma_store_log_file(i, redirect_output))
# If we use raylet, plasma manager won't be started and we don't need
# to create temp files for them.
plasma_manager_stdout_file, plasma_manager_stderr_file = (
new_plasma_manager_log_file(i, redirect_output and not use_raylet))
object_store_address = start_plasma_store(
node_ip_address,
redis_address,
@@ -1521,7 +1533,8 @@ def start_ray_processes(address_info=None,
cleanup=cleanup,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
use_raylet=use_raylet)
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name)
object_store_addresses.append(object_store_address)
time.sleep(0.1)
@@ -1546,9 +1559,8 @@ def start_ray_processes(address_info=None,
# redirect the worker output, then we cannot redirect the local
# scheduler output.
local_scheduler_stdout_file, local_scheduler_stderr_file = (
new_log_files(
"local_scheduler_{}".format(i),
redirect_output=redirect_worker_output))
new_local_scheduler_log_file(
i, redirect_output=redirect_worker_output))
local_scheduler_name = start_local_scheduler(
redis_address,
node_ip_address,
@@ -1571,12 +1583,13 @@ def start_ray_processes(address_info=None,
else:
# Start any raylets that do not exist yet.
for i in range(len(raylet_socket_names), num_local_schedulers):
raylet_stdout_file, raylet_stderr_file = new_log_files(
"raylet_{}".format(i), redirect_output=redirect_worker_output)
raylet_stdout_file, raylet_stderr_file = new_raylet_log_file(
i, redirect_output=redirect_worker_output)
address_info["raylet_socket_names"].append(
start_raylet(
redis_address,
node_ip_address,
raylet_socket_name or get_raylet_socket_name(),
object_store_addresses[i].name,
worker_path,
resources=resources[i],
@@ -1592,8 +1605,8 @@ def start_ray_processes(address_info=None,
object_store_address = object_store_addresses[i]
local_scheduler_name = local_scheduler_socket_names[i]
for j in range(num_local_scheduler_workers):
worker_stdout_file, worker_stderr_file = new_log_files(
"worker_{}_{}".format(i, j), redirect_output)
worker_stdout_file, worker_stderr_file = new_worker_log_file(
i, j, redirect_output)
start_worker(
node_ip_address,
object_store_address.name,
@@ -1611,8 +1624,7 @@ def start_ray_processes(address_info=None,
# Try to start the web UI.
if include_webui:
ui_stdout_file, ui_stderr_file = new_log_files(
"webui", redirect_output=True)
ui_stdout_file, ui_stderr_file = new_webui_log_file()
address_info["webui_url"] = start_ui(
redis_address,
stdout_file=ui_stdout_file,
@@ -1637,7 +1649,10 @@ def start_ray_node(node_ip_address,
resources=None,
plasma_directory=None,
huge_pages=False,
use_raylet=False):
use_raylet=False,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None):
"""Start the Ray processes for a single node.
This assumes that the Ray processes on some master node have already been
@@ -1672,6 +1687,12 @@ def start_ray_node(node_ip_address,
Store with hugetlbfs support. Requires plasma_directory.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
plasma_store_socket_name (str): If provided, it will specify the socket
name used by the plasma store.
raylet_socket_name (str): If provided, it will specify the socket path
used by the raylet process.
temp_dir (str): If provided, it will specify the root temporary
directory for the Ray process.
Returns:
A dictionary of the address information for the processes that were
@@ -1695,7 +1716,10 @@ def start_ray_node(node_ip_address,
resources=resources,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
use_raylet=use_raylet)
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
def start_ray_head(address_info=None,
@@ -1718,7 +1742,10 @@ def start_ray_head(address_info=None,
plasma_directory=None,
huge_pages=False,
autoscaling_config=None,
use_raylet=False):
use_raylet=False,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None):
"""Start Ray in local mode.
Args:
@@ -1770,6 +1797,12 @@ def start_ray_head(address_info=None,
autoscaling_config: path to autoscaling config file.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
plasma_store_socket_name (str): If provided, it will specify the socket
name used by the plasma store.
raylet_socket_name (str): If provided, it will specify the socket path
used by the raylet process.
temp_dir (str): If provided, it will specify the root temporary
directory for the Ray process.
Returns:
A dictionary of the address information for the processes that were
@@ -1799,58 +1832,7 @@ def start_ray_head(address_info=None,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
autoscaling_config=autoscaling_config,
use_raylet=use_raylet)
def try_to_create_directory(directory_path):
"""Attempt to create a directory that is globally readable/writable.
Args:
directory_path: The path of the directory to create.
"""
if not os.path.exists(directory_path):
try:
os.makedirs(directory_path)
except OSError as e:
if e.errno != os.errno.EEXIST:
raise e
logger.warning(
"Attempted to create '{}', but the directory already "
"exists.".format(directory_path))
# Change the log directory permissions so others can use it. This is
# important when multiple people are using the same machine.
os.chmod(directory_path, 0o0777)
def new_log_files(name, redirect_output):
"""Generate partially randomized filenames for log files.
Args:
name (str): descriptive string for this log file.
redirect_output (bool): True if files should be generated for logging
stdout and stderr and false if stdout and stderr should not be
redirected.
Returns:
If redirect_output is true, this will return a tuple of two
filehandles. The first is for redirecting stdout and the second is
for redirecting stderr. If redirect_output is false, this will
return a tuple of two None objects.
"""
if not redirect_output:
return None, None
# Create a directory to be used for process log files.
logs_dir = "/tmp/raylogs"
try_to_create_directory(logs_dir)
# Create another directory that will be used by some of the RL algorithms.
try_to_create_directory("/tmp/ray")
log_id = random.randint(0, 10000)
date_str = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
log_stdout = "{}/{}-{}-{:05d}.out".format(logs_dir, name, date_str, log_id)
log_stderr = "{}/{}-{}-{:05d}.err".format(logs_dir, name, date_str, log_id)
# Line-buffer the output (mode 1)
log_stdout_file = open(log_stdout, "a", buffering=1)
log_stderr_file = open(log_stderr, "a", buffering=1)
return log_stdout_file, log_stderr_file
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)