Change logfile names and also allow plasma store socket to be passed in. (#2862)

This commit is contained in:
Si-Yuan
2018-10-03 10:03:53 -07:00
committed by Robert Nishihara
parent 9c606ea06c
commit cc7e2ecdd5
13 changed files with 696 additions and 140 deletions
+6
View File
@@ -165,6 +165,9 @@ matrix:
- python -m pytest -v python/ray/rllib/test/test_optimizers.py
- python -m pytest -v python/ray/rllib/test/test_evaluators.py
# ray temp file tests
- python -m pytest -v test/tempfile_test.py
install:
- ./.travis/install-dependencies.sh
@@ -237,6 +240,9 @@ script:
- python -m pytest -v python/ray/rllib/test/test_optimizers.py
- python -m pytest -v python/ray/rllib/test/test_evaluators.py
# ray temp file tests
- python -m pytest -v test/tempfile_test.py
deploy:
- provider: s3
access_key_id: AKIAJ2L7XDUSZVTXI5QA
+1
View File
@@ -118,6 +118,7 @@ Ray comes with libraries that accelerate deep learning and reinforcement learnin
plasma-object-store.rst
resources.rst
redis-memory-management.rst
tempfile.rst
.. toctree::
:maxdepth: 1
+87
View File
@@ -0,0 +1,87 @@
Temporary Files
===============
Ray will produce some temporary files during running.
They are useful for logging, debugging & sharing object store with other programs.
Location of Temporary Files
---------------------------
First we introduce the concept of a session of Ray.
A session contains a set of processes. A session is created by executing
``ray start`` command or call ``ray.init()`` in a Python script and ended by
executing ``ray stop`` or call ``ray.shutdown()``.
For each session, Ray will create a *root temporary directory* to place all its
temporary files. The path is ``/tmp/ray/session_{datetime}_{pid}`` by default.
The pid belongs to the startup process (the process calling ``ray.init()`` or
the Ray process executed by a shell in ``ray start``).
You can sort by their names to find the latest session.
You are allowed to change the *root temporary directory* in one of these ways:
* Pass ``--temp-dir={your temp path}`` to ``ray start``
* Specify ``temp_dir`` when call ``ray.init()``
You can also use ``default_worker.py --temp-dir={your temp path}`` to
start a new worker with given *root temporary directory*.
The *root temporary directory* you specified will be given as it is,
without pids or datetime attached.
Layout of Temporary Files
-------------------------
A typical layout of temporary files could look like this:
.. code-block:: text
/tmp
└── ray
└── session_{datetime}_{pid}
├── logs
│   ├── log_monitor.err
│   ├── log_monitor.out
│   ├── monitor.err
│   ├── monitor.out
│   ├── plasma_manager_0.err # array of plasma managers' outputs
│   ├── plasma_manager_0.out
│   ├── plasma_store_0.err # array of plasma stores' outputs
│   ├── plasma_store_0.out
│   ├── raylet_0.err # array of raylets' outputs. Control it with `--no-redirect-worker-output` (in Ray's command line) or `redirect_worker_output` (in ray.init())
│   ├── raylet_0.out
│   ├── redis-shard_0.err # array of redis shards' outputs
│   ├── redis-shard_0.out
│   ├── redis.err # redis
│   ├── redis.out
│   ├── webui.err # ipython notebook web ui
│   ├── webui.out
│   ├── worker-{worker_id}.err # redirected output of workers
│   ├── worker-{worker_id}.out
│   └── {other workers}
├── ray_ui.ipynb # ipython notebook file
└── sockets # for logging
├── plasma_store
└── raylet # this could be deleted by Ray's shutdown cleanup.
Plasma Object Store Socket
--------------------------
Plasma object store sockets can be used to share objects with other programs using Apache Arrow.
You are allowed to specify the plasma object store socket in one of these ways:
* Pass ``--plasma-store-socket-name={your socket path}`` to ``ray start``
* Specify ``plasma_store_socket_name`` when call ``ray.init()``
The path you specified will be given as it is without being affected any other paths.
Notes
-----
Temporary file policies are defined in ``python/ray/tempfile_services.py``.
Currently, we keep ``/tmp/ray`` as the default directory for temporary data files of RLlib as before.
It is not very reasonable and could be changed later.
@@ -4,14 +4,12 @@ from __future__ import print_function
import multiprocessing
import os
import random
import subprocess
import sys
import time
def random_name():
return str(random.randint(0, 99999999))
from ray.tempfile_services import (get_local_scheduler_socket_name,
get_temp_root)
def start_local_scheduler(plasma_store_name,
@@ -71,7 +69,7 @@ def start_local_scheduler(plasma_store_name,
local_scheduler_executable = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"../core/src/local_scheduler/local_scheduler")
local_scheduler_name = "/tmp/scheduler{}".format(random_name())
local_scheduler_name = get_local_scheduler_socket_name()
command = [
local_scheduler_executable, "-s", local_scheduler_name, "-p",
plasma_store_name, "-h", node_ip_address, "-n",
@@ -88,11 +86,12 @@ def start_local_scheduler(plasma_store_name,
"--object-store-name={} "
"--object-store-manager-name={} "
"--local-scheduler-name={} "
"--redis-address={}".format(
"--redis-address={} "
"--temp-dir={}".format(
sys.executable, worker_path,
node_ip_address, plasma_store_name,
plasma_manager_name, local_scheduler_name,
redis_address))
redis_address, get_temp_root()))
command += ["-w", start_worker_command]
if redis_address is not None:
command += ["-r", redis_address]
+9 -7
View File
@@ -8,6 +8,9 @@ import subprocess
import sys
import time
from ray.tempfile_services import (get_object_store_socket_name,
get_plasma_manager_socket_name)
__all__ = [
"start_plasma_store", "start_plasma_manager", "DEFAULT_PLASMA_STORE_MEMORY"
]
@@ -17,17 +20,14 @@ PLASMA_WAIT_TIMEOUT = 2**30
DEFAULT_PLASMA_STORE_MEMORY = 10**9
def random_name():
return str(random.randint(0, 99999999))
def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY,
use_valgrind=False,
use_profiler=False,
stdout_file=None,
stderr_file=None,
plasma_directory=None,
huge_pages=False):
huge_pages=False,
socket_name=None):
"""Start a plasma store process.
Args:
@@ -43,6 +43,8 @@ def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY,
be created.
huge_pages: a boolean flag indicating whether to start the
Object Store with hugetlbfs support. Requires plasma_directory.
socket_name (str): If provided, it will specify the socket
name used by the plasma store.
Return:
A tuple of the name of the plasma store socket and the process ID of
@@ -66,7 +68,7 @@ def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY,
plasma_store_executable = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
"../core/src/plasma/plasma_store_server")
plasma_store_name = "/tmp/plasma_store{}".format(random_name())
plasma_store_name = socket_name or get_object_store_socket_name()
command = [
plasma_store_executable, "-s", plasma_store_name, "-m",
str(plasma_store_memory)
@@ -136,7 +138,7 @@ def start_plasma_manager(store_name,
plasma_manager_executable = os.path.join(
os.path.abspath(os.path.dirname(__file__)),
"../core/src/plasma/plasma_manager")
plasma_manager_name = "/tmp/plasma_manager{}".format(random_name())
plasma_manager_name = get_plasma_manager_socket_name()
if plasma_manager_port is not None:
if num_retries != 1:
raise Exception("num_retries must be 1 if port is specified.")
+2 -2
View File
@@ -5,7 +5,7 @@ from __future__ import print_function
import os
import ray
from ray.local_scheduler import ObjectID
def env_integer(key, default):
@@ -15,7 +15,7 @@ def env_integer(key, default):
ID_SIZE = 20
NIL_JOB_ID = ray.ObjectID(ID_SIZE * b"\xff")
NIL_JOB_ID = ObjectID(ID_SIZE * b"\xff")
# If a remote function or actor (or some other export) has serialized size
# greater than this quantity, print an warning.
+22 -3
View File
@@ -177,11 +177,24 @@ def cli(logging_level, logging_format):
is_flag=True,
default=False,
help="do not redirect non-worker stdout and stderr to files")
@click.option(
"--plasma-store-socket-name",
default=None,
help="manually specify the socket name of the plasma store")
@click.option(
"--raylet-socket-name",
default=None,
help="manually specify the socket path of the raylet process")
@click.option(
"--temp-dir",
default=None,
help="manually specify the root temporary dir of the Ray process")
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
redis_max_clients, redis_shard_ports, object_manager_port,
object_store_memory, num_workers, num_cpus, num_gpus, resources,
head, no_ui, block, plasma_directory, huge_pages, autoscaling_config,
use_raylet, no_redirect_worker_output, no_redirect_output):
use_raylet, no_redirect_worker_output, no_redirect_output,
plasma_store_socket_name, raylet_socket_name, temp_dir):
# Convert hostnames to numerical IP address.
if node_ip_address is not None:
node_ip_address = services.address_to_ip(node_ip_address)
@@ -260,7 +273,10 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
autoscaling_config=autoscaling_config,
use_raylet=use_raylet)
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
logger.info(address_info)
logger.info(
"\nStarted Ray on this node. You can add additional nodes to "
@@ -329,7 +345,10 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
resources=resources,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
use_raylet=use_raylet)
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
logger.info(address_info)
logger.info("\nStarted Ray on this node. If you wish to terminate the "
"processes that have been started, run\n\n"
+93 -111
View File
@@ -2,14 +2,12 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import binascii
import json
import logging
import multiprocessing
import os
import random
import resource
import shutil
import signal
import socket
import subprocess
@@ -17,8 +15,6 @@ import sys
import threading
import time
from collections import OrderedDict, namedtuple
from datetime import datetime
import redis
import pyarrow
@@ -28,6 +24,14 @@ import ray.global_scheduler as global_scheduler
import ray.local_scheduler
import ray.plasma
from ray.tempfile_services import (
get_ipython_notebook_path, get_logs_dir_path, get_raylet_socket_name,
get_temp_redis_config_path, get_temp_root, new_global_scheduler_log_file,
new_local_scheduler_log_file, new_log_monitor_log_file,
new_monitor_log_file, new_plasma_manager_log_file,
new_plasma_store_log_file, new_raylet_log_file, new_redis_log_file,
new_webui_log_file, new_worker_log_file, set_temp_root)
PROCESS_TYPE_MONITOR = "monitor"
PROCESS_TYPE_LOG_MONITOR = "log_monitor"
PROCESS_TYPE_WORKER = "worker"
@@ -120,10 +124,6 @@ def new_port():
return random.randint(10000, 65535)
def random_name():
return str(random.randint(0, 99999999))
def kill_process(p):
"""Kill a process.
@@ -456,8 +456,7 @@ def start_redis(node_ip_address,
A tuple of the address for the primary Redis shard and a list of
addresses for the remaining shards.
"""
redis_stdout_file, redis_stderr_file = new_log_files(
"redis", redirect_output)
redis_stdout_file, redis_stderr_file = new_redis_log_file(redirect_output)
if redis_shard_ports is None:
redis_shard_ports = num_redis_shards * [None]
@@ -517,8 +516,8 @@ def start_redis(node_ip_address,
# prefixed by "redis-<shard number>".
redis_shards = []
for i in range(num_redis_shards):
redis_stdout_file, redis_stderr_file = new_log_files(
"redis-{}".format(i), redirect_output)
redis_stdout_file, redis_stderr_file = new_redis_log_file(
redirect_output, shard_number=i)
if not use_credis:
redis_shard_port, _ = _start_redis_instance(
node_ip_address=node_ip_address,
@@ -572,7 +571,7 @@ def _make_temp_redis_config(node_ip_address):
node_ip_address: The IP address of this node. This should not be
127.0.0.1.
"""
redis_config_name = "/tmp/redis_conf{}".format(random_name())
redis_config_name = get_temp_redis_config_path()
with open(redis_config_name, 'w') as f:
# This allows redis clients on the same machine to connect using the
# node's IP address as opposed to just 127.0.0.1. This is only relevant
@@ -799,15 +798,7 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
then this process will be killed by services.cleanup() when the
Python process that imported services exits.
"""
new_env = os.environ.copy()
notebook_filepath = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "WebUI.ipynb")
# We copy the notebook file so that the original doesn't get modified by
# the user.
random_ui_id = random.randint(0, 100000)
new_notebook_filepath = "/tmp/raylogs/ray_ui{}.ipynb".format(random_ui_id)
new_notebook_directory = os.path.dirname(new_notebook_filepath)
shutil.copy(notebook_filepath, new_notebook_filepath)
port = 8888
while True:
try:
@@ -821,7 +812,8 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
new_env["REDIS_ADDRESS"] = redis_address
# We generate the token used for authentication ourselves to avoid
# querying the jupyter server.
token = ray.utils.decode(binascii.hexlify(os.urandom(24)))
new_notebook_directory, webui_url, token = (
get_ipython_notebook_path(port))
# The --ip=0.0.0.0 flag is intended to enable connecting to a notebook
# running within a docker container (from the outside).
command = [
@@ -847,8 +839,6 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
else:
if cleanup:
all_processes[PROCESS_TYPE_WEB_UI].append(ui_process)
webui_url = ("http://localhost:{}/notebooks/ray_ui{}.ipynb?token={}"
.format(port, random_ui_id, token))
logger.info("\n" + "=" * 70)
logger.info("View the web UI at {}".format(webui_url))
logger.info("=" * 70 + "\n")
@@ -971,6 +961,7 @@ def start_local_scheduler(redis_address,
def start_raylet(redis_address,
node_ip_address,
raylet_name,
plasma_store_name,
worker_path,
resources=None,
@@ -988,6 +979,7 @@ def start_raylet(redis_address,
scheduler is running on.
plasma_store_name (str): The name of the plasma store socket to connect
to.
raylet_name (str): The name of the raylet socket to create.
worker_path (str): The path of the script to use when the local
scheduler starts up new workers.
use_valgrind (bool): True if the raylet should be started inside
@@ -1023,16 +1015,17 @@ def start_raylet(redis_address,
])
gcs_ip_address, gcs_port = redis_address.split(":")
raylet_name = "/tmp/raylet{}".format(random_name())
# Create the command that the Raylet will use to start workers.
start_worker_command = ("{} {} "
"--node-ip-address={} "
"--object-store-name={} "
"--raylet-name={} "
"--redis-address={}".format(
"--redis-address={} "
"--temp-dir={}".format(
sys.executable, worker_path, node_ip_address,
plasma_store_name, raylet_name, redis_address))
plasma_store_name, raylet_name, redis_address,
get_temp_root()))
command = [
RAYLET_EXECUTABLE,
@@ -1084,7 +1077,8 @@ def start_plasma_store(node_ip_address,
cleanup=True,
plasma_directory=None,
huge_pages=False,
use_raylet=False):
use_raylet=False,
plasma_store_socket_name=None):
"""This method starts an object store process.
Args:
@@ -1158,7 +1152,8 @@ def start_plasma_store(node_ip_address,
stdout_file=store_stdout_file,
stderr_file=store_stderr_file,
plasma_directory=plasma_directory,
huge_pages=huge_pages)
huge_pages=huge_pages,
socket_name=plasma_store_socket_name)
# Start the plasma manager.
if not use_raylet:
if object_manager_port is not None:
@@ -1235,7 +1230,8 @@ def start_worker(node_ip_address,
"--object-store-name=" + object_store_name,
"--object-store-manager-name=" + object_store_manager_name,
"--local-scheduler-name=" + local_scheduler_name,
"--redis-address=" + str(redis_address)
"--redis-address=" + str(redis_address),
"--temp-dir=" + get_temp_root()
]
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
if cleanup:
@@ -1327,7 +1323,10 @@ def start_ray_processes(address_info=None,
plasma_directory=None,
huge_pages=False,
autoscaling_config=None,
use_raylet=False):
use_raylet=False,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None):
"""Helper method to start Ray processes.
Args:
@@ -1385,13 +1384,22 @@ def start_ray_processes(address_info=None,
autoscaling_config: path to autoscaling config file.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
plasma_store_socket_name (str): If provided, it will specify the socket
name used by the plasma store.
raylet_socket_name (str): If provided, it will specify the socket path
used by the raylet process.
temp_dir (str): If provided, it will specify the root temporary
directory for the Ray process.
Returns:
A dictionary of the address information for the processes that were
started.
"""
logger.info(
"Process STDOUT and STDERR is being redirected to /tmp/raylogs/.")
set_temp_root(temp_dir)
logger.info("Process STDOUT and STDERR is being redirected to {}.".format(
get_logs_dir_path()))
if resources is None:
resources = {}
@@ -1438,8 +1446,8 @@ def start_ray_processes(address_info=None,
time.sleep(0.1)
# Start monitoring the processes.
monitor_stdout_file, monitor_stderr_file = new_log_files(
"monitor", redirect_output)
monitor_stdout_file, monitor_stderr_file = new_monitor_log_file(
redirect_output)
start_monitor(
redis_address,
node_ip_address,
@@ -1464,8 +1472,8 @@ def start_ray_processes(address_info=None,
# Start the log monitor, if necessary.
if include_log_monitor:
log_monitor_stdout_file, log_monitor_stderr_file = new_log_files(
"log_monitor", redirect_output=True)
log_monitor_stdout_file, log_monitor_stderr_file = (
new_log_monitor_log_file())
start_log_monitor(
redis_address,
node_ip_address,
@@ -1476,7 +1484,7 @@ def start_ray_processes(address_info=None,
# Start the global scheduler, if necessary.
if include_global_scheduler and not use_raylet:
global_scheduler_stdout_file, global_scheduler_stderr_file = (
new_log_files("global_scheduler", redirect_output))
new_global_scheduler_log_file(redirect_output))
start_global_scheduler(
redis_address,
node_ip_address,
@@ -1505,10 +1513,14 @@ def start_ray_processes(address_info=None,
# Start any object stores that do not yet exist.
for i in range(num_local_schedulers - len(object_store_addresses)):
# Start Plasma.
plasma_store_stdout_file, plasma_store_stderr_file = new_log_files(
"plasma_store_{}".format(i), redirect_output)
plasma_manager_stdout_file, plasma_manager_stderr_file = new_log_files(
"plasma_manager_{}".format(i), redirect_output)
plasma_store_stdout_file, plasma_store_stderr_file = (
new_plasma_store_log_file(i, redirect_output))
# If we use raylet, plasma manager won't be started and we don't need
# to create temp files for them.
plasma_manager_stdout_file, plasma_manager_stderr_file = (
new_plasma_manager_log_file(i, redirect_output and not use_raylet))
object_store_address = start_plasma_store(
node_ip_address,
redis_address,
@@ -1521,7 +1533,8 @@ def start_ray_processes(address_info=None,
cleanup=cleanup,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
use_raylet=use_raylet)
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name)
object_store_addresses.append(object_store_address)
time.sleep(0.1)
@@ -1546,9 +1559,8 @@ def start_ray_processes(address_info=None,
# redirect the worker output, then we cannot redirect the local
# scheduler output.
local_scheduler_stdout_file, local_scheduler_stderr_file = (
new_log_files(
"local_scheduler_{}".format(i),
redirect_output=redirect_worker_output))
new_local_scheduler_log_file(
i, redirect_output=redirect_worker_output))
local_scheduler_name = start_local_scheduler(
redis_address,
node_ip_address,
@@ -1571,12 +1583,13 @@ def start_ray_processes(address_info=None,
else:
# Start any raylets that do not exist yet.
for i in range(len(raylet_socket_names), num_local_schedulers):
raylet_stdout_file, raylet_stderr_file = new_log_files(
"raylet_{}".format(i), redirect_output=redirect_worker_output)
raylet_stdout_file, raylet_stderr_file = new_raylet_log_file(
i, redirect_output=redirect_worker_output)
address_info["raylet_socket_names"].append(
start_raylet(
redis_address,
node_ip_address,
raylet_socket_name or get_raylet_socket_name(),
object_store_addresses[i].name,
worker_path,
resources=resources[i],
@@ -1592,8 +1605,8 @@ def start_ray_processes(address_info=None,
object_store_address = object_store_addresses[i]
local_scheduler_name = local_scheduler_socket_names[i]
for j in range(num_local_scheduler_workers):
worker_stdout_file, worker_stderr_file = new_log_files(
"worker_{}_{}".format(i, j), redirect_output)
worker_stdout_file, worker_stderr_file = new_worker_log_file(
i, j, redirect_output)
start_worker(
node_ip_address,
object_store_address.name,
@@ -1611,8 +1624,7 @@ def start_ray_processes(address_info=None,
# Try to start the web UI.
if include_webui:
ui_stdout_file, ui_stderr_file = new_log_files(
"webui", redirect_output=True)
ui_stdout_file, ui_stderr_file = new_webui_log_file()
address_info["webui_url"] = start_ui(
redis_address,
stdout_file=ui_stdout_file,
@@ -1637,7 +1649,10 @@ def start_ray_node(node_ip_address,
resources=None,
plasma_directory=None,
huge_pages=False,
use_raylet=False):
use_raylet=False,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None):
"""Start the Ray processes for a single node.
This assumes that the Ray processes on some master node have already been
@@ -1672,6 +1687,12 @@ def start_ray_node(node_ip_address,
Store with hugetlbfs support. Requires plasma_directory.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
plasma_store_socket_name (str): If provided, it will specify the socket
name used by the plasma store.
raylet_socket_name (str): If provided, it will specify the socket path
used by the raylet process.
temp_dir (str): If provided, it will specify the root temporary
directory for the Ray process.
Returns:
A dictionary of the address information for the processes that were
@@ -1695,7 +1716,10 @@ def start_ray_node(node_ip_address,
resources=resources,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
use_raylet=use_raylet)
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
def start_ray_head(address_info=None,
@@ -1718,7 +1742,10 @@ def start_ray_head(address_info=None,
plasma_directory=None,
huge_pages=False,
autoscaling_config=None,
use_raylet=False):
use_raylet=False,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None):
"""Start Ray in local mode.
Args:
@@ -1770,6 +1797,12 @@ def start_ray_head(address_info=None,
autoscaling_config: path to autoscaling config file.
use_raylet: True if the new raylet code path should be used. This is
not supported yet.
plasma_store_socket_name (str): If provided, it will specify the socket
name used by the plasma store.
raylet_socket_name (str): If provided, it will specify the socket path
used by the raylet process.
temp_dir (str): If provided, it will specify the root temporary
directory for the Ray process.
Returns:
A dictionary of the address information for the processes that were
@@ -1799,58 +1832,7 @@ def start_ray_head(address_info=None,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
autoscaling_config=autoscaling_config,
use_raylet=use_raylet)
def try_to_create_directory(directory_path):
"""Attempt to create a directory that is globally readable/writable.
Args:
directory_path: The path of the directory to create.
"""
if not os.path.exists(directory_path):
try:
os.makedirs(directory_path)
except OSError as e:
if e.errno != os.errno.EEXIST:
raise e
logger.warning(
"Attempted to create '{}', but the directory already "
"exists.".format(directory_path))
# Change the log directory permissions so others can use it. This is
# important when multiple people are using the same machine.
os.chmod(directory_path, 0o0777)
def new_log_files(name, redirect_output):
"""Generate partially randomized filenames for log files.
Args:
name (str): descriptive string for this log file.
redirect_output (bool): True if files should be generated for logging
stdout and stderr and false if stdout and stderr should not be
redirected.
Returns:
If redirect_output is true, this will return a tuple of two
filehandles. The first is for redirecting stdout and the second is
for redirecting stderr. If redirect_output is false, this will
return a tuple of two None objects.
"""
if not redirect_output:
return None, None
# Create a directory to be used for process log files.
logs_dir = "/tmp/raylogs"
try_to_create_directory(logs_dir)
# Create another directory that will be used by some of the RL algorithms.
try_to_create_directory("/tmp/ray")
log_id = random.randint(0, 10000)
date_str = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
log_stdout = "{}/{}-{}-{:05d}.out".format(logs_dir, name, date_str, log_id)
log_stderr = "{}/{}-{}-{:05d}.err".format(logs_dir, name, date_str, log_id)
# Line-buffer the output (mode 1)
log_stdout_file = open(log_stdout, "a", buffering=1)
log_stderr_file = open(log_stderr, "a", buffering=1)
return log_stdout_file, log_stderr_file
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
+292
View File
@@ -0,0 +1,292 @@
import binascii
import collections
import datetime
import errno
import logging
import os
import shutil
import tempfile
import ray.utils
logger = logging.getLogger(__name__)
_incremental_dict = collections.defaultdict(lambda: 0)
_temp_root = None
def make_inc_temp(suffix="", prefix="", directory_name="/tmp/ray"):
"""Return a incremental temporary file name. The file is not created.
Args:
suffix (str): The suffix of the temp file.
prefix (str): The prefix of the temp file.
directory_name (str) : The base directory of the temp file.
Returns:
A string of file name. If there existing a file having the same name,
the returned name will look like
"{directory_name}/{prefix}.{unique_index}{suffix}"
"""
index = _incremental_dict[suffix, prefix, directory_name]
# `tempfile.TMP_MAX` could be extremely large,
# so using `range` in Python2.x should be avoided.
while index < tempfile.TMP_MAX:
if index == 0:
filename = os.path.join(directory_name, prefix + suffix)
else:
filename = os.path.join(directory_name,
prefix + "." + str(index) + suffix)
index += 1
if not os.path.exists(filename):
_incremental_dict[suffix, prefix,
directory_name] = index # Save the index.
return filename
raise FileExistsError(errno.EEXIST, "No usable temporary filename found")
def try_to_create_directory(directory_path):
"""Attempt to create a directory that is globally readable/writable.
Args:
directory_path: The path of the directory to create.
"""
if not os.path.exists(directory_path):
try:
os.makedirs(directory_path)
except OSError as e:
if e.errno != os.errno.EEXIST:
raise e
logger.warning(
"Attempted to create '{}', but the directory already "
"exists.".format(directory_path))
# Change the log directory permissions so others can use it. This is
# important when multiple people are using the same machine.
os.chmod(directory_path, 0o0777)
def get_temp_root():
"""Get the path of the temporary root. If not existing, it will be created.
"""
global _temp_root
date_str = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
# Lazy creation. Avoid creating directories never used.
if _temp_root is None:
_temp_root = make_inc_temp(
prefix="session_{date_str}_{pid}".format(
pid=os.getpid(), date_str=date_str),
directory_name="/tmp/ray")
try_to_create_directory(_temp_root)
return _temp_root
def set_temp_root(path):
"""Set the path of the temporary root. It will be created lazily."""
global _temp_root
_temp_root = path
def get_logs_dir_path():
"""Get a temp dir for logging."""
logs_dir = os.path.join(get_temp_root(), "logs")
try_to_create_directory(logs_dir)
return logs_dir
def get_sockets_dir_path():
"""Get a temp dir for sockets."""
sockets_dir = os.path.join(get_temp_root(), "sockets")
try_to_create_directory(sockets_dir)
return sockets_dir
def get_raylet_socket_name(suffix=""):
"""Get a socket name for raylet."""
sockets_dir = get_sockets_dir_path()
raylet_socket_name = make_inc_temp(
prefix="raylet", directory_name=sockets_dir, suffix=suffix)
return raylet_socket_name
def get_object_store_socket_name():
"""Get a socket name for plasma object store."""
sockets_dir = get_sockets_dir_path()
return make_inc_temp(prefix="plasma_store", directory_name=sockets_dir)
def get_plasma_manager_socket_name():
"""Get a socket name for plasma manager."""
sockets_dir = get_sockets_dir_path()
return make_inc_temp(prefix="plasma_manager", directory_name=sockets_dir)
def get_local_scheduler_socket_name(suffix=""):
"""Get a socket name for local scheduler.
This function could be unsafe. The socket name may
refer to a file that did not exist at some point, but by the time
you get around to creating it, someone else may have beaten you to
the punch.
"""
sockets_dir = get_sockets_dir_path()
raylet_socket_name = make_inc_temp(
prefix="scheduler", directory_name=sockets_dir, suffix=suffix)
return raylet_socket_name
def get_ipython_notebook_path(port):
"""Get a new ipython notebook path"""
notebook_filepath = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "WebUI.ipynb")
# We copy the notebook file so that the original doesn't get modified by
# the user.
notebook_name = make_inc_temp(
suffix=".ipynb", prefix="ray_ui", directory_name=get_temp_root())
new_notebook_filepath = os.path.join(get_logs_dir_path(), notebook_name)
shutil.copy(notebook_filepath, new_notebook_filepath)
new_notebook_directory = os.path.dirname(new_notebook_filepath)
token = ray.utils.decode(binascii.hexlify(os.urandom(24)))
webui_url = ("http://localhost:{}/notebooks/{}?token={}".format(
port, os.path.basename(notebook_name), token))
return new_notebook_directory, webui_url, token
def get_temp_redis_config_path():
"""Get a temp name of the redis config file."""
redis_config_name = make_inc_temp(
prefix="redis_conf", directory_name=get_temp_root())
return redis_config_name
def new_log_files(name, redirect_output):
"""Generate partially randomized filenames for log files.
Args:
name (str): descriptive string for this log file.
redirect_output (bool): True if files should be generated for logging
stdout and stderr and false if stdout and stderr should not be
redirected.
Returns:
If redirect_output is true, this will return a tuple of two
filehandles. The first is for redirecting stdout and the second is
for redirecting stderr. If redirect_output is false, this will
return a tuple of two None objects.
"""
if not redirect_output:
return None, None
# Create a directory to be used for process log files.
logs_dir = get_logs_dir_path()
# Create another directory that will be used by some of the RL algorithms.
# TODO(suquark): This is done by the old code.
# We should be able to control its path later.
try_to_create_directory("/tmp/ray")
log_stdout = make_inc_temp(
suffix=".out", prefix=name, directory_name=logs_dir)
log_stderr = make_inc_temp(
suffix=".err", prefix=name, directory_name=logs_dir)
# Line-buffer the output (mode 1)
log_stdout_file = open(log_stdout, "a", buffering=1)
log_stderr_file = open(log_stderr, "a", buffering=1)
return log_stdout_file, log_stderr_file
def new_redis_log_file(redirect_output, shard_number=None):
"""Create new logging files for redis"""
if shard_number is None:
redis_stdout_file, redis_stderr_file = new_log_files(
"redis", redirect_output)
else:
redis_stdout_file, redis_stderr_file = new_log_files(
"redis-shard_{}".format(shard_number), redirect_output)
return redis_stdout_file, redis_stderr_file
def new_raylet_log_file(local_scheduler_index, redirect_output):
"""Create new logging files for raylet."""
raylet_stdout_file, raylet_stderr_file = new_log_files(
"raylet_{}".format(local_scheduler_index),
redirect_output=redirect_output)
return raylet_stdout_file, raylet_stderr_file
def new_local_scheduler_log_file(local_scheduler_index, redirect_output):
"""Create new logging files for local scheduler.
It is only used in non-raylet versions.
"""
local_scheduler_stdout_file, local_scheduler_stderr_file = (new_log_files(
"local_scheduler_{}".format(local_scheduler_index),
redirect_output=redirect_output))
return local_scheduler_stdout_file, local_scheduler_stderr_file
def new_webui_log_file():
"""Create new logging files for web ui."""
ui_stdout_file, ui_stderr_file = new_log_files(
"webui", redirect_output=True)
return ui_stdout_file, ui_stderr_file
def new_worker_log_file(local_scheduler_index, worker_index, redirect_output):
"""Create new logging files for workers with local scheduler index.
It is only used in non-raylet versions.
"""
worker_stdout_file, worker_stderr_file = new_log_files(
"worker_{}_{}".format(local_scheduler_index, worker_index),
redirect_output)
return worker_stdout_file, worker_stderr_file
def new_worker_redirected_log_file(worker_id):
"""Create new logging files for workers to redirect its output."""
worker_stdout_file, worker_stderr_file = (new_log_files(
"worker-" + ray.utils.binary_to_hex(worker_id), True))
return worker_stdout_file, worker_stderr_file
def new_log_monitor_log_file():
"""Create new logging files for the log monitor."""
log_monitor_stdout_file, log_monitor_stderr_file = new_log_files(
"log_monitor", redirect_output=True)
return log_monitor_stdout_file, log_monitor_stderr_file
def new_global_scheduler_log_file(redirect_output):
"""Create new logging files for the new global scheduler.
It is only used in non-raylet versions.
"""
global_scheduler_stdout_file, global_scheduler_stderr_file = (
new_log_files("global_scheduler", redirect_output))
return global_scheduler_stdout_file, global_scheduler_stderr_file
def new_plasma_store_log_file(local_scheduler_index, redirect_output):
"""Create new logging files for the plasma store."""
plasma_store_stdout_file, plasma_store_stderr_file = new_log_files(
"plasma_store_{}".format(local_scheduler_index), redirect_output)
return plasma_store_stdout_file, plasma_store_stderr_file
def new_plasma_manager_log_file(local_scheduler_index, redirect_output):
"""Create new logging files for the plasma manager."""
plasma_manager_stdout_file, plasma_manager_stderr_file = new_log_files(
"plasma_manager_{}".format(local_scheduler_index), redirect_output)
return plasma_manager_stdout_file, plasma_manager_stderr_file
def new_monitor_log_file(redirect_output):
"""Create new logging files for the monitor."""
monitor_stdout_file, monitor_stderr_file = new_log_files(
"monitor", redirect_output)
return monitor_stdout_file, monitor_stderr_file
+44 -6
View File
@@ -27,6 +27,7 @@ import ray.remote_function
import ray.serialization as serialization
import ray.services as services
import ray.signature
import ray.tempfile_services as tempfile_services
import ray.local_scheduler
import ray.plasma
import ray.ray_constants as ray_constants
@@ -1528,7 +1529,10 @@ def _init(address_info=None,
plasma_directory=None,
huge_pages=False,
include_webui=True,
use_raylet=None):
use_raylet=None,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None):
"""Helper method to connect to an existing Ray cluster or start a new one.
This method handles two cases. Either a Ray cluster already exists and we
@@ -1584,6 +1588,12 @@ def _init(address_info=None,
include_webui: Boolean flag indicating whether to start the web
UI, which is a Jupyter notebook.
use_raylet: True if the new raylet code path should be used.
plasma_store_socket_name (str): If provided, it will specify the socket
name used by the plasma store.
raylet_socket_name (str): If provided, it will specify the socket path
used by the raylet process.
temp_dir (str): If provided, it will specify the root temporary
directory for the Ray process.
Returns:
Address information about the started processes.
@@ -1658,7 +1668,10 @@ def _init(address_info=None,
plasma_directory=plasma_directory,
huge_pages=huge_pages,
include_webui=include_webui,
use_raylet=use_raylet)
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
else:
if redis_address is None:
raise Exception("When connecting to an existing cluster, "
@@ -1690,6 +1703,15 @@ def _init(address_info=None,
if huge_pages:
raise Exception("When connecting to an existing cluster, "
"huge_pages must not be provided.")
if temp_dir is not None:
raise Exception("When connecting to an existing cluster, "
"temp_dir must not be provided.")
if plasma_store_socket_name is not None:
raise Exception("When connecting to an existing cluster, "
"plasma_store_socket_name must not be provided.")
if raylet_socket_name is not None:
raise Exception("When connecting to an existing cluster, "
"raylet_socket_name must not be provided.")
# Get the node IP address if one is not provided.
if node_ip_address is None:
node_ip_address = services.get_node_ip_address(redis_address)
@@ -1719,6 +1741,9 @@ def _init(address_info=None,
else:
driver_address_info["raylet_socket_name"] = (
address_info["raylet_socket_names"][0])
# We only pass `temp_dir` to a worker (WORKER_MODE).
# It can't be a worker here.
connect(
driver_address_info,
object_id_seed=object_id_seed,
@@ -1750,7 +1775,10 @@ def init(redis_address=None,
use_raylet=None,
configure_logging=True,
logging_level=logging.INFO,
logging_format=ray_constants.LOGGER_FORMAT):
logging_format=ray_constants.LOGGER_FORMAT,
plasma_store_socket_name=None,
raylet_socket_name=None,
temp_dir=None):
"""Connect to an existing Ray cluster or start one and connect to it.
This method handles two cases. Either a Ray cluster already exists and we
@@ -1815,6 +1843,12 @@ def init(redis_address=None,
logging_level: Logging level, default will be loging.INFO.
logging_format: Logging format, default will be "%(message)s"
which means only contains the message.
plasma_store_socket_name (str): If provided, it will specify the socket
name used by the plasma store.
raylet_socket_name (str): If provided, it will specify the socket path
used by the raylet process.
temp_dir (str): If provided, it will specify the root temporary
directory for the Ray process.
Returns:
Address information about the started processes.
@@ -1863,7 +1897,10 @@ def init(redis_address=None,
huge_pages=huge_pages,
include_webui=include_webui,
object_store_memory=object_store_memory,
use_raylet=use_raylet)
use_raylet=use_raylet,
plasma_store_socket_name=plasma_store_socket_name,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir)
for hook in _post_init_hooks:
hook()
return ret
@@ -2135,8 +2172,9 @@ def connect(info,
else:
redirect_worker_output = 0
if redirect_worker_output:
log_stdout_file, log_stderr_file = services.new_log_files(
"worker", True)
log_stdout_file, log_stderr_file = (
tempfile_services.new_worker_redirected_log_file(
worker.worker_id))
sys.stdout = log_stdout_file
sys.stderr = log_stderr_file
services.record_log_files_in_redis(
+10
View File
@@ -9,6 +9,7 @@ import traceback
import ray
import ray.actor
import ray.ray_constants as ray_constants
import ray.tempfile_services as tempfile_services
parser = argparse.ArgumentParser(
description=("Parse addresses for the worker "
@@ -53,6 +54,12 @@ parser.add_argument(
type=str,
default=ray_constants.LOGGER_FORMAT,
help=ray_constants.LOGGER_FORMAT_HELP)
parser.add_argument(
"--temp-dir",
required=False,
type=str,
default=None,
help="Specify the path of the temporary directory use by Ray process.")
if __name__ == "__main__":
args = parser.parse_args()
@@ -70,6 +77,9 @@ if __name__ == "__main__":
level=logging.getLevelName(args.logging_level.upper()),
format=args.logging_format)
# Override the temporary directory.
tempfile_services.set_temp_root(args.temp_dir)
ray.worker.connect(
info, mode=ray.WORKER_MODE, use_raylet=(args.raylet_name is not None))
+5 -4
View File
@@ -8,6 +8,7 @@ import pytest
import time
import ray
import ray.tempfile_services
import ray.ray_constants as ray_constants
@@ -173,10 +174,10 @@ def ray_start_reconstruction(request):
plasma_addresses = []
objstore_memory = plasma_store_memory // num_local_schedulers
for i in range(num_local_schedulers):
store_stdout_file, store_stderr_file = ray.services.new_log_files(
"plasma_store_{}".format(i), True)
manager_stdout_file, manager_stderr_file = (ray.services.new_log_files(
"plasma_manager_{}".format(i), True))
store_stdout_file, store_stderr_file = (
ray.tempfile_services.new_plasma_store_log_file(i, True))
manager_stdout_file, manager_stderr_file = (
ray.tempfile_services.new_plasma_manager_log_file(i, True))
plasma_addresses.append(
ray.services.start_plasma_store(
node_ip_address,
+119
View File
@@ -0,0 +1,119 @@
import os
import shutil
import time
import pytest
import ray
import ray.tempfile_services as tempfile_services
def test_conn_cluster():
# plasma_store_socket_name
with pytest.raises(Exception) as exc_info:
ray.init(
use_raylet=True,
redis_address="127.0.0.1:6379",
plasma_store_socket_name="/tmp/this_should_fail")
assert exc_info.value.args[0] == (
"When connecting to an existing cluster, "
"plasma_store_socket_name must not be provided.")
# raylet_socket_name
with pytest.raises(Exception) as exc_info:
ray.init(
use_raylet=True,
redis_address="127.0.0.1:6379",
raylet_socket_name="/tmp/this_should_fail")
assert exc_info.value.args[0] == (
"When connecting to an existing cluster, "
"raylet_socket_name must not be provided.")
# temp_dir
with pytest.raises(Exception) as exc_info:
ray.init(
use_raylet=True,
redis_address="127.0.0.1:6379",
temp_dir="/tmp/this_should_fail")
assert exc_info.value.args[0] == (
"When connecting to an existing cluster, "
"temp_dir must not be provided.")
def test_tempdir():
ray.init(use_raylet=True, temp_dir="/tmp/i_am_a_temp_dir")
assert os.path.exists(
"/tmp/i_am_a_temp_dir"), "Specified temp dir not found."
ray.shutdown()
shutil.rmtree("/tmp/i_am_a_temp_dir", ignore_errors=True)
def test_raylet_socket_name():
ray.init(use_raylet=True, raylet_socket_name="/tmp/i_am_a_temp_socket")
assert os.path.exists(
"/tmp/i_am_a_temp_socket"), "Specified socket path not found."
ray.shutdown()
try:
os.remove("/tmp/i_am_a_temp_socket")
except Exception:
pass
def test_temp_plasma_store_socket():
ray.init(
use_raylet=True, plasma_store_socket_name="/tmp/i_am_a_temp_socket")
assert os.path.exists(
"/tmp/i_am_a_temp_socket"), "Specified socket path not found."
ray.shutdown()
try:
os.remove("/tmp/i_am_a_temp_socket")
except Exception:
pass
def test_raylet_tempfiles():
ray.init(use_raylet=True, redirect_worker_output=False)
top_levels = set(os.listdir(tempfile_services.get_temp_root()))
assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
log_files = set(os.listdir(tempfile_services.get_logs_dir_path()))
assert log_files == {
"log_monitor.out", "log_monitor.err", "plasma_store_0.out",
"plasma_store_0.err", "webui.out", "webui.err", "monitor.out",
"monitor.err", "redis-shard_0.out", "redis-shard_0.err", "redis.out",
"redis.err"
} # without raylet logs
socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path()))
assert socket_files == {"plasma_store", "raylet"}
ray.shutdown()
ray.init(use_raylet=True, redirect_worker_output=True, num_workers=0)
top_levels = set(os.listdir(tempfile_services.get_temp_root()))
assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
log_files = set(os.listdir(tempfile_services.get_logs_dir_path()))
assert log_files == {
"log_monitor.out", "log_monitor.err", "plasma_store_0.out",
"plasma_store_0.err", "webui.out", "webui.err", "monitor.out",
"monitor.err", "redis-shard_0.out", "redis-shard_0.err", "redis.out",
"redis.err", "raylet_0.out", "raylet_0.err"
} # with raylet logs
socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path()))
assert socket_files == {"plasma_store", "raylet"}
ray.shutdown()
ray.init(use_raylet=True, redirect_worker_output=True, num_workers=2)
top_levels = set(os.listdir(tempfile_services.get_temp_root()))
assert top_levels == {"ray_ui.ipynb", "sockets", "logs"}
time.sleep(3) # wait workers to start
log_files = set(os.listdir(tempfile_services.get_logs_dir_path()))
assert log_files.issuperset({
"log_monitor.out", "log_monitor.err", "plasma_store_0.out",
"plasma_store_0.err", "webui.out", "webui.err", "monitor.out",
"monitor.err", "redis-shard_0.out", "redis-shard_0.err", "redis.out",
"redis.err", "raylet_0.out", "raylet_0.err"
}) # with raylet logs
# Check numbers of worker log file.
assert sum(
1 for filename in log_files if filename.startswith("worker")) == 4
socket_files = set(os.listdir(tempfile_services.get_sockets_dir_path()))
assert socket_files == {"plasma_store", "raylet"}
ray.shutdown()