Change logfile names and also allow plasma store socket to be passed in. (#2862)

2026-07-02 14:32:01 +08:00 · 2018-10-03 10:03:53 -07:00
parent 9c606ea06c
commit cc7e2ecdd5
13 changed files with 696 additions and 140 deletions
@@ -2,14 +2,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import binascii
 import json
 import logging
 import multiprocessing
 import os
 import random
 import resource
-import shutil
 import signal
 import socket
 import subprocess
@@ -17,8 +15,6 @@ import sys
 import threading
 import time
 from collections import OrderedDict, namedtuple
-from datetime import datetime
-
 import redis

 import pyarrow
@@ -28,6 +24,14 @@ import ray.global_scheduler as global_scheduler
 import ray.local_scheduler
 import ray.plasma

+from ray.tempfile_services import (
+    get_ipython_notebook_path, get_logs_dir_path, get_raylet_socket_name,
+    get_temp_redis_config_path, get_temp_root, new_global_scheduler_log_file,
+    new_local_scheduler_log_file, new_log_monitor_log_file,
+    new_monitor_log_file, new_plasma_manager_log_file,
+    new_plasma_store_log_file, new_raylet_log_file, new_redis_log_file,
+    new_webui_log_file, new_worker_log_file, set_temp_root)
+
 PROCESS_TYPE_MONITOR = "monitor"
 PROCESS_TYPE_LOG_MONITOR = "log_monitor"
 PROCESS_TYPE_WORKER = "worker"
@@ -120,10 +124,6 @@ def new_port():
    return random.randint(10000, 65535)


-def random_name():
-    return str(random.randint(0, 99999999))
-
-
 def kill_process(p):
    """Kill a process.

@@ -456,8 +456,7 @@ def start_redis(node_ip_address,
        A tuple of the address for the primary Redis shard and a list of
            addresses for the remaining shards.
    """
-    redis_stdout_file, redis_stderr_file = new_log_files(
-        "redis", redirect_output)
+    redis_stdout_file, redis_stderr_file = new_redis_log_file(redirect_output)

    if redis_shard_ports is None:
        redis_shard_ports = num_redis_shards * [None]
@@ -517,8 +516,8 @@ def start_redis(node_ip_address,
    # prefixed by "redis-<shard number>".
    redis_shards = []
    for i in range(num_redis_shards):
-        redis_stdout_file, redis_stderr_file = new_log_files(
-            "redis-{}".format(i), redirect_output)
+        redis_stdout_file, redis_stderr_file = new_redis_log_file(
+            redirect_output, shard_number=i)
        if not use_credis:
            redis_shard_port, _ = _start_redis_instance(
                node_ip_address=node_ip_address,
@@ -572,7 +571,7 @@ def _make_temp_redis_config(node_ip_address):
        node_ip_address: The IP address of this node. This should not be
            127.0.0.1.
    """
-    redis_config_name = "/tmp/redis_conf{}".format(random_name())
+    redis_config_name = get_temp_redis_config_path()
    with open(redis_config_name, 'w') as f:
        # This allows redis clients on the same machine to connect using the
        # node's IP address as opposed to just 127.0.0.1. This is only relevant
@@ -799,15 +798,7 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
            then this process will be killed by services.cleanup() when the
            Python process that imported services exits.
    """
-    new_env = os.environ.copy()
-    notebook_filepath = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "WebUI.ipynb")
-    # We copy the notebook file so that the original doesn't get modified by
-    # the user.
-    random_ui_id = random.randint(0, 100000)
-    new_notebook_filepath = "/tmp/raylogs/ray_ui{}.ipynb".format(random_ui_id)
-    new_notebook_directory = os.path.dirname(new_notebook_filepath)
-    shutil.copy(notebook_filepath, new_notebook_filepath)
+
    port = 8888
    while True:
        try:
@@ -821,7 +812,8 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
    new_env["REDIS_ADDRESS"] = redis_address
    # We generate the token used for authentication ourselves to avoid
    # querying the jupyter server.
-    token = ray.utils.decode(binascii.hexlify(os.urandom(24)))
+    new_notebook_directory, webui_url, token = (
+        get_ipython_notebook_path(port))
    # The --ip=0.0.0.0 flag is intended to enable connecting to a notebook
    # running within a docker container (from the outside).
    command = [
@@ -847,8 +839,6 @@ def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
    else:
        if cleanup:
            all_processes[PROCESS_TYPE_WEB_UI].append(ui_process)
-        webui_url = ("http://localhost:{}/notebooks/ray_ui{}.ipynb?token={}"
-                     .format(port, random_ui_id, token))
        logger.info("\n" + "=" * 70)
        logger.info("View the web UI at {}".format(webui_url))
        logger.info("=" * 70 + "\n")
@@ -971,6 +961,7 @@ def start_local_scheduler(redis_address,

 def start_raylet(redis_address,
                 node_ip_address,
+                 raylet_name,
                 plasma_store_name,
                 worker_path,
                 resources=None,
@@ -988,6 +979,7 @@ def start_raylet(redis_address,
            scheduler is running on.
        plasma_store_name (str): The name of the plasma store socket to connect
            to.
+        raylet_name (str): The name of the raylet socket to create.
        worker_path (str): The path of the script to use when the local
            scheduler starts up new workers.
        use_valgrind (bool): True if the raylet should be started inside
@@ -1023,16 +1015,17 @@ def start_raylet(redis_address,
    ])

    gcs_ip_address, gcs_port = redis_address.split(":")
-    raylet_name = "/tmp/raylet{}".format(random_name())

    # Create the command that the Raylet will use to start workers.
    start_worker_command = ("{} {} "
                            "--node-ip-address={} "
                            "--object-store-name={} "
                            "--raylet-name={} "
-                            "--redis-address={}".format(
+                            "--redis-address={} "
+                            "--temp-dir={}".format(
                                sys.executable, worker_path, node_ip_address,
-                                plasma_store_name, raylet_name, redis_address))
+                                plasma_store_name, raylet_name, redis_address,
+                                get_temp_root()))

    command = [
        RAYLET_EXECUTABLE,
@@ -1084,7 +1077,8 @@ def start_plasma_store(node_ip_address,
                       cleanup=True,
                       plasma_directory=None,
                       huge_pages=False,
-                       use_raylet=False):
+                       use_raylet=False,
+                       plasma_store_socket_name=None):
    """This method starts an object store process.

    Args:
@@ -1158,7 +1152,8 @@ def start_plasma_store(node_ip_address,
        stdout_file=store_stdout_file,
        stderr_file=store_stderr_file,
        plasma_directory=plasma_directory,
-        huge_pages=huge_pages)
+        huge_pages=huge_pages,
+        socket_name=plasma_store_socket_name)
    # Start the plasma manager.
    if not use_raylet:
        if object_manager_port is not None:
@@ -1235,7 +1230,8 @@ def start_worker(node_ip_address,
        "--object-store-name=" + object_store_name,
        "--object-store-manager-name=" + object_store_manager_name,
        "--local-scheduler-name=" + local_scheduler_name,
-        "--redis-address=" + str(redis_address)
+        "--redis-address=" + str(redis_address),
+        "--temp-dir=" + get_temp_root()
    ]
    p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
    if cleanup:
@@ -1327,7 +1323,10 @@ def start_ray_processes(address_info=None,
                        plasma_directory=None,
                        huge_pages=False,
                        autoscaling_config=None,
-                        use_raylet=False):
+                        use_raylet=False,
+                        plasma_store_socket_name=None,
+                        raylet_socket_name=None,
+                        temp_dir=None):
    """Helper method to start Ray processes.

    Args:
@@ -1385,13 +1384,22 @@ def start_ray_processes(address_info=None,
        autoscaling_config: path to autoscaling config file.
        use_raylet: True if the new raylet code path should be used. This is
            not supported yet.
+        plasma_store_socket_name (str): If provided, it will specify the socket
+            name used by the plasma store.
+        raylet_socket_name (str): If provided, it will specify the socket path
+            used by the raylet process.
+        temp_dir (str): If provided, it will specify the root temporary
+            directory for the Ray process.

    Returns:
        A dictionary of the address information for the processes that were
            started.
    """
-    logger.info(
-        "Process STDOUT and STDERR is being redirected to /tmp/raylogs/.")
+
+    set_temp_root(temp_dir)
+
+    logger.info("Process STDOUT and STDERR is being redirected to {}.".format(
+        get_logs_dir_path()))

    if resources is None:
        resources = {}
@@ -1438,8 +1446,8 @@ def start_ray_processes(address_info=None,
        time.sleep(0.1)

        # Start monitoring the processes.
-        monitor_stdout_file, monitor_stderr_file = new_log_files(
-            "monitor", redirect_output)
+        monitor_stdout_file, monitor_stderr_file = new_monitor_log_file(
+            redirect_output)
        start_monitor(
            redis_address,
            node_ip_address,
@@ -1464,8 +1472,8 @@ def start_ray_processes(address_info=None,

    # Start the log monitor, if necessary.
    if include_log_monitor:
-        log_monitor_stdout_file, log_monitor_stderr_file = new_log_files(
-            "log_monitor", redirect_output=True)
+        log_monitor_stdout_file, log_monitor_stderr_file = (
+            new_log_monitor_log_file())
        start_log_monitor(
            redis_address,
            node_ip_address,
@@ -1476,7 +1484,7 @@ def start_ray_processes(address_info=None,
    # Start the global scheduler, if necessary.
    if include_global_scheduler and not use_raylet:
        global_scheduler_stdout_file, global_scheduler_stderr_file = (
-            new_log_files("global_scheduler", redirect_output))
+            new_global_scheduler_log_file(redirect_output))
        start_global_scheduler(
            redis_address,
            node_ip_address,
@@ -1505,10 +1513,14 @@ def start_ray_processes(address_info=None,
    # Start any object stores that do not yet exist.
    for i in range(num_local_schedulers - len(object_store_addresses)):
        # Start Plasma.
-        plasma_store_stdout_file, plasma_store_stderr_file = new_log_files(
-            "plasma_store_{}".format(i), redirect_output)
-        plasma_manager_stdout_file, plasma_manager_stderr_file = new_log_files(
-            "plasma_manager_{}".format(i), redirect_output)
+        plasma_store_stdout_file, plasma_store_stderr_file = (
+            new_plasma_store_log_file(i, redirect_output))
+
+        # If we use raylet, plasma manager won't be started and we don't need
+        # to create temp files for them.
+        plasma_manager_stdout_file, plasma_manager_stderr_file = (
+            new_plasma_manager_log_file(i, redirect_output and not use_raylet))
+
        object_store_address = start_plasma_store(
            node_ip_address,
            redis_address,
@@ -1521,7 +1533,8 @@ def start_ray_processes(address_info=None,
            cleanup=cleanup,
            plasma_directory=plasma_directory,
            huge_pages=huge_pages,
-            use_raylet=use_raylet)
+            use_raylet=use_raylet,
+            plasma_store_socket_name=plasma_store_socket_name)
        object_store_addresses.append(object_store_address)
        time.sleep(0.1)

@@ -1546,9 +1559,8 @@ def start_ray_processes(address_info=None,
            # redirect the worker output, then we cannot redirect the local
            # scheduler output.
            local_scheduler_stdout_file, local_scheduler_stderr_file = (
-                new_log_files(
-                    "local_scheduler_{}".format(i),
-                    redirect_output=redirect_worker_output))
+                new_local_scheduler_log_file(
+                    i, redirect_output=redirect_worker_output))
            local_scheduler_name = start_local_scheduler(
                redis_address,
                node_ip_address,
@@ -1571,12 +1583,13 @@ def start_ray_processes(address_info=None,
    else:
        # Start any raylets that do not exist yet.
        for i in range(len(raylet_socket_names), num_local_schedulers):
-            raylet_stdout_file, raylet_stderr_file = new_log_files(
-                "raylet_{}".format(i), redirect_output=redirect_worker_output)
+            raylet_stdout_file, raylet_stderr_file = new_raylet_log_file(
+                i, redirect_output=redirect_worker_output)
            address_info["raylet_socket_names"].append(
                start_raylet(
                    redis_address,
                    node_ip_address,
+                    raylet_socket_name or get_raylet_socket_name(),
                    object_store_addresses[i].name,
                    worker_path,
                    resources=resources[i],
@@ -1592,8 +1605,8 @@ def start_ray_processes(address_info=None,
            object_store_address = object_store_addresses[i]
            local_scheduler_name = local_scheduler_socket_names[i]
            for j in range(num_local_scheduler_workers):
-                worker_stdout_file, worker_stderr_file = new_log_files(
-                    "worker_{}_{}".format(i, j), redirect_output)
+                worker_stdout_file, worker_stderr_file = new_worker_log_file(
+                    i, j, redirect_output)
                start_worker(
                    node_ip_address,
                    object_store_address.name,
@@ -1611,8 +1624,7 @@ def start_ray_processes(address_info=None,

    # Try to start the web UI.
    if include_webui:
-        ui_stdout_file, ui_stderr_file = new_log_files(
-            "webui", redirect_output=True)
+        ui_stdout_file, ui_stderr_file = new_webui_log_file()
        address_info["webui_url"] = start_ui(
            redis_address,
            stdout_file=ui_stdout_file,
@@ -1637,7 +1649,10 @@ def start_ray_node(node_ip_address,
                   resources=None,
                   plasma_directory=None,
                   huge_pages=False,
-                   use_raylet=False):
+                   use_raylet=False,
+                   plasma_store_socket_name=None,
+                   raylet_socket_name=None,
+                   temp_dir=None):
    """Start the Ray processes for a single node.

    This assumes that the Ray processes on some master node have already been
@@ -1672,6 +1687,12 @@ def start_ray_node(node_ip_address,
            Store with hugetlbfs support. Requires plasma_directory.
        use_raylet: True if the new raylet code path should be used. This is
            not supported yet.
+        plasma_store_socket_name (str): If provided, it will specify the socket
+            name used by the plasma store.
+        raylet_socket_name (str): If provided, it will specify the socket path
+            used by the raylet process.
+        temp_dir (str): If provided, it will specify the root temporary
+            directory for the Ray process.

    Returns:
        A dictionary of the address information for the processes that were
@@ -1695,7 +1716,10 @@ def start_ray_node(node_ip_address,
        resources=resources,
        plasma_directory=plasma_directory,
        huge_pages=huge_pages,
-        use_raylet=use_raylet)
+        use_raylet=use_raylet,
+        plasma_store_socket_name=plasma_store_socket_name,
+        raylet_socket_name=raylet_socket_name,
+        temp_dir=temp_dir)


 def start_ray_head(address_info=None,
@@ -1718,7 +1742,10 @@ def start_ray_head(address_info=None,
                   plasma_directory=None,
                   huge_pages=False,
                   autoscaling_config=None,
-                   use_raylet=False):
+                   use_raylet=False,
+                   plasma_store_socket_name=None,
+                   raylet_socket_name=None,
+                   temp_dir=None):
    """Start Ray in local mode.

    Args:
@@ -1770,6 +1797,12 @@ def start_ray_head(address_info=None,
        autoscaling_config: path to autoscaling config file.
        use_raylet: True if the new raylet code path should be used. This is
            not supported yet.
+        plasma_store_socket_name (str): If provided, it will specify the socket
+            name used by the plasma store.
+        raylet_socket_name (str): If provided, it will specify the socket path
+            used by the raylet process.
+        temp_dir (str): If provided, it will specify the root temporary
+            directory for the Ray process.

    Returns:
        A dictionary of the address information for the processes that were
@@ -1799,58 +1832,7 @@ def start_ray_head(address_info=None,
        plasma_directory=plasma_directory,
        huge_pages=huge_pages,
        autoscaling_config=autoscaling_config,
-        use_raylet=use_raylet)
-
-
-def try_to_create_directory(directory_path):
-    """Attempt to create a directory that is globally readable/writable.
-
-    Args:
-        directory_path: The path of the directory to create.
-    """
-    if not os.path.exists(directory_path):
-        try:
-            os.makedirs(directory_path)
-        except OSError as e:
-            if e.errno != os.errno.EEXIST:
-                raise e
-            logger.warning(
-                "Attempted to create '{}', but the directory already "
-                "exists.".format(directory_path))
-        # Change the log directory permissions so others can use it. This is
-        # important when multiple people are using the same machine.
-        os.chmod(directory_path, 0o0777)
-
-
-def new_log_files(name, redirect_output):
-    """Generate partially randomized filenames for log files.
-
-    Args:
-        name (str): descriptive string for this log file.
-        redirect_output (bool): True if files should be generated for logging
-            stdout and stderr and false if stdout and stderr should not be
-            redirected.
-
-    Returns:
-        If redirect_output is true, this will return a tuple of two
-            filehandles. The first is for redirecting stdout and the second is
-            for redirecting stderr. If redirect_output is false, this will
-            return a tuple of two None objects.
-    """
-    if not redirect_output:
-        return None, None
-
-    # Create a directory to be used for process log files.
-    logs_dir = "/tmp/raylogs"
-    try_to_create_directory(logs_dir)
-    # Create another directory that will be used by some of the RL algorithms.
-    try_to_create_directory("/tmp/ray")
-
-    log_id = random.randint(0, 10000)
-    date_str = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
-    log_stdout = "{}/{}-{}-{:05d}.out".format(logs_dir, name, date_str, log_id)
-    log_stderr = "{}/{}-{}-{:05d}.err".format(logs_dir, name, date_str, log_id)
-    # Line-buffer the output (mode 1)
-    log_stdout_file = open(log_stdout, "a", buffering=1)
-    log_stderr_file = open(log_stderr, "a", buffering=1)
-    return log_stdout_file, log_stderr_file
+        use_raylet=use_raylet,
+        plasma_store_socket_name=plasma_store_socket_name,
+        raylet_socket_name=raylet_socket_name,
+        temp_dir=temp_dir)