mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 21:23:10 +08:00
a41bbc10ef
* Implement Redis authentication * Throw exception for legacy Ray * Add test * Formatting * Fix bugs in CLI * Fix bugs in Raylet * Move default password to constants.h * Use pytest.fixture * Fix bug * Authenticate using formatted strings * Add missing passwords * Add test * Improve authentication of async contexts * Disable Redis authentication for credis * Update test for credis * Fix rebase artifacts * Fix formatting * Add workaround for issue #3045 * Increase timeout for test * Improve C++ readability * Fixes for CLI * Add security docs * Address comments * Address comments * Adress comments * Use ray.get * Fix lint
1926 lines
81 KiB
Python
1926 lines
81 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import json
|
|
import logging
|
|
import multiprocessing
|
|
import os
|
|
import random
|
|
import resource
|
|
import signal
|
|
import socket
|
|
import subprocess
|
|
import sys
|
|
import threading
|
|
import time
|
|
from collections import OrderedDict, namedtuple
|
|
import redis
|
|
|
|
import pyarrow
|
|
# Ray modules
|
|
import ray.ray_constants
|
|
import ray.global_scheduler as global_scheduler
|
|
import ray.local_scheduler
|
|
import ray.plasma
|
|
|
|
from ray.tempfile_services import (
|
|
get_ipython_notebook_path, get_logs_dir_path, get_raylet_socket_name,
|
|
get_temp_redis_config_path, get_temp_root, new_global_scheduler_log_file,
|
|
new_local_scheduler_log_file, new_log_monitor_log_file,
|
|
new_monitor_log_file, new_plasma_manager_log_file,
|
|
new_plasma_store_log_file, new_raylet_log_file, new_redis_log_file,
|
|
new_webui_log_file, new_worker_log_file, set_temp_root)
|
|
|
|
PROCESS_TYPE_MONITOR = "monitor"
|
|
PROCESS_TYPE_LOG_MONITOR = "log_monitor"
|
|
PROCESS_TYPE_WORKER = "worker"
|
|
PROCESS_TYPE_RAYLET = "raylet"
|
|
PROCESS_TYPE_LOCAL_SCHEDULER = "local_scheduler"
|
|
PROCESS_TYPE_PLASMA_MANAGER = "plasma_manager"
|
|
PROCESS_TYPE_PLASMA_STORE = "plasma_store"
|
|
PROCESS_TYPE_GLOBAL_SCHEDULER = "global_scheduler"
|
|
PROCESS_TYPE_REDIS_SERVER = "redis_server"
|
|
PROCESS_TYPE_WEB_UI = "web_ui"
|
|
|
|
# This is a dictionary tracking all of the processes of different types that
|
|
# have been started by this services module. Note that the order of the keys is
|
|
# important because it determines the order in which these processes will be
|
|
# terminated when Ray exits, and certain orders will cause errors to be logged
|
|
# to the screen.
|
|
all_processes = OrderedDict(
|
|
[(PROCESS_TYPE_MONITOR, []), (PROCESS_TYPE_LOG_MONITOR, []),
|
|
(PROCESS_TYPE_WORKER, []), (PROCESS_TYPE_RAYLET, []),
|
|
(PROCESS_TYPE_LOCAL_SCHEDULER, []), (PROCESS_TYPE_PLASMA_MANAGER, []),
|
|
(PROCESS_TYPE_PLASMA_STORE, []), (PROCESS_TYPE_GLOBAL_SCHEDULER, []),
|
|
(PROCESS_TYPE_REDIS_SERVER, []), (PROCESS_TYPE_WEB_UI, [])], )
|
|
|
|
# True if processes are run in the valgrind profiler.
|
|
RUN_RAYLET_PROFILER = False
|
|
RUN_LOCAL_SCHEDULER_PROFILER = False
|
|
RUN_PLASMA_MANAGER_PROFILER = False
|
|
RUN_PLASMA_STORE_PROFILER = False
|
|
|
|
# Location of the redis server and module.
|
|
REDIS_EXECUTABLE = os.path.join(
|
|
os.path.abspath(os.path.dirname(__file__)),
|
|
"core/src/common/thirdparty/redis/src/redis-server")
|
|
REDIS_MODULE = os.path.join(
|
|
os.path.abspath(os.path.dirname(__file__)),
|
|
"core/src/common/redis_module/libray_redis_module.so")
|
|
|
|
# Location of the credis server and modules.
|
|
# credis will be enabled if the environment variable RAY_USE_NEW_GCS is set.
|
|
CREDIS_EXECUTABLE = os.path.join(
|
|
os.path.abspath(os.path.dirname(__file__)),
|
|
"core/src/credis/redis/src/redis-server")
|
|
CREDIS_MASTER_MODULE = os.path.join(
|
|
os.path.abspath(os.path.dirname(__file__)),
|
|
"core/src/credis/build/src/libmaster.so")
|
|
CREDIS_MEMBER_MODULE = os.path.join(
|
|
os.path.abspath(os.path.dirname(__file__)),
|
|
"core/src/credis/build/src/libmember.so")
|
|
|
|
# Location of the raylet executables.
|
|
RAYLET_MONITOR_EXECUTABLE = os.path.join(
|
|
os.path.abspath(os.path.dirname(__file__)),
|
|
"core/src/ray/raylet/raylet_monitor")
|
|
RAYLET_EXECUTABLE = os.path.join(
|
|
os.path.abspath(os.path.dirname(__file__)), "core/src/ray/raylet/raylet")
|
|
|
|
# ObjectStoreAddress tuples contain all information necessary to connect to an
|
|
# object store. The fields are:
|
|
# - name: The socket name for the object store
|
|
# - manager_name: The socket name for the object store manager
|
|
# - manager_port: The Internet port that the object store manager listens on
|
|
ObjectStoreAddress = namedtuple("ObjectStoreAddress",
|
|
["name", "manager_name", "manager_port"])
|
|
|
|
# Logger for this module. It should be configured at the entry point
|
|
# into the program using Ray. Ray configures it by default automatically
|
|
# using logging.basicConfig in its entry/init points.
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def address(ip_address, port):
|
|
return ip_address + ":" + str(port)
|
|
|
|
|
|
def get_ip_address(address):
|
|
assert type(address) == str, "Address must be a string"
|
|
ip_address = address.split(":")[0]
|
|
return ip_address
|
|
|
|
|
|
def get_port(address):
|
|
try:
|
|
port = int(address.split(":")[1])
|
|
except Exception:
|
|
raise Exception("Unable to parse port from address {}".format(address))
|
|
return port
|
|
|
|
|
|
def new_port():
|
|
return random.randint(10000, 65535)
|
|
|
|
|
|
def kill_process(p):
|
|
"""Kill a process.
|
|
|
|
Args:
|
|
p: The process to kill.
|
|
|
|
Returns:
|
|
True if the process was killed successfully and false otherwise.
|
|
"""
|
|
if p.poll() is not None:
|
|
# The process has already terminated.
|
|
return True
|
|
if any([
|
|
RUN_RAYLET_PROFILER, RUN_LOCAL_SCHEDULER_PROFILER,
|
|
RUN_PLASMA_MANAGER_PROFILER, RUN_PLASMA_STORE_PROFILER
|
|
]):
|
|
# Give process signal to write profiler data.
|
|
os.kill(p.pid, signal.SIGINT)
|
|
# Wait for profiling data to be written.
|
|
time.sleep(0.1)
|
|
|
|
# Allow the process one second to exit gracefully.
|
|
p.terminate()
|
|
timer = threading.Timer(1, lambda p: p.kill(), [p])
|
|
try:
|
|
timer.start()
|
|
p.wait()
|
|
finally:
|
|
timer.cancel()
|
|
|
|
if p.poll() is not None:
|
|
return True
|
|
|
|
# If the process did not exit within one second, force kill it.
|
|
p.kill()
|
|
if p.poll() is not None:
|
|
return True
|
|
|
|
# The process was not killed for some reason.
|
|
return False
|
|
|
|
|
|
def cleanup():
|
|
"""When running in local mode, shutdown the Ray processes.
|
|
|
|
This method is used to shutdown processes that were started with
|
|
services.start_ray_head(). It kills all scheduler, object store, and worker
|
|
processes that were started by this services module. Driver processes are
|
|
started and disconnected by worker.py.
|
|
"""
|
|
successfully_shut_down = True
|
|
# Terminate the processes in reverse order.
|
|
for process_type in all_processes.keys():
|
|
# Kill all of the processes of a certain type.
|
|
for p in all_processes[process_type]:
|
|
success = kill_process(p)
|
|
successfully_shut_down = successfully_shut_down and success
|
|
# Reset the list of processes of this type.
|
|
all_processes[process_type] = []
|
|
if not successfully_shut_down:
|
|
logger.warning("Ray did not shut down properly.")
|
|
|
|
|
|
def all_processes_alive(exclude=None):
|
|
"""Check if all of the processes are still alive.
|
|
|
|
Args:
|
|
exclude: Don't check the processes whose types are in this list.
|
|
"""
|
|
|
|
if exclude is None:
|
|
exclude = []
|
|
for process_type, processes in all_processes.items():
|
|
# Note that p.poll() returns the exit code that the process exited
|
|
# with, so an exit code of None indicates that the process is still
|
|
# alive.
|
|
processes_alive = [p.poll() is None for p in processes]
|
|
if not all(processes_alive) and process_type not in exclude:
|
|
logger.warning(
|
|
"A process of type {} has died.".format(process_type))
|
|
return False
|
|
return True
|
|
|
|
|
|
def address_to_ip(address):
|
|
"""Convert a hostname to a numerical IP addresses in an address.
|
|
|
|
This should be a no-op if address already contains an actual numerical IP
|
|
address.
|
|
|
|
Args:
|
|
address: This can be either a string containing a hostname (or an IP
|
|
address) and a port or it can be just an IP address.
|
|
|
|
Returns:
|
|
The same address but with the hostname replaced by a numerical IP
|
|
address.
|
|
"""
|
|
address_parts = address.split(":")
|
|
ip_address = socket.gethostbyname(address_parts[0])
|
|
# Make sure localhost isn't resolved to the loopback ip
|
|
if ip_address == "127.0.0.1":
|
|
ip_address = get_node_ip_address()
|
|
return ":".join([ip_address] + address_parts[1:])
|
|
|
|
|
|
def get_node_ip_address(address="8.8.8.8:53"):
|
|
"""Determine the IP address of the local node.
|
|
|
|
Args:
|
|
address (str): The IP address and port of any known live service on the
|
|
network you care about.
|
|
|
|
Returns:
|
|
The IP address of the current node.
|
|
"""
|
|
ip_address, port = address.split(":")
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
try:
|
|
# This command will raise an exception if there is no internet
|
|
# connection.
|
|
s.connect((ip_address, int(port)))
|
|
node_ip_address = s.getsockname()[0]
|
|
except Exception as e:
|
|
node_ip_address = "127.0.0.1"
|
|
# [Errno 101] Network is unreachable
|
|
if e.errno == 101:
|
|
try:
|
|
# try get node ip address from host name
|
|
host_name = socket.getfqdn(socket.gethostname())
|
|
node_ip_address = socket.gethostbyname(host_name)
|
|
except Exception:
|
|
pass
|
|
|
|
return node_ip_address
|
|
|
|
|
|
def record_log_files_in_redis(redis_address,
|
|
node_ip_address,
|
|
log_files,
|
|
password=None):
|
|
"""Record in Redis that a new log file has been created.
|
|
|
|
This is used so that each log monitor can check Redis and figure out which
|
|
log files it is reponsible for monitoring.
|
|
|
|
Args:
|
|
redis_address: The address of the redis server.
|
|
node_ip_address: The IP address of the node that the log file exists
|
|
on.
|
|
log_files: A list of file handles for the log files. If one of the file
|
|
handles is None, we ignore it.
|
|
password (str): The password of the redis server.
|
|
"""
|
|
for log_file in log_files:
|
|
if log_file is not None:
|
|
redis_ip_address, redis_port = redis_address.split(":")
|
|
redis_client = redis.StrictRedis(
|
|
host=redis_ip_address, port=redis_port, password=password)
|
|
# The name of the key storing the list of log filenames for this IP
|
|
# address.
|
|
log_file_list_key = "LOG_FILENAMES:{}".format(node_ip_address)
|
|
redis_client.rpush(log_file_list_key, log_file.name)
|
|
|
|
|
|
def create_redis_client(redis_address, password=None):
|
|
"""Create a Redis client.
|
|
|
|
Args:
|
|
The IP address, port, and password of the Redis server.
|
|
|
|
Returns:
|
|
A Redis client.
|
|
"""
|
|
redis_ip_address, redis_port = redis_address.split(":")
|
|
# For this command to work, some other client (on the same machine
|
|
# as Redis) must have run "CONFIG SET protected-mode no".
|
|
return redis.StrictRedis(
|
|
host=redis_ip_address, port=int(redis_port), password=password)
|
|
|
|
|
|
def wait_for_redis_to_start(redis_ip_address,
|
|
redis_port,
|
|
password=None,
|
|
num_retries=5):
|
|
"""Wait for a Redis server to be available.
|
|
|
|
This is accomplished by creating a Redis client and sending a random
|
|
command to the server until the command gets through.
|
|
|
|
Args:
|
|
redis_ip_address (str): The IP address of the redis server.
|
|
redis_port (int): The port of the redis server.
|
|
password (str): The password of the redis server.
|
|
num_retries (int): The number of times to try connecting with redis.
|
|
The client will sleep for one second between attempts.
|
|
|
|
Raises:
|
|
Exception: An exception is raised if we could not connect with Redis.
|
|
"""
|
|
redis_client = redis.StrictRedis(
|
|
host=redis_ip_address, port=redis_port, password=password)
|
|
# Wait for the Redis server to start.
|
|
counter = 0
|
|
while counter < num_retries:
|
|
try:
|
|
# Run some random command and see if it worked.
|
|
logger.info(
|
|
"Waiting for redis server at {}:{} to respond...".format(
|
|
redis_ip_address, redis_port))
|
|
redis_client.client_list()
|
|
except redis.ConnectionError as e:
|
|
# Wait a little bit.
|
|
time.sleep(1)
|
|
logger.info("Failed to connect to the redis server, retrying.")
|
|
counter += 1
|
|
else:
|
|
break
|
|
if counter == num_retries:
|
|
raise Exception("Unable to connect to Redis. If the Redis instance is "
|
|
"on a different machine, check that your firewall is "
|
|
"configured properly.")
|
|
|
|
|
|
def _autodetect_num_gpus():
|
|
"""Attempt to detect the number of GPUs on this machine.
|
|
|
|
TODO(rkn): This currently assumes Nvidia GPUs and Linux.
|
|
|
|
Returns:
|
|
The number of GPUs if any were detected, otherwise 0.
|
|
"""
|
|
proc_gpus_path = "/proc/driver/nvidia/gpus"
|
|
if os.path.isdir(proc_gpus_path):
|
|
return len(os.listdir(proc_gpus_path))
|
|
return 0
|
|
|
|
|
|
def _compute_version_info():
|
|
"""Compute the versions of Python, pyarrow, and Ray.
|
|
|
|
Returns:
|
|
A tuple containing the version information.
|
|
"""
|
|
ray_version = ray.__version__
|
|
python_version = ".".join(map(str, sys.version_info[:3]))
|
|
pyarrow_version = pyarrow.__version__
|
|
return ray_version, python_version, pyarrow_version
|
|
|
|
|
|
def _put_version_info_in_redis(redis_client):
|
|
"""Store version information in Redis.
|
|
|
|
This will be used to detect if workers or drivers are started using
|
|
different versions of Python, pyarrow, or Ray.
|
|
|
|
Args:
|
|
redis_client: A client for the primary Redis shard.
|
|
"""
|
|
redis_client.set("VERSION_INFO", json.dumps(_compute_version_info()))
|
|
|
|
|
|
def check_version_info(redis_client):
|
|
"""Check if various version info of this process is correct.
|
|
|
|
This will be used to detect if workers or drivers are started using
|
|
different versions of Python, pyarrow, or Ray. If the version
|
|
information is not present in Redis, then no check is done.
|
|
|
|
Args:
|
|
redis_client: A client for the primary Redis shard.
|
|
|
|
Raises:
|
|
Exception: An exception is raised if there is a version mismatch.
|
|
"""
|
|
redis_reply = redis_client.get("VERSION_INFO")
|
|
|
|
# Don't do the check if there is no version information in Redis. This
|
|
# is to make it easier to do things like start the processes by hand.
|
|
if redis_reply is None:
|
|
return
|
|
|
|
true_version_info = tuple(json.loads(ray.utils.decode(redis_reply)))
|
|
version_info = _compute_version_info()
|
|
if version_info != true_version_info:
|
|
node_ip_address = ray.services.get_node_ip_address()
|
|
error_message = ("Version mismatch: The cluster was started with:\n"
|
|
" Ray: " + true_version_info[0] + "\n"
|
|
" Python: " + true_version_info[1] + "\n"
|
|
" Pyarrow: " + str(true_version_info[2]) + "\n"
|
|
"This process on node " + node_ip_address +
|
|
" was started with:" + "\n"
|
|
" Ray: " + version_info[0] + "\n"
|
|
" Python: " + version_info[1] + "\n"
|
|
" Pyarrow: " + str(version_info[2]))
|
|
if version_info[:2] != true_version_info[:2]:
|
|
raise Exception(error_message)
|
|
else:
|
|
logger.warning(error_message)
|
|
|
|
|
|
def start_redis(node_ip_address,
|
|
port=None,
|
|
redis_shard_ports=None,
|
|
num_redis_shards=1,
|
|
redis_max_clients=None,
|
|
use_raylet=False,
|
|
redirect_output=False,
|
|
redirect_worker_output=False,
|
|
cleanup=True,
|
|
protected_mode=False,
|
|
password=None,
|
|
use_credis=None):
|
|
"""Start the Redis global state store.
|
|
|
|
Args:
|
|
node_ip_address: The IP address of the current node. This is only used
|
|
for recording the log filenames in Redis.
|
|
port (int): If provided, the primary Redis shard will be started on
|
|
this port.
|
|
redis_shard_ports: A list of the ports to use for the non-primary Redis
|
|
shards.
|
|
num_redis_shards (int): If provided, the number of Redis shards to
|
|
start, in addition to the primary one. The default value is one
|
|
shard.
|
|
redis_max_clients: If this is provided, Ray will attempt to configure
|
|
Redis with this maxclients number.
|
|
use_raylet: True if the new raylet code path should be used. This is
|
|
not supported yet.
|
|
redirect_output (bool): True if output should be redirected to a file
|
|
and false otherwise.
|
|
redirect_worker_output (bool): True if worker output should be
|
|
redirected to a file and false otherwise. Workers will have access
|
|
to this value when they start up.
|
|
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
|
then all Redis processes started by this method will be killed by
|
|
services.cleanup() when the Python process that imported services
|
|
exits.
|
|
password (str): Prevents external clients without the password
|
|
from connecting to Redis if provided.
|
|
use_credis: If True, additionally load the chain-replicated libraries
|
|
into the redis servers. Defaults to None, which means its value is
|
|
set by the presence of "RAY_USE_NEW_GCS" in os.environ.
|
|
|
|
Returns:
|
|
A tuple of the address for the primary Redis shard and a list of
|
|
addresses for the remaining shards.
|
|
"""
|
|
redis_stdout_file, redis_stderr_file = new_redis_log_file(redirect_output)
|
|
|
|
if redis_shard_ports is None:
|
|
redis_shard_ports = num_redis_shards * [None]
|
|
elif len(redis_shard_ports) != num_redis_shards:
|
|
raise Exception("The number of Redis shard ports does not match the "
|
|
"number of Redis shards.")
|
|
|
|
if use_credis is None:
|
|
use_credis = ("RAY_USE_NEW_GCS" in os.environ)
|
|
if use_credis and password is not None:
|
|
# TODO(pschafhalter) remove this once credis supports
|
|
# authenticating Redis ports
|
|
raise Exception("Setting the `redis_password` argument is not "
|
|
"supported in credis. To run Ray with "
|
|
"password-protected Redis ports, ensure that "
|
|
"the environment variable `RAY_USE_NEW_GCS=off`.")
|
|
if not use_credis:
|
|
assigned_port, _ = _start_redis_instance(
|
|
node_ip_address=node_ip_address,
|
|
port=port,
|
|
redis_max_clients=redis_max_clients,
|
|
stdout_file=redis_stdout_file,
|
|
stderr_file=redis_stderr_file,
|
|
cleanup=cleanup,
|
|
protected_mode=protected_mode,
|
|
password=password)
|
|
else:
|
|
assigned_port, _ = _start_redis_instance(
|
|
node_ip_address=node_ip_address,
|
|
port=port,
|
|
redis_max_clients=redis_max_clients,
|
|
stdout_file=redis_stdout_file,
|
|
stderr_file=redis_stderr_file,
|
|
cleanup=cleanup,
|
|
protected_mode=protected_mode,
|
|
executable=CREDIS_EXECUTABLE,
|
|
# It is important to load the credis module BEFORE the ray module,
|
|
# as the latter contains an extern declaration that the former
|
|
# supplies.
|
|
modules=[CREDIS_MASTER_MODULE, REDIS_MODULE],
|
|
password=password)
|
|
if port is not None:
|
|
assert assigned_port == port
|
|
port = assigned_port
|
|
redis_address = address(node_ip_address, port)
|
|
|
|
redis_client = redis.StrictRedis(
|
|
host=node_ip_address, port=port, password=password)
|
|
|
|
# Store whether we're using the raylet code path or not.
|
|
redis_client.set("UseRaylet", 1 if use_raylet else 0)
|
|
|
|
# Register the number of Redis shards in the primary shard, so that clients
|
|
# know how many redis shards to expect under RedisShards.
|
|
primary_redis_client = redis.StrictRedis(
|
|
host=node_ip_address, port=port, password=password)
|
|
primary_redis_client.set("NumRedisShards", str(num_redis_shards))
|
|
|
|
# Put the redirect_worker_output bool in the Redis shard so that workers
|
|
# can access it and know whether or not to redirect their output.
|
|
primary_redis_client.set("RedirectOutput", 1
|
|
if redirect_worker_output else 0)
|
|
|
|
# Store version information in the primary Redis shard.
|
|
_put_version_info_in_redis(primary_redis_client)
|
|
|
|
# Start other Redis shards. Each Redis shard logs to a separate file,
|
|
# prefixed by "redis-<shard number>".
|
|
redis_shards = []
|
|
for i in range(num_redis_shards):
|
|
redis_stdout_file, redis_stderr_file = new_redis_log_file(
|
|
redirect_output, shard_number=i)
|
|
if not use_credis:
|
|
redis_shard_port, _ = _start_redis_instance(
|
|
node_ip_address=node_ip_address,
|
|
port=redis_shard_ports[i],
|
|
redis_max_clients=redis_max_clients,
|
|
stdout_file=redis_stdout_file,
|
|
stderr_file=redis_stderr_file,
|
|
cleanup=cleanup,
|
|
protected_mode=protected_mode,
|
|
password=password)
|
|
else:
|
|
assert num_redis_shards == 1, \
|
|
"For now, RAY_USE_NEW_GCS supports 1 shard, and credis "\
|
|
"supports 1-node chain for that shard only."
|
|
redis_shard_port, _ = _start_redis_instance(
|
|
node_ip_address=node_ip_address,
|
|
port=redis_shard_ports[i],
|
|
redis_max_clients=redis_max_clients,
|
|
stdout_file=redis_stdout_file,
|
|
stderr_file=redis_stderr_file,
|
|
cleanup=cleanup,
|
|
protected_mode=protected_mode,
|
|
password=password,
|
|
executable=CREDIS_EXECUTABLE,
|
|
# It is important to load the credis module BEFORE the ray
|
|
# module, as the latter contains an extern declaration that the
|
|
# former supplies.
|
|
modules=[CREDIS_MEMBER_MODULE, REDIS_MODULE])
|
|
|
|
if redis_shard_ports[i] is not None:
|
|
assert redis_shard_port == redis_shard_ports[i]
|
|
shard_address = address(node_ip_address, redis_shard_port)
|
|
redis_shards.append(shard_address)
|
|
# Store redis shard information in the primary redis shard.
|
|
primary_redis_client.rpush("RedisShards", shard_address)
|
|
|
|
if use_credis:
|
|
shard_client = redis.StrictRedis(
|
|
host=node_ip_address, port=redis_shard_port, password=password)
|
|
# Configure the chain state.
|
|
primary_redis_client.execute_command("MASTER.ADD", node_ip_address,
|
|
redis_shard_port)
|
|
shard_client.execute_command("MEMBER.CONNECT_TO_MASTER",
|
|
node_ip_address, port)
|
|
|
|
return redis_address, redis_shards
|
|
|
|
|
|
def _make_temp_redis_config(node_ip_address):
|
|
"""Create a configuration file for Redis.
|
|
|
|
Args:
|
|
node_ip_address: The IP address of this node. This should not be
|
|
127.0.0.1.
|
|
"""
|
|
redis_config_name = get_temp_redis_config_path()
|
|
with open(redis_config_name, 'w') as f:
|
|
# This allows redis clients on the same machine to connect using the
|
|
# node's IP address as opposed to just 127.0.0.1. This is only relevant
|
|
# when the server is in protected mode.
|
|
f.write("bind 127.0.0.1 {}".format(node_ip_address))
|
|
return redis_config_name
|
|
|
|
|
|
def _start_redis_instance(node_ip_address="127.0.0.1",
|
|
port=None,
|
|
redis_max_clients=None,
|
|
num_retries=20,
|
|
stdout_file=None,
|
|
stderr_file=None,
|
|
cleanup=True,
|
|
protected_mode=False,
|
|
password=None,
|
|
executable=REDIS_EXECUTABLE,
|
|
modules=None):
|
|
"""Start a single Redis server.
|
|
|
|
Args:
|
|
node_ip_address (str): The IP address of the current node. This is only
|
|
used for recording the log filenames in Redis.
|
|
port (int): If provided, start a Redis server with this port.
|
|
redis_max_clients: If this is provided, Ray will attempt to configure
|
|
Redis with this maxclients number.
|
|
num_retries (int): The number of times to attempt to start Redis. If a
|
|
port is provided, this defaults to 1.
|
|
stdout_file: A file handle opened for writing to redirect stdout to. If
|
|
no redirection should happen, then this should be None.
|
|
stderr_file: A file handle opened for writing to redirect stderr to. If
|
|
no redirection should happen, then this should be None.
|
|
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
|
then this process will be killed by serices.cleanup() when the
|
|
Python process that imported services exits.
|
|
protected_mode: True if we should start the Redis server in protected
|
|
mode. This will prevent clients on other machines from connecting
|
|
and is only used when the Redis servers are started via ray.init()
|
|
as opposed to ray start.
|
|
password (str): Prevents external clients without the password
|
|
from connecting to Redis if provided.
|
|
executable (str): Full path tho the redis-server executable.
|
|
modules (list of str): A list of pathnames, pointing to the redis
|
|
module(s) that will be loaded in this redis server. If None, load
|
|
the default Ray redis module.
|
|
|
|
Returns:
|
|
A tuple of the port used by Redis and a handle to the process that was
|
|
started. If a port is passed in, then the returned port value is
|
|
the same.
|
|
|
|
Raises:
|
|
Exception: An exception is raised if Redis could not be started.
|
|
"""
|
|
assert os.path.isfile(executable)
|
|
if modules is None:
|
|
modules = [REDIS_MODULE]
|
|
for module in modules:
|
|
assert os.path.isfile(module)
|
|
counter = 0
|
|
if port is not None:
|
|
# If a port is specified, then try only once to connect.
|
|
num_retries = 1
|
|
else:
|
|
port = new_port()
|
|
|
|
if protected_mode:
|
|
redis_config_filename = _make_temp_redis_config(node_ip_address)
|
|
|
|
load_module_args = []
|
|
for module in modules:
|
|
load_module_args += ["--loadmodule", module]
|
|
|
|
while counter < num_retries:
|
|
if counter > 0:
|
|
logger.warning("Redis failed to start, retrying now.")
|
|
|
|
# Construct the command to start the Redis server.
|
|
command = [executable]
|
|
if protected_mode:
|
|
command += [redis_config_filename]
|
|
if password:
|
|
command += ["--requirepass", password]
|
|
command += (
|
|
["--port", str(port), "--loglevel", "warning"] + load_module_args)
|
|
|
|
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
|
|
time.sleep(0.1)
|
|
# Check if Redis successfully started (or at least if it the executable
|
|
# did not exit within 0.1 seconds).
|
|
if p.poll() is None:
|
|
if cleanup:
|
|
all_processes[PROCESS_TYPE_REDIS_SERVER].append(p)
|
|
break
|
|
port = new_port()
|
|
counter += 1
|
|
if counter == num_retries:
|
|
raise Exception("Couldn't start Redis. Check log files: {} {}".format(
|
|
stdout_file.name, stderr_file.name))
|
|
|
|
# Create a Redis client just for configuring Redis.
|
|
redis_client = redis.StrictRedis(
|
|
host="127.0.0.1", port=port, password=password)
|
|
# Wait for the Redis server to start.
|
|
wait_for_redis_to_start("127.0.0.1", port, password=password)
|
|
# Configure Redis to generate keyspace notifications. TODO(rkn): Change
|
|
# this to only generate notifications for the export keys.
|
|
redis_client.config_set("notify-keyspace-events", "Kl")
|
|
|
|
# Configure Redis to not run in protected mode so that processes on other
|
|
# hosts can connect to it. TODO(rkn): Do this in a more secure way.
|
|
if not protected_mode:
|
|
redis_client.config_set("protected-mode", "no")
|
|
|
|
# If redis_max_clients is provided, attempt to raise the number of maximum
|
|
# number of Redis clients.
|
|
if redis_max_clients is not None:
|
|
redis_client.config_set("maxclients", str(redis_max_clients))
|
|
else:
|
|
# If redis_max_clients is not provided, determine the current ulimit.
|
|
# We will use this to attempt to raise the maximum number of Redis
|
|
# clients.
|
|
current_max_clients = int(
|
|
redis_client.config_get("maxclients")["maxclients"])
|
|
# The below command should be the same as doing ulimit -n.
|
|
ulimit_n = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
|
|
# The quantity redis_client_buffer appears to be the required buffer
|
|
# between the maximum number of redis clients and ulimit -n. That is,
|
|
# if ulimit -n returns 10000, then we can set maxclients to
|
|
# 10000 - redis_client_buffer.
|
|
redis_client_buffer = 32
|
|
if current_max_clients < ulimit_n - redis_client_buffer:
|
|
redis_client.config_set("maxclients",
|
|
ulimit_n - redis_client_buffer)
|
|
|
|
# Increase the hard and soft limits for the redis client pubsub buffer to
|
|
# 128MB. This is a hack to make it less likely for pubsub messages to be
|
|
# dropped and for pubsub connections to therefore be killed.
|
|
cur_config = (redis_client.config_get("client-output-buffer-limit")[
|
|
"client-output-buffer-limit"])
|
|
cur_config_list = cur_config.split()
|
|
assert len(cur_config_list) == 12
|
|
cur_config_list[8:] = ["pubsub", "134217728", "134217728", "60"]
|
|
redis_client.config_set("client-output-buffer-limit",
|
|
" ".join(cur_config_list))
|
|
# Put a time stamp in Redis to indicate when it was started.
|
|
redis_client.set("redis_start_time", time.time())
|
|
# Record the log files in Redis.
|
|
record_log_files_in_redis(
|
|
address(node_ip_address, port),
|
|
node_ip_address, [stdout_file, stderr_file],
|
|
password=password)
|
|
return port, p
|
|
|
|
|
|
def start_log_monitor(redis_address,
|
|
node_ip_address,
|
|
stdout_file=None,
|
|
stderr_file=None,
|
|
cleanup=cleanup,
|
|
redis_password=None):
|
|
"""Start a log monitor process.
|
|
|
|
Args:
|
|
redis_address (str): The address of the Redis instance.
|
|
node_ip_address (str): The IP address of the node that this log monitor
|
|
is running on.
|
|
stdout_file: A file handle opened for writing to redirect stdout to. If
|
|
no redirection should happen, then this should be None.
|
|
stderr_file: A file handle opened for writing to redirect stderr to. If
|
|
no redirection should happen, then this should be None.
|
|
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
|
then this process will be killed by services.cleanup() when the
|
|
Python process that imported services exits.
|
|
redis_password (str): The password of the redis server.
|
|
"""
|
|
log_monitor_filepath = os.path.join(
|
|
os.path.dirname(os.path.abspath(__file__)), "log_monitor.py")
|
|
command = [
|
|
sys.executable, "-u", log_monitor_filepath, "--redis-address",
|
|
redis_address, "--node-ip-address", node_ip_address
|
|
]
|
|
if redis_password:
|
|
command += ["--redis-password", redis_password]
|
|
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
|
|
if cleanup:
|
|
all_processes[PROCESS_TYPE_LOG_MONITOR].append(p)
|
|
record_log_files_in_redis(
|
|
redis_address,
|
|
node_ip_address, [stdout_file, stderr_file],
|
|
password=redis_password)
|
|
|
|
|
|
def start_global_scheduler(redis_address,
|
|
node_ip_address,
|
|
stdout_file=None,
|
|
stderr_file=None,
|
|
cleanup=True,
|
|
redis_password=None):
|
|
"""Start a global scheduler process.
|
|
|
|
Args:
|
|
redis_address (str): The address of the Redis instance.
|
|
node_ip_address: The IP address of the node that this scheduler will
|
|
run on.
|
|
stdout_file: A file handle opened for writing to redirect stdout to. If
|
|
no redirection should happen, then this should be None.
|
|
stderr_file: A file handle opened for writing to redirect stderr to. If
|
|
no redirection should happen, then this should be None.
|
|
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
|
then this process will be killed by services.cleanup() when the
|
|
Python process that imported services exits.
|
|
redis_password (str): The password of the redis server.
|
|
"""
|
|
p = global_scheduler.start_global_scheduler(
|
|
redis_address,
|
|
node_ip_address,
|
|
stdout_file=stdout_file,
|
|
stderr_file=stderr_file)
|
|
if cleanup:
|
|
all_processes[PROCESS_TYPE_GLOBAL_SCHEDULER].append(p)
|
|
record_log_files_in_redis(
|
|
redis_address,
|
|
node_ip_address, [stdout_file, stderr_file],
|
|
password=redis_password)
|
|
|
|
|
|
def start_ui(redis_address, stdout_file=None, stderr_file=None, cleanup=True):
|
|
"""Start a UI process.
|
|
|
|
Args:
|
|
redis_address: The address of the primary Redis shard.
|
|
stdout_file: A file handle opened for writing to redirect stdout to. If
|
|
no redirection should happen, then this should be None.
|
|
stderr_file: A file handle opened for writing to redirect stderr to. If
|
|
no redirection should happen, then this should be None.
|
|
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
|
then this process will be killed by services.cleanup() when the
|
|
Python process that imported services exits.
|
|
"""
|
|
|
|
port = 8888
|
|
while True:
|
|
try:
|
|
port_test_socket = socket.socket()
|
|
port_test_socket.bind(("127.0.0.1", port))
|
|
port_test_socket.close()
|
|
break
|
|
except socket.error:
|
|
port += 1
|
|
new_env = os.environ.copy()
|
|
new_env["REDIS_ADDRESS"] = redis_address
|
|
# We generate the token used for authentication ourselves to avoid
|
|
# querying the jupyter server.
|
|
new_notebook_directory, webui_url, token = (
|
|
get_ipython_notebook_path(port))
|
|
# The --ip=0.0.0.0 flag is intended to enable connecting to a notebook
|
|
# running within a docker container (from the outside).
|
|
command = [
|
|
"jupyter", "notebook", "--no-browser", "--port={}".format(port),
|
|
"--ip=0.0.0.0", "--NotebookApp.iopub_data_rate_limit=10000000000",
|
|
"--NotebookApp.open_browser=False",
|
|
"--NotebookApp.token={}".format(token)
|
|
]
|
|
# If the user is root, add the --allow-root flag.
|
|
if os.geteuid() == 0:
|
|
command.append("--allow-root")
|
|
|
|
try:
|
|
ui_process = subprocess.Popen(
|
|
command,
|
|
env=new_env,
|
|
cwd=new_notebook_directory,
|
|
stdout=stdout_file,
|
|
stderr=stderr_file)
|
|
except Exception:
|
|
logger.warning("Failed to start the UI, you may need to run "
|
|
"'pip install jupyter'.")
|
|
else:
|
|
if cleanup:
|
|
all_processes[PROCESS_TYPE_WEB_UI].append(ui_process)
|
|
logger.info("\n" + "=" * 70)
|
|
logger.info("View the web UI at {}".format(webui_url))
|
|
logger.info("=" * 70 + "\n")
|
|
return webui_url
|
|
|
|
|
|
def check_and_update_resources(resources, use_raylet):
|
|
"""Sanity check a resource dictionary and add sensible defaults.
|
|
|
|
Args:
|
|
resources: A dictionary mapping resource names to resource quantities.
|
|
use_raylet: True if we are using the raylet code path and false
|
|
otherwise.
|
|
|
|
Returns:
|
|
A new resource dictionary.
|
|
"""
|
|
if resources is None:
|
|
resources = {}
|
|
resources = resources.copy()
|
|
if "CPU" not in resources:
|
|
# By default, use the number of hardware execution threads for the
|
|
# number of cores.
|
|
resources["CPU"] = multiprocessing.cpu_count()
|
|
|
|
# See if CUDA_VISIBLE_DEVICES has already been set.
|
|
gpu_ids = ray.utils.get_cuda_visible_devices()
|
|
|
|
# Check that the number of GPUs that the local scheduler wants doesn't
|
|
# excede the amount allowed by CUDA_VISIBLE_DEVICES.
|
|
if ("GPU" in resources and gpu_ids is not None
|
|
and resources["GPU"] > len(gpu_ids)):
|
|
raise Exception("Attempting to start local scheduler with {} GPUs, "
|
|
"but CUDA_VISIBLE_DEVICES contains {}.".format(
|
|
resources["GPU"], gpu_ids))
|
|
|
|
if "GPU" not in resources:
|
|
# Try to automatically detect the number of GPUs.
|
|
resources["GPU"] = _autodetect_num_gpus()
|
|
# Don't use more GPUs than allowed by CUDA_VISIBLE_DEVICES.
|
|
if gpu_ids is not None:
|
|
resources["GPU"] = min(resources["GPU"], len(gpu_ids))
|
|
|
|
# Check types.
|
|
for _, resource_quantity in resources.items():
|
|
assert (isinstance(resource_quantity, int)
|
|
or isinstance(resource_quantity, float))
|
|
if (isinstance(resource_quantity, float)
|
|
and not resource_quantity.is_integer()):
|
|
raise ValueError("Resource quantities must all be whole numbers.")
|
|
|
|
if (use_raylet and
|
|
resource_quantity > ray.ray_constants.MAX_RESOURCE_QUANTITY):
|
|
raise ValueError("Resource quantities must be at most {}.".format(
|
|
ray.ray_constants.MAX_RESOURCE_QUANTITY))
|
|
|
|
return resources
|
|
|
|
|
|
def start_local_scheduler(redis_address,
|
|
node_ip_address,
|
|
plasma_store_name,
|
|
plasma_manager_name,
|
|
worker_path,
|
|
plasma_address=None,
|
|
stdout_file=None,
|
|
stderr_file=None,
|
|
cleanup=True,
|
|
resources=None,
|
|
num_workers=0,
|
|
redis_password=None):
|
|
"""Start a local scheduler process.
|
|
|
|
Args:
|
|
redis_address (str): The address of the Redis instance.
|
|
node_ip_address (str): The IP address of the node that this local
|
|
scheduler is running on.
|
|
plasma_store_name (str): The name of the plasma store socket to connect
|
|
to.
|
|
plasma_manager_name (str): The name of the plasma manager socket to
|
|
connect to.
|
|
worker_path (str): The path of the script to use when the local
|
|
scheduler starts up new workers.
|
|
stdout_file: A file handle opened for writing to redirect stdout to. If
|
|
no redirection should happen, then this should be None.
|
|
stderr_file: A file handle opened for writing to redirect stderr to. If
|
|
no redirection should happen, then this should be None.
|
|
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
|
then this process will be killed by serices.cleanup() when the
|
|
Python process that imported services exits.
|
|
resources: A dictionary mapping the name of a resource to the available
|
|
quantity of that resource.
|
|
num_workers (int): The number of workers that the local scheduler
|
|
should start.
|
|
redis_password (str): The password of the redis server.
|
|
|
|
Return:
|
|
The name of the local scheduler socket.
|
|
"""
|
|
resources = check_and_update_resources(resources, False)
|
|
|
|
logger.info("Starting local scheduler with the following resources: {}."
|
|
.format(resources))
|
|
local_scheduler_name, p = ray.local_scheduler.start_local_scheduler(
|
|
plasma_store_name,
|
|
plasma_manager_name,
|
|
worker_path=worker_path,
|
|
node_ip_address=node_ip_address,
|
|
redis_address=redis_address,
|
|
plasma_address=plasma_address,
|
|
use_profiler=RUN_LOCAL_SCHEDULER_PROFILER,
|
|
stdout_file=stdout_file,
|
|
stderr_file=stderr_file,
|
|
static_resources=resources,
|
|
num_workers=num_workers)
|
|
if cleanup:
|
|
all_processes[PROCESS_TYPE_LOCAL_SCHEDULER].append(p)
|
|
record_log_files_in_redis(
|
|
redis_address,
|
|
node_ip_address, [stdout_file, stderr_file],
|
|
password=redis_password)
|
|
return local_scheduler_name
|
|
|
|
|
|
def start_raylet(redis_address,
|
|
node_ip_address,
|
|
raylet_name,
|
|
plasma_store_name,
|
|
worker_path,
|
|
resources=None,
|
|
num_workers=0,
|
|
use_valgrind=False,
|
|
use_profiler=False,
|
|
stdout_file=None,
|
|
stderr_file=None,
|
|
cleanup=True,
|
|
redis_password=None):
|
|
"""Start a raylet, which is a combined local scheduler and object manager.
|
|
|
|
Args:
|
|
redis_address (str): The address of the Redis instance.
|
|
node_ip_address (str): The IP address of the node that this local
|
|
scheduler is running on.
|
|
plasma_store_name (str): The name of the plasma store socket to connect
|
|
to.
|
|
raylet_name (str): The name of the raylet socket to create.
|
|
worker_path (str): The path of the script to use when the local
|
|
scheduler starts up new workers.
|
|
use_valgrind (bool): True if the raylet should be started inside
|
|
of valgrind. If this is True, use_profiler must be False.
|
|
use_profiler (bool): True if the raylet should be started inside
|
|
a profiler. If this is True, use_valgrind must be False.
|
|
stdout_file: A file handle opened for writing to redirect stdout to. If
|
|
no redirection should happen, then this should be None.
|
|
stderr_file: A file handle opened for writing to redirect stderr to. If
|
|
no redirection should happen, then this should be None.
|
|
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
|
then this process will be killed by serices.cleanup() when the
|
|
Python process that imported services exits.
|
|
redis_password (str): The password of the redis server.
|
|
|
|
Returns:
|
|
The raylet socket name.
|
|
"""
|
|
if use_valgrind and use_profiler:
|
|
raise Exception("Cannot use valgrind and profiler at the same time.")
|
|
|
|
static_resources = check_and_update_resources(resources, True)
|
|
|
|
# Limit the number of workers that can be started in parallel by the
|
|
# raylet. However, make sure it is at least 1.
|
|
maximum_startup_concurrency = max(
|
|
1, min(multiprocessing.cpu_count(), static_resources["CPU"]))
|
|
|
|
# Format the resource argument in a form like 'CPU,1.0,GPU,0,Custom,3'.
|
|
resource_argument = ",".join([
|
|
"{},{}".format(resource_name, resource_value)
|
|
for resource_name, resource_value in zip(static_resources.keys(),
|
|
static_resources.values())
|
|
])
|
|
|
|
gcs_ip_address, gcs_port = redis_address.split(":")
|
|
|
|
# Create the command that the Raylet will use to start workers.
|
|
start_worker_command = ("{} {} "
|
|
"--node-ip-address={} "
|
|
"--object-store-name={} "
|
|
"--raylet-name={} "
|
|
"--redis-address={} "
|
|
"--temp-dir={}".format(
|
|
sys.executable, worker_path, node_ip_address,
|
|
plasma_store_name, raylet_name, redis_address,
|
|
get_temp_root()))
|
|
if redis_password:
|
|
start_worker_command += " --redis-password {}".format(redis_password)
|
|
|
|
command = [
|
|
RAYLET_EXECUTABLE,
|
|
raylet_name,
|
|
plasma_store_name,
|
|
node_ip_address,
|
|
gcs_ip_address,
|
|
gcs_port,
|
|
str(num_workers),
|
|
str(maximum_startup_concurrency),
|
|
resource_argument,
|
|
start_worker_command,
|
|
"", # Worker command for Java, not needed for Python.
|
|
redis_password or "",
|
|
]
|
|
|
|
if use_valgrind:
|
|
pid = subprocess.Popen(
|
|
[
|
|
"valgrind", "--track-origins=yes", "--leak-check=full",
|
|
"--show-leak-kinds=all", "--leak-check-heuristics=stdstring",
|
|
"--error-exitcode=1"
|
|
] + command,
|
|
stdout=stdout_file,
|
|
stderr=stderr_file)
|
|
elif use_profiler:
|
|
pid = subprocess.Popen(
|
|
["valgrind", "--tool=callgrind"] + command,
|
|
stdout=stdout_file,
|
|
stderr=stderr_file)
|
|
else:
|
|
pid = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
|
|
|
|
if cleanup:
|
|
all_processes[PROCESS_TYPE_RAYLET].append(pid)
|
|
record_log_files_in_redis(
|
|
redis_address,
|
|
node_ip_address, [stdout_file, stderr_file],
|
|
password=redis_password)
|
|
|
|
return raylet_name
|
|
|
|
|
|
def start_plasma_store(node_ip_address,
|
|
redis_address,
|
|
object_manager_port=None,
|
|
store_stdout_file=None,
|
|
store_stderr_file=None,
|
|
manager_stdout_file=None,
|
|
manager_stderr_file=None,
|
|
objstore_memory=None,
|
|
cleanup=True,
|
|
plasma_directory=None,
|
|
huge_pages=False,
|
|
use_raylet=False,
|
|
plasma_store_socket_name=None,
|
|
redis_password=None):
|
|
"""This method starts an object store process.
|
|
|
|
Args:
|
|
node_ip_address (str): The IP address of the node running the object
|
|
store.
|
|
redis_address (str): The address of the Redis instance to connect to.
|
|
object_manager_port (int): The port to use for the object manager. If
|
|
this is not provided, one will be generated randomly.
|
|
store_stdout_file: A file handle opened for writing to redirect stdout
|
|
to. If no redirection should happen, then this should be None.
|
|
store_stderr_file: A file handle opened for writing to redirect stderr
|
|
to. If no redirection should happen, then this should be None.
|
|
manager_stdout_file: A file handle opened for writing to redirect
|
|
stdout to. If no redirection should happen, then this should be
|
|
None.
|
|
manager_stderr_file: A file handle opened for writing to redirect
|
|
stderr to. If no redirection should happen, then this should be
|
|
None.
|
|
objstore_memory: The amount of memory (in bytes) to start the object
|
|
store with.
|
|
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
|
then this process will be killed by serices.cleanup() when the
|
|
Python process that imported services exits.
|
|
plasma_directory: A directory where the Plasma memory mapped files will
|
|
be created.
|
|
huge_pages: Boolean flag indicating whether to start the Object
|
|
Store with hugetlbfs support. Requires plasma_directory.
|
|
use_raylet: True if the new raylet code path should be used. This is
|
|
not supported yet.
|
|
redis_password (str): The password of the redis server.
|
|
|
|
Return:
|
|
A tuple of the Plasma store socket name, the Plasma manager socket
|
|
name, and the plasma manager port.
|
|
"""
|
|
if objstore_memory is None:
|
|
# Compute a fraction of the system memory for the Plasma store to use.
|
|
system_memory = ray.utils.get_system_memory()
|
|
if sys.platform == "linux" or sys.platform == "linux2":
|
|
# On linux we use /dev/shm, its size is half the size of the
|
|
# physical memory. To not overflow it, we set the plasma memory
|
|
# limit to 0.4 times the size of the physical memory.
|
|
objstore_memory = int(system_memory * 0.4)
|
|
# Compare the requested memory size to the memory available in
|
|
# /dev/shm.
|
|
shm_fd = os.open("/dev/shm", os.O_RDONLY)
|
|
try:
|
|
shm_fs_stats = os.fstatvfs(shm_fd)
|
|
# The value shm_fs_stats.f_bsize is the block size and the
|
|
# value shm_fs_stats.f_bavail is the number of available
|
|
# blocks.
|
|
shm_avail = shm_fs_stats.f_bsize * shm_fs_stats.f_bavail
|
|
if objstore_memory > shm_avail:
|
|
logger.warning(
|
|
"Warning: Reducing object store memory because "
|
|
"/dev/shm has only {} bytes available. You may be "
|
|
"able to free up space by deleting files in "
|
|
"/dev/shm. If you are inside a Docker container, "
|
|
"you may need to pass an argument with the flag "
|
|
"'--shm-size' to 'docker run'.".format(shm_avail))
|
|
objstore_memory = int(shm_avail * 0.8)
|
|
finally:
|
|
os.close(shm_fd)
|
|
else:
|
|
objstore_memory = int(system_memory * 0.8)
|
|
# Start the Plasma store.
|
|
logger.info("Starting the Plasma object store with {0:.2f} GB memory."
|
|
.format(objstore_memory / 10**9))
|
|
plasma_store_name, p1 = ray.plasma.start_plasma_store(
|
|
plasma_store_memory=objstore_memory,
|
|
use_profiler=RUN_PLASMA_STORE_PROFILER,
|
|
stdout_file=store_stdout_file,
|
|
stderr_file=store_stderr_file,
|
|
plasma_directory=plasma_directory,
|
|
huge_pages=huge_pages,
|
|
socket_name=plasma_store_socket_name)
|
|
# Start the plasma manager.
|
|
if not use_raylet:
|
|
if object_manager_port is not None:
|
|
(plasma_manager_name, p2,
|
|
plasma_manager_port) = ray.plasma.start_plasma_manager(
|
|
plasma_store_name,
|
|
redis_address,
|
|
plasma_manager_port=object_manager_port,
|
|
node_ip_address=node_ip_address,
|
|
num_retries=1,
|
|
run_profiler=RUN_PLASMA_MANAGER_PROFILER,
|
|
stdout_file=manager_stdout_file,
|
|
stderr_file=manager_stderr_file)
|
|
assert plasma_manager_port == object_manager_port
|
|
else:
|
|
(plasma_manager_name, p2,
|
|
plasma_manager_port) = ray.plasma.start_plasma_manager(
|
|
plasma_store_name,
|
|
redis_address,
|
|
node_ip_address=node_ip_address,
|
|
run_profiler=RUN_PLASMA_MANAGER_PROFILER,
|
|
stdout_file=manager_stdout_file,
|
|
stderr_file=manager_stderr_file)
|
|
else:
|
|
plasma_manager_port = None
|
|
plasma_manager_name = None
|
|
|
|
if cleanup:
|
|
all_processes[PROCESS_TYPE_PLASMA_STORE].append(p1)
|
|
record_log_files_in_redis(
|
|
redis_address,
|
|
node_ip_address, [store_stdout_file, store_stderr_file],
|
|
password=redis_password)
|
|
if not use_raylet:
|
|
if cleanup:
|
|
all_processes[PROCESS_TYPE_PLASMA_MANAGER].append(p2)
|
|
record_log_files_in_redis(redis_address, node_ip_address,
|
|
[manager_stdout_file, manager_stderr_file])
|
|
|
|
return ObjectStoreAddress(plasma_store_name, plasma_manager_name,
|
|
plasma_manager_port)
|
|
|
|
|
|
def start_worker(node_ip_address,
|
|
object_store_name,
|
|
object_store_manager_name,
|
|
local_scheduler_name,
|
|
redis_address,
|
|
worker_path,
|
|
stdout_file=None,
|
|
stderr_file=None,
|
|
cleanup=True):
|
|
"""This method starts a worker process.
|
|
|
|
Args:
|
|
node_ip_address (str): The IP address of the node that this worker is
|
|
running on.
|
|
object_store_name (str): The name of the object store.
|
|
object_store_manager_name (str): The name of the object store manager.
|
|
local_scheduler_name (str): The name of the local scheduler.
|
|
redis_address (str): The address that the Redis server is listening on.
|
|
worker_path (str): The path of the source code which the worker process
|
|
will run.
|
|
stdout_file: A file handle opened for writing to redirect stdout to. If
|
|
no redirection should happen, then this should be None.
|
|
stderr_file: A file handle opened for writing to redirect stderr to. If
|
|
no redirection should happen, then this should be None.
|
|
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
|
then this process will be killed by services.cleanup() when the
|
|
Python process that imported services exits. This is True by
|
|
default.
|
|
"""
|
|
command = [
|
|
sys.executable, "-u", worker_path,
|
|
"--node-ip-address=" + node_ip_address,
|
|
"--object-store-name=" + object_store_name,
|
|
"--object-store-manager-name=" + object_store_manager_name,
|
|
"--local-scheduler-name=" + local_scheduler_name,
|
|
"--redis-address=" + str(redis_address),
|
|
"--temp-dir=" + get_temp_root()
|
|
]
|
|
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
|
|
if cleanup:
|
|
all_processes[PROCESS_TYPE_WORKER].append(p)
|
|
record_log_files_in_redis(redis_address, node_ip_address,
|
|
[stdout_file, stderr_file])
|
|
|
|
|
|
def start_monitor(redis_address,
|
|
node_ip_address,
|
|
stdout_file=None,
|
|
stderr_file=None,
|
|
cleanup=True,
|
|
autoscaling_config=None,
|
|
redis_password=None):
|
|
"""Run a process to monitor the other processes.
|
|
|
|
Args:
|
|
redis_address (str): The address that the Redis server is listening on.
|
|
node_ip_address: The IP address of the node that this process will run
|
|
on.
|
|
stdout_file: A file handle opened for writing to redirect stdout to. If
|
|
no redirection should happen, then this should be None.
|
|
stderr_file: A file handle opened for writing to redirect stderr to. If
|
|
no redirection should happen, then this should be None.
|
|
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
|
then this process will be killed by services.cleanup() when the
|
|
Python process that imported services exits. This is True by
|
|
default.
|
|
autoscaling_config: path to autoscaling config file.
|
|
redis_password (str): The password of the redis server.
|
|
"""
|
|
monitor_path = os.path.join(
|
|
os.path.dirname(os.path.abspath(__file__)), "monitor.py")
|
|
command = [
|
|
sys.executable, "-u", monitor_path,
|
|
"--redis-address=" + str(redis_address)
|
|
]
|
|
if autoscaling_config:
|
|
command.append("--autoscaling-config=" + str(autoscaling_config))
|
|
if redis_password:
|
|
command.append("--redis-password=" + redis_password)
|
|
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
|
|
if cleanup:
|
|
all_processes[PROCESS_TYPE_MONITOR].append(p)
|
|
record_log_files_in_redis(
|
|
redis_address,
|
|
node_ip_address, [stdout_file, stderr_file],
|
|
password=redis_password)
|
|
|
|
|
|
def start_raylet_monitor(redis_address,
|
|
stdout_file=None,
|
|
stderr_file=None,
|
|
cleanup=True,
|
|
redis_password=None):
|
|
"""Run a process to monitor the other processes.
|
|
|
|
Args:
|
|
redis_address (str): The address that the Redis server is listening on.
|
|
stdout_file: A file handle opened for writing to redirect stdout to. If
|
|
no redirection should happen, then this should be None.
|
|
stderr_file: A file handle opened for writing to redirect stderr to. If
|
|
no redirection should happen, then this should be None.
|
|
cleanup (bool): True if using Ray in local mode. If cleanup is true,
|
|
then this process will be killed by services.cleanup() when the
|
|
Python process that imported services exits. This is True by
|
|
default.
|
|
redis_password (str): The password of the redis server.
|
|
"""
|
|
gcs_ip_address, gcs_port = redis_address.split(":")
|
|
redis_password = redis_password or ""
|
|
command = [RAYLET_MONITOR_EXECUTABLE, gcs_ip_address, gcs_port]
|
|
p = subprocess.Popen(command, stdout=stdout_file, stderr=stderr_file)
|
|
if cleanup:
|
|
all_processes[PROCESS_TYPE_MONITOR].append(p)
|
|
|
|
|
|
def start_ray_processes(address_info=None,
|
|
node_ip_address="127.0.0.1",
|
|
redis_port=None,
|
|
redis_shard_ports=None,
|
|
num_workers=None,
|
|
num_local_schedulers=1,
|
|
object_store_memory=None,
|
|
num_redis_shards=1,
|
|
redis_max_clients=None,
|
|
redis_protected_mode=False,
|
|
redis_password=None,
|
|
worker_path=None,
|
|
cleanup=True,
|
|
redirect_worker_output=False,
|
|
redirect_output=False,
|
|
include_global_scheduler=False,
|
|
include_log_monitor=False,
|
|
include_webui=False,
|
|
start_workers_from_local_scheduler=True,
|
|
resources=None,
|
|
plasma_directory=None,
|
|
huge_pages=False,
|
|
autoscaling_config=None,
|
|
use_raylet=False,
|
|
plasma_store_socket_name=None,
|
|
raylet_socket_name=None,
|
|
temp_dir=None):
|
|
"""Helper method to start Ray processes.
|
|
|
|
Args:
|
|
address_info (dict): A dictionary with address information for
|
|
processes that have already been started. If provided, address_info
|
|
will be modified to include processes that are newly started.
|
|
node_ip_address (str): The IP address of this node.
|
|
redis_port (int): The port that the primary Redis shard should listen
|
|
to. If None, then a random port will be chosen. If the key
|
|
"redis_address" is in address_info, then this argument will be
|
|
ignored.
|
|
redis_shard_ports: A list of the ports to use for the non-primary Redis
|
|
shards.
|
|
num_workers (int): The number of workers to start.
|
|
num_local_schedulers (int): The total number of local schedulers
|
|
required. This is also the total number of object stores required.
|
|
This method will start new instances of local schedulers and object
|
|
stores until there are num_local_schedulers existing instances of
|
|
each, including ones already registered with the given
|
|
address_info.
|
|
object_store_memory: The amount of memory (in bytes) to start the
|
|
object store with.
|
|
num_redis_shards: The number of Redis shards to start in addition to
|
|
the primary Redis shard.
|
|
redis_max_clients: If provided, attempt to configure Redis with this
|
|
maxclients number.
|
|
redis_protected_mode: True if we should start Redis in protected mode.
|
|
This will prevent clients from other machines from connecting and
|
|
is only done when Redis is started via ray.init().
|
|
redis_password (str): Prevents external clients without the password
|
|
from connecting to Redis if provided.
|
|
worker_path (str): The path of the source code that will be run by the
|
|
worker.
|
|
cleanup (bool): If cleanup is true, then the processes started here
|
|
will be killed by services.cleanup() when the Python process that
|
|
called this method exits.
|
|
redirect_worker_output: True if the stdout and stderr of worker
|
|
processes should be redirected to files.
|
|
redirect_output (bool): True if stdout and stderr for non-worker
|
|
processes should be redirected to files and false otherwise.
|
|
include_global_scheduler (bool): If include_global_scheduler is True,
|
|
then start a global scheduler process.
|
|
include_log_monitor (bool): If True, then start a log monitor to
|
|
monitor the log files for all processes on this node and push their
|
|
contents to Redis.
|
|
include_webui (bool): If True, then attempt to start the web UI. Note
|
|
that this is only possible with Python 3.
|
|
start_workers_from_local_scheduler (bool): If this flag is True, then
|
|
start the initial workers from the local scheduler. Else, start
|
|
them from Python.
|
|
resources: A dictionary mapping resource name to the quantity of that
|
|
resource.
|
|
plasma_directory: A directory where the Plasma memory mapped files will
|
|
be created.
|
|
huge_pages: Boolean flag indicating whether to start the Object
|
|
Store with hugetlbfs support. Requires plasma_directory.
|
|
autoscaling_config: path to autoscaling config file.
|
|
use_raylet: True if the new raylet code path should be used. This is
|
|
not supported yet.
|
|
plasma_store_socket_name (str): If provided, it will specify the socket
|
|
name used by the plasma store.
|
|
raylet_socket_name (str): If provided, it will specify the socket path
|
|
used by the raylet process.
|
|
temp_dir (str): If provided, it will specify the root temporary
|
|
directory for the Ray process.
|
|
|
|
Returns:
|
|
A dictionary of the address information for the processes that were
|
|
started.
|
|
"""
|
|
|
|
set_temp_root(temp_dir)
|
|
|
|
logger.info("Process STDOUT and STDERR is being redirected to {}.".format(
|
|
get_logs_dir_path()))
|
|
|
|
if resources is None:
|
|
resources = {}
|
|
if not isinstance(resources, list):
|
|
resources = num_local_schedulers * [resources]
|
|
|
|
if num_workers is not None:
|
|
workers_per_local_scheduler = num_local_schedulers * [num_workers]
|
|
else:
|
|
workers_per_local_scheduler = []
|
|
for resource_dict in resources:
|
|
cpus = resource_dict.get("CPU")
|
|
workers_per_local_scheduler.append(cpus if cpus is not None else
|
|
multiprocessing.cpu_count())
|
|
|
|
if address_info is None:
|
|
address_info = {}
|
|
address_info["node_ip_address"] = node_ip_address
|
|
|
|
if worker_path is None:
|
|
worker_path = os.path.join(
|
|
os.path.dirname(os.path.abspath(__file__)),
|
|
"workers/default_worker.py")
|
|
|
|
# Start Redis if there isn't already an instance running. TODO(rkn): We are
|
|
# suppressing the output of Redis because on Linux it prints a bunch of
|
|
# warning messages when it starts up. Instead of suppressing the output, we
|
|
# should address the warnings.
|
|
redis_address = address_info.get("redis_address")
|
|
redis_shards = address_info.get("redis_shards", [])
|
|
if redis_address is None:
|
|
redis_address, redis_shards = start_redis(
|
|
node_ip_address,
|
|
port=redis_port,
|
|
redis_shard_ports=redis_shard_ports,
|
|
num_redis_shards=num_redis_shards,
|
|
redis_max_clients=redis_max_clients,
|
|
use_raylet=use_raylet,
|
|
redirect_output=True,
|
|
redirect_worker_output=redirect_worker_output,
|
|
cleanup=cleanup,
|
|
protected_mode=redis_protected_mode,
|
|
password=redis_password)
|
|
address_info["redis_address"] = redis_address
|
|
time.sleep(0.1)
|
|
|
|
# Start monitoring the processes.
|
|
monitor_stdout_file, monitor_stderr_file = new_monitor_log_file(
|
|
redirect_output)
|
|
start_monitor(
|
|
redis_address,
|
|
node_ip_address,
|
|
stdout_file=monitor_stdout_file,
|
|
stderr_file=monitor_stderr_file,
|
|
cleanup=cleanup,
|
|
autoscaling_config=autoscaling_config,
|
|
redis_password=redis_password)
|
|
if use_raylet:
|
|
start_raylet_monitor(
|
|
redis_address,
|
|
stdout_file=monitor_stdout_file,
|
|
stderr_file=monitor_stderr_file,
|
|
cleanup=cleanup,
|
|
redis_password=redis_password)
|
|
if redis_shards == []:
|
|
# Get redis shards from primary redis instance.
|
|
redis_ip_address, redis_port = redis_address.split(":")
|
|
redis_client = redis.StrictRedis(
|
|
host=redis_ip_address, port=redis_port, password=redis_password)
|
|
redis_shards = redis_client.lrange("RedisShards", start=0, end=-1)
|
|
redis_shards = [ray.utils.decode(shard) for shard in redis_shards]
|
|
address_info["redis_shards"] = redis_shards
|
|
|
|
# Start the log monitor, if necessary.
|
|
if include_log_monitor:
|
|
log_monitor_stdout_file, log_monitor_stderr_file = (
|
|
new_log_monitor_log_file())
|
|
start_log_monitor(
|
|
redis_address,
|
|
node_ip_address,
|
|
stdout_file=log_monitor_stdout_file,
|
|
stderr_file=log_monitor_stderr_file,
|
|
cleanup=cleanup,
|
|
redis_password=redis_password)
|
|
|
|
# Start the global scheduler, if necessary.
|
|
if include_global_scheduler and not use_raylet:
|
|
global_scheduler_stdout_file, global_scheduler_stderr_file = (
|
|
new_global_scheduler_log_file(redirect_output))
|
|
start_global_scheduler(
|
|
redis_address,
|
|
node_ip_address,
|
|
stdout_file=global_scheduler_stdout_file,
|
|
stderr_file=global_scheduler_stderr_file,
|
|
cleanup=cleanup,
|
|
redis_password=redis_password)
|
|
|
|
# Initialize with existing services.
|
|
if "object_store_addresses" not in address_info:
|
|
address_info["object_store_addresses"] = []
|
|
object_store_addresses = address_info["object_store_addresses"]
|
|
if "local_scheduler_socket_names" not in address_info:
|
|
address_info["local_scheduler_socket_names"] = []
|
|
local_scheduler_socket_names = address_info["local_scheduler_socket_names"]
|
|
if "raylet_socket_names" not in address_info:
|
|
address_info["raylet_socket_names"] = []
|
|
raylet_socket_names = address_info["raylet_socket_names"]
|
|
|
|
# Get the ports to use for the object managers if any are provided.
|
|
object_manager_ports = (address_info["object_manager_ports"] if
|
|
"object_manager_ports" in address_info else None)
|
|
if not isinstance(object_manager_ports, list):
|
|
object_manager_ports = num_local_schedulers * [object_manager_ports]
|
|
assert len(object_manager_ports) == num_local_schedulers
|
|
|
|
# Start any object stores that do not yet exist.
|
|
for i in range(num_local_schedulers - len(object_store_addresses)):
|
|
# Start Plasma.
|
|
plasma_store_stdout_file, plasma_store_stderr_file = (
|
|
new_plasma_store_log_file(i, redirect_output))
|
|
|
|
# If we use raylet, plasma manager won't be started and we don't need
|
|
# to create temp files for them.
|
|
plasma_manager_stdout_file, plasma_manager_stderr_file = (
|
|
new_plasma_manager_log_file(i, redirect_output and not use_raylet))
|
|
|
|
object_store_address = start_plasma_store(
|
|
node_ip_address,
|
|
redis_address,
|
|
object_manager_port=object_manager_ports[i],
|
|
store_stdout_file=plasma_store_stdout_file,
|
|
store_stderr_file=plasma_store_stderr_file,
|
|
manager_stdout_file=plasma_manager_stdout_file,
|
|
manager_stderr_file=plasma_manager_stderr_file,
|
|
objstore_memory=object_store_memory,
|
|
cleanup=cleanup,
|
|
plasma_directory=plasma_directory,
|
|
huge_pages=huge_pages,
|
|
use_raylet=use_raylet,
|
|
plasma_store_socket_name=plasma_store_socket_name,
|
|
redis_password=redis_password)
|
|
object_store_addresses.append(object_store_address)
|
|
time.sleep(0.1)
|
|
|
|
if not use_raylet:
|
|
# Start any local schedulers that do not yet exist.
|
|
for i in range(
|
|
len(local_scheduler_socket_names), num_local_schedulers):
|
|
# Connect the local scheduler to the object store at the same
|
|
# index.
|
|
object_store_address = object_store_addresses[i]
|
|
plasma_address = "{}:{}".format(node_ip_address,
|
|
object_store_address.manager_port)
|
|
# Determine how many workers this local scheduler should start.
|
|
if start_workers_from_local_scheduler:
|
|
num_local_scheduler_workers = workers_per_local_scheduler[i]
|
|
workers_per_local_scheduler[i] = 0
|
|
else:
|
|
# If we're starting the workers from Python, the local
|
|
# scheduler should not start any workers.
|
|
num_local_scheduler_workers = 0
|
|
# Start the local scheduler. Note that if we do not wish to
|
|
# redirect the worker output, then we cannot redirect the local
|
|
# scheduler output.
|
|
local_scheduler_stdout_file, local_scheduler_stderr_file = (
|
|
new_local_scheduler_log_file(
|
|
i, redirect_output=redirect_worker_output))
|
|
local_scheduler_name = start_local_scheduler(
|
|
redis_address,
|
|
node_ip_address,
|
|
object_store_address.name,
|
|
object_store_address.manager_name,
|
|
worker_path,
|
|
plasma_address=plasma_address,
|
|
stdout_file=local_scheduler_stdout_file,
|
|
stderr_file=local_scheduler_stderr_file,
|
|
cleanup=cleanup,
|
|
resources=resources[i],
|
|
num_workers=num_local_scheduler_workers,
|
|
redis_password=redis_password)
|
|
local_scheduler_socket_names.append(local_scheduler_name)
|
|
|
|
# Make sure that we have exactly num_local_schedulers instances of
|
|
# object stores and local schedulers.
|
|
assert len(object_store_addresses) == num_local_schedulers
|
|
assert len(local_scheduler_socket_names) == num_local_schedulers
|
|
|
|
else:
|
|
# Start any raylets that do not exist yet.
|
|
for i in range(len(raylet_socket_names), num_local_schedulers):
|
|
raylet_stdout_file, raylet_stderr_file = new_raylet_log_file(
|
|
i, redirect_output=redirect_worker_output)
|
|
address_info["raylet_socket_names"].append(
|
|
start_raylet(
|
|
redis_address,
|
|
node_ip_address,
|
|
raylet_socket_name or get_raylet_socket_name(),
|
|
object_store_addresses[i].name,
|
|
worker_path,
|
|
resources=resources[i],
|
|
num_workers=workers_per_local_scheduler[i],
|
|
stdout_file=raylet_stdout_file,
|
|
stderr_file=raylet_stderr_file,
|
|
cleanup=cleanup,
|
|
redis_password=redis_password))
|
|
|
|
if not use_raylet:
|
|
# Start any workers that the local scheduler has not already started.
|
|
for i, num_local_scheduler_workers in enumerate(
|
|
workers_per_local_scheduler):
|
|
object_store_address = object_store_addresses[i]
|
|
local_scheduler_name = local_scheduler_socket_names[i]
|
|
for j in range(num_local_scheduler_workers):
|
|
worker_stdout_file, worker_stderr_file = new_worker_log_file(
|
|
i, j, redirect_output)
|
|
start_worker(
|
|
node_ip_address,
|
|
object_store_address.name,
|
|
object_store_address.manager_name,
|
|
local_scheduler_name,
|
|
redis_address,
|
|
worker_path,
|
|
stdout_file=worker_stdout_file,
|
|
stderr_file=worker_stderr_file,
|
|
cleanup=cleanup)
|
|
workers_per_local_scheduler[i] -= 1
|
|
|
|
# Make sure that we've started all the workers.
|
|
assert (sum(workers_per_local_scheduler) == 0)
|
|
|
|
# Try to start the web UI.
|
|
if include_webui:
|
|
ui_stdout_file, ui_stderr_file = new_webui_log_file()
|
|
address_info["webui_url"] = start_ui(
|
|
redis_address,
|
|
stdout_file=ui_stdout_file,
|
|
stderr_file=ui_stderr_file,
|
|
cleanup=cleanup)
|
|
else:
|
|
address_info["webui_url"] = ""
|
|
# Return the addresses of the relevant processes.
|
|
return address_info
|
|
|
|
|
|
def start_ray_node(node_ip_address,
|
|
redis_address,
|
|
object_manager_ports=None,
|
|
num_workers=0,
|
|
num_local_schedulers=1,
|
|
object_store_memory=None,
|
|
redis_password=None,
|
|
worker_path=None,
|
|
cleanup=True,
|
|
redirect_worker_output=False,
|
|
redirect_output=False,
|
|
resources=None,
|
|
plasma_directory=None,
|
|
huge_pages=False,
|
|
use_raylet=False,
|
|
plasma_store_socket_name=None,
|
|
raylet_socket_name=None,
|
|
temp_dir=None):
|
|
"""Start the Ray processes for a single node.
|
|
|
|
This assumes that the Ray processes on some master node have already been
|
|
started.
|
|
|
|
Args:
|
|
node_ip_address (str): The IP address of this node.
|
|
redis_address (str): The address of the Redis server.
|
|
object_manager_ports (list): A list of the ports to use for the object
|
|
managers. There should be one per object manager being started on
|
|
this node (typically just one).
|
|
num_workers (int): The number of workers to start.
|
|
num_local_schedulers (int): The number of local schedulers to start.
|
|
This is also the number of plasma stores and plasma managers to
|
|
start.
|
|
object_store_memory (int): The maximum amount of memory (in bytes) to
|
|
let the plasma store use.
|
|
redis_password (str): Prevents external clients without the password
|
|
from connecting to Redis if provided.
|
|
worker_path (str): The path of the source code that will be run by the
|
|
worker.
|
|
cleanup (bool): If cleanup is true, then the processes started here
|
|
will be killed by services.cleanup() when the Python process that
|
|
called this method exits.
|
|
redirect_worker_output: True if the stdout and stderr of worker
|
|
processes should be redirected to files.
|
|
redirect_output (bool): True if stdout and stderr for non-worker
|
|
processes should be redirected to files and false otherwise.
|
|
resources: A dictionary mapping resource name to the available quantity
|
|
of that resource.
|
|
plasma_directory: A directory where the Plasma memory mapped files will
|
|
be created.
|
|
huge_pages: Boolean flag indicating whether to start the Object
|
|
Store with hugetlbfs support. Requires plasma_directory.
|
|
use_raylet: True if the new raylet code path should be used. This is
|
|
not supported yet.
|
|
plasma_store_socket_name (str): If provided, it will specify the socket
|
|
name used by the plasma store.
|
|
raylet_socket_name (str): If provided, it will specify the socket path
|
|
used by the raylet process.
|
|
temp_dir (str): If provided, it will specify the root temporary
|
|
directory for the Ray process.
|
|
|
|
Returns:
|
|
A dictionary of the address information for the processes that were
|
|
started.
|
|
"""
|
|
address_info = {
|
|
"redis_address": redis_address,
|
|
"object_manager_ports": object_manager_ports
|
|
}
|
|
return start_ray_processes(
|
|
address_info=address_info,
|
|
node_ip_address=node_ip_address,
|
|
num_workers=num_workers,
|
|
num_local_schedulers=num_local_schedulers,
|
|
object_store_memory=object_store_memory,
|
|
redis_password=redis_password,
|
|
worker_path=worker_path,
|
|
include_log_monitor=True,
|
|
cleanup=cleanup,
|
|
redirect_worker_output=redirect_worker_output,
|
|
redirect_output=redirect_output,
|
|
resources=resources,
|
|
plasma_directory=plasma_directory,
|
|
huge_pages=huge_pages,
|
|
use_raylet=use_raylet,
|
|
plasma_store_socket_name=plasma_store_socket_name,
|
|
raylet_socket_name=raylet_socket_name,
|
|
temp_dir=temp_dir)
|
|
|
|
|
|
def start_ray_head(address_info=None,
|
|
node_ip_address="127.0.0.1",
|
|
redis_port=None,
|
|
redis_shard_ports=None,
|
|
num_workers=0,
|
|
num_local_schedulers=1,
|
|
object_store_memory=None,
|
|
worker_path=None,
|
|
cleanup=True,
|
|
redirect_worker_output=False,
|
|
redirect_output=False,
|
|
start_workers_from_local_scheduler=True,
|
|
resources=None,
|
|
num_redis_shards=None,
|
|
redis_max_clients=None,
|
|
redis_protected_mode=False,
|
|
redis_password=None,
|
|
include_webui=True,
|
|
plasma_directory=None,
|
|
huge_pages=False,
|
|
autoscaling_config=None,
|
|
use_raylet=False,
|
|
plasma_store_socket_name=None,
|
|
raylet_socket_name=None,
|
|
temp_dir=None):
|
|
"""Start Ray in local mode.
|
|
|
|
Args:
|
|
address_info (dict): A dictionary with address information for
|
|
processes that have already been started. If provided, address_info
|
|
will be modified to include processes that are newly started.
|
|
node_ip_address (str): The IP address of this node.
|
|
redis_port (int): The port that the primary Redis shard should listen
|
|
to. If None, then a random port will be chosen. If the key
|
|
"redis_address" is in address_info, then this argument will be
|
|
ignored.
|
|
redis_shard_ports: A list of the ports to use for the non-primary Redis
|
|
shards.
|
|
num_workers (int): The number of workers to start.
|
|
num_local_schedulers (int): The total number of local schedulers
|
|
required. This is also the total number of object stores required.
|
|
This method will start new instances of local schedulers and object
|
|
stores until there are at least num_local_schedulers existing
|
|
instances of each, including ones already registered with the given
|
|
address_info.
|
|
object_store_memory: The amount of memory (in bytes) to start the
|
|
object store with.
|
|
worker_path (str): The path of the source code that will be run by the
|
|
worker.
|
|
cleanup (bool): If cleanup is true, then the processes started here
|
|
will be killed by services.cleanup() when the Python process that
|
|
called this method exits.
|
|
redirect_worker_output: True if the stdout and stderr of worker
|
|
processes should be redirected to files.
|
|
redirect_output (bool): True if stdout and stderr for non-worker
|
|
processes should be redirected to files and false otherwise.
|
|
start_workers_from_local_scheduler (bool): If this flag is True, then
|
|
start the initial workers from the local scheduler. Else, start
|
|
them from Python.
|
|
resources: A dictionary mapping resource name to the available quantity
|
|
of that resource.
|
|
num_redis_shards: The number of Redis shards to start in addition to
|
|
the primary Redis shard.
|
|
redis_max_clients: If provided, attempt to configure Redis with this
|
|
maxclients number.
|
|
redis_protected_mode: True if we should start Redis in protected mode.
|
|
This will prevent clients from other machines from connecting and
|
|
is only done when Redis is started via ray.init().
|
|
redis_password (str): Prevents external clients without the password
|
|
from connecting to Redis if provided.
|
|
include_webui: True if the UI should be started and false otherwise.
|
|
plasma_directory: A directory where the Plasma memory mapped files will
|
|
be created.
|
|
huge_pages: Boolean flag indicating whether to start the Object
|
|
Store with hugetlbfs support. Requires plasma_directory.
|
|
autoscaling_config: path to autoscaling config file.
|
|
use_raylet: True if the new raylet code path should be used. This is
|
|
not supported yet.
|
|
plasma_store_socket_name (str): If provided, it will specify the socket
|
|
name used by the plasma store.
|
|
raylet_socket_name (str): If provided, it will specify the socket path
|
|
used by the raylet process.
|
|
temp_dir (str): If provided, it will specify the root temporary
|
|
directory for the Ray process.
|
|
|
|
Returns:
|
|
A dictionary of the address information for the processes that were
|
|
started.
|
|
"""
|
|
num_redis_shards = 1 if num_redis_shards is None else num_redis_shards
|
|
return start_ray_processes(
|
|
address_info=address_info,
|
|
node_ip_address=node_ip_address,
|
|
redis_port=redis_port,
|
|
redis_shard_ports=redis_shard_ports,
|
|
num_workers=num_workers,
|
|
num_local_schedulers=num_local_schedulers,
|
|
object_store_memory=object_store_memory,
|
|
worker_path=worker_path,
|
|
cleanup=cleanup,
|
|
redirect_worker_output=redirect_worker_output,
|
|
redirect_output=redirect_output,
|
|
include_global_scheduler=True,
|
|
include_log_monitor=True,
|
|
include_webui=include_webui,
|
|
start_workers_from_local_scheduler=start_workers_from_local_scheduler,
|
|
resources=resources,
|
|
num_redis_shards=num_redis_shards,
|
|
redis_max_clients=redis_max_clients,
|
|
redis_protected_mode=redis_protected_mode,
|
|
redis_password=redis_password,
|
|
plasma_directory=plasma_directory,
|
|
huge_pages=huge_pages,
|
|
autoscaling_config=autoscaling_config,
|
|
use_raylet=use_raylet,
|
|
plasma_store_socket_name=plasma_store_socket_name,
|
|
raylet_socket_name=raylet_socket_name,
|
|
temp_dir=temp_dir)
|