mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 18:44:07 +08:00
Fixes and cleanups for the multinode setting. (#143)
* Add function for driver to get address info from Redis. * Use Redis address instead of Redis port. * Configure Redis to run in unprotected mode. * Add method for starting Ray processes on non-head node. * Pass in correct node ip address to start_plasma_manager. * Script for starting Ray processes. * Handle the case where an object already exists in the store. Maybe this should also compare the object hashes. * Have driver get info from Redis when start_ray_local=False. * Fix. * Script for killing ray processes. * Catch some errors when the main_loop in a worker throws an exception. * Allow redirecting stdout and stderr to /dev/null. * Wrap start_ray.py in a shell script. * More helpful error messages. * Fixes. * Wait for redis server to start up before configuring it. * Allow seeding of deterministic object ID generation. * Small change.
This commit is contained in:
committed by
Philipp Moritz
parent
c9c1b3e6af
commit
6cd02d71f8
+148
-37
@@ -5,6 +5,7 @@ from __future__ import print_function
|
||||
import psutil
|
||||
import os
|
||||
import random
|
||||
import redis
|
||||
import signal
|
||||
import string
|
||||
import subprocess
|
||||
@@ -62,7 +63,8 @@ def cleanup():
|
||||
continue
|
||||
successfully_shut_down = False
|
||||
if successfully_shut_down:
|
||||
print("Successfully shut down Ray.")
|
||||
if len(all_processes) > 0:
|
||||
print("Successfully shut down Ray.")
|
||||
else:
|
||||
print("Ray did not shut down properly.")
|
||||
all_processes = []
|
||||
@@ -70,7 +72,7 @@ def cleanup():
|
||||
def all_processes_alive():
|
||||
return all([p.poll() is None for p in all_processes])
|
||||
|
||||
def start_redis(num_retries=20, cleanup=True):
|
||||
def start_redis(num_retries=20, cleanup=True, redirect_output=False):
|
||||
"""Start a Redis server.
|
||||
|
||||
Args:
|
||||
@@ -78,6 +80,8 @@ def start_redis(num_retries=20, cleanup=True):
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by serices.cleanup() when the Python process
|
||||
that imported services exits.
|
||||
redirect_output (bool): True if stdout and stderr should be redirected to
|
||||
/dev/null.
|
||||
|
||||
Returns:
|
||||
The port used by Redis.
|
||||
@@ -94,18 +98,48 @@ def start_redis(num_retries=20, cleanup=True):
|
||||
if counter > 0:
|
||||
print("Redis failed to start, retrying now.")
|
||||
port = new_port()
|
||||
p = subprocess.Popen([redis_filepath, "--port", str(port), "--loglevel", "warning", "--loadmodule", redis_module])
|
||||
with open(os.devnull, "w") as FNULL:
|
||||
stdout = FNULL if redirect_output else None
|
||||
stderr = FNULL if redirect_output else None
|
||||
p = subprocess.Popen([redis_filepath, "--port", str(port), "--loglevel", "warning", "--loadmodule", redis_module], stdout=stdout, stderr=stderr)
|
||||
time.sleep(0.1)
|
||||
# Check if Redis successfully started (or at least if it the executable did
|
||||
# not exit within 0.1 seconds).
|
||||
if p.poll() is None:
|
||||
if cleanup:
|
||||
all_processes.append(p)
|
||||
return port
|
||||
break
|
||||
counter += 1
|
||||
raise Exception("Couldn't start Redis.")
|
||||
if counter == num_retries:
|
||||
raise Exception("Couldn't start Redis.")
|
||||
|
||||
def start_global_scheduler(redis_address, cleanup=True):
|
||||
# Create a Redis client just for configuring Redis.
|
||||
redis_client = redis.StrictRedis(host="127.0.0.1", port=port)
|
||||
|
||||
# Wait for the Redis server to start.
|
||||
counter = 0
|
||||
while counter < num_retries:
|
||||
try:
|
||||
# Run some random command and see if it worked.
|
||||
redis_client.client_list()
|
||||
except redis.ConnectionError as e:
|
||||
# Wait a little bit.
|
||||
time.sleep(1)
|
||||
counter += 1
|
||||
else:
|
||||
break
|
||||
if counter == num_retries:
|
||||
raise Exception("The Redis server did not start properly.")
|
||||
|
||||
# Configure Redis to generate keyspace notifications. TODO(rkn): Change this
|
||||
# to only generate notifications for the export keys.
|
||||
redis_client.config_set("notify-keyspace-events", "Kl")
|
||||
# Configure Redis to not run in protected mode so that processes on other
|
||||
# hosts can connect to it. TODO(rkn): Do this in a more secure way.
|
||||
redis_client.config_set("protected-mode", "no")
|
||||
return port
|
||||
|
||||
def start_global_scheduler(redis_address, cleanup=True, redirect_output=False):
|
||||
"""Start a global scheduler process.
|
||||
|
||||
Args:
|
||||
@@ -113,12 +147,14 @@ def start_global_scheduler(redis_address, cleanup=True):
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by serices.cleanup() when the Python process
|
||||
that imported services exits.
|
||||
redirect_output (bool): True if stdout and stderr should be redirected to
|
||||
/dev/null.
|
||||
"""
|
||||
p = global_scheduler.start_global_scheduler(redis_address)
|
||||
p = global_scheduler.start_global_scheduler(redis_address, redirect_output=redirect_output)
|
||||
if cleanup:
|
||||
all_processes.append(p)
|
||||
|
||||
def start_local_scheduler(redis_address, node_ip_address, plasma_store_name, plasma_manager_name, plasma_address=None, cleanup=True):
|
||||
def start_local_scheduler(redis_address, node_ip_address, plasma_store_name, plasma_manager_name, plasma_address=None, cleanup=True, redirect_output=False):
|
||||
"""Start a local scheduler process.
|
||||
|
||||
Args:
|
||||
@@ -131,24 +167,28 @@ def start_local_scheduler(redis_address, node_ip_address, plasma_store_name, pla
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by serices.cleanup() when the Python process
|
||||
that imported services exits.
|
||||
redirect_output (bool): True if stdout and stderr should be redirected to
|
||||
/dev/null.
|
||||
|
||||
Return:
|
||||
The name of the local scheduler socket.
|
||||
"""
|
||||
local_scheduler_name, p = photon.start_local_scheduler(plasma_store_name, plasma_manager_name, node_ip_address=node_ip_address, redis_address=redis_address, plasma_address=plasma_address, use_profiler=RUN_PHOTON_PROFILER)
|
||||
local_scheduler_name, p = photon.start_local_scheduler(plasma_store_name, plasma_manager_name, node_ip_address=node_ip_address, redis_address=redis_address, plasma_address=plasma_address, use_profiler=RUN_PHOTON_PROFILER, redirect_output=redirect_output)
|
||||
if cleanup:
|
||||
all_processes.append(p)
|
||||
return local_scheduler_name
|
||||
|
||||
def start_objstore(node_ip_address, redis_address, cleanup=True):
|
||||
def start_objstore(node_ip_address, redis_address, cleanup=True, redirect_output=False):
|
||||
"""This method starts an object store process.
|
||||
|
||||
Args:
|
||||
node_ip_address (str): The ip address of the node running the object store.
|
||||
node_ip_address (str): The IP address of the node running the object store.
|
||||
redis_address (str): The address of the Redis instance to connect to.
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by serices.cleanup() when the Python process
|
||||
that imported services exits.
|
||||
redirect_output (bool): True if stdout and stderr should be redirected to
|
||||
/dev/null.
|
||||
|
||||
Return:
|
||||
A tuple of the Plasma store socket name, the Plasma manager socket name, and
|
||||
@@ -164,16 +204,16 @@ def start_objstore(node_ip_address, redis_address, cleanup=True):
|
||||
else:
|
||||
plasma_store_memory = int(system_memory * 0.75)
|
||||
# Start the Plasma store.
|
||||
plasma_store_name, p1 = plasma.start_plasma_store(plasma_store_memory=plasma_store_memory, use_profiler=RUN_PLASMA_STORE_PROFILER)
|
||||
plasma_store_name, p1 = plasma.start_plasma_store(plasma_store_memory=plasma_store_memory, use_profiler=RUN_PLASMA_STORE_PROFILER, redirect_output=redirect_output)
|
||||
# Start the plasma manager.
|
||||
plasma_manager_name, p2, plasma_manager_port = plasma.start_plasma_manager(plasma_store_name, redis_address, run_profiler=RUN_PLASMA_MANAGER_PROFILER)
|
||||
plasma_manager_name, p2, plasma_manager_port = plasma.start_plasma_manager(plasma_store_name, redis_address, node_ip_address=node_ip_address, run_profiler=RUN_PLASMA_MANAGER_PROFILER, redirect_output=redirect_output)
|
||||
if cleanup:
|
||||
all_processes.append(p1)
|
||||
all_processes.append(p2)
|
||||
|
||||
return plasma_store_name, plasma_manager_name, plasma_manager_port
|
||||
|
||||
def start_worker(node_ip_address, object_store_name, object_store_manager_name, local_scheduler_name, redis_port, worker_path, cleanup=True):
|
||||
def start_worker(node_ip_address, object_store_name, object_store_manager_name, local_scheduler_name, redis_address, worker_path, cleanup=True, redirect_output=False):
|
||||
"""This method starts a worker process.
|
||||
|
||||
Args:
|
||||
@@ -182,12 +222,14 @@ def start_worker(node_ip_address, object_store_name, object_store_manager_name,
|
||||
object_store_name (str): The name of the object store.
|
||||
object_store_manager_name (str): The name of the object store manager.
|
||||
local_scheduler_name (str): The name of the local scheduler.
|
||||
redis_port (int): The port that the Redis server is listening on.
|
||||
redis_address (int): The address that the Redis server is listening on.
|
||||
worker_path (str): The path of the source code which the worker process will
|
||||
run.
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by services.cleanup() when the Python process
|
||||
that imported services exits. This is True by default.
|
||||
redirect_output (bool): True if stdout and stderr should be redirected to
|
||||
/dev/null.
|
||||
"""
|
||||
command = ["python",
|
||||
worker_path,
|
||||
@@ -195,12 +237,15 @@ def start_worker(node_ip_address, object_store_name, object_store_manager_name,
|
||||
"--object-store-name=" + object_store_name,
|
||||
"--object-store-manager-name=" + object_store_manager_name,
|
||||
"--local-scheduler-name=" + local_scheduler_name,
|
||||
"--redis-port=" + str(redis_port)]
|
||||
p = subprocess.Popen(command)
|
||||
"--redis-address=" + str(redis_address)]
|
||||
with open(os.devnull, "w") as FNULL:
|
||||
stdout = FNULL if redirect_output else None
|
||||
stderr = FNULL if redirect_output else None
|
||||
p = subprocess.Popen(command, stdout=stdout, stderr=stderr)
|
||||
if cleanup:
|
||||
all_processes.append(p)
|
||||
|
||||
def start_webui(redis_port, cleanup=True):
|
||||
def start_webui(redis_port, cleanup=True, redirect_output=False):
|
||||
"""This method starts the web interface.
|
||||
|
||||
Args:
|
||||
@@ -208,53 +253,57 @@ def start_webui(redis_port, cleanup=True):
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by services.cleanup() when the Python process
|
||||
that imported services exits. This is True by default.
|
||||
redirect_output (bool): True if stdout and stderr should be redirected to
|
||||
/dev/null.
|
||||
"""
|
||||
executable = "nodejs" if sys.platform == "linux" or sys.platform == "linux2" else "node"
|
||||
command = [executable, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../webui/index.js"), str(redis_port)]
|
||||
with open("/tmp/webui_out.txt", "wb") as out:
|
||||
p = subprocess.Popen(command, stdout=out)
|
||||
with open(os.devnull, "w") as FNULL:
|
||||
stdout = FNULL if redirect_output else out
|
||||
stderr = FNULL if redirect_output else None
|
||||
p = subprocess.Popen(command, stdout=stdout, stderr=stderr)
|
||||
if cleanup:
|
||||
all_processes.append(p)
|
||||
|
||||
def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, num_local_schedulers=1, worker_path=None):
|
||||
"""Start Ray in local mode.
|
||||
def start_ray_node(node_ip_address, redis_address, num_workers=0, num_local_schedulers=1, worker_path=None, cleanup=True, redirect_output=False):
|
||||
"""Start the Ray processes for a single node.
|
||||
|
||||
This assumes that the Ray processes on some master node have already been
|
||||
started.
|
||||
|
||||
Args:
|
||||
node_ip_address (str): The IP address of this node.
|
||||
redis_address (str): The address of the Redis server.
|
||||
num_workers (int): The number of workers to start.
|
||||
num_local_schedulers (int): The number of local schedulers to start. This is
|
||||
also the number of plasma stores and plasma managers to start.
|
||||
worker_path (str): The path of the source code that will be run by the
|
||||
worker.
|
||||
|
||||
Returns:
|
||||
This returns a dictionary of the address information for the processes that
|
||||
were started.
|
||||
cleanup (bool): If cleanup is true, then the processes started here will be
|
||||
killed by services.cleanup() when the Python process that called this
|
||||
method exits.
|
||||
redirect_output (bool): True if stdout and stderr should be redirected to
|
||||
/dev/null.
|
||||
"""
|
||||
if worker_path is None:
|
||||
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "workers/default_worker.py")
|
||||
# Start Redis.
|
||||
redis_port = start_redis(cleanup=True)
|
||||
redis_address = address(node_ip_address, redis_port)
|
||||
time.sleep(0.1)
|
||||
# Start the global scheduler.
|
||||
start_global_scheduler(redis_address, cleanup=True)
|
||||
object_store_names = []
|
||||
object_store_manager_names = []
|
||||
local_scheduler_names = []
|
||||
for _ in range(num_local_schedulers):
|
||||
# Start Plasma.
|
||||
object_store_name, object_store_manager_name, object_store_manager_port = start_objstore(node_ip_address, redis_address, cleanup=True)
|
||||
object_store_name, object_store_manager_name, object_store_manager_port = start_objstore(node_ip_address, redis_address, cleanup=cleanup, redirect_output=redirect_output)
|
||||
object_store_names.append(object_store_name)
|
||||
object_store_manager_names.append(object_store_manager_name)
|
||||
time.sleep(0.1)
|
||||
# Start the local scheduler.
|
||||
plasma_address = "{}:{}".format(node_ip_address, object_store_manager_port)
|
||||
local_scheduler_name = start_local_scheduler(redis_address, node_ip_address, object_store_name, object_store_manager_name, plasma_address=plasma_address, cleanup=True)
|
||||
local_scheduler_name = start_local_scheduler(redis_address, node_ip_address, object_store_name, object_store_manager_name, plasma_address=plasma_address, cleanup=cleanup, redirect_output=redirect_output)
|
||||
local_scheduler_names.append(local_scheduler_name)
|
||||
time.sleep(0.1)
|
||||
# Aggregate the address information together.
|
||||
address_info = {"node_ip_address": node_ip_address,
|
||||
"redis_port": redis_port,
|
||||
"object_store_names": object_store_names,
|
||||
"object_store_manager_names": object_store_manager_names,
|
||||
"local_scheduler_names": local_scheduler_names}
|
||||
@@ -264,9 +313,71 @@ def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, num_local_schedu
|
||||
address_info["object_store_names"][i % num_local_schedulers],
|
||||
address_info["object_store_manager_names"][i % num_local_schedulers],
|
||||
address_info["local_scheduler_names"][i % num_local_schedulers],
|
||||
redis_port,
|
||||
redis_address,
|
||||
worker_path,
|
||||
cleanup=True)
|
||||
cleanup=cleanup,
|
||||
redirect_output=redirect_output)
|
||||
# Return the addresses of the relevant processes.
|
||||
start_webui(redis_port)
|
||||
return address_info
|
||||
|
||||
def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, num_local_schedulers=1, worker_path=None, cleanup=True, redirect_output=False):
|
||||
"""Start Ray in local mode.
|
||||
|
||||
Args:
|
||||
node_ip_address (str): The IP address of this node.
|
||||
num_workers (int): The number of workers to start.
|
||||
num_local_schedulers (int): The number of local schedulers to start. This is
|
||||
also the number of plasma stores and plasma managers to start.
|
||||
worker_path (str): The path of the source code that will be run by the
|
||||
worker.
|
||||
cleanup (bool): If cleanup is true, then the processes started here will be
|
||||
killed by services.cleanup() when the Python process that called this
|
||||
method exits.
|
||||
redirect_output (bool): True if stdout and stderr should be redirected to
|
||||
/dev/null.
|
||||
|
||||
Returns:
|
||||
This returns a dictionary of the address information for the processes that
|
||||
were started.
|
||||
"""
|
||||
if worker_path is None:
|
||||
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "workers/default_worker.py")
|
||||
# Start Redis.
|
||||
redis_port = start_redis(cleanup=cleanup, redirect_output=redirect_output)
|
||||
redis_address = address(node_ip_address, redis_port)
|
||||
time.sleep(0.1)
|
||||
# Start the global scheduler.
|
||||
start_global_scheduler(redis_address, cleanup=cleanup, redirect_output=redirect_output)
|
||||
object_store_names = []
|
||||
object_store_manager_names = []
|
||||
local_scheduler_names = []
|
||||
for _ in range(num_local_schedulers):
|
||||
# Start Plasma.
|
||||
object_store_name, object_store_manager_name, object_store_manager_port = start_objstore(node_ip_address, redis_address, cleanup=cleanup, redirect_output=redirect_output)
|
||||
object_store_names.append(object_store_name)
|
||||
object_store_manager_names.append(object_store_manager_name)
|
||||
time.sleep(0.1)
|
||||
# Start the local scheduler.
|
||||
plasma_address = "{}:{}".format(node_ip_address, object_store_manager_port)
|
||||
local_scheduler_name = start_local_scheduler(redis_address, node_ip_address, object_store_name, object_store_manager_name, plasma_address=plasma_address, cleanup=cleanup, redirect_output=redirect_output)
|
||||
local_scheduler_names.append(local_scheduler_name)
|
||||
time.sleep(0.1)
|
||||
# Aggregate the address information together.
|
||||
address_info = {"node_ip_address": node_ip_address,
|
||||
"redis_address": redis_address,
|
||||
"object_store_names": object_store_names,
|
||||
"object_store_manager_names": object_store_manager_names,
|
||||
"local_scheduler_names": local_scheduler_names}
|
||||
# Start the workers.
|
||||
for i in range(num_workers):
|
||||
start_worker(address_info["node_ip_address"],
|
||||
address_info["object_store_names"][i % num_local_schedulers],
|
||||
address_info["object_store_manager_names"][i % num_local_schedulers],
|
||||
address_info["local_scheduler_names"][i % num_local_schedulers],
|
||||
redis_address,
|
||||
worker_path,
|
||||
cleanup=cleanup,
|
||||
redirect_output=redirect_output)
|
||||
# Return the addresses of the relevant processes.
|
||||
start_webui(redis_port, cleanup=cleanup, redirect_output=redirect_output)
|
||||
return address_info
|
||||
|
||||
+130
-30
@@ -419,7 +419,16 @@ class Worker(object):
|
||||
# Serialize and put the object in the object store.
|
||||
schema, size, serialized = numbuf_serialize(value)
|
||||
size = size + 4096 * 4 + 8 # The last 8 bytes are for the metadata offset. This is temporary.
|
||||
buff = self.plasma_client.create(objectid.id(), size, bytearray(schema))
|
||||
try:
|
||||
buff = self.plasma_client.create(objectid.id(), size, bytearray(schema))
|
||||
except RuntimeError as e:
|
||||
if e.args != ("an object with this ID could not be created",):
|
||||
raise
|
||||
# The object already exists in the object store, so there is no need to
|
||||
# add it again. TODO(rkn): We need to compare the hashes and make sure
|
||||
# that the objects are in fact the same.
|
||||
print("This object already exists in the object store.")
|
||||
return
|
||||
data = np.frombuffer(buff.buffer, dtype="byte")[8:]
|
||||
metadata_offset = numbuf.write_to_buffer(serialized, memoryview(data))
|
||||
np.frombuffer(buff.buffer, dtype="int64", count=1)[0] = metadata_offset
|
||||
@@ -617,7 +626,52 @@ def initialize_numbuf(worker=global_worker):
|
||||
register_class(RayGetError)
|
||||
register_class(RayGetArgumentError)
|
||||
|
||||
def init(start_ray_local=False, num_workers=None, num_local_schedulers=1, driver_mode=SCRIPT_MODE):
|
||||
def get_address_info_from_redis_helper(redis_address, node_ip_address):
|
||||
redis_host, redis_port = redis_address.split(":")
|
||||
# For this command to work, some other client (on the same machine as Redis)
|
||||
# must have run "CONFIG SET protected-mode no".
|
||||
redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port))
|
||||
# The client table prefix must be kept in sync with the file
|
||||
# "src/common/redis_module/ray_redis_module.c" where it is defined.
|
||||
REDIS_CLIENT_TABLE_PREFIX = "CL:"
|
||||
client_keys = redis_client.keys("{}*".format(REDIS_CLIENT_TABLE_PREFIX))
|
||||
# Filter to clients on the same node and do some basic checking.
|
||||
plasma_managers = []
|
||||
local_schedulers = []
|
||||
for key in client_keys:
|
||||
info = redis_client.hgetall(key)
|
||||
assert b"ray_client_id" in info
|
||||
assert b"node_ip_address" in info
|
||||
assert b"client_type" in info
|
||||
if info[b"node_ip_address"].decode("ascii") == node_ip_address:
|
||||
if info[b"client_type"].decode("ascii") == "plasma_manager":
|
||||
plasma_managers.append(info)
|
||||
elif info[b"client_type"].decode("ascii") == "photon":
|
||||
local_schedulers.append(info)
|
||||
# Make sure that we got at one plasma manager and local scheduler.
|
||||
assert len(plasma_managers) == 1
|
||||
assert len(local_schedulers) == 1
|
||||
client_info = {"node_ip_address": node_ip_address,
|
||||
"redis_address": redis_address,
|
||||
"store_socket_name": plasma_managers[0][b"store_socket_name"].decode("ascii"),
|
||||
"manager_socket_name": plasma_managers[0][b"manager_socket_name"].decode("ascii"),
|
||||
"local_scheduler_socket_name": local_schedulers[0][b"local_scheduler_socket_name"].decode("ascii")}
|
||||
return client_info
|
||||
|
||||
def get_address_info_from_redis(redis_address, node_ip_address, num_retries=10):
|
||||
counter = 0
|
||||
while True:
|
||||
try:
|
||||
return get_address_info_from_redis_helper(redis_address, node_ip_address)
|
||||
except Exception as e:
|
||||
if counter == num_retries:
|
||||
raise
|
||||
# Some of the information may not be in Redis yet, so wait a little bit.
|
||||
print("Some processes that the driver needs to connect to have not registered with Redis, so retrying.")
|
||||
time.sleep(1)
|
||||
counter += 1
|
||||
|
||||
def init(node_ip_address="127.0.0.1", redis_address=None, start_ray_local=False, object_id_seed=None, num_workers=None, num_local_schedulers=None, driver_mode=SCRIPT_MODE):
|
||||
"""Either connect to an existing Ray cluster or start one and connect to it.
|
||||
|
||||
This method handles two cases. Either a Ray cluster already exists and we
|
||||
@@ -625,44 +679,72 @@ def init(start_ray_local=False, num_workers=None, num_local_schedulers=1, driver
|
||||
with a Ray cluster and attach to the newly started cluster.
|
||||
|
||||
Args:
|
||||
start_ray_local (Optional[bool]): If True then this will start a scheduler
|
||||
an object store, and some workers. If False, this will attach to an
|
||||
existing Ray cluster.
|
||||
num_workers (Optional[int]): The number of workers to start if
|
||||
node_ip_address (str): The IP address of the node that we are on.
|
||||
redis_address (str): The address of the Redis server to connect to. This
|
||||
should only be provided if start_ray_local is False.
|
||||
start_ray_local (bool): If True then this will start Redis, a global
|
||||
scheduler, a local scheduler, a plasma store, a plasma manager, and some
|
||||
workers. It will also kill these processes when Python exits. If False,
|
||||
this will attach to an existing Ray cluster.
|
||||
object_id_seed (int): Used to seed the deterministic generation of object
|
||||
IDs. The same value can be used across multiple runs of the same job in
|
||||
order to generate the object IDs in a consistent manner. However, the same
|
||||
ID should not be used for different jobs.
|
||||
num_workers (int): The number of workers to start. This is only provided if
|
||||
start_ray_local is True.
|
||||
num_local_schedulers (Optional[int]): The number of local schedulers to
|
||||
start if start_ray_local is True.
|
||||
driver_mode (Optional[bool]): The mode in which to start the driver. This
|
||||
should be one of SCRIPT_MODE, PYTHON_MODE, and SILENT_MODE.
|
||||
num_local_schedulers (int): The number of local schedulers to start. This is
|
||||
only provided if start_ray_local is True.
|
||||
driver_mode (bool): The mode in which to start the driver. This should be
|
||||
one of ray.SCRIPT_MODE, ray.PYTHON_MODE, and ray.SILENT_MODE.
|
||||
|
||||
Returns:
|
||||
The address of the Redis server.
|
||||
Address information about the started processes.
|
||||
|
||||
Raises:
|
||||
Exception: An exception is raised if an inappropriate combination of
|
||||
arguments is passed in.
|
||||
"""
|
||||
check_main_thread()
|
||||
if driver_mode not in [SCRIPT_MODE, PYTHON_MODE, SILENT_MODE]:
|
||||
raise Exception("Driver_mode must be in [ray.SCRIPT_MODE, ray.PYTHON_MODE, ray.SILENT_MODE].")
|
||||
if driver_mode == PYTHON_MODE:
|
||||
# If starting Ray in PYTHON_MODE, don't start any other processes.
|
||||
address_info = {}
|
||||
info = {}
|
||||
elif start_ray_local:
|
||||
# In this case, we launch a scheduler, a new object store, and some workers,
|
||||
# and we connect to them.
|
||||
if driver_mode not in [SCRIPT_MODE, PYTHON_MODE, SILENT_MODE]:
|
||||
raise Exception("If start_ray_local=True, then driver_mode must be in [ray.SCRIPT_MODE, ray.PYTHON_MODE, ray.SILENT_MODE].")
|
||||
if redis_address is not None:
|
||||
raise Exception("If start_ray_local=True, then redis_address cannot be provided because ray.init will start a new Redis server.")
|
||||
# Use the address 127.0.0.1 in local mode.
|
||||
node_ip_address = "127.0.0.1" if node_ip_address is None else node_ip_address
|
||||
# Use 1 worker if num_workers is not provided.
|
||||
num_workers = 1 if num_workers is None else num_workers
|
||||
# Use 1 local scheduler if num_local_schedulers is not provided.
|
||||
num_local_schedulers = 1 if num_local_schedulers is None else num_local_schedulers
|
||||
# Start the scheduler, object store, and some workers. These will be killed
|
||||
# by the call to cleanup(), which happens when the Python script exits.
|
||||
address_info = services.start_ray_local(num_workers=num_workers, num_local_schedulers=num_local_schedulers)
|
||||
address_info = services.start_ray_local(node_ip_address=node_ip_address, num_workers=num_workers, num_local_schedulers=num_local_schedulers)
|
||||
info = {"node_ip_address": node_ip_address,
|
||||
"redis_address": address_info["redis_address"],
|
||||
"store_socket_name": address_info["object_store_names"][0],
|
||||
"manager_socket_name": address_info["object_store_manager_names"][0],
|
||||
"local_scheduler_socket_name": address_info["local_scheduler_names"][0]}
|
||||
else:
|
||||
raise Exception("This mode is currently not enabled.")
|
||||
if redis_address is None:
|
||||
raise Exception("If start_ray_local=False, then redis_address must be provided.")
|
||||
if node_ip_address is None:
|
||||
raise Exception("If start_ray_local=False, then node_ip_address must be provided.")
|
||||
if num_workers is not None:
|
||||
raise Exception("If start_ray_local=False, then num_workers must not be provided.")
|
||||
if num_local_schedulers is not None:
|
||||
raise Exception("If start_ray_local=False, then num_local_schedulers must not be provided.")
|
||||
# Get the address info of the processes to connect to from Redis.
|
||||
info = get_address_info_from_redis(redis_address, node_ip_address)
|
||||
# Connect this driver to Redis, the object store, and the local scheduler. The
|
||||
# corresponing call to disconnect will happen in the call to cleanup() when
|
||||
# the Python script exits.
|
||||
connect(address_info, driver_mode, worker=global_worker)
|
||||
return address_info
|
||||
connect(info, object_id_seed=object_id_seed, mode=driver_mode, worker=global_worker)
|
||||
return info
|
||||
|
||||
def cleanup(worker=global_worker):
|
||||
"""Disconnect the driver, and terminate any processes started in init.
|
||||
@@ -828,17 +910,23 @@ def import_thread(worker):
|
||||
worker.redis_client.hincrby(worker_info_key, "export_counter", 1)
|
||||
worker.worker_import_counter += 1
|
||||
|
||||
def connect(address_info, mode=WORKER_MODE, worker=global_worker):
|
||||
"""Connect this worker to the scheduler and an object store.
|
||||
def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
|
||||
"""Connect this worker to the local scheduler, to Plasma, and to Redis.
|
||||
|
||||
Args:
|
||||
address_info (dict): This contains the entries node_ip_address,
|
||||
redis_address, object_store_name, object_store_manager_name, and
|
||||
local_scheduler_name.
|
||||
info (dict): A dictionary with address of the Redis server and the sockets
|
||||
of the plasma store, plasma manager, and local scheduler.
|
||||
mode: The mode of the worker. One of SCRIPT_MODE, WORKER_MODE, PYTHON_MODE,
|
||||
and SILENT_MODE.
|
||||
"""
|
||||
check_main_thread()
|
||||
# Do some basic checking to make sure we didn't call ray.init twice.
|
||||
error_message = "Perhaps you called ray.init twice by accident?"
|
||||
assert not worker.connected, error_message
|
||||
assert worker.cached_functions_to_run is not None, error_message
|
||||
assert worker.cached_remote_functions is not None, error_message
|
||||
assert reusables._cached_reusables is not None, error_message
|
||||
# Initialize some fields.
|
||||
worker.worker_id = random_string()
|
||||
worker.connected = True
|
||||
worker.set_mode(mode)
|
||||
@@ -847,13 +935,13 @@ def connect(address_info, mode=WORKER_MODE, worker=global_worker):
|
||||
if mode == PYTHON_MODE:
|
||||
return
|
||||
# Create a Redis client.
|
||||
worker.redis_client = redis.StrictRedis(host=address_info["node_ip_address"], port=address_info["redis_port"])
|
||||
worker.redis_client.config_set("notify-keyspace-events", "AKE")
|
||||
redis_host, redis_port = info["redis_address"].split(":")
|
||||
worker.redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port))
|
||||
worker.lock = threading.Lock()
|
||||
# Create an object store client.
|
||||
worker.plasma_client = plasma.PlasmaClient(address_info["object_store_names"][0], address_info["object_store_manager_names"][0])
|
||||
worker.plasma_client = plasma.PlasmaClient(info["store_socket_name"], info["manager_socket_name"])
|
||||
# Create the local scheduler client.
|
||||
worker.photon_client = photon.PhotonClient(address_info["local_scheduler_names"][0])
|
||||
worker.photon_client = photon.PhotonClient(info["local_scheduler_socket_name"])
|
||||
# Register the worker with Redis.
|
||||
if mode in [SCRIPT_MODE, SILENT_MODE]:
|
||||
worker.redis_client.rpush("Drivers", worker.worker_id)
|
||||
@@ -861,10 +949,22 @@ def connect(address_info, mode=WORKER_MODE, worker=global_worker):
|
||||
worker.redis_client.rpush("Workers", worker.worker_id)
|
||||
else:
|
||||
raise Exception("This code should be unreachable.")
|
||||
# If this is a driver, set the current task ID to a specific fixed value and
|
||||
# set the task index to 0.
|
||||
# If this is a driver, set the current task ID and set the task index to 0.
|
||||
if mode in [SCRIPT_MODE, SILENT_MODE]:
|
||||
worker.current_task_id = photon.ObjectID("".join(chr(i) for i in range(20)))
|
||||
# If the user provided an object_id_seed, then set the current task ID
|
||||
# deterministically based on that seed (without altering the state of the
|
||||
# user's random number generator). Otherwise, set the current task ID
|
||||
# randomly to avoid object ID collisions.
|
||||
numpy_state = np.random.get_state()
|
||||
if object_id_seed is not None:
|
||||
np.random.seed(object_id_seed)
|
||||
else:
|
||||
# Try to use true randomness.
|
||||
np.random.seed(None)
|
||||
worker.current_task_id = photon.ObjectID(np.random.bytes(20))
|
||||
# Reset the state of the numpy random number generator.
|
||||
np.random.set_state(numpy_state)
|
||||
# Set other fields needed for computing task IDs.
|
||||
worker.task_index = 0
|
||||
worker.put_index = 0
|
||||
# If this is a worker, then start a thread to import exports from the driver.
|
||||
|
||||
@@ -2,26 +2,59 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import numpy as np
|
||||
import redis
|
||||
import traceback
|
||||
|
||||
import ray
|
||||
|
||||
parser = argparse.ArgumentParser(description="Parse addresses for the worker to connect to.")
|
||||
parser.add_argument("--node-ip-address", required=True, type=str, help="the ip address of the worker's node")
|
||||
parser.add_argument("--redis-port", required=True, type=int, help="the port to use for Redis")
|
||||
parser.add_argument("--redis-address", required=True, type=str, help="the address to use for Redis")
|
||||
parser.add_argument("--object-store-name", required=True, type=str, help="the object store's name")
|
||||
parser.add_argument("--object-store-manager-name", required=True, type=str, help="the object store manager's name")
|
||||
parser.add_argument("--local-scheduler-name", required=True, type=str, help="the local scheduler's name")
|
||||
|
||||
def random_string():
|
||||
return np.random.bytes(20)
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
address_info = {"node_ip_address": args.node_ip_address,
|
||||
"redis_port": args.redis_port,
|
||||
"object_store_names": [args.object_store_name],
|
||||
"object_store_manager_names": [args.object_store_manager_name],
|
||||
"local_scheduler_names": [args.local_scheduler_name]}
|
||||
ray.worker.connect(address_info, ray.WORKER_MODE)
|
||||
info = {"redis_address": args.redis_address,
|
||||
"store_socket_name": args.object_store_name,
|
||||
"manager_socket_name": args.object_store_manager_name,
|
||||
"local_scheduler_socket_name": args.local_scheduler_name}
|
||||
ray.worker.connect(info, ray.WORKER_MODE)
|
||||
|
||||
ray.worker.main_loop()
|
||||
error_explanation = """
|
||||
This error is unexpected and should not have happened. Somehow a worker crashed
|
||||
in an unanticipated way causing the main_loop to throw an exception, which is
|
||||
being caught in "lib/python/ray/workers/default_worker.py".
|
||||
"""
|
||||
|
||||
while True:
|
||||
try:
|
||||
# This call to main_loop should never return if things are working. Most
|
||||
# exceptions that are thrown (e.g., inside the execution of a task) should
|
||||
# be caught and handled inside of the call to main_loop. If an exception
|
||||
# is thrown here, then that means that there is some error that we didn't
|
||||
# anticipate.
|
||||
ray.worker.main_loop()
|
||||
except Exception as e:
|
||||
traceback_str = traceback.format_exc() + error_explanation
|
||||
error_key = "WorkerError:{}".format(random_string())
|
||||
redis_host, redis_port = args.redis_address.split(":")
|
||||
# For this command to work, some other client (on the same machine as
|
||||
# Redis) must have run "CONFIG SET protected-mode no".
|
||||
redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port))
|
||||
redis_client.hmset(error_key, {"message": traceback_str,
|
||||
"note": "This error is unexpected and should not have happened."})
|
||||
redis_client.rpush("ErrorKeys", error_key)
|
||||
# TODO(rkn): Note that if the worker was in the middle of executing a
|
||||
# task, the any worker or driver that is blocking in a get call and
|
||||
# waiting for the output of that task will hang. We need to address this.
|
||||
|
||||
# After putting the error message in Redis, this worker will attempt to
|
||||
# reenter the main loop. TODO(rkn): We should probably reset it's state and
|
||||
# call connect again.
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
|
||||
import ray.services as services
|
||||
|
||||
parser = argparse.ArgumentParser(description="Parse addresses for the worker to connect to.")
|
||||
parser.add_argument("--node-ip-address", required=True, type=str, help="the ip address of the worker's node")
|
||||
parser.add_argument("--redis-address", required=False, type=str, help="the address to use for Redis")
|
||||
parser.add_argument("--num-workers", default=10, required=False, type=int, help="the number of workers to start on this node")
|
||||
parser.add_argument("--head", action="store_true", help="provide this argument for the head node")
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
# Note that we redirect stdout and stderr to /dev/null because otherwise
|
||||
# attempts to print may cause exceptions if a process is started inside of an
|
||||
# SSH connection and the SSH connection dies. TODO(rkn): This is a temporary
|
||||
# fix. We should actually redirect stdout and stderr to Redis in some way.
|
||||
|
||||
if args.head:
|
||||
# Start Ray on the head node.
|
||||
if args.redis_address is not None:
|
||||
raise Exception("If --head is passed in, a Redis server will be started, so a Redis address should not be provided.")
|
||||
address_info = services.start_ray_local(node_ip_address=args.node_ip_address,
|
||||
num_workers=args.num_workers,
|
||||
cleanup=False,
|
||||
redirect_output=True)
|
||||
else:
|
||||
# Start Ray on a non-head node.
|
||||
if args.redis_address is None:
|
||||
raise Exception("If --head is not passed in, --redis-address must be provided.")
|
||||
address_info = services.start_ray_node(node_ip_address=args.node_ip_address,
|
||||
redis_address=args.redis_address,
|
||||
num_workers=args.num_workers,
|
||||
cleanup=False,
|
||||
redirect_output=True)
|
||||
print(address_info)
|
||||
Executable
+5
@@ -0,0 +1,5 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
|
||||
|
||||
python "$ROOT_DIR/start_ray.py" "$@"
|
||||
Executable
+3
@@ -0,0 +1,3 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
killall redis-server global_scheduler plasma_store plasma_manager photon_scheduler
|
||||
@@ -61,14 +61,25 @@ RedisModuleKey *OpenPrefixedKey(RedisModuleCtx *ctx,
|
||||
/**
|
||||
* Register a client with Redis. This is called from a client with the command:
|
||||
*
|
||||
* RAY.CONNECT <client type> <address> <ray client id> <aux address>
|
||||
* RAY.CONNECT <ray client id> <node ip address> <client type> <field 1>
|
||||
* <value 1> <field 2> <value 2> ...
|
||||
*
|
||||
* The command can take an arbitrary number of pairs of field names and keys,
|
||||
* and these will be stored in a hashmap associated with this client. Several
|
||||
* fields are singled out for special treatment:
|
||||
*
|
||||
* address: This is provided by plasma managers and it should be an address
|
||||
* like "127.0.0.1:1234". It is returned by RAY.GET_CLIENT_ADDRESS so
|
||||
* that other plasma managers know how to fetch objects.
|
||||
* aux_address: This is provided by local schedulers and should be the
|
||||
* address of the plasma manager that the local scheduler is connected
|
||||
* to. This is published to the "db_clients" channel by the RAY.CONNECT
|
||||
* command and is used by the global scheduler to determine which plasma
|
||||
* managers and local schedulers are connected.
|
||||
*
|
||||
* @param client_type The type of the client (e.g., plasma_manager).
|
||||
* @param address The address of the client.
|
||||
* @param ray_client_id The db client ID of the client.
|
||||
* @param aux_address An auxiliary address. This is currently just used by the
|
||||
* local scheduler to record the address of the plasma manager that it is
|
||||
* connected to.
|
||||
* @param node_ip_address The IP address of the node the client is on.
|
||||
* @param client_type The type of the client (e.g., plasma_manager).
|
||||
* @return OK if the operation was successful.
|
||||
*/
|
||||
int Connect_RedisCommand(RedisModuleCtx *ctx,
|
||||
@@ -95,7 +106,8 @@ int Connect_RedisCommand(RedisModuleCtx *ctx,
|
||||
RedisModule_CreateString(ctx, "aux_address", strlen("aux_address"));
|
||||
|
||||
RedisModule_HashSet(db_client_table_key, REDISMODULE_HASH_CFIELDS,
|
||||
"node_ip_address", node_ip_address, NULL);
|
||||
"ray_client_id", ray_client_id, "node_ip_address",
|
||||
node_ip_address, "client_type", client_type, NULL);
|
||||
|
||||
for (int i = 4; i < argc; i += 2) {
|
||||
RedisModuleString *key = argv[i];
|
||||
|
||||
@@ -160,11 +160,19 @@ db_handle *db_connect(const char *db_address,
|
||||
"could not establish synchronous connection to redis "
|
||||
"%s:%d",
|
||||
db_address, db_port);
|
||||
/* Enable keyspace events. */
|
||||
reply = redisCommand(context, "CONFIG SET notify-keyspace-events AKE");
|
||||
/* Configure Redis to generate keyspace notifications for list events. This
|
||||
* should only need to be done once (by whoever started Redis), but since
|
||||
* Redis may be started in multiple places (e.g., for testing or when starting
|
||||
* processes by hand), it is easier to do it multiple times. */
|
||||
reply = redisCommand(context, "CONFIG SET notify-keyspace-events Kl");
|
||||
CHECKM(reply != NULL, "db_connect failed on CONFIG SET");
|
||||
freeReplyObject(reply);
|
||||
/* Add new client using optimistic locking. */
|
||||
/* Also configure Redis to not run in protected mode, so clients on other
|
||||
* hosts can connect to it. */
|
||||
reply = redisCommand(context, "CONFIG SET protected-mode no");
|
||||
CHECKM(reply != NULL, "db_connect failed on CONFIG SET");
|
||||
freeReplyObject(reply);
|
||||
/* Create a client ID for this client. */
|
||||
db_client_id client = globally_unique_id();
|
||||
|
||||
/* Construct the argument arrays for RAY.CONNECT. */
|
||||
|
||||
@@ -6,7 +6,7 @@ import os
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
def start_global_scheduler(redis_address, use_valgrind=False, use_profiler=False):
|
||||
def start_global_scheduler(redis_address, use_valgrind=False, use_profiler=False, redirect_output=False):
|
||||
"""Start a global scheduler process.
|
||||
|
||||
Args:
|
||||
@@ -15,6 +15,8 @@ def start_global_scheduler(redis_address, use_valgrind=False, use_profiler=False
|
||||
of valgrind. If this is True, use_profiler must be False.
|
||||
use_profiler (bool): True if the global scheduler should be started inside a
|
||||
profiler. If this is True, use_valgrind must be False.
|
||||
redirect_output (bool): True if stdout and stderr should be redirected to
|
||||
/dev/null.
|
||||
|
||||
Return:
|
||||
The process ID of the global scheduler process.
|
||||
@@ -23,13 +25,16 @@ def start_global_scheduler(redis_address, use_valgrind=False, use_profiler=False
|
||||
raise Exception("Cannot use valgrind and profiler at the same time.")
|
||||
global_scheduler_executable = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../build/global_scheduler")
|
||||
command = [global_scheduler_executable, "-r", redis_address]
|
||||
if use_valgrind:
|
||||
pid = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all", "--error-exitcode=1"] + command)
|
||||
time.sleep(1.0)
|
||||
elif use_profiler:
|
||||
pid = subprocess.Popen(["valgrind", "--tool=callgrind"] + command)
|
||||
time.sleep(1.0)
|
||||
else:
|
||||
pid = subprocess.Popen(command)
|
||||
time.sleep(0.1)
|
||||
with open(os.devnull, "w") as FNULL:
|
||||
stdout = FNULL if redirect_output else None
|
||||
stderr = FNULL if redirect_output else None
|
||||
if use_valgrind:
|
||||
pid = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all", "--error-exitcode=1"] + command, stdout=stdout, stderr=stderr)
|
||||
time.sleep(1.0)
|
||||
elif use_profiler:
|
||||
pid = subprocess.Popen(["valgrind", "--tool=callgrind"] + command, stdout=stdout, stderr=stderr)
|
||||
time.sleep(1.0)
|
||||
else:
|
||||
pid = subprocess.Popen(command, stdout=stdout, stderr=stderr)
|
||||
time.sleep(0.1)
|
||||
return pid
|
||||
|
||||
@@ -10,7 +10,7 @@ import time
|
||||
def random_name():
|
||||
return str(random.randint(0, 99999999))
|
||||
|
||||
def start_local_scheduler(plasma_store_name, plasma_manager_name=None, plasma_address=None, node_ip_address="127.0.0.1", redis_address=None, use_valgrind=False, use_profiler=False):
|
||||
def start_local_scheduler(plasma_store_name, plasma_manager_name=None, plasma_address=None, node_ip_address="127.0.0.1", redis_address=None, use_valgrind=False, use_profiler=False, redirect_output=False):
|
||||
"""Start a local scheduler process.
|
||||
|
||||
Args:
|
||||
@@ -29,6 +29,8 @@ def start_local_scheduler(plasma_store_name, plasma_manager_name=None, plasma_ad
|
||||
valgrind. If this is True, use_profiler must be False.
|
||||
use_profiler (bool): True if the local scheduler should be started inside a
|
||||
profiler. If this is True, use_valgrind must be False.
|
||||
redirect_output (bool): True if stdout and stderr should be redirected to
|
||||
/dev/null.
|
||||
|
||||
Return:
|
||||
A tuple of the name of the local scheduler socket and the process ID of the
|
||||
@@ -47,13 +49,16 @@ def start_local_scheduler(plasma_store_name, plasma_manager_name=None, plasma_ad
|
||||
command += ["-r", redis_address]
|
||||
if plasma_address is not None:
|
||||
command += ["-a", plasma_address]
|
||||
if use_valgrind:
|
||||
pid = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all", "--error-exitcode=1"] + command)
|
||||
time.sleep(1.0)
|
||||
elif use_profiler:
|
||||
pid = subprocess.Popen(["valgrind", "--tool=callgrind"] + command)
|
||||
time.sleep(1.0)
|
||||
else:
|
||||
pid = subprocess.Popen(command)
|
||||
time.sleep(0.1)
|
||||
with open(os.devnull, "w") as FNULL:
|
||||
stdout = FNULL if redirect_output else None
|
||||
stderr = FNULL if redirect_output else None
|
||||
if use_valgrind:
|
||||
pid = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all", "--error-exitcode=1"] + command, stdout=stdout, stderr=stderr)
|
||||
time.sleep(1.0)
|
||||
elif use_profiler:
|
||||
pid = subprocess.Popen(["valgrind", "--tool=callgrind"] + command, stdout=stdout, stderr=stderr)
|
||||
time.sleep(1.0)
|
||||
else:
|
||||
pid = subprocess.Popen(command, stdout=stdout, stderr=stderr)
|
||||
time.sleep(0.1)
|
||||
return local_scheduler_name, pid
|
||||
|
||||
@@ -29,6 +29,7 @@ local_scheduler_state *init_local_scheduler(
|
||||
event_loop *loop,
|
||||
const char *redis_addr,
|
||||
int redis_port,
|
||||
const char *local_scheduler_socket_name,
|
||||
const char *plasma_store_socket_name,
|
||||
const char *plasma_manager_socket_name,
|
||||
const char *plasma_manager_address,
|
||||
@@ -40,19 +41,24 @@ local_scheduler_state *init_local_scheduler(
|
||||
utarray_new(state->workers, &worker_icd);
|
||||
/* Connect to Redis if a Redis address is provided. */
|
||||
if (redis_addr != NULL) {
|
||||
int num_args = 0;
|
||||
int num_args;
|
||||
const char **db_connect_args = NULL;
|
||||
if (plasma_manager_address != NULL) {
|
||||
num_args = 4;
|
||||
db_connect_args = malloc(sizeof(char *) * num_args);
|
||||
db_connect_args[0] = "local_scheduler_socket_name";
|
||||
db_connect_args[1] = local_scheduler_socket_name;
|
||||
db_connect_args[2] = "aux_address";
|
||||
db_connect_args[3] = plasma_manager_address;
|
||||
} else {
|
||||
num_args = 2;
|
||||
db_connect_args = malloc(sizeof(char *) * num_args);
|
||||
db_connect_args[0] = "aux_address";
|
||||
db_connect_args[1] = plasma_manager_address;
|
||||
db_connect_args[0] = "local_scheduler_socket_name";
|
||||
db_connect_args[1] = local_scheduler_socket_name;
|
||||
}
|
||||
state->db = db_connect(redis_addr, redis_port, "photon", node_ip_address,
|
||||
num_args, db_connect_args);
|
||||
if (num_args != 0) {
|
||||
free(db_connect_args);
|
||||
};
|
||||
free(db_connect_args);
|
||||
db_attach(state->db, loop, false);
|
||||
} else {
|
||||
state->db = NULL;
|
||||
@@ -295,10 +301,10 @@ void start_server(const char *node_ip_address,
|
||||
bool global_scheduler_exists) {
|
||||
int fd = bind_ipc_sock(socket_name, true);
|
||||
event_loop *loop = event_loop_create();
|
||||
g_state =
|
||||
init_local_scheduler(node_ip_address, loop, redis_addr, redis_port,
|
||||
plasma_store_socket_name, plasma_manager_socket_name,
|
||||
plasma_manager_address, global_scheduler_exists);
|
||||
g_state = init_local_scheduler(
|
||||
node_ip_address, loop, redis_addr, redis_port, socket_name,
|
||||
plasma_store_socket_name, plasma_manager_socket_name,
|
||||
plasma_manager_address, global_scheduler_exists);
|
||||
|
||||
/* Register a callback for registering new clients. */
|
||||
event_loop_add_file(loop, fd, EVENT_LOOP_READ, new_client_connection,
|
||||
|
||||
@@ -69,6 +69,7 @@ local_scheduler_state *init_local_scheduler(
|
||||
event_loop *loop,
|
||||
const char *redis_addr,
|
||||
int redis_port,
|
||||
const char *local_scheduler_socket_name,
|
||||
const char *plasma_manager_socket_name,
|
||||
const char *plasma_store_socket_name,
|
||||
const char *plasma_manager_address,
|
||||
|
||||
@@ -59,6 +59,7 @@ photon_mock *init_photon_mock() {
|
||||
CHECK(mock->plasma_fd >= 0 && mock->photon_fd >= 0);
|
||||
mock->photon_state = init_local_scheduler(
|
||||
"127.0.0.1", mock->loop, redis_addr, redis_port,
|
||||
utstring_body(photon_socket_name),
|
||||
utstring_body(plasma_manager_socket_name),
|
||||
utstring_body(plasma_store_socket_name), NULL, false);
|
||||
/* Connect a Photon client. */
|
||||
|
||||
+29
-18
@@ -234,7 +234,7 @@ DEFAULT_PLASMA_STORE_MEMORY = 10 ** 9
|
||||
def random_name():
|
||||
return str(random.randint(0, 99999999))
|
||||
|
||||
def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY, use_valgrind=False, use_profiler=False):
|
||||
def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY, use_valgrind=False, use_profiler=False, redirect_output=False):
|
||||
"""Start a plasma store process.
|
||||
|
||||
Args:
|
||||
@@ -242,6 +242,8 @@ def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY, use_valg
|
||||
valgrind. If this is True, use_profiler must be False.
|
||||
use_profiler (bool): True if the plasma store should be started inside a
|
||||
profiler. If this is True, use_valgrind must be False.
|
||||
redirect_output (bool): True if stdout and stderr should be redirected to
|
||||
/dev/null.
|
||||
|
||||
Return:
|
||||
A tuple of the name of the plasma store socket and the process ID of the
|
||||
@@ -252,25 +254,31 @@ def start_plasma_store(plasma_store_memory=DEFAULT_PLASMA_STORE_MEMORY, use_valg
|
||||
plasma_store_executable = os.path.join(os.path.abspath(os.path.dirname(__file__)), "plasma_store")
|
||||
plasma_store_name = "/tmp/plasma_store{}".format(random_name())
|
||||
command = [plasma_store_executable, "-s", plasma_store_name, "-m", str(plasma_store_memory)]
|
||||
if use_valgrind:
|
||||
pid = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all", "--error-exitcode=1"] + command)
|
||||
time.sleep(1.0)
|
||||
elif use_profiler:
|
||||
pid = subprocess.Popen(["valgrind", "--tool=callgrind"] + command)
|
||||
time.sleep(1.0)
|
||||
else:
|
||||
pid = subprocess.Popen(command)
|
||||
time.sleep(0.1)
|
||||
with open(os.devnull, "w") as FNULL:
|
||||
stdout = FNULL if redirect_output else None
|
||||
stderr = FNULL if redirect_output else None
|
||||
if use_valgrind:
|
||||
pid = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all", "--error-exitcode=1"] + command, stdout=stdout, stderr=stderr)
|
||||
time.sleep(1.0)
|
||||
elif use_profiler:
|
||||
pid = subprocess.Popen(["valgrind", "--tool=callgrind"] + command, stdout=stdout, stderr=stderr)
|
||||
time.sleep(1.0)
|
||||
else:
|
||||
pid = subprocess.Popen(command, stdout=stdout, stderr=stderr)
|
||||
time.sleep(0.1)
|
||||
return plasma_store_name, pid
|
||||
|
||||
def start_plasma_manager(store_name, redis_address, num_retries=20, use_valgrind=False, run_profiler=False):
|
||||
def start_plasma_manager(store_name, redis_address, node_ip_address="127.0.0.1", num_retries=20, use_valgrind=False, run_profiler=False, redirect_output=False):
|
||||
"""Start a plasma manager and return the ports it listens on.
|
||||
|
||||
Args:
|
||||
store_name (str): The name of the plasma store socket.
|
||||
redis_address (str): The address of the Redis server.
|
||||
node_ip_address (str): The IP address of the node.
|
||||
use_valgrind (bool): True if the Plasma manager should be started inside of
|
||||
valgrind and False otherwise.
|
||||
redirect_output (bool): True if stdout and stderr should be redirected to
|
||||
/dev/null.
|
||||
|
||||
Returns:
|
||||
A tuple of the Plasma manager socket name, the process ID of the Plasma
|
||||
@@ -291,15 +299,18 @@ def start_plasma_manager(store_name, redis_address, num_retries=20, use_valgrind
|
||||
command = [plasma_manager_executable,
|
||||
"-s", store_name,
|
||||
"-m", plasma_manager_name,
|
||||
"-h", "127.0.0.1",
|
||||
"-h", node_ip_address,
|
||||
"-p", str(port),
|
||||
"-r", redis_address]
|
||||
if use_valgrind:
|
||||
process = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all", "--error-exitcode=1"] + command)
|
||||
elif run_profiler:
|
||||
process = subprocess.Popen(["valgrind", "--tool=callgrind"] + command)
|
||||
else:
|
||||
process = subprocess.Popen(command)
|
||||
with open(os.devnull, "w") as FNULL:
|
||||
stdout = FNULL if redirect_output else None
|
||||
stderr = FNULL if redirect_output else None
|
||||
if use_valgrind:
|
||||
process = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all", "--error-exitcode=1"] + command, stdout=stdout, stderr=stderr)
|
||||
elif run_profiler:
|
||||
process = subprocess.Popen(["valgrind", "--tool=callgrind"] + command, stdout=stdout, stderr=stderr)
|
||||
else:
|
||||
process = subprocess.Popen(command, stdout=stdout, stderr=stderr)
|
||||
# This sleep is critical. If the plasma_manager fails to start because the
|
||||
# port is already in use, then we need it to fail within 0.1 seconds.
|
||||
time.sleep(0.1)
|
||||
|
||||
Reference in New Issue
Block a user