Fixes and cleanups for the multinode setting. (#143)

* Add function for driver to get address info from Redis.

* Use Redis address instead of Redis port.

* Configure Redis to run in unprotected mode.

* Add method for starting Ray processes on non-head node.

* Pass in correct node ip address to start_plasma_manager.

* Script for starting Ray processes.

* Handle the case where an object already exists in the store. Maybe this should also compare the object hashes.

* Have driver get info from Redis when start_ray_local=False.

* Fix.

* Script for killing ray processes.

* Catch some errors when the main_loop in a worker throws an exception.

* Allow redirecting stdout and stderr to /dev/null.

* Wrap start_ray.py in a shell script.

* More helpful error messages.

* Fixes.

* Wait for redis server to start up before configuring it.

* Allow seeding of deterministic object ID generation.

* Small change.
This commit is contained in:
Robert Nishihara
2016-12-21 18:53:12 -08:00
committed by Philipp Moritz
parent c9c1b3e6af
commit 6cd02d71f8
14 changed files with 475 additions and 134 deletions
+148 -37
View File
@@ -5,6 +5,7 @@ from __future__ import print_function
import psutil
import os
import random
import redis
import signal
import string
import subprocess
@@ -62,7 +63,8 @@ def cleanup():
continue
successfully_shut_down = False
if successfully_shut_down:
print("Successfully shut down Ray.")
if len(all_processes) > 0:
print("Successfully shut down Ray.")
else:
print("Ray did not shut down properly.")
all_processes = []
@@ -70,7 +72,7 @@ def cleanup():
def all_processes_alive():
return all([p.poll() is None for p in all_processes])
def start_redis(num_retries=20, cleanup=True):
def start_redis(num_retries=20, cleanup=True, redirect_output=False):
"""Start a Redis server.
Args:
@@ -78,6 +80,8 @@ def start_redis(num_retries=20, cleanup=True):
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
this process will be killed by serices.cleanup() when the Python process
that imported services exits.
redirect_output (bool): True if stdout and stderr should be redirected to
/dev/null.
Returns:
The port used by Redis.
@@ -94,18 +98,48 @@ def start_redis(num_retries=20, cleanup=True):
if counter > 0:
print("Redis failed to start, retrying now.")
port = new_port()
p = subprocess.Popen([redis_filepath, "--port", str(port), "--loglevel", "warning", "--loadmodule", redis_module])
with open(os.devnull, "w") as FNULL:
stdout = FNULL if redirect_output else None
stderr = FNULL if redirect_output else None
p = subprocess.Popen([redis_filepath, "--port", str(port), "--loglevel", "warning", "--loadmodule", redis_module], stdout=stdout, stderr=stderr)
time.sleep(0.1)
# Check if Redis successfully started (or at least if it the executable did
# not exit within 0.1 seconds).
if p.poll() is None:
if cleanup:
all_processes.append(p)
return port
break
counter += 1
raise Exception("Couldn't start Redis.")
if counter == num_retries:
raise Exception("Couldn't start Redis.")
def start_global_scheduler(redis_address, cleanup=True):
# Create a Redis client just for configuring Redis.
redis_client = redis.StrictRedis(host="127.0.0.1", port=port)
# Wait for the Redis server to start.
counter = 0
while counter < num_retries:
try:
# Run some random command and see if it worked.
redis_client.client_list()
except redis.ConnectionError as e:
# Wait a little bit.
time.sleep(1)
counter += 1
else:
break
if counter == num_retries:
raise Exception("The Redis server did not start properly.")
# Configure Redis to generate keyspace notifications. TODO(rkn): Change this
# to only generate notifications for the export keys.
redis_client.config_set("notify-keyspace-events", "Kl")
# Configure Redis to not run in protected mode so that processes on other
# hosts can connect to it. TODO(rkn): Do this in a more secure way.
redis_client.config_set("protected-mode", "no")
return port
def start_global_scheduler(redis_address, cleanup=True, redirect_output=False):
"""Start a global scheduler process.
Args:
@@ -113,12 +147,14 @@ def start_global_scheduler(redis_address, cleanup=True):
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
this process will be killed by serices.cleanup() when the Python process
that imported services exits.
redirect_output (bool): True if stdout and stderr should be redirected to
/dev/null.
"""
p = global_scheduler.start_global_scheduler(redis_address)
p = global_scheduler.start_global_scheduler(redis_address, redirect_output=redirect_output)
if cleanup:
all_processes.append(p)
def start_local_scheduler(redis_address, node_ip_address, plasma_store_name, plasma_manager_name, plasma_address=None, cleanup=True):
def start_local_scheduler(redis_address, node_ip_address, plasma_store_name, plasma_manager_name, plasma_address=None, cleanup=True, redirect_output=False):
"""Start a local scheduler process.
Args:
@@ -131,24 +167,28 @@ def start_local_scheduler(redis_address, node_ip_address, plasma_store_name, pla
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
this process will be killed by serices.cleanup() when the Python process
that imported services exits.
redirect_output (bool): True if stdout and stderr should be redirected to
/dev/null.
Return:
The name of the local scheduler socket.
"""
local_scheduler_name, p = photon.start_local_scheduler(plasma_store_name, plasma_manager_name, node_ip_address=node_ip_address, redis_address=redis_address, plasma_address=plasma_address, use_profiler=RUN_PHOTON_PROFILER)
local_scheduler_name, p = photon.start_local_scheduler(plasma_store_name, plasma_manager_name, node_ip_address=node_ip_address, redis_address=redis_address, plasma_address=plasma_address, use_profiler=RUN_PHOTON_PROFILER, redirect_output=redirect_output)
if cleanup:
all_processes.append(p)
return local_scheduler_name
def start_objstore(node_ip_address, redis_address, cleanup=True):
def start_objstore(node_ip_address, redis_address, cleanup=True, redirect_output=False):
"""This method starts an object store process.
Args:
node_ip_address (str): The ip address of the node running the object store.
node_ip_address (str): The IP address of the node running the object store.
redis_address (str): The address of the Redis instance to connect to.
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
this process will be killed by serices.cleanup() when the Python process
that imported services exits.
redirect_output (bool): True if stdout and stderr should be redirected to
/dev/null.
Return:
A tuple of the Plasma store socket name, the Plasma manager socket name, and
@@ -164,16 +204,16 @@ def start_objstore(node_ip_address, redis_address, cleanup=True):
else:
plasma_store_memory = int(system_memory * 0.75)
# Start the Plasma store.
plasma_store_name, p1 = plasma.start_plasma_store(plasma_store_memory=plasma_store_memory, use_profiler=RUN_PLASMA_STORE_PROFILER)
plasma_store_name, p1 = plasma.start_plasma_store(plasma_store_memory=plasma_store_memory, use_profiler=RUN_PLASMA_STORE_PROFILER, redirect_output=redirect_output)
# Start the plasma manager.
plasma_manager_name, p2, plasma_manager_port = plasma.start_plasma_manager(plasma_store_name, redis_address, run_profiler=RUN_PLASMA_MANAGER_PROFILER)
plasma_manager_name, p2, plasma_manager_port = plasma.start_plasma_manager(plasma_store_name, redis_address, node_ip_address=node_ip_address, run_profiler=RUN_PLASMA_MANAGER_PROFILER, redirect_output=redirect_output)
if cleanup:
all_processes.append(p1)
all_processes.append(p2)
return plasma_store_name, plasma_manager_name, plasma_manager_port
def start_worker(node_ip_address, object_store_name, object_store_manager_name, local_scheduler_name, redis_port, worker_path, cleanup=True):
def start_worker(node_ip_address, object_store_name, object_store_manager_name, local_scheduler_name, redis_address, worker_path, cleanup=True, redirect_output=False):
"""This method starts a worker process.
Args:
@@ -182,12 +222,14 @@ def start_worker(node_ip_address, object_store_name, object_store_manager_name,
object_store_name (str): The name of the object store.
object_store_manager_name (str): The name of the object store manager.
local_scheduler_name (str): The name of the local scheduler.
redis_port (int): The port that the Redis server is listening on.
redis_address (int): The address that the Redis server is listening on.
worker_path (str): The path of the source code which the worker process will
run.
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
this process will be killed by services.cleanup() when the Python process
that imported services exits. This is True by default.
redirect_output (bool): True if stdout and stderr should be redirected to
/dev/null.
"""
command = ["python",
worker_path,
@@ -195,12 +237,15 @@ def start_worker(node_ip_address, object_store_name, object_store_manager_name,
"--object-store-name=" + object_store_name,
"--object-store-manager-name=" + object_store_manager_name,
"--local-scheduler-name=" + local_scheduler_name,
"--redis-port=" + str(redis_port)]
p = subprocess.Popen(command)
"--redis-address=" + str(redis_address)]
with open(os.devnull, "w") as FNULL:
stdout = FNULL if redirect_output else None
stderr = FNULL if redirect_output else None
p = subprocess.Popen(command, stdout=stdout, stderr=stderr)
if cleanup:
all_processes.append(p)
def start_webui(redis_port, cleanup=True):
def start_webui(redis_port, cleanup=True, redirect_output=False):
"""This method starts the web interface.
Args:
@@ -208,53 +253,57 @@ def start_webui(redis_port, cleanup=True):
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
this process will be killed by services.cleanup() when the Python process
that imported services exits. This is True by default.
redirect_output (bool): True if stdout and stderr should be redirected to
/dev/null.
"""
executable = "nodejs" if sys.platform == "linux" or sys.platform == "linux2" else "node"
command = [executable, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../webui/index.js"), str(redis_port)]
with open("/tmp/webui_out.txt", "wb") as out:
p = subprocess.Popen(command, stdout=out)
with open(os.devnull, "w") as FNULL:
stdout = FNULL if redirect_output else out
stderr = FNULL if redirect_output else None
p = subprocess.Popen(command, stdout=stdout, stderr=stderr)
if cleanup:
all_processes.append(p)
def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, num_local_schedulers=1, worker_path=None):
"""Start Ray in local mode.
def start_ray_node(node_ip_address, redis_address, num_workers=0, num_local_schedulers=1, worker_path=None, cleanup=True, redirect_output=False):
"""Start the Ray processes for a single node.
This assumes that the Ray processes on some master node have already been
started.
Args:
node_ip_address (str): The IP address of this node.
redis_address (str): The address of the Redis server.
num_workers (int): The number of workers to start.
num_local_schedulers (int): The number of local schedulers to start. This is
also the number of plasma stores and plasma managers to start.
worker_path (str): The path of the source code that will be run by the
worker.
Returns:
This returns a dictionary of the address information for the processes that
were started.
cleanup (bool): If cleanup is true, then the processes started here will be
killed by services.cleanup() when the Python process that called this
method exits.
redirect_output (bool): True if stdout and stderr should be redirected to
/dev/null.
"""
if worker_path is None:
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "workers/default_worker.py")
# Start Redis.
redis_port = start_redis(cleanup=True)
redis_address = address(node_ip_address, redis_port)
time.sleep(0.1)
# Start the global scheduler.
start_global_scheduler(redis_address, cleanup=True)
object_store_names = []
object_store_manager_names = []
local_scheduler_names = []
for _ in range(num_local_schedulers):
# Start Plasma.
object_store_name, object_store_manager_name, object_store_manager_port = start_objstore(node_ip_address, redis_address, cleanup=True)
object_store_name, object_store_manager_name, object_store_manager_port = start_objstore(node_ip_address, redis_address, cleanup=cleanup, redirect_output=redirect_output)
object_store_names.append(object_store_name)
object_store_manager_names.append(object_store_manager_name)
time.sleep(0.1)
# Start the local scheduler.
plasma_address = "{}:{}".format(node_ip_address, object_store_manager_port)
local_scheduler_name = start_local_scheduler(redis_address, node_ip_address, object_store_name, object_store_manager_name, plasma_address=plasma_address, cleanup=True)
local_scheduler_name = start_local_scheduler(redis_address, node_ip_address, object_store_name, object_store_manager_name, plasma_address=plasma_address, cleanup=cleanup, redirect_output=redirect_output)
local_scheduler_names.append(local_scheduler_name)
time.sleep(0.1)
# Aggregate the address information together.
address_info = {"node_ip_address": node_ip_address,
"redis_port": redis_port,
"object_store_names": object_store_names,
"object_store_manager_names": object_store_manager_names,
"local_scheduler_names": local_scheduler_names}
@@ -264,9 +313,71 @@ def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, num_local_schedu
address_info["object_store_names"][i % num_local_schedulers],
address_info["object_store_manager_names"][i % num_local_schedulers],
address_info["local_scheduler_names"][i % num_local_schedulers],
redis_port,
redis_address,
worker_path,
cleanup=True)
cleanup=cleanup,
redirect_output=redirect_output)
# Return the addresses of the relevant processes.
start_webui(redis_port)
return address_info
def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, num_local_schedulers=1, worker_path=None, cleanup=True, redirect_output=False):
"""Start Ray in local mode.
Args:
node_ip_address (str): The IP address of this node.
num_workers (int): The number of workers to start.
num_local_schedulers (int): The number of local schedulers to start. This is
also the number of plasma stores and plasma managers to start.
worker_path (str): The path of the source code that will be run by the
worker.
cleanup (bool): If cleanup is true, then the processes started here will be
killed by services.cleanup() when the Python process that called this
method exits.
redirect_output (bool): True if stdout and stderr should be redirected to
/dev/null.
Returns:
This returns a dictionary of the address information for the processes that
were started.
"""
if worker_path is None:
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "workers/default_worker.py")
# Start Redis.
redis_port = start_redis(cleanup=cleanup, redirect_output=redirect_output)
redis_address = address(node_ip_address, redis_port)
time.sleep(0.1)
# Start the global scheduler.
start_global_scheduler(redis_address, cleanup=cleanup, redirect_output=redirect_output)
object_store_names = []
object_store_manager_names = []
local_scheduler_names = []
for _ in range(num_local_schedulers):
# Start Plasma.
object_store_name, object_store_manager_name, object_store_manager_port = start_objstore(node_ip_address, redis_address, cleanup=cleanup, redirect_output=redirect_output)
object_store_names.append(object_store_name)
object_store_manager_names.append(object_store_manager_name)
time.sleep(0.1)
# Start the local scheduler.
plasma_address = "{}:{}".format(node_ip_address, object_store_manager_port)
local_scheduler_name = start_local_scheduler(redis_address, node_ip_address, object_store_name, object_store_manager_name, plasma_address=plasma_address, cleanup=cleanup, redirect_output=redirect_output)
local_scheduler_names.append(local_scheduler_name)
time.sleep(0.1)
# Aggregate the address information together.
address_info = {"node_ip_address": node_ip_address,
"redis_address": redis_address,
"object_store_names": object_store_names,
"object_store_manager_names": object_store_manager_names,
"local_scheduler_names": local_scheduler_names}
# Start the workers.
for i in range(num_workers):
start_worker(address_info["node_ip_address"],
address_info["object_store_names"][i % num_local_schedulers],
address_info["object_store_manager_names"][i % num_local_schedulers],
address_info["local_scheduler_names"][i % num_local_schedulers],
redis_address,
worker_path,
cleanup=cleanup,
redirect_output=redirect_output)
# Return the addresses of the relevant processes.
start_webui(redis_port, cleanup=cleanup, redirect_output=redirect_output)
return address_info
+130 -30
View File
@@ -419,7 +419,16 @@ class Worker(object):
# Serialize and put the object in the object store.
schema, size, serialized = numbuf_serialize(value)
size = size + 4096 * 4 + 8 # The last 8 bytes are for the metadata offset. This is temporary.
buff = self.plasma_client.create(objectid.id(), size, bytearray(schema))
try:
buff = self.plasma_client.create(objectid.id(), size, bytearray(schema))
except RuntimeError as e:
if e.args != ("an object with this ID could not be created",):
raise
# The object already exists in the object store, so there is no need to
# add it again. TODO(rkn): We need to compare the hashes and make sure
# that the objects are in fact the same.
print("This object already exists in the object store.")
return
data = np.frombuffer(buff.buffer, dtype="byte")[8:]
metadata_offset = numbuf.write_to_buffer(serialized, memoryview(data))
np.frombuffer(buff.buffer, dtype="int64", count=1)[0] = metadata_offset
@@ -617,7 +626,52 @@ def initialize_numbuf(worker=global_worker):
register_class(RayGetError)
register_class(RayGetArgumentError)
def init(start_ray_local=False, num_workers=None, num_local_schedulers=1, driver_mode=SCRIPT_MODE):
def get_address_info_from_redis_helper(redis_address, node_ip_address):
redis_host, redis_port = redis_address.split(":")
# For this command to work, some other client (on the same machine as Redis)
# must have run "CONFIG SET protected-mode no".
redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port))
# The client table prefix must be kept in sync with the file
# "src/common/redis_module/ray_redis_module.c" where it is defined.
REDIS_CLIENT_TABLE_PREFIX = "CL:"
client_keys = redis_client.keys("{}*".format(REDIS_CLIENT_TABLE_PREFIX))
# Filter to clients on the same node and do some basic checking.
plasma_managers = []
local_schedulers = []
for key in client_keys:
info = redis_client.hgetall(key)
assert b"ray_client_id" in info
assert b"node_ip_address" in info
assert b"client_type" in info
if info[b"node_ip_address"].decode("ascii") == node_ip_address:
if info[b"client_type"].decode("ascii") == "plasma_manager":
plasma_managers.append(info)
elif info[b"client_type"].decode("ascii") == "photon":
local_schedulers.append(info)
# Make sure that we got at one plasma manager and local scheduler.
assert len(plasma_managers) == 1
assert len(local_schedulers) == 1
client_info = {"node_ip_address": node_ip_address,
"redis_address": redis_address,
"store_socket_name": plasma_managers[0][b"store_socket_name"].decode("ascii"),
"manager_socket_name": plasma_managers[0][b"manager_socket_name"].decode("ascii"),
"local_scheduler_socket_name": local_schedulers[0][b"local_scheduler_socket_name"].decode("ascii")}
return client_info
def get_address_info_from_redis(redis_address, node_ip_address, num_retries=10):
counter = 0
while True:
try:
return get_address_info_from_redis_helper(redis_address, node_ip_address)
except Exception as e:
if counter == num_retries:
raise
# Some of the information may not be in Redis yet, so wait a little bit.
print("Some processes that the driver needs to connect to have not registered with Redis, so retrying.")
time.sleep(1)
counter += 1
def init(node_ip_address="127.0.0.1", redis_address=None, start_ray_local=False, object_id_seed=None, num_workers=None, num_local_schedulers=None, driver_mode=SCRIPT_MODE):
"""Either connect to an existing Ray cluster or start one and connect to it.
This method handles two cases. Either a Ray cluster already exists and we
@@ -625,44 +679,72 @@ def init(start_ray_local=False, num_workers=None, num_local_schedulers=1, driver
with a Ray cluster and attach to the newly started cluster.
Args:
start_ray_local (Optional[bool]): If True then this will start a scheduler
an object store, and some workers. If False, this will attach to an
existing Ray cluster.
num_workers (Optional[int]): The number of workers to start if
node_ip_address (str): The IP address of the node that we are on.
redis_address (str): The address of the Redis server to connect to. This
should only be provided if start_ray_local is False.
start_ray_local (bool): If True then this will start Redis, a global
scheduler, a local scheduler, a plasma store, a plasma manager, and some
workers. It will also kill these processes when Python exits. If False,
this will attach to an existing Ray cluster.
object_id_seed (int): Used to seed the deterministic generation of object
IDs. The same value can be used across multiple runs of the same job in
order to generate the object IDs in a consistent manner. However, the same
ID should not be used for different jobs.
num_workers (int): The number of workers to start. This is only provided if
start_ray_local is True.
num_local_schedulers (Optional[int]): The number of local schedulers to
start if start_ray_local is True.
driver_mode (Optional[bool]): The mode in which to start the driver. This
should be one of SCRIPT_MODE, PYTHON_MODE, and SILENT_MODE.
num_local_schedulers (int): The number of local schedulers to start. This is
only provided if start_ray_local is True.
driver_mode (bool): The mode in which to start the driver. This should be
one of ray.SCRIPT_MODE, ray.PYTHON_MODE, and ray.SILENT_MODE.
Returns:
The address of the Redis server.
Address information about the started processes.
Raises:
Exception: An exception is raised if an inappropriate combination of
arguments is passed in.
"""
check_main_thread()
if driver_mode not in [SCRIPT_MODE, PYTHON_MODE, SILENT_MODE]:
raise Exception("Driver_mode must be in [ray.SCRIPT_MODE, ray.PYTHON_MODE, ray.SILENT_MODE].")
if driver_mode == PYTHON_MODE:
# If starting Ray in PYTHON_MODE, don't start any other processes.
address_info = {}
info = {}
elif start_ray_local:
# In this case, we launch a scheduler, a new object store, and some workers,
# and we connect to them.
if driver_mode not in [SCRIPT_MODE, PYTHON_MODE, SILENT_MODE]:
raise Exception("If start_ray_local=True, then driver_mode must be in [ray.SCRIPT_MODE, ray.PYTHON_MODE, ray.SILENT_MODE].")
if redis_address is not None:
raise Exception("If start_ray_local=True, then redis_address cannot be provided because ray.init will start a new Redis server.")
# Use the address 127.0.0.1 in local mode.
node_ip_address = "127.0.0.1" if node_ip_address is None else node_ip_address
# Use 1 worker if num_workers is not provided.
num_workers = 1 if num_workers is None else num_workers
# Use 1 local scheduler if num_local_schedulers is not provided.
num_local_schedulers = 1 if num_local_schedulers is None else num_local_schedulers
# Start the scheduler, object store, and some workers. These will be killed
# by the call to cleanup(), which happens when the Python script exits.
address_info = services.start_ray_local(num_workers=num_workers, num_local_schedulers=num_local_schedulers)
address_info = services.start_ray_local(node_ip_address=node_ip_address, num_workers=num_workers, num_local_schedulers=num_local_schedulers)
info = {"node_ip_address": node_ip_address,
"redis_address": address_info["redis_address"],
"store_socket_name": address_info["object_store_names"][0],
"manager_socket_name": address_info["object_store_manager_names"][0],
"local_scheduler_socket_name": address_info["local_scheduler_names"][0]}
else:
raise Exception("This mode is currently not enabled.")
if redis_address is None:
raise Exception("If start_ray_local=False, then redis_address must be provided.")
if node_ip_address is None:
raise Exception("If start_ray_local=False, then node_ip_address must be provided.")
if num_workers is not None:
raise Exception("If start_ray_local=False, then num_workers must not be provided.")
if num_local_schedulers is not None:
raise Exception("If start_ray_local=False, then num_local_schedulers must not be provided.")
# Get the address info of the processes to connect to from Redis.
info = get_address_info_from_redis(redis_address, node_ip_address)
# Connect this driver to Redis, the object store, and the local scheduler. The
# corresponing call to disconnect will happen in the call to cleanup() when
# the Python script exits.
connect(address_info, driver_mode, worker=global_worker)
return address_info
connect(info, object_id_seed=object_id_seed, mode=driver_mode, worker=global_worker)
return info
def cleanup(worker=global_worker):
"""Disconnect the driver, and terminate any processes started in init.
@@ -828,17 +910,23 @@ def import_thread(worker):
worker.redis_client.hincrby(worker_info_key, "export_counter", 1)
worker.worker_import_counter += 1
def connect(address_info, mode=WORKER_MODE, worker=global_worker):
"""Connect this worker to the scheduler and an object store.
def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
"""Connect this worker to the local scheduler, to Plasma, and to Redis.
Args:
address_info (dict): This contains the entries node_ip_address,
redis_address, object_store_name, object_store_manager_name, and
local_scheduler_name.
info (dict): A dictionary with address of the Redis server and the sockets
of the plasma store, plasma manager, and local scheduler.
mode: The mode of the worker. One of SCRIPT_MODE, WORKER_MODE, PYTHON_MODE,
and SILENT_MODE.
"""
check_main_thread()
# Do some basic checking to make sure we didn't call ray.init twice.
error_message = "Perhaps you called ray.init twice by accident?"
assert not worker.connected, error_message
assert worker.cached_functions_to_run is not None, error_message
assert worker.cached_remote_functions is not None, error_message
assert reusables._cached_reusables is not None, error_message
# Initialize some fields.
worker.worker_id = random_string()
worker.connected = True
worker.set_mode(mode)
@@ -847,13 +935,13 @@ def connect(address_info, mode=WORKER_MODE, worker=global_worker):
if mode == PYTHON_MODE:
return
# Create a Redis client.
worker.redis_client = redis.StrictRedis(host=address_info["node_ip_address"], port=address_info["redis_port"])
worker.redis_client.config_set("notify-keyspace-events", "AKE")
redis_host, redis_port = info["redis_address"].split(":")
worker.redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port))
worker.lock = threading.Lock()
# Create an object store client.
worker.plasma_client = plasma.PlasmaClient(address_info["object_store_names"][0], address_info["object_store_manager_names"][0])
worker.plasma_client = plasma.PlasmaClient(info["store_socket_name"], info["manager_socket_name"])
# Create the local scheduler client.
worker.photon_client = photon.PhotonClient(address_info["local_scheduler_names"][0])
worker.photon_client = photon.PhotonClient(info["local_scheduler_socket_name"])
# Register the worker with Redis.
if mode in [SCRIPT_MODE, SILENT_MODE]:
worker.redis_client.rpush("Drivers", worker.worker_id)
@@ -861,10 +949,22 @@ def connect(address_info, mode=WORKER_MODE, worker=global_worker):
worker.redis_client.rpush("Workers", worker.worker_id)
else:
raise Exception("This code should be unreachable.")
# If this is a driver, set the current task ID to a specific fixed value and
# set the task index to 0.
# If this is a driver, set the current task ID and set the task index to 0.
if mode in [SCRIPT_MODE, SILENT_MODE]:
worker.current_task_id = photon.ObjectID("".join(chr(i) for i in range(20)))
# If the user provided an object_id_seed, then set the current task ID
# deterministically based on that seed (without altering the state of the
# user's random number generator). Otherwise, set the current task ID
# randomly to avoid object ID collisions.
numpy_state = np.random.get_state()
if object_id_seed is not None:
np.random.seed(object_id_seed)
else:
# Try to use true randomness.
np.random.seed(None)
worker.current_task_id = photon.ObjectID(np.random.bytes(20))
# Reset the state of the numpy random number generator.
np.random.set_state(numpy_state)
# Set other fields needed for computing task IDs.
worker.task_index = 0
worker.put_index = 0
# If this is a worker, then start a thread to import exports from the driver.
+42 -9
View File
@@ -2,26 +2,59 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import argparse
import numpy as np
import redis
import traceback
import ray
parser = argparse.ArgumentParser(description="Parse addresses for the worker to connect to.")
parser.add_argument("--node-ip-address", required=True, type=str, help="the ip address of the worker's node")
parser.add_argument("--redis-port", required=True, type=int, help="the port to use for Redis")
parser.add_argument("--redis-address", required=True, type=str, help="the address to use for Redis")
parser.add_argument("--object-store-name", required=True, type=str, help="the object store's name")
parser.add_argument("--object-store-manager-name", required=True, type=str, help="the object store manager's name")
parser.add_argument("--local-scheduler-name", required=True, type=str, help="the local scheduler's name")
def random_string():
return np.random.bytes(20)
if __name__ == "__main__":
args = parser.parse_args()
address_info = {"node_ip_address": args.node_ip_address,
"redis_port": args.redis_port,
"object_store_names": [args.object_store_name],
"object_store_manager_names": [args.object_store_manager_name],
"local_scheduler_names": [args.local_scheduler_name]}
ray.worker.connect(address_info, ray.WORKER_MODE)
info = {"redis_address": args.redis_address,
"store_socket_name": args.object_store_name,
"manager_socket_name": args.object_store_manager_name,
"local_scheduler_socket_name": args.local_scheduler_name}
ray.worker.connect(info, ray.WORKER_MODE)
ray.worker.main_loop()
error_explanation = """
This error is unexpected and should not have happened. Somehow a worker crashed
in an unanticipated way causing the main_loop to throw an exception, which is
being caught in "lib/python/ray/workers/default_worker.py".
"""
while True:
try:
# This call to main_loop should never return if things are working. Most
# exceptions that are thrown (e.g., inside the execution of a task) should
# be caught and handled inside of the call to main_loop. If an exception
# is thrown here, then that means that there is some error that we didn't
# anticipate.
ray.worker.main_loop()
except Exception as e:
traceback_str = traceback.format_exc() + error_explanation
error_key = "WorkerError:{}".format(random_string())
redis_host, redis_port = args.redis_address.split(":")
# For this command to work, some other client (on the same machine as
# Redis) must have run "CONFIG SET protected-mode no".
redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port))
redis_client.hmset(error_key, {"message": traceback_str,
"note": "This error is unexpected and should not have happened."})
redis_client.rpush("ErrorKeys", error_key)
# TODO(rkn): Note that if the worker was in the middle of executing a
# task, the any worker or driver that is blocking in a get call and
# waiting for the output of that task will hang. We need to address this.
# After putting the error message in Redis, this worker will attempt to
# reenter the main loop. TODO(rkn): We should probably reset it's state and
# call connect again.