mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 21:38:18 +08:00
Add Redis port option to startup script (#232)
* specify redis address when starting head * cleanup * update starting cluster documentation * Whitespace. * Address Philipp's comments. * Change redis_host -> redis_ip_address.
This commit is contained in:
committed by
Robert Nishihara
parent
db7297865f
commit
6ad2b5d87a
@@ -493,7 +493,7 @@ class TestPlasmaManager(unittest.TestCase):
|
||||
store_name1, self.p2 = plasma.start_plasma_store(use_valgrind=USE_VALGRIND)
|
||||
store_name2, self.p3 = plasma.start_plasma_store(use_valgrind=USE_VALGRIND)
|
||||
# Start a Redis server.
|
||||
redis_address = services.start_redis("127.0.0.1")
|
||||
redis_address = services.address("127.0.0.1", services.start_redis())
|
||||
# Start two PlasmaManagers.
|
||||
manager_name1, self.p4, self.port1 = plasma.start_plasma_manager(store_name1, redis_address, use_valgrind=USE_VALGRIND)
|
||||
manager_name2, self.p5, self.port2 = plasma.start_plasma_manager(store_name2, redis_address, use_valgrind=USE_VALGRIND)
|
||||
@@ -789,7 +789,7 @@ class TestPlasmaManagerRecovery(unittest.TestCase):
|
||||
# Start a Plasma store.
|
||||
self.store_name, self.p2 = plasma.start_plasma_store(use_valgrind=USE_VALGRIND)
|
||||
# Start a Redis server.
|
||||
self.redis_address = services.start_redis("127.0.0.1")
|
||||
self.redis_address = services.address("127.0.0.1", services.start_redis())
|
||||
# Start a PlasmaManagers.
|
||||
manager_name, self.p3, self.port1 = plasma.start_plasma_manager(
|
||||
self.store_name,
|
||||
|
||||
+53
-29
@@ -52,8 +52,8 @@ ObjectStoreAddress = namedtuple("ObjectStoreAddress", ["name",
|
||||
"manager_name",
|
||||
"manager_port"])
|
||||
|
||||
def address(host, port):
|
||||
return host + ":" + str(port)
|
||||
def address(ip_address, port):
|
||||
return ip_address + ":" + str(port)
|
||||
|
||||
def get_port(address):
|
||||
try:
|
||||
@@ -101,7 +101,7 @@ def cleanup():
|
||||
"""When running in local mode, shutdown the Ray processes.
|
||||
|
||||
This method is used to shutdown processes that were started with
|
||||
services.start_ray_local(). It kills all scheduler, object store, and worker
|
||||
services.start_ray_head(). It kills all scheduler, object store, and worker
|
||||
processes that were started by this services module. Driver processes are
|
||||
started and disconnected by worker.py.
|
||||
"""
|
||||
@@ -140,19 +140,19 @@ def get_node_ip_address(address="8.8.8.8:53"):
|
||||
Returns:
|
||||
The IP address of the current node.
|
||||
"""
|
||||
host, port = address.split(":")
|
||||
ip_address, port = address.split(":")
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
s.connect((host, int(port)))
|
||||
s.connect((ip_address, int(port)))
|
||||
return s.getsockname()[0]
|
||||
|
||||
def wait_for_redis_to_start(redis_host, redis_port, num_retries=5):
|
||||
def wait_for_redis_to_start(redis_ip_address, redis_port, num_retries=5):
|
||||
"""Wait for a Redis server to be available.
|
||||
|
||||
This is accomplished by creating a Redis client and sending a random command
|
||||
to the server until the command gets through.
|
||||
|
||||
Args:
|
||||
redis_host (str): The IP address of the redis server.
|
||||
redis_ip_address (str): The IP address of the redis server.
|
||||
redis_port (int): The port of the redis server.
|
||||
num_retries (int): The number of times to try connecting with redis. The
|
||||
client will sleep for one second between attempts.
|
||||
@@ -160,13 +160,13 @@ def wait_for_redis_to_start(redis_host, redis_port, num_retries=5):
|
||||
Raises:
|
||||
Exception: An exception is raised if we could not connect with Redis.
|
||||
"""
|
||||
redis_client = redis.StrictRedis(host=redis_host, port=redis_port)
|
||||
redis_client = redis.StrictRedis(host=redis_ip_address, port=redis_port)
|
||||
# Wait for the Redis server to start.
|
||||
counter = 0
|
||||
while counter < num_retries:
|
||||
try:
|
||||
# Run some random command and see if it worked.
|
||||
print("Waiting for redis server at {}:{} to respond...".format(redis_host, redis_port))
|
||||
print("Waiting for redis server at {}:{} to respond...".format(redis_ip_address, redis_port))
|
||||
redis_client.client_list()
|
||||
except redis.ConnectionError as e:
|
||||
# Wait a little bit.
|
||||
@@ -178,10 +178,11 @@ def wait_for_redis_to_start(redis_host, redis_port, num_retries=5):
|
||||
if counter == num_retries:
|
||||
raise Exception("Unable to connect to Redis. If the Redis instance is on a different machine, check that your firewall is configured properly.")
|
||||
|
||||
def start_redis(node_ip_address, num_retries=20, cleanup=True, redirect_output=False):
|
||||
def start_redis(port=None, num_retries=20, cleanup=True, redirect_output=False):
|
||||
"""Start a Redis server.
|
||||
|
||||
Args:
|
||||
port (int): If provided, start a Redis server with this port.
|
||||
num_retries (int): The number of times to attempt to start Redis.
|
||||
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
|
||||
this process will be killed by serices.cleanup() when the Python process
|
||||
@@ -190,7 +191,8 @@ def start_redis(node_ip_address, num_retries=20, cleanup=True, redirect_output=F
|
||||
/dev/null.
|
||||
|
||||
Returns:
|
||||
The address used by Redis.
|
||||
The port used by Redis. If a port is passed in, then the same value is
|
||||
returned.
|
||||
|
||||
Raises:
|
||||
Exception: An exception is raised if Redis could not be started.
|
||||
@@ -200,10 +202,14 @@ def start_redis(node_ip_address, num_retries=20, cleanup=True, redirect_output=F
|
||||
assert os.path.isfile(redis_filepath)
|
||||
assert os.path.isfile(redis_module)
|
||||
counter = 0
|
||||
if port is not None:
|
||||
if num_retries != 1:
|
||||
raise Exception("Num retries must be 1 if port is specified")
|
||||
else:
|
||||
port = new_port()
|
||||
while counter < num_retries:
|
||||
if counter > 0:
|
||||
print("Redis failed to start, retrying now.")
|
||||
port = new_port()
|
||||
with open(os.devnull, "w") as FNULL:
|
||||
stdout = FNULL if redirect_output else None
|
||||
stderr = FNULL if redirect_output else None
|
||||
@@ -215,6 +221,7 @@ def start_redis(node_ip_address, num_retries=20, cleanup=True, redirect_output=F
|
||||
if cleanup:
|
||||
all_processes[PROCESS_TYPE_REDIS_SERVER].append(p)
|
||||
break
|
||||
port = new_port()
|
||||
counter += 1
|
||||
if counter == num_retries:
|
||||
raise Exception("Couldn't start Redis.")
|
||||
@@ -229,8 +236,7 @@ def start_redis(node_ip_address, num_retries=20, cleanup=True, redirect_output=F
|
||||
# Configure Redis to not run in protected mode so that processes on other
|
||||
# hosts can connect to it. TODO(rkn): Do this in a more secure way.
|
||||
redis_client.config_set("protected-mode", "no")
|
||||
redis_address = address(node_ip_address, port)
|
||||
return redis_address
|
||||
return port
|
||||
|
||||
def start_global_scheduler(redis_address, cleanup=True, redirect_output=False):
|
||||
"""Start a global scheduler process.
|
||||
@@ -370,7 +376,8 @@ def start_ray_processes(address_info=None,
|
||||
worker_path=None,
|
||||
cleanup=True,
|
||||
redirect_output=False,
|
||||
include_global_scheduler=False):
|
||||
include_global_scheduler=False,
|
||||
include_redis=False):
|
||||
"""Helper method to start Ray processes.
|
||||
|
||||
Args:
|
||||
@@ -393,6 +400,8 @@ def start_ray_processes(address_info=None,
|
||||
/dev/null.
|
||||
include_global_scheduler (bool): If include_global_scheduler is True, then
|
||||
start a global scheduler process.
|
||||
include_redis (bool): If include_redis is True, then start a Redis server
|
||||
process.
|
||||
|
||||
Returns:
|
||||
A dictionary of the address information for the processes that were
|
||||
@@ -410,12 +419,26 @@ def start_ray_processes(address_info=None,
|
||||
# warning messages when it starts up. Instead of suppressing the output, we
|
||||
# should address the warnings.
|
||||
redis_address = address_info.get("redis_address")
|
||||
if redis_address is None:
|
||||
redis_address = start_redis(node_ip_address, cleanup=cleanup,
|
||||
redirect_output=redirect_output)
|
||||
address_info["redis_address"] = redis_address
|
||||
time.sleep(0.1)
|
||||
redis_port = get_port(redis_address)
|
||||
if include_redis:
|
||||
if redis_address is None:
|
||||
# Start a Redis server. The start_redis method will choose a random port.
|
||||
redis_port = start_redis(cleanup=cleanup, redirect_output=redirect_output)
|
||||
redis_address = address(node_ip_address, redis_port)
|
||||
address_info["redis_address"] = redis_address
|
||||
time.sleep(0.1)
|
||||
else:
|
||||
# A Redis address was provided, so start a Redis server with the given
|
||||
# port. TODO(rkn): We should check that the IP address corresponds to the
|
||||
# machine that this method is running on.
|
||||
redis_ip_address, redis_port = redis_address.split(":")
|
||||
new_redis_port = start_redis(port=int(redis_port),
|
||||
num_retries=1,
|
||||
cleanup=cleanup,
|
||||
redirect_output=redirect_output)
|
||||
assert redis_port == new_redis_port
|
||||
else:
|
||||
if redis_address is None:
|
||||
raise Exception("Redis address expected")
|
||||
|
||||
# Start the global scheduler, if necessary.
|
||||
if include_global_scheduler:
|
||||
@@ -519,13 +542,13 @@ def start_ray_node(node_ip_address,
|
||||
cleanup=cleanup,
|
||||
redirect_output=redirect_output)
|
||||
|
||||
def start_ray_local(address_info=None,
|
||||
node_ip_address="127.0.0.1",
|
||||
num_workers=0,
|
||||
num_local_schedulers=1,
|
||||
worker_path=None,
|
||||
cleanup=True,
|
||||
redirect_output=False):
|
||||
def start_ray_head(address_info=None,
|
||||
node_ip_address="127.0.0.1",
|
||||
num_workers=0,
|
||||
num_local_schedulers=1,
|
||||
worker_path=None,
|
||||
cleanup=True,
|
||||
redirect_output=False):
|
||||
"""Start Ray in local mode.
|
||||
|
||||
Args:
|
||||
@@ -558,4 +581,5 @@ def start_ray_local(address_info=None,
|
||||
worker_path=worker_path,
|
||||
cleanup=cleanup,
|
||||
redirect_output=redirect_output,
|
||||
include_global_scheduler=True)
|
||||
include_global_scheduler=True,
|
||||
include_redis=True)
|
||||
|
||||
@@ -649,10 +649,10 @@ def initialize_numbuf(worker=global_worker):
|
||||
register_class(RayGetArgumentError)
|
||||
|
||||
def get_address_info_from_redis_helper(redis_address, node_ip_address):
|
||||
redis_host, redis_port = redis_address.split(":")
|
||||
redis_ip_address, redis_port = redis_address.split(":")
|
||||
# For this command to work, some other client (on the same machine as Redis)
|
||||
# must have run "CONFIG SET protected-mode no".
|
||||
redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port))
|
||||
redis_client = redis.StrictRedis(host=redis_ip_address, port=int(redis_port))
|
||||
# The client table prefix must be kept in sync with the file
|
||||
# "src/common/redis_module/ray_redis_module.c" where it is defined.
|
||||
REDIS_CLIENT_TABLE_PREFIX = "CL:"
|
||||
@@ -781,10 +781,10 @@ def _init(address_info=None, start_ray_local=False, object_id_seed=None,
|
||||
num_local_schedulers = 1
|
||||
# Start the scheduler, object store, and some workers. These will be killed
|
||||
# by the call to cleanup(), which happens when the Python script exits.
|
||||
address_info = services.start_ray_local(address_info=address_info,
|
||||
node_ip_address=node_ip_address,
|
||||
num_workers=num_workers,
|
||||
num_local_schedulers=num_local_schedulers)
|
||||
address_info = services.start_ray_head(address_info=address_info,
|
||||
node_ip_address=node_ip_address,
|
||||
num_workers=num_workers,
|
||||
num_local_schedulers=num_local_schedulers)
|
||||
else:
|
||||
if redis_address is None:
|
||||
raise Exception("If start_ray_local=False, then redis_address must be provided.")
|
||||
@@ -1075,8 +1075,8 @@ def connect(info, object_id_seed=None, mode=WORKER_MODE, worker=global_worker):
|
||||
worker.node_ip_address = info["node_ip_address"]
|
||||
worker.redis_address = info["redis_address"]
|
||||
# Create a Redis client.
|
||||
redis_host, redis_port = info["redis_address"].split(":")
|
||||
worker.redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port))
|
||||
redis_ip_address, redis_port = info["redis_address"].split(":")
|
||||
worker.redis_client = redis.StrictRedis(host=redis_ip_address, port=int(redis_port))
|
||||
worker.lock = threading.Lock()
|
||||
# Create an object store client.
|
||||
worker.plasma_client = plasma.PlasmaClient(info["store_socket_name"], info["manager_socket_name"])
|
||||
|
||||
@@ -48,10 +48,10 @@ being caught in "lib/python/ray/workers/default_worker.py".
|
||||
# We use a driver ID of all zeros to push an error message to all drivers.
|
||||
driver_id = DRIVER_ID_LENGTH * b"\x00"
|
||||
error_key = b"Error:" + driver_id + b":" + random_string()
|
||||
redis_host, redis_port = args.redis_address.split(":")
|
||||
redis_ip_address, redis_port = args.redis_address.split(":")
|
||||
# For this command to work, some other client (on the same machine as
|
||||
# Redis) must have run "CONFIG SET protected-mode no".
|
||||
redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port))
|
||||
redis_client = redis.StrictRedis(host=redis_ip_address, port=int(redis_port))
|
||||
redis_client.hmset(error_key, {"type": "worker_crash",
|
||||
"message": traceback_str,
|
||||
"note": "This error is unexpected and should not have happened."})
|
||||
|
||||
Reference in New Issue
Block a user