mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 16:49:48 +08:00
Let worker get worker address and object store address from scheduler (#350)
This commit is contained in:
committed by
Philipp Moritz
parent
b71f064f3e
commit
ac363bf451
+36
-49
@@ -14,7 +14,6 @@ _services_env["PATH"] = os.pathsep.join([os.path.dirname(os.path.abspath(__file_
|
||||
# mode.
|
||||
all_processes = []
|
||||
|
||||
IP_ADDRESS = "127.0.0.1"
|
||||
TIMEOUT_SECONDS = 5
|
||||
|
||||
def address(host, port):
|
||||
@@ -26,18 +25,6 @@ def new_scheduler_port():
|
||||
scheduler_port_counter += 1
|
||||
return 10000 + scheduler_port_counter
|
||||
|
||||
worker_port_counter = 0
|
||||
def new_worker_port():
|
||||
global worker_port_counter
|
||||
worker_port_counter += 1
|
||||
return 40000 + worker_port_counter
|
||||
|
||||
driver_port_counter = 0
|
||||
def new_driver_port():
|
||||
global driver_port_counter
|
||||
driver_port_counter += 1
|
||||
return 30000 + driver_port_counter
|
||||
|
||||
objstore_port_counter = 0
|
||||
def new_objstore_port():
|
||||
global objstore_port_counter
|
||||
@@ -53,23 +40,23 @@ def cleanup():
|
||||
started and disconnected by worker.py.
|
||||
"""
|
||||
global all_processes
|
||||
for p, address in all_processes:
|
||||
successfully_shut_down = True
|
||||
for p in all_processes:
|
||||
if p.poll() is not None: # process has already terminated
|
||||
print "Process at address " + address + " has already terminated."
|
||||
continue
|
||||
print "Attempting to kill process at address " + address + "."
|
||||
p.kill()
|
||||
time.sleep(0.05) # is this necessary?
|
||||
if p.poll() is not None:
|
||||
print "Successfully killed process at address " + address + "."
|
||||
continue
|
||||
print "Kill attempt failed, attempting to terminate process at address " + address + "."
|
||||
p.terminate()
|
||||
time.sleep(0.05) # is this necessary?
|
||||
if p.poll is not None:
|
||||
print "Successfully terminated process at address " + address + "."
|
||||
continue
|
||||
print "Termination attempt failed, giving up."
|
||||
successfully_shut_down = False
|
||||
if successfully_shut_down:
|
||||
print "Successfully shut down Ray."
|
||||
else:
|
||||
print "Ray did not shut down properly."
|
||||
all_processes = []
|
||||
|
||||
def start_scheduler(scheduler_address, local):
|
||||
@@ -83,7 +70,7 @@ def start_scheduler(scheduler_address, local):
|
||||
"""
|
||||
p = subprocess.Popen(["scheduler", scheduler_address, "--log-file-name", config.get_log_file_path("scheduler.log")], env=_services_env)
|
||||
if local:
|
||||
all_processes.append((p, scheduler_address))
|
||||
all_processes.append(p)
|
||||
|
||||
def start_objstore(scheduler_address, objstore_address, local):
|
||||
"""This method starts an object store process.
|
||||
@@ -98,38 +85,40 @@ def start_objstore(scheduler_address, objstore_address, local):
|
||||
"""
|
||||
p = subprocess.Popen(["objstore", scheduler_address, objstore_address, "--log-file-name", config.get_log_file_path("-".join(["objstore", objstore_address]) + ".log")], env=_services_env)
|
||||
if local:
|
||||
all_processes.append((p, objstore_address))
|
||||
all_processes.append(p)
|
||||
|
||||
def start_worker(worker_path, scheduler_address, objstore_address, worker_address, local, user_source_directory=None):
|
||||
def start_worker(node_ip_address, worker_path, scheduler_address, objstore_address=None, local=True, user_source_directory=None):
|
||||
"""This method starts a worker process.
|
||||
|
||||
Args:
|
||||
node_ip_address (str): The IP address of the node that the worker runs on.
|
||||
worker_path (str): The path of the source code which the worker process will
|
||||
run.
|
||||
scheduler_address (str): The ip address and port of the scheduler to connect
|
||||
to.
|
||||
objstore_address (str): The ip address and port of the object store to
|
||||
connect to.
|
||||
worker_address (str): The ip address and port to use for the worker.
|
||||
local (bool): True if using Ray in local mode. If local is true, then this
|
||||
process will be killed by serices.cleanup() when the Python process that
|
||||
imported services exits.
|
||||
user_source_directory (str): The directory containing the application code.
|
||||
This directory will be added to the path of each worker. If not provided,
|
||||
the directory of the script currently being run is used.
|
||||
objstore_address (Optional[str]): The ip address and port of the object
|
||||
store to connect to.
|
||||
local (Optional[bool]): True if using Ray in local mode. If local is true,
|
||||
then this process will be killed by serices.cleanup() when the Python
|
||||
process that imported services exits. This is True by default.
|
||||
user_source_directory (Optional[str]): The directory containing the
|
||||
application code. This directory will be added to the path of each worker.
|
||||
If not provided, the directory of the script currently being run is used.
|
||||
"""
|
||||
if user_source_directory is None:
|
||||
# This extracts the directory of the script that is currently being run.
|
||||
# This will allow users to import modules contained in this directory.
|
||||
user_source_directory = os.path.dirname(os.path.abspath(os.path.join(os.path.curdir, sys.argv[0])))
|
||||
p = subprocess.Popen(["python",
|
||||
worker_path,
|
||||
"--user-source-directory=" + user_source_directory,
|
||||
"--scheduler-address=" + scheduler_address,
|
||||
"--objstore-address=" + objstore_address,
|
||||
"--worker-address=" + worker_address])
|
||||
command = ["python",
|
||||
worker_path,
|
||||
"--node-ip-address=" + node_ip_address,
|
||||
"--user-source-directory=" + user_source_directory,
|
||||
"--scheduler-address=" + scheduler_address]
|
||||
if objstore_address is not None:
|
||||
command.append("--objstore-address=" + objstore_address)
|
||||
p = subprocess.Popen(command)
|
||||
if local:
|
||||
all_processes.append((p, worker_address))
|
||||
all_processes.append(p)
|
||||
|
||||
def start_node(scheduler_address, node_ip_address, num_workers, worker_path=None, user_source_directory=None):
|
||||
"""Start an object store and associated workers in the cluster setting.
|
||||
@@ -153,7 +142,7 @@ def start_node(scheduler_address, node_ip_address, num_workers, worker_path=None
|
||||
if worker_path is None:
|
||||
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../scripts/default_worker.py")
|
||||
for _ in range(num_workers):
|
||||
start_worker(worker_path, scheduler_address, objstore_address, address(node_ip_address, new_worker_port()), user_source_directory=user_source_directory, local=False)
|
||||
start_worker(node_ip_address, worker_path, scheduler_address, objstore_address=objstore_address, user_source_directory=user_source_directory, local=False)
|
||||
time.sleep(0.5)
|
||||
|
||||
def start_workers(scheduler_address, objstore_address, num_workers, worker_path):
|
||||
@@ -174,9 +163,9 @@ def start_workers(scheduler_address, objstore_address, num_workers, worker_path)
|
||||
"""
|
||||
node_ip_address = objstore_address.split(":")[0]
|
||||
for _ in range(num_workers):
|
||||
start_worker(worker_path, scheduler_address, objstore_address, address(node_ip_address, new_worker_port()), local=False)
|
||||
start_worker(node_ip_address, worker_path, scheduler_address, objstore_address=objstore_address, local=False)
|
||||
|
||||
def start_ray_local(num_objstores=1, num_workers=0, worker_path=None):
|
||||
def start_ray_local(node_ip_address="127.0.0.1", num_objstores=1, num_workers=0, worker_path=None):
|
||||
"""Start Ray in local mode.
|
||||
|
||||
This method starts Ray in local mode (as opposed to cluster mode, which is
|
||||
@@ -190,20 +179,19 @@ def start_ray_local(num_objstores=1, num_workers=0, worker_path=None):
|
||||
worker.
|
||||
|
||||
Returns:
|
||||
The address of the scheduler, the addresses of all of the object stores, and
|
||||
the one new driver address for each object store.
|
||||
The address of the scheduler and the addresses of all of the object stores.
|
||||
"""
|
||||
if worker_path is None:
|
||||
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../scripts/default_worker.py")
|
||||
if num_workers > 0 and num_objstores < 1:
|
||||
raise Exception("Attempting to start a cluster with {} workers per object store, but `num_objstores` is {}.".format(num_objstores))
|
||||
scheduler_address = address(IP_ADDRESS, new_scheduler_port())
|
||||
scheduler_address = address(node_ip_address, new_scheduler_port())
|
||||
start_scheduler(scheduler_address, local=True)
|
||||
time.sleep(0.1)
|
||||
objstore_addresses = []
|
||||
# create objstores
|
||||
for i in range(num_objstores):
|
||||
objstore_address = address(IP_ADDRESS, new_objstore_port())
|
||||
objstore_address = address(node_ip_address, new_objstore_port())
|
||||
objstore_addresses.append(objstore_address)
|
||||
start_objstore(scheduler_address, objstore_address, local=True)
|
||||
time.sleep(0.2)
|
||||
@@ -214,8 +202,7 @@ def start_ray_local(num_objstores=1, num_workers=0, worker_path=None):
|
||||
# remaining number of workers.
|
||||
num_workers_to_start = num_workers - (num_objstores - 1) * (num_workers / num_objstores)
|
||||
for _ in range(num_workers_to_start):
|
||||
start_worker(worker_path, scheduler_address, objstore_address, address(IP_ADDRESS, new_worker_port()), local=True)
|
||||
start_worker(node_ip_address, worker_path, scheduler_address, objstore_address=objstore_address, local=True)
|
||||
time.sleep(0.3)
|
||||
|
||||
driver_addresses = [address(IP_ADDRESS, new_driver_port()) for _ in range(num_objstores)]
|
||||
return scheduler_address, objstore_addresses, driver_addresses
|
||||
return scheduler_address, objstore_addresses
|
||||
|
||||
+20
-22
@@ -658,7 +658,7 @@ def register_module(module, worker=global_worker):
|
||||
_logger().info("registering {}.".format(val.func_name))
|
||||
worker.register_function(val)
|
||||
|
||||
def init(start_ray_local=False, num_workers=None, num_objstores=None, scheduler_address=None, objstore_address=None, driver_address=None, driver_mode=SCRIPT_MODE):
|
||||
def init(start_ray_local=False, num_workers=None, num_objstores=None, scheduler_address=None, node_ip_address=None, driver_mode=SCRIPT_MODE):
|
||||
"""Either connect to an existing Ray cluster or start one and connect to it.
|
||||
|
||||
This method handles two cases. Either a Ray cluster already exists and we
|
||||
@@ -675,10 +675,9 @@ def init(start_ray_local=False, num_workers=None, num_objstores=None, scheduler_
|
||||
start_ray_local is True.
|
||||
scheduler_address (Optional[str]): The address of the scheduler to connect
|
||||
to if start_ray_local is False.
|
||||
objstore_address (Optional[str]): The address of the object store to connect
|
||||
to if start_ray_local is False.
|
||||
driver_address (Optional[str]): The address of this driver if
|
||||
start_ray_local is False.
|
||||
node_ip_address (Optional[str]): The address of the node the worker is
|
||||
running on. It is required if start_ray_local is False and it cannot be
|
||||
provided otherwise.
|
||||
driver_mode (Optional[bool]): The mode in which to start the driver. This
|
||||
should be one of SCRIPT_MODE, PYTHON_MODE, and SILENT_MODE.
|
||||
|
||||
@@ -689,28 +688,28 @@ def init(start_ray_local=False, num_workers=None, num_objstores=None, scheduler_
|
||||
if start_ray_local:
|
||||
# In this case, we launch a scheduler, a new object store, and some workers,
|
||||
# and we connect to them.
|
||||
if (scheduler_address is not None) or (objstore_address is not None) or (driver_address is not None):
|
||||
raise Exception("If start_ray_local=True, then you cannot pass in a scheduler_address, objstore_address, or worker_address.")
|
||||
if (scheduler_address is not None) or (node_ip_address is not None):
|
||||
raise Exception("If start_ray_local=True, then you cannot pass in a scheduler_address or a node_ip_address.")
|
||||
if driver_mode not in [SCRIPT_MODE, PYTHON_MODE, SILENT_MODE]:
|
||||
raise Exception("If start_ray_local=True, then driver_mode must be in [SCRIPT_MODE, PYTHON_MODE, SILENT_MODE].")
|
||||
# Use the address 127.0.0.1 in local mode.
|
||||
node_ip_address = "127.0.0.1"
|
||||
num_workers = 1 if num_workers is None else num_workers
|
||||
num_objstores = 1 if num_objstores is None else num_objstores
|
||||
# Start the scheduler, object store, and some workers. These will be killed
|
||||
# by the call to cleanup(), which happens when the Python script exits.
|
||||
scheduler_address, objstore_addresses, driver_addresses = services.start_ray_local(num_objstores=num_objstores, num_workers=num_workers, worker_path=None)
|
||||
# It is possible for start_ray_local to return multiple object stores, but
|
||||
# we will only connect the driver to one of them.
|
||||
objstore_address = objstore_addresses[0]
|
||||
driver_address = driver_addresses[0]
|
||||
scheduler_address, _ = services.start_ray_local(num_objstores=num_objstores, num_workers=num_workers, worker_path=None)
|
||||
else:
|
||||
# In this case, there is an existing scheduler and object store, and we do
|
||||
# not need to start any processes.
|
||||
if (num_workers is not None) or (num_objstores is not None):
|
||||
raise Exception("The arguments num_workers and num_objstores must not be provided unless start_ray_local=True.")
|
||||
if node_ip_address is None:
|
||||
raise Exception("When start_ray_local=False, the node_ip_address of the current node must be provided.")
|
||||
# Connect this driver to the scheduler and object store. The corresponing call
|
||||
# to disconnect will happen in the call to cleanup() when the Python script
|
||||
# exits.
|
||||
connect(scheduler_address, objstore_address, driver_address, is_driver=True, worker=global_worker, mode=driver_mode)
|
||||
connect(node_ip_address, scheduler_address, is_driver=True, worker=global_worker, mode=driver_mode)
|
||||
|
||||
def cleanup(worker=global_worker):
|
||||
"""Disconnect the driver, and terminate any processes started in init.
|
||||
@@ -726,14 +725,15 @@ def cleanup(worker=global_worker):
|
||||
|
||||
atexit.register(cleanup)
|
||||
|
||||
def connect(scheduler_address, objstore_address, worker_address, is_driver=False, worker=global_worker, mode=WORKER_MODE):
|
||||
def connect(node_ip_address, scheduler_address, objstore_address=None, is_driver=False, worker=global_worker, mode=WORKER_MODE):
|
||||
"""Connect this worker to the scheduler and an object store.
|
||||
|
||||
Args:
|
||||
node_ip_address (str): The ip address of the node the worker runs on.
|
||||
scheduler_address (str): The ip address and port of the scheduler.
|
||||
objstore_address (str): The ip address and port of the local object store.
|
||||
worker_address (str): The ip address and port of this worker. The port can
|
||||
be chosen arbitrarily.
|
||||
objstore_address (Optional[str]): The ip address and port of the local
|
||||
object store. Normally, this argument should be omitted and the scheduler
|
||||
will tell the worker what object store to connect to.
|
||||
is_driver (bool): True if this worker is a driver and false otherwise.
|
||||
mode: The mode of the worker. One of SCRIPT_MODE, WORKER_MODE, PYTHON_MODE,
|
||||
and SILENT_MODE.
|
||||
@@ -741,22 +741,20 @@ def connect(scheduler_address, objstore_address, worker_address, is_driver=False
|
||||
if hasattr(worker, "handle"):
|
||||
del worker.handle
|
||||
worker.scheduler_address = scheduler_address
|
||||
worker.objstore_address = objstore_address
|
||||
worker.worker_address = worker_address
|
||||
worker.handle = raylib.create_worker(worker.scheduler_address, worker.objstore_address, worker.worker_address, is_driver)
|
||||
worker.handle, worker.worker_address = raylib.create_worker(node_ip_address, scheduler_address, objstore_address if objstore_address is not None else "", is_driver)
|
||||
worker.set_mode(mode)
|
||||
FORMAT = "%(asctime)-15s %(message)s"
|
||||
# Configure the Python logging module. Note that if we do not provide our own
|
||||
# logger, then our logging will interfere with other Python modules that also
|
||||
# use the logging module.
|
||||
log_handler = logging.FileHandler(config.get_log_file_path("-".join(["worker", worker_address]) + ".log"))
|
||||
log_handler = logging.FileHandler(config.get_log_file_path("-".join(["worker", worker.worker_address]) + ".log"))
|
||||
log_handler.setLevel(logging.DEBUG)
|
||||
log_handler.setFormatter(logging.Formatter(FORMAT))
|
||||
_logger().addHandler(log_handler)
|
||||
_logger().setLevel(logging.DEBUG)
|
||||
_logger().propagate = False
|
||||
# Configure the logging from the worker C++ code.
|
||||
raylib.set_log_config(config.get_log_file_path("-".join(["worker", worker_address, "c++"]) + ".log"))
|
||||
raylib.set_log_config(config.get_log_file_path("-".join(["worker", worker.worker_address, "c++"]) + ".log"))
|
||||
if mode in [SCRIPT_MODE, SILENT_MODE]:
|
||||
for function_to_export in worker.cached_remote_functions:
|
||||
raylib.export_function(worker.handle, function_to_export)
|
||||
|
||||
Reference in New Issue
Block a user