Let worker get worker address and object store address from scheduler (#350)

This commit is contained in:
Robert Nishihara
2016-08-04 17:47:08 -07:00
committed by Philipp Moritz
parent b71f064f3e
commit ac363bf451
13 changed files with 165 additions and 147 deletions
+36 -49
View File
@@ -14,7 +14,6 @@ _services_env["PATH"] = os.pathsep.join([os.path.dirname(os.path.abspath(__file_
# mode.
all_processes = []
IP_ADDRESS = "127.0.0.1"
TIMEOUT_SECONDS = 5
def address(host, port):
@@ -26,18 +25,6 @@ def new_scheduler_port():
scheduler_port_counter += 1
return 10000 + scheduler_port_counter
worker_port_counter = 0
def new_worker_port():
global worker_port_counter
worker_port_counter += 1
return 40000 + worker_port_counter
driver_port_counter = 0
def new_driver_port():
global driver_port_counter
driver_port_counter += 1
return 30000 + driver_port_counter
objstore_port_counter = 0
def new_objstore_port():
global objstore_port_counter
@@ -53,23 +40,23 @@ def cleanup():
started and disconnected by worker.py.
"""
global all_processes
for p, address in all_processes:
successfully_shut_down = True
for p in all_processes:
if p.poll() is not None: # process has already terminated
print "Process at address " + address + " has already terminated."
continue
print "Attempting to kill process at address " + address + "."
p.kill()
time.sleep(0.05) # is this necessary?
if p.poll() is not None:
print "Successfully killed process at address " + address + "."
continue
print "Kill attempt failed, attempting to terminate process at address " + address + "."
p.terminate()
time.sleep(0.05) # is this necessary?
if p.poll is not None:
print "Successfully terminated process at address " + address + "."
continue
print "Termination attempt failed, giving up."
successfully_shut_down = False
if successfully_shut_down:
print "Successfully shut down Ray."
else:
print "Ray did not shut down properly."
all_processes = []
def start_scheduler(scheduler_address, local):
@@ -83,7 +70,7 @@ def start_scheduler(scheduler_address, local):
"""
p = subprocess.Popen(["scheduler", scheduler_address, "--log-file-name", config.get_log_file_path("scheduler.log")], env=_services_env)
if local:
all_processes.append((p, scheduler_address))
all_processes.append(p)
def start_objstore(scheduler_address, objstore_address, local):
"""This method starts an object store process.
@@ -98,38 +85,40 @@ def start_objstore(scheduler_address, objstore_address, local):
"""
p = subprocess.Popen(["objstore", scheduler_address, objstore_address, "--log-file-name", config.get_log_file_path("-".join(["objstore", objstore_address]) + ".log")], env=_services_env)
if local:
all_processes.append((p, objstore_address))
all_processes.append(p)
def start_worker(worker_path, scheduler_address, objstore_address, worker_address, local, user_source_directory=None):
def start_worker(node_ip_address, worker_path, scheduler_address, objstore_address=None, local=True, user_source_directory=None):
"""This method starts a worker process.
Args:
node_ip_address (str): The IP address of the node that the worker runs on.
worker_path (str): The path of the source code which the worker process will
run.
scheduler_address (str): The ip address and port of the scheduler to connect
to.
objstore_address (str): The ip address and port of the object store to
connect to.
worker_address (str): The ip address and port to use for the worker.
local (bool): True if using Ray in local mode. If local is true, then this
process will be killed by serices.cleanup() when the Python process that
imported services exits.
user_source_directory (str): The directory containing the application code.
This directory will be added to the path of each worker. If not provided,
the directory of the script currently being run is used.
objstore_address (Optional[str]): The ip address and port of the object
store to connect to.
local (Optional[bool]): True if using Ray in local mode. If local is true,
then this process will be killed by serices.cleanup() when the Python
process that imported services exits. This is True by default.
user_source_directory (Optional[str]): The directory containing the
application code. This directory will be added to the path of each worker.
If not provided, the directory of the script currently being run is used.
"""
if user_source_directory is None:
# This extracts the directory of the script that is currently being run.
# This will allow users to import modules contained in this directory.
user_source_directory = os.path.dirname(os.path.abspath(os.path.join(os.path.curdir, sys.argv[0])))
p = subprocess.Popen(["python",
worker_path,
"--user-source-directory=" + user_source_directory,
"--scheduler-address=" + scheduler_address,
"--objstore-address=" + objstore_address,
"--worker-address=" + worker_address])
command = ["python",
worker_path,
"--node-ip-address=" + node_ip_address,
"--user-source-directory=" + user_source_directory,
"--scheduler-address=" + scheduler_address]
if objstore_address is not None:
command.append("--objstore-address=" + objstore_address)
p = subprocess.Popen(command)
if local:
all_processes.append((p, worker_address))
all_processes.append(p)
def start_node(scheduler_address, node_ip_address, num_workers, worker_path=None, user_source_directory=None):
"""Start an object store and associated workers in the cluster setting.
@@ -153,7 +142,7 @@ def start_node(scheduler_address, node_ip_address, num_workers, worker_path=None
if worker_path is None:
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../scripts/default_worker.py")
for _ in range(num_workers):
start_worker(worker_path, scheduler_address, objstore_address, address(node_ip_address, new_worker_port()), user_source_directory=user_source_directory, local=False)
start_worker(node_ip_address, worker_path, scheduler_address, objstore_address=objstore_address, user_source_directory=user_source_directory, local=False)
time.sleep(0.5)
def start_workers(scheduler_address, objstore_address, num_workers, worker_path):
@@ -174,9 +163,9 @@ def start_workers(scheduler_address, objstore_address, num_workers, worker_path)
"""
node_ip_address = objstore_address.split(":")[0]
for _ in range(num_workers):
start_worker(worker_path, scheduler_address, objstore_address, address(node_ip_address, new_worker_port()), local=False)
start_worker(node_ip_address, worker_path, scheduler_address, objstore_address=objstore_address, local=False)
def start_ray_local(num_objstores=1, num_workers=0, worker_path=None):
def start_ray_local(node_ip_address="127.0.0.1", num_objstores=1, num_workers=0, worker_path=None):
"""Start Ray in local mode.
This method starts Ray in local mode (as opposed to cluster mode, which is
@@ -190,20 +179,19 @@ def start_ray_local(num_objstores=1, num_workers=0, worker_path=None):
worker.
Returns:
The address of the scheduler, the addresses of all of the object stores, and
the one new driver address for each object store.
The address of the scheduler and the addresses of all of the object stores.
"""
if worker_path is None:
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../scripts/default_worker.py")
if num_workers > 0 and num_objstores < 1:
raise Exception("Attempting to start a cluster with {} workers per object store, but `num_objstores` is {}.".format(num_objstores))
scheduler_address = address(IP_ADDRESS, new_scheduler_port())
scheduler_address = address(node_ip_address, new_scheduler_port())
start_scheduler(scheduler_address, local=True)
time.sleep(0.1)
objstore_addresses = []
# create objstores
for i in range(num_objstores):
objstore_address = address(IP_ADDRESS, new_objstore_port())
objstore_address = address(node_ip_address, new_objstore_port())
objstore_addresses.append(objstore_address)
start_objstore(scheduler_address, objstore_address, local=True)
time.sleep(0.2)
@@ -214,8 +202,7 @@ def start_ray_local(num_objstores=1, num_workers=0, worker_path=None):
# remaining number of workers.
num_workers_to_start = num_workers - (num_objstores - 1) * (num_workers / num_objstores)
for _ in range(num_workers_to_start):
start_worker(worker_path, scheduler_address, objstore_address, address(IP_ADDRESS, new_worker_port()), local=True)
start_worker(node_ip_address, worker_path, scheduler_address, objstore_address=objstore_address, local=True)
time.sleep(0.3)
driver_addresses = [address(IP_ADDRESS, new_driver_port()) for _ in range(num_objstores)]
return scheduler_address, objstore_addresses, driver_addresses
return scheduler_address, objstore_addresses
+20 -22
View File
@@ -658,7 +658,7 @@ def register_module(module, worker=global_worker):
_logger().info("registering {}.".format(val.func_name))
worker.register_function(val)
def init(start_ray_local=False, num_workers=None, num_objstores=None, scheduler_address=None, objstore_address=None, driver_address=None, driver_mode=SCRIPT_MODE):
def init(start_ray_local=False, num_workers=None, num_objstores=None, scheduler_address=None, node_ip_address=None, driver_mode=SCRIPT_MODE):
"""Either connect to an existing Ray cluster or start one and connect to it.
This method handles two cases. Either a Ray cluster already exists and we
@@ -675,10 +675,9 @@ def init(start_ray_local=False, num_workers=None, num_objstores=None, scheduler_
start_ray_local is True.
scheduler_address (Optional[str]): The address of the scheduler to connect
to if start_ray_local is False.
objstore_address (Optional[str]): The address of the object store to connect
to if start_ray_local is False.
driver_address (Optional[str]): The address of this driver if
start_ray_local is False.
node_ip_address (Optional[str]): The address of the node the worker is
running on. It is required if start_ray_local is False and it cannot be
provided otherwise.
driver_mode (Optional[bool]): The mode in which to start the driver. This
should be one of SCRIPT_MODE, PYTHON_MODE, and SILENT_MODE.
@@ -689,28 +688,28 @@ def init(start_ray_local=False, num_workers=None, num_objstores=None, scheduler_
if start_ray_local:
# In this case, we launch a scheduler, a new object store, and some workers,
# and we connect to them.
if (scheduler_address is not None) or (objstore_address is not None) or (driver_address is not None):
raise Exception("If start_ray_local=True, then you cannot pass in a scheduler_address, objstore_address, or worker_address.")
if (scheduler_address is not None) or (node_ip_address is not None):
raise Exception("If start_ray_local=True, then you cannot pass in a scheduler_address or a node_ip_address.")
if driver_mode not in [SCRIPT_MODE, PYTHON_MODE, SILENT_MODE]:
raise Exception("If start_ray_local=True, then driver_mode must be in [SCRIPT_MODE, PYTHON_MODE, SILENT_MODE].")
# Use the address 127.0.0.1 in local mode.
node_ip_address = "127.0.0.1"
num_workers = 1 if num_workers is None else num_workers
num_objstores = 1 if num_objstores is None else num_objstores
# Start the scheduler, object store, and some workers. These will be killed
# by the call to cleanup(), which happens when the Python script exits.
scheduler_address, objstore_addresses, driver_addresses = services.start_ray_local(num_objstores=num_objstores, num_workers=num_workers, worker_path=None)
# It is possible for start_ray_local to return multiple object stores, but
# we will only connect the driver to one of them.
objstore_address = objstore_addresses[0]
driver_address = driver_addresses[0]
scheduler_address, _ = services.start_ray_local(num_objstores=num_objstores, num_workers=num_workers, worker_path=None)
else:
# In this case, there is an existing scheduler and object store, and we do
# not need to start any processes.
if (num_workers is not None) or (num_objstores is not None):
raise Exception("The arguments num_workers and num_objstores must not be provided unless start_ray_local=True.")
if node_ip_address is None:
raise Exception("When start_ray_local=False, the node_ip_address of the current node must be provided.")
# Connect this driver to the scheduler and object store. The corresponing call
# to disconnect will happen in the call to cleanup() when the Python script
# exits.
connect(scheduler_address, objstore_address, driver_address, is_driver=True, worker=global_worker, mode=driver_mode)
connect(node_ip_address, scheduler_address, is_driver=True, worker=global_worker, mode=driver_mode)
def cleanup(worker=global_worker):
"""Disconnect the driver, and terminate any processes started in init.
@@ -726,14 +725,15 @@ def cleanup(worker=global_worker):
atexit.register(cleanup)
def connect(scheduler_address, objstore_address, worker_address, is_driver=False, worker=global_worker, mode=WORKER_MODE):
def connect(node_ip_address, scheduler_address, objstore_address=None, is_driver=False, worker=global_worker, mode=WORKER_MODE):
"""Connect this worker to the scheduler and an object store.
Args:
node_ip_address (str): The ip address of the node the worker runs on.
scheduler_address (str): The ip address and port of the scheduler.
objstore_address (str): The ip address and port of the local object store.
worker_address (str): The ip address and port of this worker. The port can
be chosen arbitrarily.
objstore_address (Optional[str]): The ip address and port of the local
object store. Normally, this argument should be omitted and the scheduler
will tell the worker what object store to connect to.
is_driver (bool): True if this worker is a driver and false otherwise.
mode: The mode of the worker. One of SCRIPT_MODE, WORKER_MODE, PYTHON_MODE,
and SILENT_MODE.
@@ -741,22 +741,20 @@ def connect(scheduler_address, objstore_address, worker_address, is_driver=False
if hasattr(worker, "handle"):
del worker.handle
worker.scheduler_address = scheduler_address
worker.objstore_address = objstore_address
worker.worker_address = worker_address
worker.handle = raylib.create_worker(worker.scheduler_address, worker.objstore_address, worker.worker_address, is_driver)
worker.handle, worker.worker_address = raylib.create_worker(node_ip_address, scheduler_address, objstore_address if objstore_address is not None else "", is_driver)
worker.set_mode(mode)
FORMAT = "%(asctime)-15s %(message)s"
# Configure the Python logging module. Note that if we do not provide our own
# logger, then our logging will interfere with other Python modules that also
# use the logging module.
log_handler = logging.FileHandler(config.get_log_file_path("-".join(["worker", worker_address]) + ".log"))
log_handler = logging.FileHandler(config.get_log_file_path("-".join(["worker", worker.worker_address]) + ".log"))
log_handler.setLevel(logging.DEBUG)
log_handler.setFormatter(logging.Formatter(FORMAT))
_logger().addHandler(log_handler)
_logger().setLevel(logging.DEBUG)
_logger().propagate = False
# Configure the logging from the worker C++ code.
raylib.set_log_config(config.get_log_file_path("-".join(["worker", worker_address, "c++"]) + ".log"))
raylib.set_log_config(config.get_log_file_path("-".join(["worker", worker.worker_address, "c++"]) + ".log"))
if mode in [SCRIPT_MODE, SILENT_MODE]:
for function_to_export in worker.cached_remote_functions:
raylib.export_function(worker.handle, function_to_export)