Allow starting multiple local schedulers. (#86)

This commit is contained in:
Robert Nishihara
2016-12-04 17:08:16 -08:00
committed by Philipp Moritz
parent 35b9dedb48
commit b5ed2f063d
6 changed files with 114 additions and 60 deletions
+43 -23
View File
@@ -111,12 +111,14 @@ def start_global_scheduler(redis_address, cleanup=True):
if cleanup:
all_processes.append(p)
def start_local_scheduler(redis_address, plasma_store_name, cleanup=True):
def start_local_scheduler(redis_address, plasma_store_name, plasma_manager_name, cleanup=True):
"""Start a local scheduler process.
Args:
redis_address (str): The address of the Redis instance.
plasma_store_name (str): The name of the plasma store socket to connect to.
plasma_manager_name (str): The name of the plasma manager socket to connect
to.
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
this process will be killed by serices.cleanup() when the Python process
that imported services exits.
@@ -124,7 +126,7 @@ def start_local_scheduler(redis_address, plasma_store_name, cleanup=True):
Return:
The name of the local scheduler socket.
"""
local_scheduler_name, p = photon.start_local_scheduler(plasma_store_name, redis_address=redis_address, use_profiler=RUN_PHOTON_PROFILER)
local_scheduler_name, p = photon.start_local_scheduler(plasma_store_name, plasma_manager_name, redis_address=redis_address, use_profiler=RUN_PHOTON_PROFILER)
if cleanup:
all_processes.append(p)
return local_scheduler_name
@@ -156,13 +158,16 @@ def start_objstore(node_ip_address, redis_address, cleanup=True):
return plasma_store_name, plasma_manager_name, plasma_manager_port
def start_worker(address_info, worker_path, cleanup=True):
def start_worker(node_ip_address, object_store_name, object_store_manager_name, local_scheduler_name, redis_port, worker_path, cleanup=True):
"""This method starts a worker process.
Args:
address_info (dict): This dictionary contains the node_ip_address,
redis_port, object_store_name, object_store_manager_name, and
local_scheduler_name.
node_ip_address (str): The IP address of the node that this worker is
running on.
object_store_name (str): The name of the object store.
object_store_manager_name (str): The name of the object store manager.
local_scheduler_name (str): The name of the local scheduler.
redis_port (int): The port that the Redis server is listening on.
worker_path (str): The path of the source code which the worker process will
run.
cleanup (bool): True if using Ray in local mode. If cleanup is true, then
@@ -171,11 +176,11 @@ def start_worker(address_info, worker_path, cleanup=True):
"""
command = ["python",
worker_path,
"--node-ip-address=" + address_info["node_ip_address"],
"--object-store-name=" + address_info["object_store_name"],
"--object-store-manager-name=" + address_info["object_store_manager_name"],
"--local-scheduler-name=" + address_info["local_scheduler_name"],
"--redis-port=" + str(address_info["redis_port"])]
"--node-ip-address=" + node_ip_address,
"--object-store-name=" + object_store_name,
"--object-store-manager-name=" + object_store_manager_name,
"--local-scheduler-name=" + local_scheduler_name,
"--redis-port=" + str(redis_port)]
p = subprocess.Popen(command)
if cleanup:
all_processes.append(p)
@@ -196,11 +201,13 @@ def start_webui(redis_port, cleanup=True):
if cleanup:
all_processes.append(p)
def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, worker_path=None):
def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, num_local_schedulers=1, worker_path=None):
"""Start Ray in local mode.
Args:
num_workers (int): The number of workers to start.
num_local_schedulers (int): The number of local schedulers to start. This is
also the number of plasma stores and plasma managers to start.
worker_path (str): The path of the source code that will be run by the
worker.
@@ -216,21 +223,34 @@ def start_ray_local(node_ip_address="127.0.0.1", num_workers=0, worker_path=None
time.sleep(0.1)
# Start the global scheduler.
start_global_scheduler(redis_address, cleanup=True)
# Start Plasma.
object_store_name, object_store_manager_name, object_store_manager_port = start_objstore(node_ip_address, redis_address, cleanup=True)
time.sleep(0.1)
# Start the local scheduler.
local_scheduler_name = start_local_scheduler(redis_address, object_store_name, cleanup=True)
time.sleep(0.1)
object_store_names = []
object_store_manager_names = []
local_scheduler_names = []
for _ in range(num_local_schedulers):
# Start Plasma.
object_store_name, object_store_manager_name, object_store_manager_port = start_objstore(node_ip_address, redis_address, cleanup=True)
object_store_names.append(object_store_name)
object_store_manager_names.append(object_store_manager_name)
time.sleep(0.1)
# Start the local scheduler.
local_scheduler_name = start_local_scheduler(redis_address, object_store_name, object_store_manager_name, cleanup=True)
local_scheduler_names.append(local_scheduler_name)
time.sleep(0.1)
# Aggregate the address information together.
address_info = {"node_ip_address": node_ip_address,
"redis_port": redis_port,
"object_store_name": object_store_name,
"object_store_manager_name": object_store_manager_name,
"local_scheduler_name": local_scheduler_name}
"object_store_names": object_store_names,
"object_store_manager_names": object_store_manager_names,
"local_scheduler_names": local_scheduler_names}
# Start the workers.
for _ in range(num_workers):
start_worker(address_info, worker_path, cleanup=True)
for i in range(num_workers):
start_worker(address_info["node_ip_address"],
address_info["object_store_names"][i % num_local_schedulers],
address_info["object_store_manager_names"][i % num_local_schedulers],
address_info["local_scheduler_names"][i % num_local_schedulers],
redis_port,
worker_path,
cleanup=True)
# Return the addresses of the relevant processes.
start_webui(redis_port)
return address_info
+6 -4
View File
@@ -601,7 +601,7 @@ def initialize_numbuf(worker=global_worker):
register_class(RayGetError)
register_class(RayGetArgumentError)
def init(start_ray_local=False, num_workers=None, driver_mode=SCRIPT_MODE):
def init(start_ray_local=False, num_workers=None, num_local_schedulers=1, driver_mode=SCRIPT_MODE):
"""Either connect to an existing Ray cluster or start one and connect to it.
This method handles two cases. Either a Ray cluster already exists and we
@@ -614,6 +614,8 @@ def init(start_ray_local=False, num_workers=None, driver_mode=SCRIPT_MODE):
existing Ray cluster.
num_workers (Optional[int]): The number of workers to start if
start_ray_local is True.
num_local_schedulers (Optional[int]): The number of local schedulers to
start if start_ray_local is True.
driver_mode (Optional[bool]): The mode in which to start the driver. This
should be one of SCRIPT_MODE, PYTHON_MODE, and SILENT_MODE.
@@ -636,7 +638,7 @@ def init(start_ray_local=False, num_workers=None, driver_mode=SCRIPT_MODE):
num_workers = 1 if num_workers is None else num_workers
# Start the scheduler, object store, and some workers. These will be killed
# by the call to cleanup(), which happens when the Python script exits.
address_info = services.start_ray_local(num_workers=num_workers)
address_info = services.start_ray_local(num_workers=num_workers, num_local_schedulers=num_local_schedulers)
else:
raise Exception("This mode is currently not enabled.")
# Connect this driver to Redis, the object store, and the local scheduler. The
@@ -828,9 +830,9 @@ def connect(address_info, mode=WORKER_MODE, worker=global_worker):
worker.redis_client.config_set("notify-keyspace-events", "AKE")
worker.lock = threading.Lock()
# Create an object store client.
worker.plasma_client = plasma.PlasmaClient(address_info["object_store_name"], address_info["object_store_manager_name"])
worker.plasma_client = plasma.PlasmaClient(address_info["object_store_names"][0], address_info["object_store_manager_names"][0])
# Create the local scheduler client.
worker.photon_client = photon.PhotonClient(address_info["local_scheduler_name"])
worker.photon_client = photon.PhotonClient(address_info["local_scheduler_names"][0])
# Register the worker with Redis.
if mode in [SCRIPT_MODE, SILENT_MODE]:
worker.redis_client.rpush("Drivers", worker.worker_id)
+3 -3
View File
@@ -17,9 +17,9 @@ if __name__ == "__main__":
args = parser.parse_args()
address_info = {"node_ip_address": args.node_ip_address,
"redis_port": args.redis_port,
"object_store_name": args.object_store_name,
"object_store_manager_name": args.object_store_manager_name,
"local_scheduler_name": args.local_scheduler_name}
"object_store_names": [args.object_store_name],
"object_store_manager_names": [args.object_store_manager_name],
"local_scheduler_names": [args.local_scheduler_name]}
ray.worker.connect(address_info, ray.WORKER_MODE)
ray.worker.main_loop()