unify starting local cluster with attaching to existing cluster (#327)

This commit is contained in:
Robert Nishihara
2016-07-31 19:26:35 -07:00
committed by Philipp Moritz
parent 0e5b858324
commit 2040372084
17 changed files with 104 additions and 90 deletions
+1 -1
View File
@@ -21,7 +21,7 @@ if hasattr(ctypes, "windll"):
import config
import libraylib as lib
import serialization
from worker import scheduler_info, visualize_computation_graph, task_info, register_module, connect, disconnect, get, put, remote, kill_workers, restart_workers_local
from worker import scheduler_info, visualize_computation_graph, task_info, register_module, init, connect, disconnect, get, put, remote, kill_workers, restart_workers_local
from worker import Reusable, reusables
from libraylib import ObjRef
import internal
+7 -11
View File
@@ -15,7 +15,7 @@ _services_env["PATH"] = os.pathsep.join([os.path.dirname(os.path.abspath(__file_
# mode.
all_processes = []
# drivers is a list of the worker objects corresponding to drivers if
# start_services_local is run with return_drivers=True.
# start_ray_local is run with return_drivers=True.
drivers = []
IP_ADDRESS = "127.0.0.1"
@@ -189,14 +189,16 @@ def start_workers(scheduler_address, objstore_address, num_workers, worker_path)
for _ in range(num_workers):
start_worker(worker_path, scheduler_address, objstore_address, address(node_ip_address, new_worker_port()), local=False)
def start_ray_local(num_workers=0, worker_path=None, driver_mode=ray.SCRIPT_MODE):
def start_ray_local(num_objstores=1, num_workers_per_objstore=0, worker_path=None, driver_mode=ray.SCRIPT_MODE, return_drivers=False):
"""Start Ray in local mode.
This method starts Ray in local mode (as opposed to cluster mode, which is
handled by cluster.py).
Args:
num_workers (int): The number of workers to start.
num_objstores (int): The number of object stores to start.
num_workers_per_objstore (int): The number of workers to start per object
store.
worker_path (str): The path of the source code that will be run by the
worker
driver_mode: The mode for the driver, this only affects the printing of
@@ -205,17 +207,11 @@ def start_ray_local(num_workers=0, worker_path=None, driver_mode=ray.SCRIPT_MODE
in the shell. It should be ray.PYTHON_MODE to run things in a manner
equivalent to serial Python code. It should be ray.WORKER_MODE to surpress
the printing of error messages.
return_drivers (bool): This should only be True in special cases for tests.
"""
global drivers
if worker_path is None:
worker_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../scripts/default_worker.py")
start_services_local(num_objstores=1, num_workers_per_objstore=num_workers, worker_path=worker_path, driver_mode=driver_mode)
# This is a helper method which is only used in the tests and should not be
# called by users
def start_services_local(num_objstores=1, num_workers_per_objstore=0, worker_path=None, driver_mode=ray.SCRIPT_MODE, return_drivers=False):
global drivers
if num_workers_per_objstore > 0 and worker_path is None:
raise Exception("Attempting to start a cluster with {} workers per object store, but `worker_path` is None.".format(num_workers_per_objstore))
if num_workers_per_objstore > 0 and num_objstores < 1:
raise Exception("Attempting to start a cluster with {} workers per object store, but `num_objstores` is {}.".format(num_objstores))
scheduler_address = address(IP_ADDRESS, new_scheduler_port())
+45 -2
View File
@@ -447,7 +447,7 @@ def check_connected(worker=global_worker):
Exception: An exception is raised if the worker is not connected.
"""
if worker.handle is None:
raise Exception("This command cannot be called before a Ray cluster has been started. You can start one with 'ray.services.start_ray_local(num_workers=1)'.")
raise Exception("This command cannot be called before a Ray cluster has been started. You can start one with 'ray.init(start_ray_local=True, num_workers=1)'.")
def print_failed_task(task_status):
"""Print information about failed tasks.
@@ -505,8 +505,9 @@ def visualize_computation_graph(file_path=None, view=False, worker=global_worker
open the result in a viewer.
Examples:
In ray/scripts, call "python shell.py" and try the following code.
Try the following code.
>>> import ray.array.distributed as da
>>> x = da.zeros([20, 20])
>>> y = da.zeros([20, 20])
>>> z = da.dot(x, y)
@@ -552,6 +553,48 @@ def register_module(module, worker=global_worker):
_logger().info("registering {}.".format(val.func_name))
worker.register_function(val)
def init(start_ray_local=False, num_workers=None, scheduler_address=None, objstore_address=None, driver_address=None, driver_mode=ray.SCRIPT_MODE):
"""Either connect to an existing Ray cluster or start one and connect to it.
This method handles two cases. Either a Ray cluster already exists and we
just attach this driver to it, or we start all of the processes associated
with a Ray cluster and attach to the newly started cluster.
Args:
start_ray_local (Optional[bool]): If True then this will start a scheduler
an object store, and some workers. If False, this will attach to an
existing Ray cluster.
num_workers (Optional[int]): The number of workers to start if
start_ray_local is True.
scheduler_address (Optional[str]): The address of the scheduler to connect
to if start_ray_local is False.
objstore_address (Optional[str]): The address of the object store to connect
to if start_ray_local is False.
driver_address (Optional[str]): The address of this driver if
start_ray_local is False.
driver_mode (Optional[bool]): The mode in which to start the driver. This
should be one of ray.SCRIPT_MODE, ray.SHELL_MODE, ray.PYTHON_MODE, and
ray.SILENT_MODE.
raises:
Exception: An exception is raised if an inappropriate combination of
arguments is passed in.
"""
if start_ray_local:
# In this case, we launch a scheduler, a new object store, and some workers,
# and we connect to them.
if (scheduler_address is not None) or (objstore_address is not None) or (driver_address is not None):
raise Exception("If start_ray_local=True, then you cannot pass in a scheduler_address, objstore_address, or worker_address.")
if driver_mode not in [ray.SCRIPT_MODE, ray.SHELL_MODE, ray.PYTHON_MODE, ray.SILENT_MODE]:
raise Exception("If start_ray_local=True, then driver_mode must be in [ray.SCRIPT_MODE, ray.SHELL_MODE, ray.PYTHON_MODE, ray.SILENT_MODE].")
num_workers = 1 if num_workers is None else num_workers
ray.services.start_ray_local(num_objstores=1, num_workers_per_objstore=num_workers, worker_path=None, driver_mode=driver_mode)
else:
# In this case, connect to an existing scheduler and object store.
if num_workers is not None:
raise Exception("The argument num_workers must not be provided unless start_ray_local=True.")
connect(scheduler_address, objstore_address, driver_address, is_driver=True, worker=global_worker, mode=driver_mode)
def connect(scheduler_address, objstore_address, worker_address, is_driver=False, worker=global_worker, mode=ray.WORKER_MODE):
"""Connect this worker to the scheduler and an object store.