enable restarting workers in singlenode case, plus cleanups to cluster.py (#190)

This commit is contained in:
Robert Nishihara
2016-07-01 14:10:51 -07:00
committed by Philipp Moritz
parent f5316d50fc
commit 0ffe657e27
6 changed files with 234 additions and 50 deletions
+1 -1
View File
@@ -8,6 +8,6 @@ PYTHON_MODE = 3
import libraylib as lib
import serialization
from worker import scheduler_info, visualize_computation_graph, task_info, register_module, connect, disconnect, get, put, remote, kill_workers
from worker import scheduler_info, visualize_computation_graph, task_info, register_module, connect, disconnect, get, put, remote, kill_workers, restart_workers_local
from libraylib import ObjRef
import internal
+20 -3
View File
@@ -5,7 +5,7 @@ import time
import datetime
import ray
import ray.worker as worker
import worker
from ray.config import LOG_DIRECTORY, LOG_TIMESTAMP
_services_env = os.environ.copy()
@@ -94,7 +94,7 @@ def start_node(scheduler_address, node_ip_address, num_workers, worker_path=None
:param scheduler_address: ip address and port of the scheduler (which may run on a different node)
:param node_ip_address: ip address (without port) of the node this function is run on
:param num_workers: the number of workers to be started on this node
:worker_path: path of the source code that will be run on the worker
:param worker_path: path of the source code that will be run on the worker
"""
objstore_address = address(node_ip_address, new_objstore_port())
start_objstore(scheduler_address, objstore_address)
@@ -102,9 +102,26 @@ def start_node(scheduler_address, node_ip_address, num_workers, worker_path=None
for _ in range(num_workers):
start_worker(worker_path, scheduler_address, objstore_address, address(node_ip_address, new_worker_port()))
time.sleep(0.3)
ray.connect(scheduler_address, objstore_address, address(node_ip_address, new_worker_port()))
ray.connect(scheduler_address, objstore_address, address(node_ip_address, new_worker_port()), is_driver=True)
time.sleep(0.5)
def start_workers(scheduler_address, objstore_address, num_workers, worker_path):
"""
Start a new set of workers on this node. This assumes that the scheduler is
already running and that the object store on this node is already running.
The intended use case is that a developer wants to update the code running
on the worker processes so first kills all of the workers and then runs this
method.
:param scheduler_address: ip address and port of the scheduler (which may run on a different node)
:param objstore_address: ip address and port of the object store (which runs on the same node)
:param num_workers: the number of workers to be started on this node
:param worker_path: path of the source code that will be run on the worker
"""
node_ip_address = objstore_address.split(":")[0]
for _ in range(num_workers):
start_worker(worker_path, scheduler_address, objstore_address, address(node_ip_address, new_worker_port()))
# driver_mode should equal ray.SCRIPT_MODE if this is being run in a script and
# ray.SHELL_MODE if it is being used interactively in a shell. It can also equal
# ray.PYTHON_MODE to run things in a manner equivalent to serial Python code.
+24 -4
View File
@@ -14,6 +14,7 @@ from ray.config import LOG_DIRECTORY, LOG_TIMESTAMP
import serialization
import ray.internal.graph_pb2
import ray.graph
import services
class RayFailedObject(object):
"""If a task throws an exception during execution, a RayFailedObject is stored in the object store for each of the tasks outputs."""
@@ -196,15 +197,18 @@ def register_module(module, recursive=False, worker=global_worker):
# elif recursive and isinstance(val, ModuleType):
# register_module(val, recursive, worker)
def connect(scheduler_addr, objstore_addr, worker_addr, is_driver=False, worker=global_worker, mode=ray.WORKER_MODE):
def connect(scheduler_address, objstore_address, worker_address, is_driver=False, worker=global_worker, mode=ray.WORKER_MODE):
if hasattr(worker, "handle"):
del worker.handle
worker.handle = ray.lib.create_worker(scheduler_addr, objstore_addr, worker_addr, is_driver)
worker.scheduler_address = scheduler_address
worker.objstore_address = objstore_address
worker.worker_address = worker_address
worker.handle = ray.lib.create_worker(worker.scheduler_address, worker.objstore_address, worker.worker_address, is_driver)
worker.set_mode(mode)
FORMAT = "%(asctime)-15s %(message)s"
log_basename = os.path.join(LOG_DIRECTORY, (LOG_TIMESTAMP + "-worker-{}").format(datetime.datetime.now(), worker_addr))
log_basename = os.path.join(LOG_DIRECTORY, (LOG_TIMESTAMP + "-worker-{}").format(datetime.datetime.now(), worker_address))
logging.basicConfig(level=logging.DEBUG, format=FORMAT, filename=log_basename + ".log")
ray.lib.set_log_config(log_basename + "-c++.log")
worker.set_mode(mode)
def disconnect(worker=global_worker):
ray.lib.disconnect(worker.handle)
@@ -230,11 +234,27 @@ def put(value, worker=global_worker):
return objref
def kill_workers(worker=global_worker):
"""
This method kills all of the workers in the cluster. It does not kill drivers.
"""
success = ray.lib.kill_workers(worker.handle)
if not success:
print "Could not kill all workers; check that there are no tasks currently running."
return success
def restart_workers_local(num_workers, worker_path, worker=global_worker):
"""
This method kills all of the workers and starts new workers locally on the
same node as the driver. This is intended for use in the case where Ray is
being used on a single node.
:param num_workers: the number of workers to be started
:param worker_path: path of the source code that will be run on the worker
"""
if not kill_workers(worker):
return False
services.start_workers(worker.scheduler_address, worker.objstore_address, num_workers, worker_path)
def main_loop(worker=global_worker):
if not ray.lib.connected(worker.handle):
raise Exception("Worker is attempting to enter main_loop but has not been connected yet.")