enable restarting workers in singlenode case, plus cleanups to cluster.py (#190)

2026-06-28 18:45:03 +08:00 · 2016-07-01 14:10:51 -07:00
parent f5316d50fc
commit 0ffe657e27
6 changed files with 234 additions and 50 deletions
@@ -8,6 +8,6 @@ PYTHON_MODE = 3

 import libraylib as lib
 import serialization
-from worker import scheduler_info, visualize_computation_graph, task_info, register_module, connect, disconnect, get, put, remote, kill_workers
+from worker import scheduler_info, visualize_computation_graph, task_info, register_module, connect, disconnect, get, put, remote, kill_workers, restart_workers_local
 from libraylib import ObjRef
 import internal
@@ -5,7 +5,7 @@ import time
 import datetime

 import ray
-import ray.worker as worker
+import worker
 from ray.config import LOG_DIRECTORY, LOG_TIMESTAMP

 _services_env = os.environ.copy()
@@ -94,7 +94,7 @@ def start_node(scheduler_address, node_ip_address, num_workers, worker_path=None
  :param scheduler_address: ip address and port of the scheduler (which may run on a different node)
  :param node_ip_address: ip address (without port) of the node this function is run on
  :param num_workers: the number of workers to be started on this node
-  :worker_path: path of the source code that will be run on the worker
+  :param worker_path: path of the source code that will be run on the worker
  """
  objstore_address = address(node_ip_address, new_objstore_port())
  start_objstore(scheduler_address, objstore_address)
@@ -102,9 +102,26 @@ def start_node(scheduler_address, node_ip_address, num_workers, worker_path=None
  for _ in range(num_workers):
    start_worker(worker_path, scheduler_address, objstore_address, address(node_ip_address, new_worker_port()))
  time.sleep(0.3)
-  ray.connect(scheduler_address, objstore_address, address(node_ip_address, new_worker_port()))
+  ray.connect(scheduler_address, objstore_address, address(node_ip_address, new_worker_port()), is_driver=True)
  time.sleep(0.5)

+def start_workers(scheduler_address, objstore_address, num_workers, worker_path):
+  """
+  Start a new set of workers on this node. This assumes that the scheduler is
+    already running and that the object store on this node is already running.
+    The intended use case is that a developer wants to update the code running
+    on the worker processes so first kills all of the workers and then runs this
+    method.
+
+    :param scheduler_address: ip address and port of the scheduler (which may run on a different node)
+    :param objstore_address: ip address and port of the object store (which runs on the same node)
+    :param num_workers: the number of workers to be started on this node
+    :param worker_path: path of the source code that will be run on the worker
+  """
+  node_ip_address = objstore_address.split(":")[0]
+  for _ in range(num_workers):
+    start_worker(worker_path, scheduler_address, objstore_address, address(node_ip_address, new_worker_port()))
+
 # driver_mode should equal ray.SCRIPT_MODE if this is being run in a script and
 # ray.SHELL_MODE if it is being used interactively in a shell. It can also equal
 # ray.PYTHON_MODE to run things in a manner equivalent to serial Python code.
@@ -14,6 +14,7 @@ from ray.config import LOG_DIRECTORY, LOG_TIMESTAMP
 import serialization
 import ray.internal.graph_pb2
 import ray.graph
+import services

 class RayFailedObject(object):
  """If a task throws an exception during execution, a RayFailedObject is stored in the object store for each of the tasks outputs."""
@@ -196,15 +197,18 @@ def register_module(module, recursive=False, worker=global_worker):
    # elif recursive and isinstance(val, ModuleType):
    #   register_module(val, recursive, worker)

-def connect(scheduler_addr, objstore_addr, worker_addr, is_driver=False, worker=global_worker, mode=ray.WORKER_MODE):
+def connect(scheduler_address, objstore_address, worker_address, is_driver=False, worker=global_worker, mode=ray.WORKER_MODE):
  if hasattr(worker, "handle"):
    del worker.handle
-  worker.handle = ray.lib.create_worker(scheduler_addr, objstore_addr, worker_addr, is_driver)
+  worker.scheduler_address = scheduler_address
+  worker.objstore_address = objstore_address
+  worker.worker_address = worker_address
+  worker.handle = ray.lib.create_worker(worker.scheduler_address, worker.objstore_address, worker.worker_address, is_driver)
+  worker.set_mode(mode)
  FORMAT = "%(asctime)-15s %(message)s"
-  log_basename = os.path.join(LOG_DIRECTORY, (LOG_TIMESTAMP + "-worker-{}").format(datetime.datetime.now(), worker_addr))
+  log_basename = os.path.join(LOG_DIRECTORY, (LOG_TIMESTAMP + "-worker-{}").format(datetime.datetime.now(), worker_address))
  logging.basicConfig(level=logging.DEBUG, format=FORMAT, filename=log_basename + ".log")
  ray.lib.set_log_config(log_basename + "-c++.log")
-  worker.set_mode(mode)

 def disconnect(worker=global_worker):
  ray.lib.disconnect(worker.handle)
@@ -230,11 +234,27 @@ def put(value, worker=global_worker):
  return objref

 def kill_workers(worker=global_worker):
+  """
+  This method kills all of the workers in the cluster. It does not kill drivers.
+  """
  success = ray.lib.kill_workers(worker.handle)
  if not success:
    print "Could not kill all workers; check that there are no tasks currently running."
  return success

+def restart_workers_local(num_workers, worker_path, worker=global_worker):
+  """
+  This method kills all of the workers and starts new workers locally on the
+    same node as the driver. This is intended for use in the case where Ray is
+    being used on a single node.
+
+  :param num_workers: the number of workers to be started
+  :param worker_path: path of the source code that will be run on the worker
+  """
+  if not kill_workers(worker):
+    return False
+  services.start_workers(worker.scheduler_address, worker.objstore_address, num_workers, worker_path)
+
 def main_loop(worker=global_worker):
  if not ray.lib.connected(worker.handle):
    raise Exception("Worker is attempting to enter main_loop but has not been connected yet.")