mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 18:45:03 +08:00
enable restarting workers in singlenode case, plus cleanups to cluster.py (#190)
This commit is contained in:
committed by
Philipp Moritz
parent
f5316d50fc
commit
0ffe657e27
@@ -8,6 +8,6 @@ PYTHON_MODE = 3
|
||||
|
||||
import libraylib as lib
|
||||
import serialization
|
||||
from worker import scheduler_info, visualize_computation_graph, task_info, register_module, connect, disconnect, get, put, remote, kill_workers
|
||||
from worker import scheduler_info, visualize_computation_graph, task_info, register_module, connect, disconnect, get, put, remote, kill_workers, restart_workers_local
|
||||
from libraylib import ObjRef
|
||||
import internal
|
||||
|
||||
@@ -5,7 +5,7 @@ import time
|
||||
import datetime
|
||||
|
||||
import ray
|
||||
import ray.worker as worker
|
||||
import worker
|
||||
from ray.config import LOG_DIRECTORY, LOG_TIMESTAMP
|
||||
|
||||
_services_env = os.environ.copy()
|
||||
@@ -94,7 +94,7 @@ def start_node(scheduler_address, node_ip_address, num_workers, worker_path=None
|
||||
:param scheduler_address: ip address and port of the scheduler (which may run on a different node)
|
||||
:param node_ip_address: ip address (without port) of the node this function is run on
|
||||
:param num_workers: the number of workers to be started on this node
|
||||
:worker_path: path of the source code that will be run on the worker
|
||||
:param worker_path: path of the source code that will be run on the worker
|
||||
"""
|
||||
objstore_address = address(node_ip_address, new_objstore_port())
|
||||
start_objstore(scheduler_address, objstore_address)
|
||||
@@ -102,9 +102,26 @@ def start_node(scheduler_address, node_ip_address, num_workers, worker_path=None
|
||||
for _ in range(num_workers):
|
||||
start_worker(worker_path, scheduler_address, objstore_address, address(node_ip_address, new_worker_port()))
|
||||
time.sleep(0.3)
|
||||
ray.connect(scheduler_address, objstore_address, address(node_ip_address, new_worker_port()))
|
||||
ray.connect(scheduler_address, objstore_address, address(node_ip_address, new_worker_port()), is_driver=True)
|
||||
time.sleep(0.5)
|
||||
|
||||
def start_workers(scheduler_address, objstore_address, num_workers, worker_path):
|
||||
"""
|
||||
Start a new set of workers on this node. This assumes that the scheduler is
|
||||
already running and that the object store on this node is already running.
|
||||
The intended use case is that a developer wants to update the code running
|
||||
on the worker processes so first kills all of the workers and then runs this
|
||||
method.
|
||||
|
||||
:param scheduler_address: ip address and port of the scheduler (which may run on a different node)
|
||||
:param objstore_address: ip address and port of the object store (which runs on the same node)
|
||||
:param num_workers: the number of workers to be started on this node
|
||||
:param worker_path: path of the source code that will be run on the worker
|
||||
"""
|
||||
node_ip_address = objstore_address.split(":")[0]
|
||||
for _ in range(num_workers):
|
||||
start_worker(worker_path, scheduler_address, objstore_address, address(node_ip_address, new_worker_port()))
|
||||
|
||||
# driver_mode should equal ray.SCRIPT_MODE if this is being run in a script and
|
||||
# ray.SHELL_MODE if it is being used interactively in a shell. It can also equal
|
||||
# ray.PYTHON_MODE to run things in a manner equivalent to serial Python code.
|
||||
|
||||
@@ -14,6 +14,7 @@ from ray.config import LOG_DIRECTORY, LOG_TIMESTAMP
|
||||
import serialization
|
||||
import ray.internal.graph_pb2
|
||||
import ray.graph
|
||||
import services
|
||||
|
||||
class RayFailedObject(object):
|
||||
"""If a task throws an exception during execution, a RayFailedObject is stored in the object store for each of the tasks outputs."""
|
||||
@@ -196,15 +197,18 @@ def register_module(module, recursive=False, worker=global_worker):
|
||||
# elif recursive and isinstance(val, ModuleType):
|
||||
# register_module(val, recursive, worker)
|
||||
|
||||
def connect(scheduler_addr, objstore_addr, worker_addr, is_driver=False, worker=global_worker, mode=ray.WORKER_MODE):
|
||||
def connect(scheduler_address, objstore_address, worker_address, is_driver=False, worker=global_worker, mode=ray.WORKER_MODE):
|
||||
if hasattr(worker, "handle"):
|
||||
del worker.handle
|
||||
worker.handle = ray.lib.create_worker(scheduler_addr, objstore_addr, worker_addr, is_driver)
|
||||
worker.scheduler_address = scheduler_address
|
||||
worker.objstore_address = objstore_address
|
||||
worker.worker_address = worker_address
|
||||
worker.handle = ray.lib.create_worker(worker.scheduler_address, worker.objstore_address, worker.worker_address, is_driver)
|
||||
worker.set_mode(mode)
|
||||
FORMAT = "%(asctime)-15s %(message)s"
|
||||
log_basename = os.path.join(LOG_DIRECTORY, (LOG_TIMESTAMP + "-worker-{}").format(datetime.datetime.now(), worker_addr))
|
||||
log_basename = os.path.join(LOG_DIRECTORY, (LOG_TIMESTAMP + "-worker-{}").format(datetime.datetime.now(), worker_address))
|
||||
logging.basicConfig(level=logging.DEBUG, format=FORMAT, filename=log_basename + ".log")
|
||||
ray.lib.set_log_config(log_basename + "-c++.log")
|
||||
worker.set_mode(mode)
|
||||
|
||||
def disconnect(worker=global_worker):
|
||||
ray.lib.disconnect(worker.handle)
|
||||
@@ -230,11 +234,27 @@ def put(value, worker=global_worker):
|
||||
return objref
|
||||
|
||||
def kill_workers(worker=global_worker):
|
||||
"""
|
||||
This method kills all of the workers in the cluster. It does not kill drivers.
|
||||
"""
|
||||
success = ray.lib.kill_workers(worker.handle)
|
||||
if not success:
|
||||
print "Could not kill all workers; check that there are no tasks currently running."
|
||||
return success
|
||||
|
||||
def restart_workers_local(num_workers, worker_path, worker=global_worker):
|
||||
"""
|
||||
This method kills all of the workers and starts new workers locally on the
|
||||
same node as the driver. This is intended for use in the case where Ray is
|
||||
being used on a single node.
|
||||
|
||||
:param num_workers: the number of workers to be started
|
||||
:param worker_path: path of the source code that will be run on the worker
|
||||
"""
|
||||
if not kill_workers(worker):
|
||||
return False
|
||||
services.start_workers(worker.scheduler_address, worker.objstore_address, num_workers, worker_path)
|
||||
|
||||
def main_loop(worker=global_worker):
|
||||
if not ray.lib.connected(worker.handle):
|
||||
raise Exception("Worker is attempting to enter main_loop but has not been connected yet.")
|
||||
|
||||
Reference in New Issue
Block a user