Start and clean up workers from the local scheduler. (#250)

* Start and clean up workers from the local scheduler

Ability to kill workers in photon scheduler

Test for old method of starting workers

Common codepath for killing workers

Common codepath for killing workers

Photon test case for starting and killing workers

fix build

Fix component failure test

Register a worker's pid as part of initial connection

Address comments and revert photon_connect

Set PATH during travis install

Fix

* Fix photon test case to accept clients on plasma manager fd
This commit is contained in:
Stephanie Wang
2017-02-10 12:46:23 -08:00
committed by Robert Nishihara
parent ec175b7dfb
commit 2b8e6485e3
12 changed files with 556 additions and 165 deletions
+19 -6
View File
@@ -11,11 +11,17 @@ import time
def random_name():
return str(random.randint(0, 99999999))
def start_local_scheduler(plasma_store_name, plasma_manager_name=None,
worker_path=None, plasma_address=None,
node_ip_address="127.0.0.1", redis_address=None,
use_valgrind=False, use_profiler=False,
redirect_output=False, static_resource_list=None):
def start_local_scheduler(plasma_store_name,
plasma_manager_name=None,
worker_path=None,
plasma_address=None,
node_ip_address="127.0.0.1",
redis_address=None,
use_valgrind=False,
use_profiler=False,
redirect_output=False,
static_resource_list=None,
num_workers=0):
"""Start a local scheduler process.
Args:
@@ -41,6 +47,8 @@ def start_local_scheduler(plasma_store_name, plasma_manager_name=None,
static_resource_list (list): A list of integers specifying the local
scheduler's resource capacities. The resources should appear in an order
matching the order defined in task.h.
num_workers (int): The number of workers that the local scheduler should
start.
Return:
A tuple of the name of the local scheduler socket and the process ID of the
@@ -52,7 +60,12 @@ def start_local_scheduler(plasma_store_name, plasma_manager_name=None,
raise Exception("Cannot use valgrind and profiler at the same time.")
local_scheduler_executable = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../core/src/photon/photon_scheduler")
local_scheduler_name = "/tmp/scheduler{}".format(random_name())
command = [local_scheduler_executable, "-s", local_scheduler_name, "-p", plasma_store_name, "-h", node_ip_address]
command = [local_scheduler_executable,
"-s", local_scheduler_name,
"-p", plasma_store_name,
"-h", node_ip_address,
"-n", str(num_workers),
]
if plasma_manager_name is not None:
command += ["-m", plasma_manager_name]
if worker_path is not None:
+71 -27
View File
@@ -13,6 +13,7 @@ import subprocess
import sys
import time
from collections import namedtuple, OrderedDict
import threading
# Ray modules
import photon
@@ -89,18 +90,24 @@ def kill_process(p):
if RUN_PHOTON_PROFILER or RUN_PLASMA_MANAGER_PROFILER or RUN_PLASMA_STORE_PROFILER:
os.kill(p.pid, signal.SIGINT) # Give process signal to write profiler data.
time.sleep(0.1) # Wait for profiling data to be written.
p.kill()
# Sleeping for 0 should yield the core and allow the killed process to process
# its pending signals.
time.sleep(0)
# Allow the process one second to exit gracefully.
p.terminate()
timer = threading.Timer(1, lambda p: p.kill(), [p])
try:
timer.start()
p.wait()
finally:
timer.cancel()
if p.poll() is not None:
return True
p.terminate()
# Sleeping for 0 should yield the core and allow the killed process to process
# its pending signals.
time.sleep(0)
if p.poll is not None:
# If the process did not exit within one second, force kill it.
p.kill()
if p.poll() is not None:
return True
# The process was not killed for some reason.
return False
@@ -262,10 +269,16 @@ def start_global_scheduler(redis_address, cleanup=True, redirect_output=False):
if cleanup:
all_processes[PROCESS_TYPE_GLOBAL_SCHEDULER].append(p)
def start_local_scheduler(redis_address, node_ip_address, plasma_store_name,
plasma_manager_name, worker_path, plasma_address=None,
cleanup=True, redirect_output=False,
static_resource_list=None):
def start_local_scheduler(redis_address,
node_ip_address,
plasma_store_name,
plasma_manager_name,
worker_path,
plasma_address=None,
cleanup=True,
redirect_output=False,
static_resource_list=None,
num_workers=0):
"""Start a local scheduler process.
Args:
@@ -284,6 +297,8 @@ def start_local_scheduler(redis_address, node_ip_address, plasma_store_name,
/dev/null.
static_resource_list (list): An ordered list of the configured resource
capacities for this local scheduler.
num_workers (int): The number of workers that the local scheduler should
start.
Return:
The name of the local scheduler socket.
@@ -296,7 +311,8 @@ def start_local_scheduler(redis_address, node_ip_address, plasma_store_name,
plasma_address=plasma_address,
use_profiler=RUN_PHOTON_PROFILER,
redirect_output=redirect_output,
static_resource_list=static_resource_list)
static_resource_list=static_resource_list,
num_workers=num_workers)
if cleanup:
all_processes[PROCESS_TYPE_LOCAL_SCHEDULER].append(p)
return local_scheduler_name
@@ -391,6 +407,7 @@ def start_ray_processes(address_info=None,
redirect_output=False,
include_global_scheduler=False,
include_redis=False,
start_workers_from_local_scheduler=True,
num_cpus=None,
num_gpus=None):
"""Helper method to start Ray processes.
@@ -417,6 +434,9 @@ def start_ray_processes(address_info=None,
start a global scheduler process.
include_redis (bool): If include_redis is True, then start a Redis server
process.
start_workers_from_local_scheduler (bool): If this flag is True, then start
the initial workers from the local scheduler. Else, start them from
Python.
num_cpus: A list of length num_local_schedulers containing the number of
CPUs each local scheduler should be configured with.
num_gpus: A list of length num_local_schedulers containing the number of
@@ -489,12 +509,25 @@ def start_ray_processes(address_info=None,
object_store_addresses.append(object_store_address)
time.sleep(0.1)
# Determine how many workers to start for each local scheduler.
num_workers_per_local_scheduler = [0] * num_local_schedulers
for i in range(num_workers):
num_workers_per_local_scheduler[i % num_local_schedulers] += 1
# Start any local schedulers that do not yet exist.
for i in range(len(local_scheduler_socket_names), num_local_schedulers):
# Connect the local scheduler to the object store at the same index.
object_store_address = object_store_addresses[i]
plasma_address = "{}:{}".format(node_ip_address,
object_store_address.manager_port)
# Determine how many workers this local scheduler should start.
if start_workers_from_local_scheduler:
num_local_scheduler_workers = num_workers_per_local_scheduler[i]
num_workers_per_local_scheduler[i] = 0
else:
# If we're starting the workers from Python, the local scheduler should
# not start any workers.
num_local_scheduler_workers = 0
# Start the local scheduler.
local_scheduler_name = start_local_scheduler(redis_address,
node_ip_address,
@@ -504,7 +537,8 @@ def start_ray_processes(address_info=None,
plasma_address=plasma_address,
cleanup=cleanup,
redirect_output=redirect_output,
static_resource_list=[num_cpus[i], num_gpus[i]])
static_resource_list=[num_cpus[i], num_gpus[i]],
num_workers=num_local_scheduler_workers)
local_scheduler_socket_names.append(local_scheduler_name)
time.sleep(0.1)
@@ -513,18 +547,23 @@ def start_ray_processes(address_info=None,
assert len(object_store_addresses) == num_local_schedulers
assert len(local_scheduler_socket_names) == num_local_schedulers
# Start the workers.
for i in range(num_workers):
object_store_address = object_store_addresses[i % num_local_schedulers]
local_scheduler_name = local_scheduler_socket_names[i % num_local_schedulers]
start_worker(node_ip_address,
object_store_address.name,
object_store_address.manager_name,
local_scheduler_name,
redis_address,
worker_path,
cleanup=cleanup,
redirect_output=redirect_output)
# Start any workers that the local scheduler has not already started.
for i, num_local_scheduler_workers in enumerate(num_workers_per_local_scheduler):
object_store_address = object_store_addresses[i]
local_scheduler_name = local_scheduler_socket_names[i]
for j in range(num_local_scheduler_workers):
start_worker(node_ip_address,
object_store_address.name,
object_store_address.manager_name,
local_scheduler_name,
redis_address,
worker_path,
cleanup=cleanup,
redirect_output=redirect_output)
num_workers_per_local_scheduler[i] -= 1
# Make sure that we've started all the workers.
assert(sum(num_workers_per_local_scheduler) == 0)
# Return the addresses of the relevant processes.
return address_info
@@ -581,6 +620,7 @@ def start_ray_head(address_info=None,
worker_path=None,
cleanup=True,
redirect_output=False,
start_workers_from_local_scheduler=True,
num_cpus=None,
num_gpus=None):
"""Start Ray in local mode.
@@ -603,6 +643,9 @@ def start_ray_head(address_info=None,
method exits.
redirect_output (bool): True if stdout and stderr should be redirected to
/dev/null.
start_workers_from_local_scheduler (bool): If this flag is True, then start
the initial workers from the local scheduler. Else, start them from
Python.
num_cpus (int): number of cpus to configure the local scheduler with.
num_gpus (int): number of gpus to configure the local scheduler with.
@@ -619,5 +662,6 @@ def start_ray_head(address_info=None,
redirect_output=redirect_output,
include_global_scheduler=True,
include_redis=True,
start_workers_from_local_scheduler=start_workers_from_local_scheduler,
num_cpus=num_cpus,
num_gpus=num_gpus)
+15 -4
View File
@@ -735,9 +735,15 @@ def get_address_info_from_redis(redis_address, node_ip_address, num_retries=5):
time.sleep(1)
counter += 1
def _init(address_info=None, start_ray_local=False, object_id_seed=None,
num_workers=None, num_local_schedulers=None,
driver_mode=SCRIPT_MODE, num_cpus=None, num_gpus=None):
def _init(address_info=None,
start_ray_local=False,
object_id_seed=None,
num_workers=None,
num_local_schedulers=None,
driver_mode=SCRIPT_MODE,
start_workers_from_local_scheduler=True,
num_cpus=None,
num_gpus=None):
"""Helper method to connect to an existing Ray cluster or start a new one.
This method handles two cases. Either a Ray cluster already exists and we
@@ -764,6 +770,9 @@ def _init(address_info=None, start_ray_local=False, object_id_seed=None,
only provided if start_ray_local is True.
driver_mode (bool): The mode in which to start the driver. This should be
one of ray.SCRIPT_MODE, ray.PYTHON_MODE, and ray.SILENT_MODE.
start_workers_from_local_scheduler (bool): If this flag is True, then start
the initial workers from the local scheduler. Else, start them from
Python. The latter case is for debugging purposes only.
num_cpus: A list containing the number of CPUs the local schedulers should
be configured with.
num_gpus: A list containing the number of GPUs the local schedulers should
@@ -815,7 +824,9 @@ def _init(address_info=None, start_ray_local=False, object_id_seed=None,
node_ip_address=node_ip_address,
num_workers=num_workers,
num_local_schedulers=num_local_schedulers,
num_cpus=num_cpus, num_gpus=num_gpus)
start_workers_from_local_scheduler=start_workers_from_local_scheduler,
num_cpus=num_cpus,
num_gpus=num_gpus)
else:
if redis_address is None:
raise Exception("If start_ray_local=False, then redis_address must be provided.")