Files
ray/test/jenkins_tests/multi_node_docker_test.py
T
Robert Nishihara 0ac125e9b2 Clean up when a driver disconnects. (#462)
* Clean up state when drivers exit.

* Remove unnecessary field in ActorMapEntry struct.

* Have monitor release GPU resources in Redis when driver exits.

* Enable multiple drivers in multi-node tests and test driver cleanup.

* Make redis GPU allocation a redis transaction and small cleanups.

* Fix multi-node test.

* Small cleanups.

* Make global scheduler take node_ip_address so it appears in the right place in the client table.

* Cleanups.

* Fix linting and cleanups in local scheduler.

* Fix removed_driver_test.

* Fix bug related to vector -> list.

* Fix linting.

* Cleanup.

* Fix multi node tests.

* Fix jenkins tests.

* Add another multi node test with many drivers.

* Fix linting.

* Make the actor creation notification a flatbuffer message.

* Revert "Make the actor creation notification a flatbuffer message."

This reverts commit af99099c8084dbf9177fb4e34c0c9b1a12c78f39.

* Add comment explaining flatbuffer problems.
2017-04-24 18:10:21 -07:00

301 lines
12 KiB
Python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import numpy as np
import os
import re
import subprocess
import sys
def wait_for_output(proc):
"""This is a convenience method to parse a process's stdout and stderr.
Args:
proc: A process started by subprocess.Popen.
Returns:
A tuple of the stdout and stderr of the process as strings.
"""
stdout_data, stderr_data = proc.communicate()
stdout_data = (stdout_data.decode("ascii") if stdout_data is not None
else None)
stderr_data = (stderr_data.decode("ascii") if stderr_data is not None
else None)
return stdout_data, stderr_data
class DockerRunner(object):
"""This class manages the logistics of running multiple nodes in Docker.
This class is used for starting multiple Ray nodes within Docker, stopping
Ray, running a workload, and determining the success or failure of the
workload.
Attributes:
head_container_id: The ID of the docker container that runs the head node.
worker_container_ids: A list of the docker container IDs of the Ray worker
nodes.
head_container_ip: The IP address of the docker container that runs the
head node.
"""
def __init__(self):
"""Initialize the DockerRunner."""
self.head_container_id = None
self.worker_container_ids = []
self.head_container_ip = None
def _get_container_id(self, stdout_data):
"""Parse the docker container ID from stdout_data.
Args:
stdout_data: This should be a string with the standard output of a call
to a docker command.
Returns:
The container ID of the docker container.
"""
p = re.compile("([0-9a-f]{64})\n")
m = p.match(stdout_data)
if m is None:
return None
else:
return m.group(1)
def _get_container_ip(self, container_id):
"""Get the IP address of a specific docker container.
Args:
container_id: The docker container ID of the relevant docker container.
Returns:
The IP address of the container.
"""
proc = subprocess.Popen(["docker", "inspect",
"--format={{.NetworkSettings.Networks.bridge"
".IPAddress}}",
container_id],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout_data, _ = wait_for_output(proc)
p = re.compile("([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})")
m = p.match(stdout_data)
if m is None:
raise RuntimeError("Container IP not found.")
else:
return m.group(1)
def _start_head_node(self, docker_image, mem_size, shm_size, num_cpus,
num_gpus, development_mode):
"""Start the Ray head node inside a docker container."""
mem_arg = ["--memory=" + mem_size] if mem_size else []
shm_arg = ["--shm-size=" + shm_size] if shm_size else []
volume_arg = (["-v",
"{}:{}".format(os.path.dirname(os.path.realpath(__file__)),
"/ray/test/jenkins_tests")]
if development_mode else [])
command = (["docker", "run", "-d"] + mem_arg + shm_arg + volume_arg +
[docker_image, "/ray/scripts/start_ray.sh", "--head",
"--redis-port=6379",
"--num-cpus={}".format(num_cpus),
"--num-gpus={}".format(num_gpus)])
print("Starting head node with command:{}".format(command))
proc = subprocess.Popen(command,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout_data, _ = wait_for_output(proc)
container_id = self._get_container_id(stdout_data)
if container_id is None:
raise RuntimeError("Failed to find container ID.")
self.head_container_id = container_id
self.head_container_ip = self._get_container_ip(container_id)
def _start_worker_node(self, docker_image, mem_size, shm_size, num_cpus,
num_gpus, development_mode):
"""Start a Ray worker node inside a docker container."""
mem_arg = ["--memory=" + mem_size] if mem_size else []
shm_arg = ["--shm-size=" + shm_size] if shm_size else []
volume_arg = (["-v",
"{}:{}".format(os.path.dirname(os.path.realpath(__file__)),
"/ray/test/jenkins_tests")]
if development_mode else [])
command = (["docker", "run", "-d"] + mem_arg + shm_arg + volume_arg +
["--shm-size=" + shm_size, docker_image,
"/ray/scripts/start_ray.sh",
"--redis-address={:s}:6379".format(self.head_container_ip),
"--num-cpus={}".format(num_cpus),
"--num-gpus={}".format(num_gpus)])
print("Starting worker node with command:{}".format(command))
proc = subprocess.Popen(command, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout_data, _ = wait_for_output(proc)
container_id = self._get_container_id(stdout_data)
if container_id is None:
raise RuntimeError("Failed to find container id")
self.worker_container_ids.append(container_id)
def start_ray(self, docker_image=None, mem_size=None, shm_size=None,
num_nodes=None, num_cpus=None, num_gpus=None,
development_mode=None):
"""Start a Ray cluster within docker.
This starts one docker container running the head node and num_nodes - 1
docker containers running the Ray worker nodes.
Args:
docker_image: The docker image to use for all of the nodes.
mem_size: The amount of memory to start each docker container with. This
will be passed into `docker run` as the --memory flag. If this is None,
then no --memory flag will be used.
shm_size: The amount of shared memory to start each docker container
with. This will be passed into `docker run` as the `--shm-size` flag.
num_nodes: The number of nodes to use in the cluster (this counts the
head node as well).
num_cpus: A list of the number of CPUs to start each node with.
num_gpus: A list of the number of GPUs to start each node with.
development_mode: True if you want to mount the local copy of
test/jenkins_test on the head node so we can avoid rebuilding docker
images during development.
"""
assert len(num_cpus) == num_nodes
assert len(num_gpus) == num_nodes
# Launch the head node.
self._start_head_node(docker_image, mem_size, shm_size, num_cpus[0],
num_gpus[0], development_mode)
# Start the worker nodes.
for i in range(num_nodes - 1):
self._start_worker_node(docker_image, mem_size, shm_size,
num_cpus[1 + i], num_gpus[1 + i],
development_mode)
def _stop_node(self, container_id):
"""Stop a node in the Ray cluster."""
proc = subprocess.Popen(["docker", "kill", container_id],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout_data, _ = wait_for_output(proc)
stopped_container_id = self._get_container_id(stdout_data)
if not container_id == stopped_container_id:
raise Exception("Failed to stop container {}.".format(container_id))
proc = subprocess.Popen(["docker", "rm", "-f", container_id],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout_data, _ = wait_for_output(proc)
removed_container_id = self._get_container_id(stdout_data)
if not container_id == removed_container_id:
raise Exception("Failed to remove container {}.".format(container_id))
print("stop_node", {"container_id": container_id,
"is_head": container_id == self.head_container_id})
def stop_ray(self):
"""Stop the Ray cluster."""
self._stop_node(self.head_container_id)
for container_id in self.worker_container_ids:
self._stop_node(container_id)
def run_test(self, test_script, num_drivers, driver_locations=None):
"""Run a test script.
Run a test using the Ray cluster.
Args:
test_script: The test script to run.
num_drivers: The number of copies of the test script to run.
driver_locations: A list of the indices of the containers that the
different copies of the test script should be run on. If this is None,
then the containers will be chosen randomly.
Returns:
A dictionary with information about the test script run.
"""
all_container_ids = [self.head_container_id] + self.worker_container_ids
if driver_locations is None:
driver_locations = [np.random.randint(0, len(all_container_ids))
for _ in range(num_drivers)]
# Start the different drivers.
driver_processes = []
for i in range(len(driver_locations)):
# Get the container ID to run the ith driver in.
container_id = all_container_ids[driver_locations[i]]
command = ["docker", "exec", container_id, "/bin/bash", "-c",
("RAY_REDIS_ADDRESS={}:6379 RAY_DRIVER_INDEX={} python {}"
.format(self.head_container_ip, i, test_script))]
print("Starting driver with command {}.".format(test_script))
# Start the driver.
p = subprocess.Popen(command, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
driver_processes.append(p)
# Wait for the drivers to finish.
results = []
for p in driver_processes:
stdout_data, stderr_data = wait_for_output(p)
print("STDOUT:")
print(stdout_data)
print("STDERR:")
print(stderr_data)
results.append({"success": p.returncode == 0,
"return_code": p.returncode})
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Run multinode tests in Docker.")
parser.add_argument("--docker-image", default="ray-project/deploy",
help="docker image")
parser.add_argument("--mem-size", help="memory size")
parser.add_argument("--shm-size", default="1G", help="shared memory size")
parser.add_argument("--num-nodes", default=1, type=int,
help="number of nodes to use in the cluster")
parser.add_argument("--num-cpus", type=str,
help=("a comma separated list of values representing "
"the number of CPUs to start each node with"))
parser.add_argument("--num-gpus", type=str,
help=("a comma separated list of values representing "
"the number of GPUs to start each node with"))
parser.add_argument("--num-drivers", default=1, type=int,
help="number of drivers to run")
parser.add_argument("--driver-locations", type=str,
help=("a comma separated list of indices of the "
"containers to run the drivers in"))
parser.add_argument("--test-script", required=True, help="test script")
parser.add_argument("--development-mode", action="store_true",
help="use local copies of the test scripts")
args = parser.parse_args()
# Parse the number of CPUs and GPUs to use for each worker.
num_nodes = args.num_nodes
num_cpus = ([int(i) for i in args.num_cpus.split(",")]
if args.num_cpus is not None else num_nodes * [10])
num_gpus = ([int(i) for i in args.num_gpus.split(",")]
if args.num_gpus is not None else num_nodes * [0])
d = DockerRunner()
d.start_ray(docker_image=args.docker_image, mem_size=args.mem_size,
shm_size=args.shm_size, num_nodes=num_nodes,
num_cpus=num_cpus, num_gpus=num_gpus,
development_mode=args.development_mode)
try:
run_results = d.run_test(args.test_script, args.num_drivers,
driver_locations=args.driver_locations)
finally:
d.stop_ray()
any_failed = False
for run_result in run_results:
if "success" in run_result and run_result["success"]:
print("RESULT: Test {} succeeded.".format(args.test_script))
else:
print("RESULT: Test {} failed.".format(args.test_script))
any_failed = True
if any_failed:
sys.exit(1)
else:
sys.exit(0)