Application Stress Tests (#3612)

This commit is contained in:
Richard Liaw
2019-01-16 02:05:16 -08:00
committed by GitHub
parent c28e6d41f5
commit fa99fda2b4
5 changed files with 269 additions and 0 deletions
+3
View File
@@ -9,6 +9,7 @@ import hashlib
import logging
import math
import os
from six import string_types
from six.moves import queue
import subprocess
import threading
@@ -633,6 +634,8 @@ def check_extraneous(config, schema):
continue
elif isinstance(v, type):
if not isinstance(config[k], v):
if v is str and isinstance(config[k], string_types):
continue
raise ValueError(
"Config key `{}` has wrong type {}, expected {}".format(
k,
@@ -0,0 +1,22 @@
# Runs on a g3.16xl node with 5 m5.24xl workers
# Takes roughly 10 minutes.
atari-impala:
env:
grid_search:
- BreakoutNoFrameskip-v4
- BeamRiderNoFrameskip-v4
- QbertNoFrameskip-v4
- SpaceInvadersNoFrameskip-v4
run: IMPALA
stop:
timesteps_total: 3000000
config:
sample_batch_size: 50
train_batch_size: 500
num_workers: 128
num_envs_per_worker: 5
clip_rewards: True
lr_schedule: [
[0, 0.0005],
[20000000, 0.000000000001],
]
@@ -0,0 +1,112 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: <<<CLUSTER_NAME>>>
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: <<<MIN_WORKERS>>>
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: <<<MAX_WORKERS>>>
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
container_name: "" # e.g. ray_docker
# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
# can be decreased to increase the aggressiveness of upscaling.
# This value must be less than 1.0 for scaling to happen.
target_utilization_fraction: 0.8
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
# Availability zone(s), comma-separated, that nodes may be launched in.
# Nodes are currently spread between zones by a round-robin approach,
# however this implementation detail should not be relied upon.
availability_zone: us-west-2a,us-west-2b
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
head_node:
InstanceType: <<<HEAD_TYPE>>>
ImageId: ami-0d0ff0945ae093aea # Amazon Deep Learning AMI (Ubuntu) 12/12/2018
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: <<<WORKER_TYPE>>>
ImageId: ami-0d0ff0945ae093aea # Amazon Deep Learning AMI (Ubuntu) 12/12/2018
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# List of shell commands to run to set up nodes.
setup_commands:
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow_<<<PYTHON_VERSION>>>/bin:$PATH"' >> ~/.bashrc
- ray || wget https://s3-us-west-2.amazonaws.com/ray-wheels/latest/<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl
- rllib || pip install -U <<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl[rllib]
- pip install -U tensorflow-gpu
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
# - sudo dpkg --configure -a
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
+131
View File
@@ -0,0 +1,131 @@
#!/usr/bin/env bash
# This script runs all of the application tests.
# Currently includes an IMPALA stress test and a SGD stress test.
# on both Python 2.7 and 3.6.
# All tests use a separate cluster, and each cluster
# will be destroyed upon test completion (or failure).
# Note that if the environment variable DEBUG_MODE is detected,
# the clusters will not be automatically shut down after the test runs.
# This script will exit with code 1 if the test did not run successfully.
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
RAY_VERSION=$(git describe --tags --abbrev=0)
RESULT_FILE=$ROOT_DIR/"results-$(date '+%Y-%m-%d_%H-%M-%S').log"
echo "Testing on latest version of Ray: $RAY_VERSION"
echo "Logging to" $RESULT_FILE
touch $RESULT_FILE
# This function identifies the right string for the Ray wheel.
_find_wheel_str(){
local python_version=$1
# echo "PYTHON_VERSION", $python_version
local wheel_str=""
if [ $python_version == "p27" ]; then
wheel_str="cp27-cp27mu"
else
wheel_str="cp36-cp36m"
fi
echo $wheel_str
}
# Total time is roughly 25 minutes.
# Actual test runtime is roughly 10 minutes.
test_impala(){
local PYTHON_VERSION=$1
local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION)
pushd "$ROOT_DIR"
local TEST_NAME="rllib_impala_$PYTHON_VERSION"
local CLUSTER="$TEST_NAME.yaml"
echo "Creating IMPALA cluster YAML from template."
cat application_cluster_template.yaml |
sed -e "
s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
s/<<<RAY_VERSION>>>/$RAY_VERSION/;
s/<<<HEAD_TYPE>>>/g3.16xlarge/;
s/<<<WORKER_TYPE>>>/m5.24xlarge/;
s/<<<MIN_WORKERS>>>/5/;
s/<<<MAX_WORKERS>>>/5/;
s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > $CLUSTER
echo "Try running IMPALA stress test."
{
RLLIB_DIR=../../python/ray/rllib/
ray up -y $CLUSTER &&
ray rsync_up $CLUSTER $RLLIB_DIR/tuned_examples/ tuned_examples/ &&
sleep 1 &&
ray exec $CLUSTER "
rllib train -f tuned_examples/atari-impala-large.yaml --redis-address='localhost:6379' --queue-trials" &&
echo "PASS: IMPALA Test for" $PYTHON_VERSION >> $RESULT_FILE
} || echo "FAIL: IMPALA Test for" $PYTHON_VERSION >> $RESULT_FILE
# Tear down cluster.
if [ "$DEBUG_MODE" = "" ]; then
ray down -y $CLUSTER
rm $CLUSTER
else
echo "Not tearing down cluster" $CLUSTER
fi
popd
}
# Total runtime is about 20 minutes (if the AWS spot instance order is fulfilled).
# Actual test runtime is roughly 10 minutes.
test_sgd(){
local PYTHON_VERSION=$1
local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION)
pushd "$ROOT_DIR"
local TEST_NAME="sgd_$PYTHON_VERSION"
local CLUSTER="$TEST_NAME.yaml"
echo "Creating SGD cluster YAML from template."
cat application_cluster_template.yaml |
sed -e "
s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
s/<<<RAY_VERSION>>>/$RAY_VERSION/;
s/<<<HEAD_TYPE>>>/g3.16xlarge/;
s/<<<WORKER_TYPE>>>/g3.16xlarge/;
s/<<<MIN_WORKERS>>>/3/;
s/<<<MAX_WORKERS>>>/3/;
s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > $CLUSTER
echo "Try running SGD stress test."
{
SGD_DIR=$ROOT_DIR/../../python/ray/experimental/sgd/
ray up -y $CLUSTER &&
# TODO: fix submit so that args work
ray rsync_up $CLUSTER $SGD_DIR/mnist_example.py mnist_example.py &&
sleep 1 &&
ray exec $CLUSTER "
python mnist_example.py --redis-address=localhost:6379 --num-iters=2000 --num-workers=8 --devices-per-worker=2 --gpu" &&
echo "PASS: SGD Test for" $PYTHON_VERSION >> $RESULT_FILE
} || echo "FAIL: SGD Test for" $PYTHON_VERSION >> $RESULT_FILE
# Tear down cluster.
if [ "$DEBUG_MODE" = "" ]; then
ray down -y $CLUSTER
rm $CLUSTER
else
echo "Not tearing down cluster" $CLUSTER
fi
popd
}
# RUN TESTS
for PYTHON_VERSION in "p27" "p36"
do
test_impala $PYTHON_VERSION
test_sgd $PYTHON_VERSION
done
cat $RESULT_FILE
cat $RESULT_FILE | grep FAIL > test.log
[ ! -s test.log ] || exit 1
+1
View File
@@ -33,3 +33,4 @@ pushd "$ROOT_DIR"
popd
cat $RESULT_FILE
[ ! -s $RESULT_FILE ] || exit 1