mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 19:16:19 +08:00
Application Stress Tests (#3612)
This commit is contained in:
@@ -9,6 +9,7 @@ import hashlib
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
from six import string_types
|
||||
from six.moves import queue
|
||||
import subprocess
|
||||
import threading
|
||||
@@ -633,6 +634,8 @@ def check_extraneous(config, schema):
|
||||
continue
|
||||
elif isinstance(v, type):
|
||||
if not isinstance(config[k], v):
|
||||
if v is str and isinstance(config[k], string_types):
|
||||
continue
|
||||
raise ValueError(
|
||||
"Config key `{}` has wrong type {}, expected {}".format(
|
||||
k,
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
# Runs on a g3.16xl node with 5 m5.24xl workers
|
||||
# Takes roughly 10 minutes.
|
||||
atari-impala:
|
||||
env:
|
||||
grid_search:
|
||||
- BreakoutNoFrameskip-v4
|
||||
- BeamRiderNoFrameskip-v4
|
||||
- QbertNoFrameskip-v4
|
||||
- SpaceInvadersNoFrameskip-v4
|
||||
run: IMPALA
|
||||
stop:
|
||||
timesteps_total: 3000000
|
||||
config:
|
||||
sample_batch_size: 50
|
||||
train_batch_size: 500
|
||||
num_workers: 128
|
||||
num_envs_per_worker: 5
|
||||
clip_rewards: True
|
||||
lr_schedule: [
|
||||
[0, 0.0005],
|
||||
[20000000, 0.000000000001],
|
||||
]
|
||||
@@ -0,0 +1,112 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: <<<CLUSTER_NAME>>>
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: <<<MIN_WORKERS>>>
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: <<<MAX_WORKERS>>>
|
||||
|
||||
# This executes all commands on all nodes in the docker container,
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
|
||||
container_name: "" # e.g. ray_docker
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
|
||||
# can be decreased to increase the aggressiveness of upscaling.
|
||||
# This value must be less than 1.0 for scaling to happen.
|
||||
target_utilization_fraction: 0.8
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
# Availability zone(s), comma-separated, that nodes may be launched in.
|
||||
# Nodes are currently spread between zones by a round-robin approach,
|
||||
# however this implementation detail should not be relied upon.
|
||||
availability_zone: us-west-2a,us-west-2b
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
# By default Ray creates a new private keypair, but you can also use your own.
|
||||
# If you do so, make sure to also set "KeyName" in the head and worker node
|
||||
# configurations below.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
head_node:
|
||||
InstanceType: <<<HEAD_TYPE>>>
|
||||
ImageId: ami-0d0ff0945ae093aea # Amazon Deep Learning AMI (Ubuntu) 12/12/2018
|
||||
|
||||
# You can provision additional disk space with a conf as follows
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
worker_nodes:
|
||||
InstanceType: <<<WORKER_TYPE>>>
|
||||
ImageId: ami-0d0ff0945ae093aea # Amazon Deep Learning AMI (Ubuntu) 12/12/2018
|
||||
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
# Additional options can be found in the boto docs, e.g.
|
||||
# SpotOptions:
|
||||
# MaxPrice: MAX_HOURLY_PRICE
|
||||
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow_<<<PYTHON_VERSION>>>/bin:$PATH"' >> ~/.bashrc
|
||||
- ray || wget https://s3-us-west-2.amazonaws.com/ray-wheels/latest/<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl
|
||||
- rllib || pip install -U <<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl[rllib]
|
||||
- pip install -U tensorflow-gpu
|
||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
||||
# - sudo pkill -9 apt-get || true
|
||||
# - sudo pkill -9 dpkg || true
|
||||
# - sudo dpkg --configure -a
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
- pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
+131
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env bash
|
||||
# This script runs all of the application tests.
|
||||
# Currently includes an IMPALA stress test and a SGD stress test.
|
||||
# on both Python 2.7 and 3.6.
|
||||
# All tests use a separate cluster, and each cluster
|
||||
# will be destroyed upon test completion (or failure).
|
||||
|
||||
# Note that if the environment variable DEBUG_MODE is detected,
|
||||
# the clusters will not be automatically shut down after the test runs.
|
||||
|
||||
# This script will exit with code 1 if the test did not run successfully.
|
||||
|
||||
|
||||
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
|
||||
RAY_VERSION=$(git describe --tags --abbrev=0)
|
||||
RESULT_FILE=$ROOT_DIR/"results-$(date '+%Y-%m-%d_%H-%M-%S').log"
|
||||
|
||||
echo "Testing on latest version of Ray: $RAY_VERSION"
|
||||
echo "Logging to" $RESULT_FILE
|
||||
touch $RESULT_FILE
|
||||
|
||||
# This function identifies the right string for the Ray wheel.
|
||||
_find_wheel_str(){
|
||||
local python_version=$1
|
||||
# echo "PYTHON_VERSION", $python_version
|
||||
local wheel_str=""
|
||||
if [ $python_version == "p27" ]; then
|
||||
wheel_str="cp27-cp27mu"
|
||||
else
|
||||
wheel_str="cp36-cp36m"
|
||||
fi
|
||||
echo $wheel_str
|
||||
}
|
||||
|
||||
# Total time is roughly 25 minutes.
|
||||
# Actual test runtime is roughly 10 minutes.
|
||||
test_impala(){
|
||||
local PYTHON_VERSION=$1
|
||||
local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION)
|
||||
|
||||
pushd "$ROOT_DIR"
|
||||
local TEST_NAME="rllib_impala_$PYTHON_VERSION"
|
||||
local CLUSTER="$TEST_NAME.yaml"
|
||||
echo "Creating IMPALA cluster YAML from template."
|
||||
|
||||
cat application_cluster_template.yaml |
|
||||
sed -e "
|
||||
s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
|
||||
s/<<<RAY_VERSION>>>/$RAY_VERSION/;
|
||||
s/<<<HEAD_TYPE>>>/g3.16xlarge/;
|
||||
s/<<<WORKER_TYPE>>>/m5.24xlarge/;
|
||||
s/<<<MIN_WORKERS>>>/5/;
|
||||
s/<<<MAX_WORKERS>>>/5/;
|
||||
s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
|
||||
s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > $CLUSTER
|
||||
|
||||
echo "Try running IMPALA stress test."
|
||||
{
|
||||
RLLIB_DIR=../../python/ray/rllib/
|
||||
ray up -y $CLUSTER &&
|
||||
ray rsync_up $CLUSTER $RLLIB_DIR/tuned_examples/ tuned_examples/ &&
|
||||
sleep 1 &&
|
||||
ray exec $CLUSTER "
|
||||
rllib train -f tuned_examples/atari-impala-large.yaml --redis-address='localhost:6379' --queue-trials" &&
|
||||
echo "PASS: IMPALA Test for" $PYTHON_VERSION >> $RESULT_FILE
|
||||
} || echo "FAIL: IMPALA Test for" $PYTHON_VERSION >> $RESULT_FILE
|
||||
|
||||
# Tear down cluster.
|
||||
if [ "$DEBUG_MODE" = "" ]; then
|
||||
ray down -y $CLUSTER
|
||||
rm $CLUSTER
|
||||
else
|
||||
echo "Not tearing down cluster" $CLUSTER
|
||||
fi
|
||||
popd
|
||||
}
|
||||
|
||||
# Total runtime is about 20 minutes (if the AWS spot instance order is fulfilled).
|
||||
# Actual test runtime is roughly 10 minutes.
|
||||
test_sgd(){
|
||||
local PYTHON_VERSION=$1
|
||||
local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION)
|
||||
|
||||
pushd "$ROOT_DIR"
|
||||
local TEST_NAME="sgd_$PYTHON_VERSION"
|
||||
local CLUSTER="$TEST_NAME.yaml"
|
||||
echo "Creating SGD cluster YAML from template."
|
||||
|
||||
cat application_cluster_template.yaml |
|
||||
sed -e "
|
||||
s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
|
||||
s/<<<RAY_VERSION>>>/$RAY_VERSION/;
|
||||
s/<<<HEAD_TYPE>>>/g3.16xlarge/;
|
||||
s/<<<WORKER_TYPE>>>/g3.16xlarge/;
|
||||
s/<<<MIN_WORKERS>>>/3/;
|
||||
s/<<<MAX_WORKERS>>>/3/;
|
||||
s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
|
||||
s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > $CLUSTER
|
||||
|
||||
echo "Try running SGD stress test."
|
||||
{
|
||||
SGD_DIR=$ROOT_DIR/../../python/ray/experimental/sgd/
|
||||
ray up -y $CLUSTER &&
|
||||
# TODO: fix submit so that args work
|
||||
ray rsync_up $CLUSTER $SGD_DIR/mnist_example.py mnist_example.py &&
|
||||
sleep 1 &&
|
||||
ray exec $CLUSTER "
|
||||
python mnist_example.py --redis-address=localhost:6379 --num-iters=2000 --num-workers=8 --devices-per-worker=2 --gpu" &&
|
||||
echo "PASS: SGD Test for" $PYTHON_VERSION >> $RESULT_FILE
|
||||
} || echo "FAIL: SGD Test for" $PYTHON_VERSION >> $RESULT_FILE
|
||||
|
||||
# Tear down cluster.
|
||||
if [ "$DEBUG_MODE" = "" ]; then
|
||||
ray down -y $CLUSTER
|
||||
rm $CLUSTER
|
||||
else
|
||||
echo "Not tearing down cluster" $CLUSTER
|
||||
fi
|
||||
popd
|
||||
}
|
||||
|
||||
# RUN TESTS
|
||||
for PYTHON_VERSION in "p27" "p36"
|
||||
do
|
||||
test_impala $PYTHON_VERSION
|
||||
test_sgd $PYTHON_VERSION
|
||||
done
|
||||
|
||||
cat $RESULT_FILE
|
||||
cat $RESULT_FILE | grep FAIL > test.log
|
||||
[ ! -s test.log ] || exit 1
|
||||
@@ -33,3 +33,4 @@ pushd "$ROOT_DIR"
|
||||
popd
|
||||
|
||||
cat $RESULT_FILE
|
||||
[ ! -s $RESULT_FILE ] || exit 1
|
||||
|
||||
Reference in New Issue
Block a user