From fa99fda2b46f5f01c09c62677d0b6538ed4ecc10 Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Wed, 16 Jan 2019 02:05:16 -0800 Subject: [PATCH] Application Stress Tests (#3612) --- python/ray/autoscaler/autoscaler.py | 3 + .../tuned_examples/atari-impala-large.yaml | 22 +++ .../application_cluster_template.yaml | 112 +++++++++++++++ .../run_application_stress_tests.sh | 131 ++++++++++++++++++ test/stress_tests/run_stress_tests.sh | 1 + 5 files changed, 269 insertions(+) create mode 100644 python/ray/rllib/tuned_examples/atari-impala-large.yaml create mode 100644 test/stress_tests/application_cluster_template.yaml create mode 100755 test/stress_tests/run_application_stress_tests.sh diff --git a/python/ray/autoscaler/autoscaler.py b/python/ray/autoscaler/autoscaler.py index 1b2f45ebf..aa4391827 100644 --- a/python/ray/autoscaler/autoscaler.py +++ b/python/ray/autoscaler/autoscaler.py @@ -9,6 +9,7 @@ import hashlib import logging import math import os +from six import string_types from six.moves import queue import subprocess import threading @@ -633,6 +634,8 @@ def check_extraneous(config, schema): continue elif isinstance(v, type): if not isinstance(config[k], v): + if v is str and isinstance(config[k], string_types): + continue raise ValueError( "Config key `{}` has wrong type {}, expected {}".format( k, diff --git a/python/ray/rllib/tuned_examples/atari-impala-large.yaml b/python/ray/rllib/tuned_examples/atari-impala-large.yaml new file mode 100644 index 000000000..c97238d37 --- /dev/null +++ b/python/ray/rllib/tuned_examples/atari-impala-large.yaml @@ -0,0 +1,22 @@ +# Runs on a g3.16xl node with 5 m5.24xl workers +# Takes roughly 10 minutes. +atari-impala: + env: + grid_search: + - BreakoutNoFrameskip-v4 + - BeamRiderNoFrameskip-v4 + - QbertNoFrameskip-v4 + - SpaceInvadersNoFrameskip-v4 + run: IMPALA + stop: + timesteps_total: 3000000 + config: + sample_batch_size: 50 + train_batch_size: 500 + num_workers: 128 + num_envs_per_worker: 5 + clip_rewards: True + lr_schedule: [ + [0, 0.0005], + [20000000, 0.000000000001], + ] diff --git a/test/stress_tests/application_cluster_template.yaml b/test/stress_tests/application_cluster_template.yaml new file mode 100644 index 000000000..5549fadd1 --- /dev/null +++ b/test/stress_tests/application_cluster_template.yaml @@ -0,0 +1,112 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: <<>> + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: <<>> + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: <<>> + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: + image: "" # e.g., tensorflow/tensorflow:1.5.0-py3 + container_name: "" # e.g. ray_docker + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: aws + region: us-west-2 + # Availability zone(s), comma-separated, that nodes may be launched in. + # Nodes are currently spread between zones by a round-robin approach, + # however this implementation detail should not be relied upon. + availability_zone: us-west-2a,us-west-2b + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu +# By default Ray creates a new private keypair, but you can also use your own. +# If you do so, make sure to also set "KeyName" in the head and worker node +# configurations below. +# ssh_private_key: /path/to/your/key.pem + +# Provider-specific config for the head node, e.g. instance type. By default +# Ray will auto-configure unspecified fields such as SubnetId and KeyName. +# For more documentation on available fields, see: +# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances +head_node: + InstanceType: <<>> + ImageId: ami-0d0ff0945ae093aea # Amazon Deep Learning AMI (Ubuntu) 12/12/2018 + + # You can provision additional disk space with a conf as follows + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 100 + + # Additional options in the boto docs. + +# Provider-specific config for worker nodes, e.g. instance type. By default +# Ray will auto-configure unspecified fields such as SubnetId and KeyName. +# For more documentation on available fields, see: +# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances +worker_nodes: + InstanceType: <<>> + ImageId: ami-0d0ff0945ae093aea # Amazon Deep Learning AMI (Ubuntu) 12/12/2018 + + # Run workers on spot by default. Comment this out to use on-demand. + InstanceMarketOptions: + MarketType: spot + # Additional options can be found in the boto docs, e.g. + # SpotOptions: + # MaxPrice: MAX_HOURLY_PRICE + + # Additional options in the boto docs. + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# List of shell commands to run to set up nodes. +setup_commands: + - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_<<>>/bin:$PATH"' >> ~/.bashrc + - ray || wget https://s3-us-west-2.amazonaws.com/ray-wheels/latest/<<>>-<<>>-manylinux1_x86_64.whl + - rllib || pip install -U <<>>-<<>>-manylinux1_x86_64.whl[rllib] + - pip install -U tensorflow-gpu + # Consider uncommenting these if you also want to run apt-get commands during setup + # - sudo pkill -9 apt-get || true + # - sudo pkill -9 dpkg || true + # - sudo dpkg --configure -a + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + - pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/test/stress_tests/run_application_stress_tests.sh b/test/stress_tests/run_application_stress_tests.sh new file mode 100755 index 000000000..0c4531f06 --- /dev/null +++ b/test/stress_tests/run_application_stress_tests.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +# This script runs all of the application tests. +# Currently includes an IMPALA stress test and a SGD stress test. +# on both Python 2.7 and 3.6. +# All tests use a separate cluster, and each cluster +# will be destroyed upon test completion (or failure). + +# Note that if the environment variable DEBUG_MODE is detected, +# the clusters will not be automatically shut down after the test runs. + +# This script will exit with code 1 if the test did not run successfully. + + +ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) +RAY_VERSION=$(git describe --tags --abbrev=0) +RESULT_FILE=$ROOT_DIR/"results-$(date '+%Y-%m-%d_%H-%M-%S').log" + +echo "Testing on latest version of Ray: $RAY_VERSION" +echo "Logging to" $RESULT_FILE +touch $RESULT_FILE + +# This function identifies the right string for the Ray wheel. +_find_wheel_str(){ + local python_version=$1 + # echo "PYTHON_VERSION", $python_version + local wheel_str="" + if [ $python_version == "p27" ]; then + wheel_str="cp27-cp27mu" + else + wheel_str="cp36-cp36m" + fi + echo $wheel_str +} + +# Total time is roughly 25 minutes. +# Actual test runtime is roughly 10 minutes. +test_impala(){ + local PYTHON_VERSION=$1 + local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION) + + pushd "$ROOT_DIR" + local TEST_NAME="rllib_impala_$PYTHON_VERSION" + local CLUSTER="$TEST_NAME.yaml" + echo "Creating IMPALA cluster YAML from template." + + cat application_cluster_template.yaml | + sed -e " + s/<<>>/$TEST_NAME/; + s/<<>>/$RAY_VERSION/; + s/<<>>/g3.16xlarge/; + s/<<>>/m5.24xlarge/; + s/<<>>/5/; + s/<<>>/5/; + s/<<>>/$PYTHON_VERSION/; + s/<<>>/$WHEEL_STR/;" > $CLUSTER + + echo "Try running IMPALA stress test." + { + RLLIB_DIR=../../python/ray/rllib/ + ray up -y $CLUSTER && + ray rsync_up $CLUSTER $RLLIB_DIR/tuned_examples/ tuned_examples/ && + sleep 1 && + ray exec $CLUSTER " + rllib train -f tuned_examples/atari-impala-large.yaml --redis-address='localhost:6379' --queue-trials" && + echo "PASS: IMPALA Test for" $PYTHON_VERSION >> $RESULT_FILE + } || echo "FAIL: IMPALA Test for" $PYTHON_VERSION >> $RESULT_FILE + + # Tear down cluster. + if [ "$DEBUG_MODE" = "" ]; then + ray down -y $CLUSTER + rm $CLUSTER + else + echo "Not tearing down cluster" $CLUSTER + fi + popd +} + +# Total runtime is about 20 minutes (if the AWS spot instance order is fulfilled). +# Actual test runtime is roughly 10 minutes. +test_sgd(){ + local PYTHON_VERSION=$1 + local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION) + + pushd "$ROOT_DIR" + local TEST_NAME="sgd_$PYTHON_VERSION" + local CLUSTER="$TEST_NAME.yaml" + echo "Creating SGD cluster YAML from template." + + cat application_cluster_template.yaml | + sed -e " + s/<<>>/$TEST_NAME/; + s/<<>>/$RAY_VERSION/; + s/<<>>/g3.16xlarge/; + s/<<>>/g3.16xlarge/; + s/<<>>/3/; + s/<<>>/3/; + s/<<>>/$PYTHON_VERSION/; + s/<<>>/$WHEEL_STR/;" > $CLUSTER + + echo "Try running SGD stress test." + { + SGD_DIR=$ROOT_DIR/../../python/ray/experimental/sgd/ + ray up -y $CLUSTER && + # TODO: fix submit so that args work + ray rsync_up $CLUSTER $SGD_DIR/mnist_example.py mnist_example.py && + sleep 1 && + ray exec $CLUSTER " + python mnist_example.py --redis-address=localhost:6379 --num-iters=2000 --num-workers=8 --devices-per-worker=2 --gpu" && + echo "PASS: SGD Test for" $PYTHON_VERSION >> $RESULT_FILE + } || echo "FAIL: SGD Test for" $PYTHON_VERSION >> $RESULT_FILE + + # Tear down cluster. + if [ "$DEBUG_MODE" = "" ]; then + ray down -y $CLUSTER + rm $CLUSTER + else + echo "Not tearing down cluster" $CLUSTER + fi + popd +} + +# RUN TESTS +for PYTHON_VERSION in "p27" "p36" +do + test_impala $PYTHON_VERSION + test_sgd $PYTHON_VERSION +done + +cat $RESULT_FILE +cat $RESULT_FILE | grep FAIL > test.log +[ ! -s test.log ] || exit 1 diff --git a/test/stress_tests/run_stress_tests.sh b/test/stress_tests/run_stress_tests.sh index eb3487a41..394f65deb 100755 --- a/test/stress_tests/run_stress_tests.sh +++ b/test/stress_tests/run_stress_tests.sh @@ -33,3 +33,4 @@ pushd "$ROOT_DIR" popd cat $RESULT_FILE +[ ! -s $RESULT_FILE ] || exit 1