Application Stress Tests (#3612)

2026-06-27 19:16:19 +08:00 · 2019-01-16 02:05:16 -08:00
parent c28e6d41f5
commit fa99fda2b4
5 changed files with 269 additions and 0 deletions
@@ -9,6 +9,7 @@ import hashlib
 import logging
 import math
 import os
+from six import string_types
 from six.moves import queue
 import subprocess
 import threading
@@ -633,6 +634,8 @@ def check_extraneous(config, schema):
            continue
        elif isinstance(v, type):
            if not isinstance(config[k], v):
+                if v is str and isinstance(config[k], string_types):
+                    continue
                raise ValueError(
                    "Config key `{}` has wrong type {}, expected {}".format(
                        k,
@@ -0,0 +1,22 @@
+# Runs on a g3.16xl node with 5 m5.24xl workers
+# Takes roughly 10 minutes.
+atari-impala:
+    env:
+        grid_search:
+            - BreakoutNoFrameskip-v4
+            - BeamRiderNoFrameskip-v4
+            - QbertNoFrameskip-v4
+            - SpaceInvadersNoFrameskip-v4
+    run: IMPALA
+    stop:
+        timesteps_total: 3000000
+    config:
+        sample_batch_size: 50
+        train_batch_size: 500
+        num_workers: 128
+        num_envs_per_worker: 5
+        clip_rewards: True
+        lr_schedule: [
+            [0, 0.0005],
+            [20000000, 0.000000000001],
+        ]
@@ -0,0 +1,112 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: <<<CLUSTER_NAME>>>
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: <<<MIN_WORKERS>>>
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: <<<MAX_WORKERS>>>
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker:
+    image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
+    container_name: "" # e.g. ray_docker
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This value must be less than 1.0 for scaling to happen.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: aws
+    region: us-west-2
+    # Availability zone(s), comma-separated, that nodes may be launched in.
+    # Nodes are currently spread between zones by a round-robin approach,
+    # however this implementation detail should not be relied upon.
+    availability_zone: us-west-2a,us-west-2b
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below.
+#    ssh_private_key: /path/to/your/key.pem
+
+# Provider-specific config for the head node, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+head_node:
+    InstanceType: <<<HEAD_TYPE>>>
+    ImageId: ami-0d0ff0945ae093aea  # Amazon Deep Learning AMI (Ubuntu) 12/12/2018
+
+    # You can provision additional disk space with a conf as follows
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+              VolumeSize: 100
+
+    # Additional options in the boto docs.
+
+# Provider-specific config for worker nodes, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+worker_nodes:
+    InstanceType: <<<WORKER_TYPE>>>
+    ImageId: ami-0d0ff0945ae093aea  # Amazon Deep Learning AMI (Ubuntu) 12/12/2018
+
+    # Run workers on spot by default. Comment this out to use on-demand.
+    InstanceMarketOptions:
+        MarketType: spot
+        # Additional options can be found in the boto docs, e.g.
+        #   SpotOptions:
+        #       MaxPrice: MAX_HOURLY_PRICE
+
+    # Additional options in the boto docs.
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# List of shell commands to run to set up nodes.
+setup_commands:
+    - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_<<<PYTHON_VERSION>>>/bin:$PATH"' >> ~/.bashrc
+    - ray || wget https://s3-us-west-2.amazonaws.com/ray-wheels/latest/<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl
+    - rllib || pip install -U <<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl[rllib]
+    - pip install -U tensorflow-gpu
+    # Consider uncommenting these if you also want to run apt-get commands during setup
+    # - sudo pkill -9 apt-get || true
+    # - sudo pkill -9 dpkg || true
+    # - sudo dpkg --configure -a
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+    - pip install boto3==1.4.8  # 1.4.8 adds InstanceMarketOptions
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -0,0 +1,131 @@
+#!/usr/bin/env bash
+# This script runs all of the application tests.
+# Currently includes an IMPALA stress test and a SGD stress test.
+# on both Python 2.7 and 3.6.
+# All tests use a separate cluster, and each cluster
+# will be destroyed upon test completion (or failure).
+
+# Note that if the environment variable DEBUG_MODE is detected,
+# the clusters will not be automatically shut down after the test runs.
+
+# This script will exit with code 1 if the test did not run successfully.
+
+
+ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
+RAY_VERSION=$(git describe --tags --abbrev=0)
+RESULT_FILE=$ROOT_DIR/"results-$(date '+%Y-%m-%d_%H-%M-%S').log"
+
+echo "Testing on latest version of Ray: $RAY_VERSION"
+echo "Logging to" $RESULT_FILE
+touch $RESULT_FILE
+
+# This function identifies the right string for the Ray wheel.
+_find_wheel_str(){
+    local python_version=$1
+    # echo "PYTHON_VERSION", $python_version
+    local wheel_str=""
+    if [ $python_version == "p27" ]; then
+        wheel_str="cp27-cp27mu"
+    else
+        wheel_str="cp36-cp36m"
+    fi
+    echo $wheel_str
+}
+
+# Total time is roughly 25 minutes.
+# Actual test runtime is roughly 10 minutes.
+test_impala(){
+    local PYTHON_VERSION=$1
+    local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION)
+
+    pushd "$ROOT_DIR"
+        local TEST_NAME="rllib_impala_$PYTHON_VERSION"
+        local CLUSTER="$TEST_NAME.yaml"
+        echo "Creating IMPALA cluster YAML from template."
+
+        cat application_cluster_template.yaml |
+            sed -e "
+                s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
+                s/<<<RAY_VERSION>>>/$RAY_VERSION/;
+                s/<<<HEAD_TYPE>>>/g3.16xlarge/;
+                s/<<<WORKER_TYPE>>>/m5.24xlarge/;
+                s/<<<MIN_WORKERS>>>/5/;
+                s/<<<MAX_WORKERS>>>/5/;
+                s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
+                s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > $CLUSTER
+
+        echo "Try running IMPALA stress test."
+        {
+            RLLIB_DIR=../../python/ray/rllib/
+            ray up -y $CLUSTER &&
+            ray rsync_up $CLUSTER $RLLIB_DIR/tuned_examples/ tuned_examples/ &&
+            sleep 1 &&
+            ray exec $CLUSTER "
+                rllib train -f tuned_examples/atari-impala-large.yaml --redis-address='localhost:6379' --queue-trials" &&
+            echo "PASS: IMPALA Test for" $PYTHON_VERSION >> $RESULT_FILE
+        } || echo "FAIL: IMPALA Test for" $PYTHON_VERSION >> $RESULT_FILE
+
+        # Tear down cluster.
+        if [ "$DEBUG_MODE" = "" ]; then
+            ray down -y $CLUSTER
+            rm $CLUSTER
+        else
+            echo "Not tearing down cluster" $CLUSTER
+        fi
+    popd
+}
+
+# Total runtime is about 20 minutes (if the AWS spot instance order is fulfilled).
+# Actual test runtime is roughly 10 minutes.
+test_sgd(){
+    local PYTHON_VERSION=$1
+    local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION)
+
+    pushd "$ROOT_DIR"
+        local TEST_NAME="sgd_$PYTHON_VERSION"
+        local CLUSTER="$TEST_NAME.yaml"
+        echo "Creating SGD cluster YAML from template."
+
+        cat application_cluster_template.yaml |
+            sed -e "
+                s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
+                s/<<<RAY_VERSION>>>/$RAY_VERSION/;
+                s/<<<HEAD_TYPE>>>/g3.16xlarge/;
+                s/<<<WORKER_TYPE>>>/g3.16xlarge/;
+                s/<<<MIN_WORKERS>>>/3/;
+                s/<<<MAX_WORKERS>>>/3/;
+                s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
+                s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > $CLUSTER
+
+        echo "Try running SGD stress test."
+        {
+            SGD_DIR=$ROOT_DIR/../../python/ray/experimental/sgd/
+            ray up -y $CLUSTER &&
+            # TODO: fix submit so that args work
+            ray rsync_up $CLUSTER $SGD_DIR/mnist_example.py mnist_example.py &&
+            sleep 1 &&
+            ray exec $CLUSTER "
+                python mnist_example.py --redis-address=localhost:6379 --num-iters=2000 --num-workers=8 --devices-per-worker=2 --gpu" &&
+            echo "PASS: SGD Test for" $PYTHON_VERSION >> $RESULT_FILE
+        } || echo "FAIL: SGD Test for" $PYTHON_VERSION >> $RESULT_FILE
+
+        # Tear down cluster.
+        if [ "$DEBUG_MODE" = "" ]; then
+            ray down -y $CLUSTER
+            rm $CLUSTER
+        else
+            echo "Not tearing down cluster" $CLUSTER
+        fi
+    popd
+}
+
+# RUN TESTS
+for PYTHON_VERSION in "p27" "p36"
+do
+    test_impala $PYTHON_VERSION
+    test_sgd $PYTHON_VERSION
+done
+
+cat $RESULT_FILE
+cat $RESULT_FILE | grep FAIL > test.log
+[ ! -s test.log ] || exit 1
@@ -33,3 +33,4 @@ pushd "$ROOT_DIR"
 popd

 cat $RESULT_FILE
+[ ! -s $RESULT_FILE ] || exit 1