mirror of
https://github.com/wassname/ray.git
synced 2026-07-01 11:10:02 +08:00
[rllib] Fix impala stress test (#5101)
* add copy * upgrade to tf 1.14 * update * reduce count to workaround https://github.com/ray-project/ray/issues/5125 * Update impala.py * placeholder * comments * update
This commit is contained in:
@@ -7,14 +7,14 @@ intended to run forever until they fail.
|
||||
Running the Workloads
|
||||
---------------------
|
||||
|
||||
To run the workloads, run
|
||||
To run the workloads, first edit the config.yaml and replace
|
||||
``RAY_WHEEL_TO_TEST_HERE`` with the desired version to test, then run:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./start_workloads.sh <ray-branch> <ray-version> <ray-commit>
|
||||
./start_workloads.sh
|
||||
|
||||
using the appropriate values of ``<ray-branch>``, ``<ray-version>``, and
|
||||
``<ray-commit>``. This will start one EC2 instance per workload and will start
|
||||
This will start one EC2 instance per workload and will start
|
||||
the workloads running (one per instance). Running the ``./start_workloads.sh``
|
||||
script again will clean up any state from the previous runs and will start the
|
||||
workloads again.
|
||||
|
||||
@@ -34,9 +34,17 @@ worker_nodes:
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
# Install latest TensorFlow
|
||||
- source activate tensorflow_p36 && conda remove -y --force wrapt || true
|
||||
- source activate tensorflow_p36 && pip install -U tensorflow==1.14
|
||||
# Install nightly Ray wheels.
|
||||
- source activate tensorflow_p36 && pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/<<<RAY_BRANCH>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl
|
||||
# Example: https://s3-us-west-2.amazonaws.com/ray-wheels/<<<RAY_BRANCH>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl
|
||||
- source activate tensorflow_p36 && pip install -U RAY_WHEEL_TO_TEST_HERE
|
||||
- source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari]
|
||||
- source activate tensorflow_p36 && pip install ray[debug]
|
||||
- echo set-window-option -g mouse on > ~/.tmux.conf
|
||||
- echo 'termcapinfo xterm* ti@:te@' > ~/.screenrc
|
||||
|
||||
# Uncomment the following if you wish to build Ray instead.
|
||||
# - sudo apt-get update
|
||||
# - sudo apt-get install -y build-essential curl unzip
|
||||
|
||||
@@ -3,40 +3,16 @@
|
||||
set -e
|
||||
|
||||
ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
|
||||
|
||||
if [[ -z "$1" ]]; then
|
||||
echo "ERROR: The first argument must be the Ray branch to test (e.g., 'master')."
|
||||
exit 1
|
||||
else
|
||||
RAY_BRANCH=$1
|
||||
fi
|
||||
|
||||
if [[ -z "$2" ]]; then
|
||||
echo "ERROR: The second argument must be the Ray version to test (e.g., '0.8.0.dev1')."
|
||||
exit 1
|
||||
else
|
||||
RAY_VERSION=$2
|
||||
fi
|
||||
|
||||
if [[ -z "$3" ]]; then
|
||||
echo "ERROR: The third argument must be the Ray commit to test (e.g., '62e4b591e3d6443ce25b0f05cc32b43d5e2ebb3d')."
|
||||
exit 1
|
||||
else
|
||||
RAY_COMMIT=$3
|
||||
fi
|
||||
|
||||
echo "Testing ray==$RAY_VERSION at commit $RAY_COMMIT."
|
||||
echo "The wheels used will live under https://s3-us-west-2.amazonaws.com/ray-wheels/$RAY_BRANCH/$RAY_VERSION/$RAY_COMMIT/"
|
||||
|
||||
|
||||
pushd "$ROOT_DIR"
|
||||
|
||||
# Substitute in the appropriate Ray version and commit in the config file and
|
||||
# store it in a temporary file.
|
||||
CLUSTER_CONFIG="config_temporary.yaml"
|
||||
sed -e "s/<<<RAY_BRANCH>>>/$RAY_BRANCH/g;
|
||||
s/<<<RAY_VERSION>>>/$RAY_VERSION/g;
|
||||
s/<<<RAY_COMMIT>>>/$RAY_COMMIT/;" config.yaml > "$CLUSTER_CONFIG"
|
||||
CLUSTER_CONFIG="config.yaml"
|
||||
|
||||
if grep -q RAY_WHEEL_TO_TEST_HERE $CLUSTER_CONFIG; then
|
||||
echo "You must replace the RAY_WHEEL_TO_TEST_HERE string in $CLUSTER_CONFIG."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Start one instance per workload.
|
||||
for workload_file in "$ROOT_DIR"/workloads/*; do
|
||||
@@ -47,6 +23,12 @@ done
|
||||
# Wait for all of the nodes to be up.
|
||||
wait
|
||||
|
||||
status=$?
|
||||
if [ $status != 0 ]; then
|
||||
echo "Some update processes failed with $status"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Start the workloads running.
|
||||
for workload_file in "$ROOT_DIR"/workloads/*; do
|
||||
file_name=$(basename -- "$workload_file")
|
||||
@@ -55,7 +37,7 @@ for workload_file in "$ROOT_DIR"/workloads/*; do
|
||||
# Copy the workload to the cluster.
|
||||
ray rsync_up $CLUSTER_CONFIG --cluster-name="$workload_name" "$workload_file" "$file_name"
|
||||
# Clean up previous runs if relevant.
|
||||
ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "ray stop; rm -r /tmp/ray; tmux kill-server | true"
|
||||
ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && ray stop; rm -r /tmp/ray; tmux kill-server | true"
|
||||
# Start the workload.
|
||||
ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && python $file_name" --tmux
|
||||
) &
|
||||
|
||||
@@ -105,8 +105,13 @@ class AggregationWorkerBase(object):
|
||||
self.batch_buffer.append(sample_batch)
|
||||
if sum(b.count
|
||||
for b in self.batch_buffer) >= self.train_batch_size:
|
||||
train_batch = self.batch_buffer[0].concat_samples(
|
||||
self.batch_buffer)
|
||||
if len(self.batch_buffer) == 1:
|
||||
# make a defensive copy to avoid sharing plasma memory
|
||||
# across multiple threads
|
||||
train_batch = self.batch_buffer[0].copy()
|
||||
else:
|
||||
train_batch = self.batch_buffer[0].concat_samples(
|
||||
self.batch_buffer)
|
||||
yield train_batch
|
||||
self.batch_buffer = []
|
||||
|
||||
|
||||
Reference in New Issue
Block a user