[rllib] Fix impala stress test (#5101)

* add copy * upgrade to tf 1.14 * update * reduce count to workaround https://github.com/ray-project/ray/issues/5125 * Update impala.py * placeholder * comments * update
2026-07-01 11:10:02 +08:00 · 2019-07-09 20:22:30 -07:00
parent 5733690aa6
commit 5ab5017c67
4 changed files with 33 additions and 38 deletions
@@ -7,14 +7,14 @@ intended to run forever until they fail.
 Running the Workloads
 ---------------------

-To run the workloads, run
+To run the workloads, first edit the config.yaml and replace
+``RAY_WHEEL_TO_TEST_HERE`` with the desired version to test, then run:

 .. code-block:: bash

-    ./start_workloads.sh <ray-branch> <ray-version> <ray-commit>
+    ./start_workloads.sh

-using the appropriate values of ``<ray-branch>``, ``<ray-version>``, and
-``<ray-commit>``. This will start one EC2 instance per  workload and will start
+This will start one EC2 instance per  workload and will start
 the workloads running (one per instance). Running the ``./start_workloads.sh``
 script again will clean up any state from the previous runs and will start the
 workloads again.
@@ -34,9 +34,17 @@ worker_nodes:

 # List of shell commands to run to set up nodes.
 setup_commands:
+    # Install latest TensorFlow
+    - source activate tensorflow_p36 && conda remove -y --force wrapt || true
+    - source activate tensorflow_p36 && pip install -U tensorflow==1.14
    # Install nightly Ray wheels.
-    - source activate tensorflow_p36 && pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/<<<RAY_BRANCH>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl
+    # Example: https://s3-us-west-2.amazonaws.com/ray-wheels/<<<RAY_BRANCH>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl
+    - source activate tensorflow_p36 && pip install -U RAY_WHEEL_TO_TEST_HERE
    - source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari]
+    - source activate tensorflow_p36 && pip install ray[debug]
+    - echo set-window-option -g mouse on > ~/.tmux.conf
+    - echo 'termcapinfo xterm* ti@:te@' > ~/.screenrc
+
    # Uncomment the following if you wish to build Ray instead.
    # - sudo apt-get update
    # - sudo apt-get install -y build-essential curl unzip
@@ -3,40 +3,16 @@
 set -e

 ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
-
-if [[ -z  "$1" ]]; then
-  echo "ERROR: The first argument must be the Ray branch to test (e.g., 'master')."
-  exit 1
-else
-  RAY_BRANCH=$1
-fi
-
-if [[ -z  "$2" ]]; then
-  echo "ERROR: The second argument must be the Ray version to test (e.g., '0.8.0.dev1')."
-  exit 1
-else
-  RAY_VERSION=$2
-fi
-
-if [[ -z  "$3" ]]; then
-  echo "ERROR: The third argument must be the Ray commit to test (e.g., '62e4b591e3d6443ce25b0f05cc32b43d5e2ebb3d')."
-  exit 1
-else
-  RAY_COMMIT=$3
-fi
-
-echo "Testing ray==$RAY_VERSION at commit $RAY_COMMIT."
-echo "The wheels used will live under https://s3-us-west-2.amazonaws.com/ray-wheels/$RAY_BRANCH/$RAY_VERSION/$RAY_COMMIT/"
-
-
 pushd "$ROOT_DIR"

 # Substitute in the appropriate Ray version and commit in the config file and
 # store it in a temporary file.
-CLUSTER_CONFIG="config_temporary.yaml"
-sed -e "s/<<<RAY_BRANCH>>>/$RAY_BRANCH/g;
-        s/<<<RAY_VERSION>>>/$RAY_VERSION/g;
-        s/<<<RAY_COMMIT>>>/$RAY_COMMIT/;" config.yaml > "$CLUSTER_CONFIG"
+CLUSTER_CONFIG="config.yaml"
+
+if grep -q RAY_WHEEL_TO_TEST_HERE $CLUSTER_CONFIG; then
+    echo "You must replace the RAY_WHEEL_TO_TEST_HERE string in $CLUSTER_CONFIG."
+    exit 1
+fi

 # Start one instance per workload.
 for workload_file in "$ROOT_DIR"/workloads/*; do
@@ -47,6 +23,12 @@ done
 # Wait for all of the nodes to be up.
 wait

+status=$?
+if [ $status != 0 ]; then
+    echo "Some update processes failed with $status"
+    exit 1
+fi
+
 # Start the workloads running.
 for workload_file in "$ROOT_DIR"/workloads/*; do
  file_name=$(basename -- "$workload_file")
@@ -55,7 +37,7 @@ for workload_file in "$ROOT_DIR"/workloads/*; do
      # Copy the workload to the cluster.
      ray rsync_up $CLUSTER_CONFIG --cluster-name="$workload_name" "$workload_file" "$file_name"
      # Clean up previous runs if relevant.
-      ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "ray stop; rm -r /tmp/ray; tmux kill-server | true"
+      ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && ray stop; rm -r /tmp/ray; tmux kill-server | true"
      # Start the workload.
      ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && python $file_name" --tmux
   ) &
@@ -105,8 +105,13 @@ class AggregationWorkerBase(object):
            self.batch_buffer.append(sample_batch)
            if sum(b.count
                   for b in self.batch_buffer) >= self.train_batch_size:
-                train_batch = self.batch_buffer[0].concat_samples(
-                    self.batch_buffer)
+                if len(self.batch_buffer) == 1:
+                    # make a defensive copy to avoid sharing plasma memory
+                    # across multiple threads
+                    train_batch = self.batch_buffer[0].copy()
+                else:
+                    train_batch = self.batch_buffer[0].concat_samples(
+                        self.batch_buffer)
                yield train_batch
                self.batch_buffer = []