diff --git a/ci/long_running_tests/README.rst b/ci/long_running_tests/README.rst index 42526682c..736862abe 100644 --- a/ci/long_running_tests/README.rst +++ b/ci/long_running_tests/README.rst @@ -15,9 +15,9 @@ the previous runs and will start the workloads again. Check Workload Statuses ----------------------- -To check up on the workloads, run ``./check_workloads.sh``. This will print the -tail of each workload, and from the output you might be able to see if something -has failed. +To check up on the workloads, run either ``./check_workloads.sh --load``, which +will print the load on each machine, or ``./check_workloads.sh --logs``, which +will print the tail of the output for each workload. To debug workloads that have failed, you may find it useful to ssh to the relevant machine, attach to the tmux session (usually ``tmux a -t 0``), inspect diff --git a/ci/long_running_tests/check_workloads.sh b/ci/long_running_tests/check_workloads.sh index 591b0d7f8..a7a613848 100755 --- a/ci/long_running_tests/check_workloads.sh +++ b/ci/long_running_tests/check_workloads.sh @@ -4,21 +4,33 @@ ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) -pushd "$ROOT_DIR" +if [ "$1" == "--load" ]; then + check_load=true +elif [ "$1" == "--logs" ]; then + check_load=false +else + echo "Usage: $0 [--load|--logs]" + exit 1 +fi + +cd "$ROOT_DIR" for workload_file in "$ROOT_DIR"/workloads/*; do file_name=$(basename -- $workload_file) workload_name="${file_name%.*}" - echo "======================================================================" - echo "WORKLOAD: $workload_name" - echo "======================================================================" + if $check_load; then + echo -n "$workload_name: " + ray --logging-level=WARNING exec config.yaml --cluster-name="$workload_name" uptime 2>/dev/null || echo "" + else + echo "======================================================================" + echo "WORKLOAD: $workload_name" + echo "======================================================================" - ray exec config.yaml --cluster-name="$workload_name" "tmux capture-pane -p" - echo "" - echo "ssh to this machine with:" - echo " ray attach $ROOT_DIR/config.yaml --cluster-name=$workload_name" - echo "" - echo "" + ray exec config.yaml --cluster-name="$workload_name" "tmux capture-pane -p" + echo "" + echo "ssh to this machine with:" + echo " ray attach $ROOT_DIR/config.yaml --cluster-name=$workload_name" + echo "" + echo "" + fi done - -popd diff --git a/ci/long_running_tests/config.yaml b/ci/long_running_tests/config.yaml index 80505f606..76254385f 100644 --- a/ci/long_running_tests/config.yaml +++ b/ci/long_running_tests/config.yaml @@ -42,6 +42,7 @@ setup_commands: - wget https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh || true - bash Anaconda3-5.0.1-Linux-x86_64.sh -b -p $HOME/anaconda3 || true - echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.bashrc + - echo 'termcapinfo xterm* ti@:te@' >> ~/.screenrc # Some Python dependencies. - pip install boto3==1.4.8 cython==0.29.0 # # Uncomment the following if you wish to install Ray instead. @@ -50,7 +51,9 @@ setup_commands: # - git clone https://github.com/ray-project/ray || true # - cd ray/python; git checkout master; git pull; pip install -e . --verbose # Install nightly Ray wheels. - - pip install https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp36-cp36m-manylinux1_x86_64.whl + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp36-cp36m-manylinux1_x86_64.whl + - pip install ray[rllib] ray[debug] tensorflow + - pip install -U dask # fix error importing lz4 # Custom commands that will be run on the head node after common setup. head_setup_commands: [] diff --git a/ci/long_running_tests/shut_down_workloads.sh b/ci/long_running_tests/shut_down_workloads.sh index 44abe6824..a4573e1cf 100755 --- a/ci/long_running_tests/shut_down_workloads.sh +++ b/ci/long_running_tests/shut_down_workloads.sh @@ -12,10 +12,7 @@ for workload_file in "$ROOT_DIR"/workloads/*; do workload_name="${file_name%.*}" ray down -y config.yaml --cluster-name="$workload_name" & done - # Wait for all of the ray down commands to finish. -for pid in `jobs -p`; do - wait $pid -done +wait popd diff --git a/ci/long_running_tests/start_workloads.sh b/ci/long_running_tests/start_workloads.sh index 26b2bbca4..f68a774a2 100755 --- a/ci/long_running_tests/start_workloads.sh +++ b/ci/long_running_tests/start_workloads.sh @@ -12,23 +12,24 @@ for workload_file in "$ROOT_DIR"/workloads/*; do workload_name="${file_name%.*}" ray up -y config.yaml --cluster-name="$workload_name" & done - # Wait for all of the nodes to be up. -for pid in `jobs -p`; do - wait $pid -done +wait # Start the workloads running. for workload_file in "$ROOT_DIR"/workloads/*; do file_name=$(basename -- $workload_file) workload_name="${file_name%.*}" - # Copy the workload to the cluster. - ray rsync_up config.yaml --cluster-name="$workload_name" "$workload_file" "$file_name" - # Clean up previous runs if relevant. - ray exec config.yaml --cluster-name="$workload_name" "ray stop; rm -r /tmp/ray; tmux kill-server | true" - # Start the workload. - ray exec config.yaml --cluster-name="$workload_name" "python $file_name" --tmux + ( + # Copy the workload to the cluster. + ray rsync_up config.yaml --cluster-name="$workload_name" "$workload_file" "$file_name" + # Clean up previous runs if relevant. + ray exec config.yaml --cluster-name="$workload_name" "ray stop; rm -r /tmp/ray; tmux kill-server | true" + # Start the workload. + ray exec config.yaml --cluster-name="$workload_name" "python $file_name" --tmux + ) & done +# Wait for child processes to finish. +wait popd diff --git a/ci/long_running_tests/workloads/workload_apex.py b/ci/long_running_tests/workloads/workload_apex.py new file mode 100644 index 000000000..2b177ab6f --- /dev/null +++ b/ci/long_running_tests/workloads/workload_apex.py @@ -0,0 +1,52 @@ +# This workload tests running APEX + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import ray +from ray.tests.cluster_utils import Cluster +from ray.tune import run_experiments + +num_redis_shards = 5 +redis_max_memory = 10**8 +object_store_memory = 10**9 +num_nodes = 3 + +message = ("Make sure there is enough memory on this machine to run this " + "workload. We divide the system memory by 2 to provide a buffer.") +assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory < + ray.utils.get_system_memory() / 2), message + +# Simulate a cluster on one machine. + +cluster = Cluster() +for i in range(num_nodes): + cluster.add_node( + redis_port=6379 if i == 0 else None, + num_redis_shards=num_redis_shards if i == 0 else None, + num_cpus=20, + num_gpus=0, + resources={str(i): 2}, + object_store_memory=object_store_memory, + redis_max_memory=redis_max_memory) +ray.init(redis_address=cluster.redis_address) + +# Run the workload. + +run_experiments({ + "apex": { + "run": "APEX", + "env": "Pong-v0", + "config": { + "num_workers": 8, + "num_gpus": 0, + "buffer_size": 10000, + "learning_starts": 0, + "sample_batch_size": 1, + "train_batch_size": 1, + "min_iter_time_s": 10, + "timesteps_per_iteration": 10, + }, + } +}) diff --git a/ci/long_running_tests/workloads/workload_impala.py b/ci/long_running_tests/workloads/workload_impala.py new file mode 100644 index 000000000..dcc927010 --- /dev/null +++ b/ci/long_running_tests/workloads/workload_impala.py @@ -0,0 +1,50 @@ +# This workload tests running IMPALA with remote envs + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import ray +from ray.tune import run_experiments +from ray.tests.cluster_utils import Cluster + +num_redis_shards = 5 +redis_max_memory = 10**8 +object_store_memory = 10**8 +num_nodes = 1 + +message = ("Make sure there is enough memory on this machine to run this " + "workload. We divide the system memory by 2 to provide a buffer.") +assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory < + ray.utils.get_system_memory() / 2), message + +# Simulate a cluster on one machine. + +cluster = Cluster() +for i in range(num_nodes): + cluster.add_node( + redis_port=6379 if i == 0 else None, + num_redis_shards=num_redis_shards if i == 0 else None, + num_cpus=10, + num_gpus=0, + resources={str(i): 2}, + object_store_memory=object_store_memory, + redis_max_memory=redis_max_memory) +ray.init(redis_address=cluster.redis_address) + +# Run the workload. + +run_experiments({ + "impala": { + "run": "IMPALA", + "env": "CartPole-v0", + "config": { + "num_workers": 8, + "num_gpus": 0, + "num_envs_per_worker": 5, + "remote_worker_envs": True, + "sample_batch_size": 50, + "train_batch_size": 100, + }, + }, +}) diff --git a/ci/long_running_tests/workloads/workload_pbt.py b/ci/long_running_tests/workloads/workload_pbt.py new file mode 100644 index 000000000..86473d86e --- /dev/null +++ b/ci/long_running_tests/workloads/workload_pbt.py @@ -0,0 +1,58 @@ +# This workload tests running PBT + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import ray +from ray.tune import run_experiments +from ray.tune.schedulers import PopulationBasedTraining +from ray.tests.cluster_utils import Cluster + +num_redis_shards = 5 +redis_max_memory = 10**8 +object_store_memory = 10**8 +num_nodes = 3 + +message = ("Make sure there is enough memory on this machine to run this " + "workload. We divide the system memory by 2 to provide a buffer.") +assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory < + ray.utils.get_system_memory() / 2), message + +# Simulate a cluster on one machine. + +cluster = Cluster() +for i in range(num_nodes): + cluster.add_node( + redis_port=6379 if i == 0 else None, + num_redis_shards=num_redis_shards if i == 0 else None, + num_cpus=10, + num_gpus=0, + resources={str(i): 2}, + object_store_memory=object_store_memory, + redis_max_memory=redis_max_memory) +ray.init(redis_address=cluster.redis_address) + +# Run the workload. + +pbt = PopulationBasedTraining( + time_attr="training_iteration", + reward_attr="episode_reward_mean", + perturbation_interval=10, + hyperparam_mutations={ + "lr": [0.1, 0.01, 0.001, 0.0001], + }) + +run_experiments( + { + "pbt_test": { + "run": "PG", + "env": "CartPole-v0", + "num_samples": 8, + "config": { + "lr": 0.01, + }, + } + }, + scheduler=pbt, + verbose=False) diff --git a/doc/source/rllib-examples.rst b/doc/source/rllib-examples.rst index 02092ac6c..45397bec0 100644 --- a/doc/source/rllib-examples.rst +++ b/doc/source/rllib-examples.rst @@ -65,5 +65,7 @@ Community Examples Example of training robotic control policies in SageMaker with RLlib. - `StarCraft2 `__: Example of training in StarCraft2 maps with RLlib / multi-agent. +- `NeuroCuts `__: + Example of building packet classification trees using RLlib / multi-agent in a bandit-like setting. - `Sequential Social Dilemma Games `__: Example of using the multi-agent API to model several `social dilemma games `__. diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index d6354b562..e27479b07 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -685,7 +685,7 @@ export IFS=" # Call sudo to prompt for password before anything has been printed. sudo true workers=$( - ps aux | grep ' ray_' | grep -v grep + ps aux | grep -E ' ray_|default_worker.py' | grep -v grep ) for worker in $workers; do echo "Stack dump for $worker";