From 9a0f0db070ece317df02ea65809315ebd6522ce3 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sat, 3 Nov 2018 13:13:02 -0700 Subject: [PATCH] Add `ray stack` tool for debugging (#3213) --- .travis/install-dependencies.sh | 8 ++++---- doc/source/rllib.rst | 3 +++ doc/source/troubleshooting.rst | 5 +++++ python/ray/scripts/scripts.py | 27 +++++++++++++++++++++++++ test/runtest.py | 36 +++++++++++++++++++++++++++++++++ 5 files changed, 75 insertions(+), 4 deletions(-) diff --git a/.travis/install-dependencies.sh b/.travis/install-dependencies.sh index 1c6c3a342..31fbf18c8 100755 --- a/.travis/install-dependencies.sh +++ b/.travis/install-dependencies.sh @@ -25,7 +25,7 @@ if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \ - feather-format lxml openpyxl xlrd + feather-format lxml openpyxl xlrd py-spy elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then sudo apt-get update sudo apt-get install -y cmake pkg-config python-dev python-numpy build-essential autoconf curl libtool unzip @@ -34,7 +34,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \ - feather-format lxml openpyxl xlrd + feather-format lxml openpyxl xlrd py-spy elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then # check that brew is installed which -s brew @@ -51,7 +51,7 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \ - feather-format lxml openpyxl xlrd + feather-format lxml openpyxl xlrd py-spy elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then # check that brew is installed which -s brew @@ -68,7 +68,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \ - feather-format lxml openpyxl xlrd + feather-format lxml openpyxl xlrd py-spy elif [[ "$LINT" == "1" ]]; then sudo apt-get update sudo apt-get install -y cmake build-essential autoconf curl libtool unzip diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 8979e702a..8caa21d9a 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -99,3 +99,6 @@ If you encounter errors like `blas_thread_init: pthread_create: Resource temporarily unavailable` when using many workers, try setting ``OMP_NUM_THREADS=1``. Similarly, check configured system limits with `ulimit -a` for other resource limit errors. + +For debugging unexpected hangs or performance problems, you can run ``ray stack`` to dump +the stack traces of all Ray workers on the current node. This requires py-spy to be installed. diff --git a/doc/source/troubleshooting.rst b/doc/source/troubleshooting.rst index 46c20edc8..86f56e775 100644 --- a/doc/source/troubleshooting.rst +++ b/doc/source/troubleshooting.rst @@ -92,6 +92,11 @@ of the following reasons. Hanging ------- +.. tip:: + + You can run ``ray stack`` to dump the stack traces of all Ray workers on + the current node. This requires py-spy to be installed. + If a workload is hanging and not progressing, the problem may be one of the following. diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index 60bb6f0ed..688fbb3ae 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -577,6 +577,32 @@ def get_head_ip(cluster_config_file, cluster_name): click.echo(get_head_node_ip(cluster_config_file, cluster_name)) +@cli.command() +def stack(): + COMMAND = """ +pyspy=`which py-spy` +if [ ! -e "$pyspy" ]; then + echo "ERROR: Please 'pip install py-spy' first" + exit 1 +fi +# Set IFS to iterate over lines instead of over words. +export IFS=" +" +# Call sudo to prompt for password before anything has been printed. +sudo true +workers=$( + ps aux | grep default_worker.py | grep -v grep | grep -v raylet/raylet +) +for worker in $workers; do + echo "Stack dump for $worker"; + pid=`echo $worker | awk '{print $2}'`; + sudo $pyspy --pid $pid --dump; + echo; +done + """ + subprocess.call(COMMAND, shell=True) + + cli.add_command(start) cli.add_command(stop) cli.add_command(create_or_update) @@ -588,6 +614,7 @@ cli.add_command(rsync_up) cli.add_command(teardown) cli.add_command(teardown, name="down") cli.add_command(get_head_ip) +cli.add_command(stack) def main(): diff --git a/test/runtest.py b/test/runtest.py index ab006ebac..bd87c66d6 100644 --- a/test/runtest.py +++ b/test/runtest.py @@ -5,6 +5,7 @@ from __future__ import print_function import os import re import string +import subprocess import sys import threading import time @@ -2296,3 +2297,38 @@ def test_wait_reconstruction(shutdown_only): ray.pyarrow.plasma.ObjectID(x_id.id())) ready_ids, _ = ray.wait([x_id]) assert len(ready_ids) == 1 + + +@pytest.mark.skipif( + os.getenv("TRAVIS") is None, + reason="This test should only be run on Travis.") +def test_ray_stack(shutdown_only): + ray.init(num_cpus=2) + + def unique_name_1(): + time.sleep(1000) + + @ray.remote + def unique_name_2(): + time.sleep(1000) + + @ray.remote + def unique_name_3(): + unique_name_1() + + unique_name_2.remote() + unique_name_3.remote() + + success = False + start_time = time.time() + while time.time() - start_time < 30: + # Attempt to parse the "ray stack" call. + output = ray.utils.decode(subprocess.check_output(["ray", "stack"])) + if ("unique_name_1" in output and "unique_name_2" in output + and "unique_name_3" in output): + success = True + break + + if not success: + raise Exception("Failed to find necessary information with " + "'ray stack'")