Add ray stack tool for debugging (#3213)

This commit is contained in:
Eric Liang
2018-11-03 13:13:02 -07:00
committed by Robert Nishihara
parent ca7d4c2cf5
commit 9a0f0db070
5 changed files with 75 additions and 4 deletions
+4 -4
View File
@@ -25,7 +25,7 @@ if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \
feather-format lxml openpyxl xlrd
feather-format lxml openpyxl xlrd py-spy
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
sudo apt-get update
sudo apt-get install -y cmake pkg-config python-dev python-numpy build-essential autoconf curl libtool unzip
@@ -34,7 +34,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \
feather-format lxml openpyxl xlrd
feather-format lxml openpyxl xlrd py-spy
elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
# check that brew is installed
which -s brew
@@ -51,7 +51,7 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \
feather-format lxml openpyxl xlrd
feather-format lxml openpyxl xlrd py-spy
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
# check that brew is installed
which -s brew
@@ -68,7 +68,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \
feather-format lxml openpyxl xlrd
feather-format lxml openpyxl xlrd py-spy
elif [[ "$LINT" == "1" ]]; then
sudo apt-get update
sudo apt-get install -y cmake build-essential autoconf curl libtool unzip
+3
View File
@@ -99,3 +99,6 @@ If you encounter errors like
`blas_thread_init: pthread_create: Resource temporarily unavailable` when using many workers,
try setting ``OMP_NUM_THREADS=1``. Similarly, check configured system limits with
`ulimit -a` for other resource limit errors.
For debugging unexpected hangs or performance problems, you can run ``ray stack`` to dump
the stack traces of all Ray workers on the current node. This requires py-spy to be installed.
+5
View File
@@ -92,6 +92,11 @@ of the following reasons.
Hanging
-------
.. tip::
You can run ``ray stack`` to dump the stack traces of all Ray workers on
the current node. This requires py-spy to be installed.
If a workload is hanging and not progressing, the problem may be one of the
following.
+27
View File
@@ -577,6 +577,32 @@ def get_head_ip(cluster_config_file, cluster_name):
click.echo(get_head_node_ip(cluster_config_file, cluster_name))
@cli.command()
def stack():
COMMAND = """
pyspy=`which py-spy`
if [ ! -e "$pyspy" ]; then
echo "ERROR: Please 'pip install py-spy' first"
exit 1
fi
# Set IFS to iterate over lines instead of over words.
export IFS="
"
# Call sudo to prompt for password before anything has been printed.
sudo true
workers=$(
ps aux | grep default_worker.py | grep -v grep | grep -v raylet/raylet
)
for worker in $workers; do
echo "Stack dump for $worker";
pid=`echo $worker | awk '{print $2}'`;
sudo $pyspy --pid $pid --dump;
echo;
done
"""
subprocess.call(COMMAND, shell=True)
cli.add_command(start)
cli.add_command(stop)
cli.add_command(create_or_update)
@@ -588,6 +614,7 @@ cli.add_command(rsync_up)
cli.add_command(teardown)
cli.add_command(teardown, name="down")
cli.add_command(get_head_ip)
cli.add_command(stack)
def main():
+36
View File
@@ -5,6 +5,7 @@ from __future__ import print_function
import os
import re
import string
import subprocess
import sys
import threading
import time
@@ -2296,3 +2297,38 @@ def test_wait_reconstruction(shutdown_only):
ray.pyarrow.plasma.ObjectID(x_id.id()))
ready_ids, _ = ray.wait([x_id])
assert len(ready_ids) == 1
@pytest.mark.skipif(
os.getenv("TRAVIS") is None,
reason="This test should only be run on Travis.")
def test_ray_stack(shutdown_only):
ray.init(num_cpus=2)
def unique_name_1():
time.sleep(1000)
@ray.remote
def unique_name_2():
time.sleep(1000)
@ray.remote
def unique_name_3():
unique_name_1()
unique_name_2.remote()
unique_name_3.remote()
success = False
start_time = time.time()
while time.time() - start_time < 30:
# Attempt to parse the "ray stack" call.
output = ray.utils.decode(subprocess.check_output(["ray", "stack"]))
if ("unique_name_1" in output and "unique_name_2" in output
and "unique_name_3" in output):
success = True
break
if not success:
raise Exception("Failed to find necessary information with "
"'ray stack'")