From 29e33629058ae715e6aba14040489d2e8944d8ad Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 7 Nov 2018 14:08:16 -0800 Subject: [PATCH] Better errors on process deaths (#3252) --- .travis/install-dependencies.sh | 4 ++-- python/ray/worker.py | 7 ++++++- python/setup.py | 1 + .../object_store_notification_manager.cc | 4 +++- src/ray/raylet/local_scheduler_client.cc | 12 +++++++++--- 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/.travis/install-dependencies.sh b/.travis/install-dependencies.sh index e1db5ce97..d3662c157 100755 --- a/.travis/install-dependencies.sh +++ b/.travis/install-dependencies.sh @@ -25,7 +25,7 @@ if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \ - feather-format lxml openpyxl xlrd py-spy setproctitle + feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then sudo apt-get update sudo apt-get install -y cmake pkg-config python-dev python-numpy build-essential autoconf curl libtool unzip @@ -51,7 +51,7 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \ - feather-format lxml openpyxl xlrd py-spy setproctitle + feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then # check that brew is installed which -s brew diff --git a/python/ray/worker.py b/python/ray/worker.py index fe65e290a..28fa47058 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -5,6 +5,7 @@ from __future__ import print_function from contextlib import contextmanager import atexit import colorama +import faulthandler import hashlib import inspect import logging @@ -404,7 +405,8 @@ class Worker(object): invalid_error = RayTaskError( "", None, "Invalid return value: likely worker died or was killed " - "while executing the task.") + "while executing the task; check previous logs or dmesg " + "for errors.") return [invalid_error] * len(object_ids) except pyarrow.DeserializationCallbackError: # Wait a little bit for the import thread to import the class. @@ -1850,6 +1852,9 @@ def connect(info, assert not worker.connected, error_message assert worker.cached_functions_to_run is not None, error_message + # Enable nice stack traces on SIGSEGV etc. + faulthandler.enable(all_threads=False) + # Initialize some fields. if mode is WORKER_MODE: worker.worker_id = random_string() diff --git a/python/setup.py b/python/setup.py index 1a5719e2d..7fb366a40 100644 --- a/python/setup.py +++ b/python/setup.py @@ -151,6 +151,7 @@ setup( "pytest", "pyyaml", "redis", + "faulthandler;python_version<'3'", "setproctitle", # The six module is required by pyarrow. "six >= 1.0.0", diff --git a/src/ray/object_manager/object_store_notification_manager.cc b/src/ray/object_manager/object_store_notification_manager.cc index f497a7fc6..361dba49a 100644 --- a/src/ray/object_manager/object_store_notification_manager.cc +++ b/src/ray/object_manager/object_store_notification_manager.cc @@ -47,7 +47,9 @@ void ObjectStoreNotificationManager::ProcessStoreLength( void ObjectStoreNotificationManager::ProcessStoreNotification( const boost::system::error_code &error) { if (error.value() != boost::system::errc::success) { - RAY_LOG(FATAL) << boost_to_ray_status(error).ToString(); + RAY_LOG(FATAL) + << "Problem communicating with the object store from raylet, check logs or " + << "dmesg for previous errors: " << boost_to_ray_status(error).ToString(); } const auto &object_info = diff --git a/src/ray/raylet/local_scheduler_client.cc b/src/ray/raylet/local_scheduler_client.cc index 591fedb88..1f6c59300 100644 --- a/src/ray/raylet/local_scheduler_client.cc +++ b/src/ray/raylet/local_scheduler_client.cc @@ -258,7 +258,10 @@ ray::raylet::TaskSpecification *local_scheduler_get_task_raylet( RAY_LOG(DEBUG) << "Exiting because local scheduler closed connection."; exit(1); } - RAY_CHECK(type == static_cast(MessageType::ExecuteTask)); + if (type != static_cast(MessageType::ExecuteTask)) { + RAY_LOG(FATAL) << "Problem communicating with raylet from worker: check logs or " + "dmesg for previous errors."; + } // Parse the flatbuffer object. auto reply_message = flatbuffers::GetRoot(reply); @@ -338,8 +341,11 @@ std::pair, std::vector> local_scheduler_wait( // Read result. read_message(conn->conn, &type, &reply_size, &reply); } - RAY_CHECK(static_cast(type) == - ray::protocol::MessageType::WaitReply); + if (static_cast(type) != + ray::protocol::MessageType::WaitReply) { + RAY_LOG(FATAL) << "Problem communicating with raylet from worker: check logs or " + "dmesg for previous errors."; + } auto reply_message = flatbuffers::GetRoot(reply); // Convert result. std::pair, std::vector> result;