[xray] Add error table and push error messages to driver through node manager. (#2256)

* Fix documentation indentation.

* Add error table to GCS and push error messages through node manager.

* Add type to error data.

* Linting

* Fix failure_test bug.

* Linting.

* Enable one more test.

* Attempt to fix doc building.

* Restructuring

* Fixes

* More fixes.

* Move current_time_ms function into util.h.
This commit is contained in:
Robert Nishihara
2018-06-20 21:29:28 -07:00
committed by Philipp Moritz
parent 6bf48f47bc
commit ff2217251f
27 changed files with 610 additions and 204 deletions
+91 -21
View File
@@ -22,6 +22,7 @@ import pyarrow
import pyarrow.plasma as plasma
import ray.cloudpickle as pickle
import ray.experimental.state as state
import ray.gcs_utils
import ray.remote_function
import ray.serialization as serialization
import ray.services as services
@@ -31,9 +32,6 @@ import ray.plasma
import ray.ray_constants as ray_constants
from ray.utils import random_string, binary_to_hex, is_cython
# Import flatbuffer bindings.
from ray.core.generated.ClientTableData import ClientTableData
SCRIPT_MODE = 0
WORKER_MODE = 1
PYTHON_MODE = 2
@@ -415,7 +413,7 @@ class Worker(object):
"may be a bug.")
if not warning_sent:
ray.utils.push_error_to_driver(
self.redis_client,
self,
ray_constants.WAIT_FOR_CLASS_PUSH_ERROR,
warning_message,
driver_id=self.task_driver_id.id())
@@ -663,7 +661,7 @@ class Worker(object):
"large array or other object.".format(
function_name, len(pickled_function)))
ray.utils.push_error_to_driver(
self.redis_client,
self,
ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR,
warning_message,
driver_id=self.task_driver_id.id())
@@ -726,7 +724,7 @@ class Worker(object):
.format(function.__name__,
len(pickled_function)))
ray.utils.push_error_to_driver(
self.redis_client,
self,
ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR,
warning_message,
driver_id=self.task_driver_id.id())
@@ -781,7 +779,7 @@ class Worker(object):
"Ray.")
if not warning_sent:
ray.utils.push_error_to_driver(
self.redis_client,
self,
ray_constants.WAIT_FOR_FUNCTION_PUSH_ERROR,
warning_message,
driver_id=driver_id)
@@ -942,7 +940,7 @@ class Worker(object):
self._store_outputs_in_objstore(return_object_ids, failure_objects)
# Log the error message.
ray.utils.push_error_to_driver(
self.redis_client,
self,
ray_constants.TASK_PUSH_ERROR,
str(failure_object),
driver_id=self.task_driver_id.id(),
@@ -1200,6 +1198,11 @@ def error_info(worker=global_worker):
"""Return information about failed tasks."""
worker.check_connected()
check_main_thread()
if worker.use_raylet:
return (global_state.error_messages(job_id=worker.task_driver_id) +
global_state.error_messages(job_id=ray_constants.NIL_JOB_ID))
error_keys = worker.redis_client.lrange("ErrorKeys", 0, -1)
errors = []
for error_key in error_keys:
@@ -1291,9 +1294,8 @@ def get_address_info_from_redis_helper(redis_address,
if not use_raylet:
# The client table prefix must be kept in sync with the file
# "src/common/redis_module/ray_redis_module.cc" where it is defined.
REDIS_CLIENT_TABLE_PREFIX = "CL:"
client_keys = redis_client.keys(
"{}*".format(REDIS_CLIENT_TABLE_PREFIX))
client_keys = redis_client.keys("{}*".format(
ray.gcs_utils.DB_CLIENT_PREFIX))
# Filter to live clients on the same node and do some basic checking.
plasma_managers = []
local_schedulers = []
@@ -1350,11 +1352,11 @@ def get_address_info_from_redis_helper(redis_address,
else:
# In the raylet code path, all client data is stored in a zset at the
# key for the nil client.
client_key = b"CLIENT:" + NIL_CLIENT_ID
client_key = b"CLIENT" + NIL_CLIENT_ID
clients = redis_client.zrange(client_key, 0, -1)
raylets = []
for client_message in clients:
client = ClientTableData.GetRootAsClientTableData(
client = ray.gcs_utils.ClientTableData.GetRootAsClientTableData(
client_message, 0)
client_node_ip_address = client.NodeManagerAddress().decode(
"ascii")
@@ -1819,6 +1821,71 @@ def custom_excepthook(type, value, tb):
sys.excepthook = custom_excepthook
def print_error_messages_raylet(worker):
"""Print error messages in the background on the driver.
This runs in a separate thread on the driver and prints error messages in
the background.
"""
if not worker.use_raylet:
raise Exception("This function is specific to the raylet code path.")
worker.error_message_pubsub_client = worker.redis_client.pubsub(
ignore_subscribe_messages=True)
# Exports that are published after the call to
# error_message_pubsub_client.subscribe and before the call to
# error_message_pubsub_client.listen will still be processed in the loop.
# Really we should just subscribe to the errors for this specific job.
# However, currently all errors seem to be published on the same channel.
error_pubsub_channel = str(
ray.gcs_utils.TablePubsub.ERROR_INFO).encode("ascii")
worker.error_message_pubsub_client.subscribe(error_pubsub_channel)
# worker.error_message_pubsub_client.psubscribe("*")
# Keep a set of all the error messages that we've seen so far in order to
# avoid printing the same error message repeatedly. This is especially
# important when running a script inside of a tool like screen where
# scrolling is difficult.
old_error_messages = set()
# Get the exports that occurred before the call to subscribe.
with worker.lock:
error_messages = global_state.error_messages(worker.task_driver_id)
for error_message in error_messages:
if error_message not in old_error_messages:
print(error_message)
old_error_messages.add(error_message)
else:
print("Suppressing duplicate error message.")
try:
for msg in worker.error_message_pubsub_client.listen():
gcs_entry = state.GcsTableEntry.GetRootAsGcsTableEntry(
msg["data"], 0)
assert gcs_entry.EntriesLength() == 1
error_data = state.ErrorTableData.GetRootAsErrorTableData(
gcs_entry.Entries(0), 0)
NIL_JOB_ID = 20 * b"\x00"
job_id = error_data.JobId()
if job_id not in [worker.task_driver_id.id(), NIL_JOB_ID]:
continue
error_message = error_data.ErrorMessage().decode("ascii")
if error_message not in old_error_messages:
print(error_message)
old_error_messages.add(error_message)
else:
print("Suppressing duplicate error message.")
except redis.ConnectionError:
# When Redis terminates the listen call will throw a ConnectionError,
# which we catch here.
pass
def print_error_messages(worker):
"""Print error messages in the background on the driver.
@@ -1907,7 +1974,7 @@ def fetch_and_register_remote_function(key, worker=global_worker):
traceback_str = ray.utils.format_error_message(traceback.format_exc())
# Log the error message.
ray.utils.push_error_to_driver(
worker.redis_client,
worker,
ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR,
traceback_str,
driver_id=driver_id,
@@ -1952,7 +2019,7 @@ def fetch_and_execute_function_to_run(key, worker=global_worker):
name = function.__name__ if ("function" in locals()
and hasattr(function, "__name__")) else ""
ray.utils.push_error_to_driver(
worker.redis_client,
worker,
ray_constants.FUNCTION_TO_RUN_PUSH_ERROR,
traceback_str,
driver_id=driver_id,
@@ -2111,8 +2178,9 @@ def connect(info,
raise e
elif mode == WORKER_MODE:
traceback_str = traceback.format_exc()
ray.utils.push_error_to_driver(
ray.utils.push_error_to_driver_through_redis(
worker.redis_client,
worker.use_raylet,
ray_constants.VERSION_MISMATCH_PUSH_ERROR,
traceback_str,
driver_id=None)
@@ -2237,13 +2305,11 @@ def connect(info,
driver_task.execution_dependencies_string(), 0,
ray.local_scheduler.task_to_string(driver_task))
else:
TablePubsub_RAYLET_TASK = 2
# TODO(rkn): When we shard the GCS in xray, we will need to change
# this to use _execute_command.
global_state.redis_client.execute_command(
"RAY.TABLE_ADD", state.TablePrefix_RAYLET_TASK,
TablePubsub_RAYLET_TASK,
"RAY.TABLE_ADD", ray.gcs_utils.TablePrefix.RAYLET_TASK,
ray.gcs_utils.TablePubsub.RAYLET_TASK,
driver_task.task_id().id(),
driver_task._serialized_raylet_task())
@@ -2271,7 +2337,11 @@ def connect(info,
# temporarily using this implementation which constantly queries the
# scheduler for new error messages.
if mode == SCRIPT_MODE:
t = threading.Thread(target=print_error_messages, args=(worker, ))
if not worker.use_raylet:
t = threading.Thread(target=print_error_messages, args=(worker, ))
else:
t = threading.Thread(
target=print_error_messages_raylet, args=(worker, ))
# Making the thread a daemon causes it to exit when the main thread
# exits.
t.daemon = True