mirror of
https://github.com/wassname/ray.git
synced 2026-07-03 11:45:17 +08:00
[xray] Add error table and push error messages to driver through node manager. (#2256)
* Fix documentation indentation. * Add error table to GCS and push error messages through node manager. * Add type to error data. * Linting * Fix failure_test bug. * Linting. * Enable one more test. * Attempt to fix doc building. * Restructuring * Fixes * More fixes. * Move current_time_ms function into util.h.
This commit is contained in:
committed by
Philipp Moritz
parent
6bf48f47bc
commit
ff2217251f
+91
-21
@@ -22,6 +22,7 @@ import pyarrow
|
||||
import pyarrow.plasma as plasma
|
||||
import ray.cloudpickle as pickle
|
||||
import ray.experimental.state as state
|
||||
import ray.gcs_utils
|
||||
import ray.remote_function
|
||||
import ray.serialization as serialization
|
||||
import ray.services as services
|
||||
@@ -31,9 +32,6 @@ import ray.plasma
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray.utils import random_string, binary_to_hex, is_cython
|
||||
|
||||
# Import flatbuffer bindings.
|
||||
from ray.core.generated.ClientTableData import ClientTableData
|
||||
|
||||
SCRIPT_MODE = 0
|
||||
WORKER_MODE = 1
|
||||
PYTHON_MODE = 2
|
||||
@@ -415,7 +413,7 @@ class Worker(object):
|
||||
"may be a bug.")
|
||||
if not warning_sent:
|
||||
ray.utils.push_error_to_driver(
|
||||
self.redis_client,
|
||||
self,
|
||||
ray_constants.WAIT_FOR_CLASS_PUSH_ERROR,
|
||||
warning_message,
|
||||
driver_id=self.task_driver_id.id())
|
||||
@@ -663,7 +661,7 @@ class Worker(object):
|
||||
"large array or other object.".format(
|
||||
function_name, len(pickled_function)))
|
||||
ray.utils.push_error_to_driver(
|
||||
self.redis_client,
|
||||
self,
|
||||
ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR,
|
||||
warning_message,
|
||||
driver_id=self.task_driver_id.id())
|
||||
@@ -726,7 +724,7 @@ class Worker(object):
|
||||
.format(function.__name__,
|
||||
len(pickled_function)))
|
||||
ray.utils.push_error_to_driver(
|
||||
self.redis_client,
|
||||
self,
|
||||
ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR,
|
||||
warning_message,
|
||||
driver_id=self.task_driver_id.id())
|
||||
@@ -781,7 +779,7 @@ class Worker(object):
|
||||
"Ray.")
|
||||
if not warning_sent:
|
||||
ray.utils.push_error_to_driver(
|
||||
self.redis_client,
|
||||
self,
|
||||
ray_constants.WAIT_FOR_FUNCTION_PUSH_ERROR,
|
||||
warning_message,
|
||||
driver_id=driver_id)
|
||||
@@ -942,7 +940,7 @@ class Worker(object):
|
||||
self._store_outputs_in_objstore(return_object_ids, failure_objects)
|
||||
# Log the error message.
|
||||
ray.utils.push_error_to_driver(
|
||||
self.redis_client,
|
||||
self,
|
||||
ray_constants.TASK_PUSH_ERROR,
|
||||
str(failure_object),
|
||||
driver_id=self.task_driver_id.id(),
|
||||
@@ -1200,6 +1198,11 @@ def error_info(worker=global_worker):
|
||||
"""Return information about failed tasks."""
|
||||
worker.check_connected()
|
||||
check_main_thread()
|
||||
|
||||
if worker.use_raylet:
|
||||
return (global_state.error_messages(job_id=worker.task_driver_id) +
|
||||
global_state.error_messages(job_id=ray_constants.NIL_JOB_ID))
|
||||
|
||||
error_keys = worker.redis_client.lrange("ErrorKeys", 0, -1)
|
||||
errors = []
|
||||
for error_key in error_keys:
|
||||
@@ -1291,9 +1294,8 @@ def get_address_info_from_redis_helper(redis_address,
|
||||
if not use_raylet:
|
||||
# The client table prefix must be kept in sync with the file
|
||||
# "src/common/redis_module/ray_redis_module.cc" where it is defined.
|
||||
REDIS_CLIENT_TABLE_PREFIX = "CL:"
|
||||
client_keys = redis_client.keys(
|
||||
"{}*".format(REDIS_CLIENT_TABLE_PREFIX))
|
||||
client_keys = redis_client.keys("{}*".format(
|
||||
ray.gcs_utils.DB_CLIENT_PREFIX))
|
||||
# Filter to live clients on the same node and do some basic checking.
|
||||
plasma_managers = []
|
||||
local_schedulers = []
|
||||
@@ -1350,11 +1352,11 @@ def get_address_info_from_redis_helper(redis_address,
|
||||
else:
|
||||
# In the raylet code path, all client data is stored in a zset at the
|
||||
# key for the nil client.
|
||||
client_key = b"CLIENT:" + NIL_CLIENT_ID
|
||||
client_key = b"CLIENT" + NIL_CLIENT_ID
|
||||
clients = redis_client.zrange(client_key, 0, -1)
|
||||
raylets = []
|
||||
for client_message in clients:
|
||||
client = ClientTableData.GetRootAsClientTableData(
|
||||
client = ray.gcs_utils.ClientTableData.GetRootAsClientTableData(
|
||||
client_message, 0)
|
||||
client_node_ip_address = client.NodeManagerAddress().decode(
|
||||
"ascii")
|
||||
@@ -1819,6 +1821,71 @@ def custom_excepthook(type, value, tb):
|
||||
sys.excepthook = custom_excepthook
|
||||
|
||||
|
||||
def print_error_messages_raylet(worker):
|
||||
"""Print error messages in the background on the driver.
|
||||
|
||||
This runs in a separate thread on the driver and prints error messages in
|
||||
the background.
|
||||
"""
|
||||
if not worker.use_raylet:
|
||||
raise Exception("This function is specific to the raylet code path.")
|
||||
|
||||
worker.error_message_pubsub_client = worker.redis_client.pubsub(
|
||||
ignore_subscribe_messages=True)
|
||||
# Exports that are published after the call to
|
||||
# error_message_pubsub_client.subscribe and before the call to
|
||||
# error_message_pubsub_client.listen will still be processed in the loop.
|
||||
|
||||
# Really we should just subscribe to the errors for this specific job.
|
||||
# However, currently all errors seem to be published on the same channel.
|
||||
error_pubsub_channel = str(
|
||||
ray.gcs_utils.TablePubsub.ERROR_INFO).encode("ascii")
|
||||
worker.error_message_pubsub_client.subscribe(error_pubsub_channel)
|
||||
# worker.error_message_pubsub_client.psubscribe("*")
|
||||
|
||||
# Keep a set of all the error messages that we've seen so far in order to
|
||||
# avoid printing the same error message repeatedly. This is especially
|
||||
# important when running a script inside of a tool like screen where
|
||||
# scrolling is difficult.
|
||||
old_error_messages = set()
|
||||
|
||||
# Get the exports that occurred before the call to subscribe.
|
||||
with worker.lock:
|
||||
error_messages = global_state.error_messages(worker.task_driver_id)
|
||||
for error_message in error_messages:
|
||||
if error_message not in old_error_messages:
|
||||
print(error_message)
|
||||
old_error_messages.add(error_message)
|
||||
else:
|
||||
print("Suppressing duplicate error message.")
|
||||
|
||||
try:
|
||||
for msg in worker.error_message_pubsub_client.listen():
|
||||
|
||||
gcs_entry = state.GcsTableEntry.GetRootAsGcsTableEntry(
|
||||
msg["data"], 0)
|
||||
assert gcs_entry.EntriesLength() == 1
|
||||
error_data = state.ErrorTableData.GetRootAsErrorTableData(
|
||||
gcs_entry.Entries(0), 0)
|
||||
NIL_JOB_ID = 20 * b"\x00"
|
||||
job_id = error_data.JobId()
|
||||
if job_id not in [worker.task_driver_id.id(), NIL_JOB_ID]:
|
||||
continue
|
||||
|
||||
error_message = error_data.ErrorMessage().decode("ascii")
|
||||
|
||||
if error_message not in old_error_messages:
|
||||
print(error_message)
|
||||
old_error_messages.add(error_message)
|
||||
else:
|
||||
print("Suppressing duplicate error message.")
|
||||
|
||||
except redis.ConnectionError:
|
||||
# When Redis terminates the listen call will throw a ConnectionError,
|
||||
# which we catch here.
|
||||
pass
|
||||
|
||||
|
||||
def print_error_messages(worker):
|
||||
"""Print error messages in the background on the driver.
|
||||
|
||||
@@ -1907,7 +1974,7 @@ def fetch_and_register_remote_function(key, worker=global_worker):
|
||||
traceback_str = ray.utils.format_error_message(traceback.format_exc())
|
||||
# Log the error message.
|
||||
ray.utils.push_error_to_driver(
|
||||
worker.redis_client,
|
||||
worker,
|
||||
ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR,
|
||||
traceback_str,
|
||||
driver_id=driver_id,
|
||||
@@ -1952,7 +2019,7 @@ def fetch_and_execute_function_to_run(key, worker=global_worker):
|
||||
name = function.__name__ if ("function" in locals()
|
||||
and hasattr(function, "__name__")) else ""
|
||||
ray.utils.push_error_to_driver(
|
||||
worker.redis_client,
|
||||
worker,
|
||||
ray_constants.FUNCTION_TO_RUN_PUSH_ERROR,
|
||||
traceback_str,
|
||||
driver_id=driver_id,
|
||||
@@ -2111,8 +2178,9 @@ def connect(info,
|
||||
raise e
|
||||
elif mode == WORKER_MODE:
|
||||
traceback_str = traceback.format_exc()
|
||||
ray.utils.push_error_to_driver(
|
||||
ray.utils.push_error_to_driver_through_redis(
|
||||
worker.redis_client,
|
||||
worker.use_raylet,
|
||||
ray_constants.VERSION_MISMATCH_PUSH_ERROR,
|
||||
traceback_str,
|
||||
driver_id=None)
|
||||
@@ -2237,13 +2305,11 @@ def connect(info,
|
||||
driver_task.execution_dependencies_string(), 0,
|
||||
ray.local_scheduler.task_to_string(driver_task))
|
||||
else:
|
||||
TablePubsub_RAYLET_TASK = 2
|
||||
|
||||
# TODO(rkn): When we shard the GCS in xray, we will need to change
|
||||
# this to use _execute_command.
|
||||
global_state.redis_client.execute_command(
|
||||
"RAY.TABLE_ADD", state.TablePrefix_RAYLET_TASK,
|
||||
TablePubsub_RAYLET_TASK,
|
||||
"RAY.TABLE_ADD", ray.gcs_utils.TablePrefix.RAYLET_TASK,
|
||||
ray.gcs_utils.TablePubsub.RAYLET_TASK,
|
||||
driver_task.task_id().id(),
|
||||
driver_task._serialized_raylet_task())
|
||||
|
||||
@@ -2271,7 +2337,11 @@ def connect(info,
|
||||
# temporarily using this implementation which constantly queries the
|
||||
# scheduler for new error messages.
|
||||
if mode == SCRIPT_MODE:
|
||||
t = threading.Thread(target=print_error_messages, args=(worker, ))
|
||||
if not worker.use_raylet:
|
||||
t = threading.Thread(target=print_error_messages, args=(worker, ))
|
||||
else:
|
||||
t = threading.Thread(
|
||||
target=print_error_messages_raylet, args=(worker, ))
|
||||
# Making the thread a daemon causes it to exit when the main thread
|
||||
# exits.
|
||||
t.daemon = True
|
||||
|
||||
Reference in New Issue
Block a user