[xray] Add error table and push error messages to driver through node manager. (#2256)

* Fix documentation indentation.

* Add error table to GCS and push error messages through node manager.

* Add type to error data.

* Linting

* Fix failure_test bug.

* Linting.

* Enable one more test.

* Attempt to fix doc building.

* Restructuring

* Fixes

* More fixes.

* Move current_time_ms function into util.h.
This commit is contained in:
Robert Nishihara
2018-06-20 21:29:28 -07:00
committed by Philipp Moritz
parent 6bf48f47bc
commit ff2217251f
27 changed files with 610 additions and 204 deletions
+104 -59
View File
@@ -12,41 +12,10 @@ import sys
import time
import ray
import ray.gcs_utils
from ray.utils import (decode, binary_to_object_id, binary_to_hex,
hex_to_binary)
# Import flatbuffer bindings.
from ray.core.generated.TaskReply import TaskReply
from ray.core.generated.ResultTableReply import ResultTableReply
from ray.core.generated.TaskExecutionDependencies import \
TaskExecutionDependencies
from ray.core.generated.ClientTableData import ClientTableData
from ray.core.generated.GcsTableEntry import GcsTableEntry
from ray.core.generated.ObjectTableData import ObjectTableData
from ray.core.generated.ray.protocol.Task import Task
# These prefixes must be kept up-to-date with the definitions in
# ray_redis_module.cc.
DB_CLIENT_PREFIX = "CL:"
OBJECT_INFO_PREFIX = "OI:"
OBJECT_LOCATION_PREFIX = "OL:"
OBJECT_SUBSCRIBE_PREFIX = "OS:"
TASK_PREFIX = "TT:"
FUNCTION_PREFIX = "RemoteFunction:"
OBJECT_CHANNEL_PREFIX = "OC:"
# These prefixes must be kept up-to-date with the TablePrefix enum in gcs.fbs.
# TODO(rkn): We should use scoped enums, in which case we should be able to
# just access the flatbuffer generated values.
TablePrefix_RAYLET_TASK = 2
TablePrefix_RAYLET_TASK_string = "TASK"
TablePrefix_CLIENT = 3
TablePrefix_CLIENT_string = "CLIENT"
TablePrefix_OBJECT = 4
TablePrefix_OBJECT_string = "OBJECT"
# This mapping from integer to task state string must be kept up-to-date with
# the scheduling_state enum in task.h.
TASK_STATUS_WAITING = 1
@@ -231,8 +200,9 @@ class GlobalState(object):
result_table_response = self._execute_command(
object_id, "RAY.RESULT_TABLE_LOOKUP", object_id.id())
result_table_message = ResultTableReply.GetRootAsResultTableReply(
result_table_response, 0)
result_table_message = (
ray.gcs_utils.ResultTableReply.GetRootAsResultTableReply(
result_table_response, 0))
result = {
"ManagerIDs": manager_ids,
@@ -245,12 +215,14 @@ class GlobalState(object):
else:
# Use the raylet code path.
message = self.redis_client.execute_command(
"RAY.TABLE_LOOKUP", TablePrefix_OBJECT, "", object_id.id())
"RAY.TABLE_LOOKUP", ray.gcs_utils.TablePrefix.OBJECT, "",
object_id.id())
result = []
gcs_entry = GcsTableEntry.GetRootAsGcsTableEntry(message, 0)
gcs_entry = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
message, 0)
for i in range(gcs_entry.EntriesLength()):
entry = ObjectTableData.GetRootAsObjectTableData(
entry = ray.gcs_utils.ObjectTableData.GetRootAsObjectTableData(
gcs_entry.Entries(i), 0)
object_info = {
"DataSize": entry.ObjectSize(),
@@ -279,19 +251,22 @@ class GlobalState(object):
else:
# Return the entire object table.
if not self.use_raylet:
object_info_keys = self._keys(OBJECT_INFO_PREFIX + "*")
object_location_keys = self._keys(OBJECT_LOCATION_PREFIX + "*")
object_info_keys = self._keys(
ray.gcs_utils.OBJECT_INFO_PREFIX + "*")
object_location_keys = self._keys(
ray.gcs_utils.OBJECT_LOCATION_PREFIX + "*")
object_ids_binary = set([
key[len(OBJECT_INFO_PREFIX):] for key in object_info_keys
key[len(ray.gcs_utils.OBJECT_INFO_PREFIX):]
for key in object_info_keys
] + [
key[len(OBJECT_LOCATION_PREFIX):]
key[len(ray.gcs_utils.OBJECT_LOCATION_PREFIX):]
for key in object_location_keys
])
else:
object_keys = self.redis_client.keys(
TablePrefix_OBJECT_string + ":*")
ray.gcs_utils.TablePrefix_OBJECT_string + "*")
object_ids_binary = {
key[len(TablePrefix_OBJECT_string + ":"):]
key[len(ray.gcs_utils.TablePrefix_OBJECT_string):]
for key in object_keys
}
@@ -320,7 +295,7 @@ class GlobalState(object):
if task_table_response is None:
raise Exception("There is no entry for task ID {} in the task "
"table.".format(binary_to_hex(task_id.id())))
task_table_message = TaskReply.GetRootAsTaskReply(
task_table_message = ray.gcs_utils.TaskReply.GetRootAsTaskReply(
task_table_response, 0)
task_spec = task_table_message.TaskSpec()
task_spec = ray.local_scheduler.task_from_string(task_spec)
@@ -343,7 +318,8 @@ class GlobalState(object):
}
execution_dependencies_message = (
TaskExecutionDependencies.GetRootAsTaskExecutionDependencies(
ray.gcs_utils.TaskExecutionDependencies.
GetRootAsTaskExecutionDependencies(
task_table_message.ExecutionDependencies(), 0))
execution_dependencies = [
ray.ObjectID(
@@ -371,15 +347,17 @@ class GlobalState(object):
else:
# Use the raylet code path.
message = self.redis_client.execute_command(
"RAY.TABLE_LOOKUP", TablePrefix_RAYLET_TASK, "", task_id.id())
gcs_entries = GcsTableEntry.GetRootAsGcsTableEntry(message, 0)
"RAY.TABLE_LOOKUP", ray.gcs_utils.TablePrefix.RAYLET_TASK, "",
task_id.id())
gcs_entries = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
message, 0)
info = []
for i in range(gcs_entries.EntriesLength()):
task_table_message = Task.GetRootAsTask(
task_table_message = ray.gcs_utils.Task.GetRootAsTask(
gcs_entries.Entries(i), 0)
task_table_message = Task.GetRootAsTask(
task_table_message = ray.gcs_utils.Task.GetRootAsTask(
gcs_entries.Entries(0), 0)
execution_spec = task_table_message.TaskExecutionSpec()
task_spec = task_table_message.TaskSpecification()
@@ -432,15 +410,16 @@ class GlobalState(object):
return self._task_table(task_id)
else:
if not self.use_raylet:
task_table_keys = self._keys(TASK_PREFIX + "*")
task_table_keys = self._keys(ray.gcs_utils.TASK_PREFIX + "*")
task_ids_binary = [
key[len(TASK_PREFIX):] for key in task_table_keys
key[len(ray.gcs_utils.TASK_PREFIX):]
for key in task_table_keys
]
else:
task_table_keys = self.redis_client.keys(
TablePrefix_RAYLET_TASK_string + ":*")
ray.gcs_utils.TablePrefix_RAYLET_TASK_string + "*")
task_ids_binary = [
key[len(TablePrefix_RAYLET_TASK_string + ":"):]
key[len(ray.gcs_utils.TablePrefix_RAYLET_TASK_string):]
for key in task_table_keys
]
@@ -458,7 +437,8 @@ class GlobalState(object):
function.
"""
self._check_connected()
function_table_keys = self.redis_client.keys(FUNCTION_PREFIX + "*")
function_table_keys = self.redis_client.keys(
ray.gcs_utils.FUNCTION_PREFIX + "*")
results = {}
for key in function_table_keys:
info = self.redis_client.hgetall(key)
@@ -478,7 +458,8 @@ class GlobalState(object):
"""
self._check_connected()
if not self.use_raylet:
db_client_keys = self.redis_client.keys(DB_CLIENT_PREFIX + "*")
db_client_keys = self.redis_client.keys(
ray.gcs_utils.DB_CLIENT_PREFIX + "*")
node_info = {}
for key in db_client_keys:
client_info = self.redis_client.hgetall(key)
@@ -520,13 +501,16 @@ class GlobalState(object):
# This is the raylet code path.
NIL_CLIENT_ID = 20 * b"\xff"
message = self.redis_client.execute_command(
"RAY.TABLE_LOOKUP", TablePrefix_CLIENT, "", NIL_CLIENT_ID)
"RAY.TABLE_LOOKUP", ray.gcs_utils.TablePrefix.CLIENT, "",
NIL_CLIENT_ID)
node_info = []
gcs_entry = GcsTableEntry.GetRootAsGcsTableEntry(message, 0)
gcs_entry = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
message, 0)
for i in range(gcs_entry.EntriesLength()):
client = ClientTableData.GetRootAsClientTableData(
gcs_entry.Entries(i), 0)
client = (
ray.gcs_utils.ClientTableData.GetRootAsClientTableData(
gcs_entry.Entries(i), 0))
resources = {
client.ResourcesTotalLabel(i).decode("ascii"):
@@ -1146,3 +1130,64 @@ class GlobalState(object):
resources[key] += value
return dict(resources)
def _error_messages(self, job_id):
"""Get the error messages for a specific job.
Args:
job_id: The ID of the job to get the errors for.
Returns:
A list of the error messages for this job.
"""
message = self.redis_client.execute_command(
"RAY.TABLE_LOOKUP", ray.gcs_utils.TablePrefix.ERROR_INFO, "",
job_id.id())
# If there are no errors, return early.
if message is None:
return []
gcs_entries = ray.gcs_utils.GcsTableEntry.GetRootAsGcsTableEntry(
message, 0)
error_messages = []
for i in range(gcs_entries.EntriesLength()):
error_data = ray.gcs_utils.ErrorTableData.GetRootAsErrorTableData(
gcs_entries.Entries(i), 0)
error_message = {
"type": error_data.Type().decode("ascii"),
"message": error_data.ErrorMessage().decode("ascii"),
"timestamp": error_data.Timestamp(),
}
error_messages.append(error_message)
return error_messages
def error_messages(self, job_id=None):
"""Get the error messages for all jobs or a specific job.
Args:
job_id: The specific job to get the errors for. If this is None,
then this method retrieves the errors for all jobs.
Returns:
A dictionary mapping job ID to a list of the error messages for
that job.
"""
if not self.use_raylet:
raise Exception("The error_messages method is only supported in "
"the raylet code path.")
if job_id is not None:
return self._error_messages(job_id)
error_table_keys = self.redis_client.keys(
ray.gcs_utils.TablePrefix_ERROR_INFO_string + "*")
job_ids = [
key[len(ray.gcs_utils.TablePrefix_ERROR_INFO_string):]
for key in error_table_keys
]
return {
binary_to_hex(job_id): self._error_messages(ray.ObjectID(job_id))
for job_id in job_ids
}