GCS server use worker table to handle RegisterWorker instead of redis accessor (#9168)

This commit is contained in:
ChenZhilei
2020-07-06 10:37:25 +08:00
committed by GitHub
parent dcf989292e
commit 6f3d993681
40 changed files with 653 additions and 299 deletions
+5
View File
@@ -17,6 +17,7 @@ from ray.core.generated.gcs_pb2 import (
ResourceTableData,
ObjectLocationInfo,
PubSubMessage,
WorkerTableData,
)
__all__ = [
@@ -39,6 +40,7 @@ __all__ = [
"construct_error_message",
"ObjectLocationInfo",
"PubSubMessage",
"WorkerTableData",
]
FUNCTION_PREFIX = "RemoteFunction:"
@@ -69,6 +71,9 @@ TablePrefix_PROFILE_string = "PROFILE"
TablePrefix_JOB_string = "JOB"
TablePrefix_ACTOR_string = "ACTOR"
WORKER = 0
DRIVER = 1
def construct_error_message(job_id, error_type, message, timestamp):
"""Construct a serialized ErrorTableData object.
@@ -6,6 +6,7 @@ from ray.includes.unique_ids cimport (
CActorID,
CClientID,
CObjectID,
CWorkerID,
)
cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
@@ -23,3 +24,6 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
c_vector[c_string] GetAllActorInfo()
unique_ptr[c_string] GetActorInfo(const CActorID &actor_id)
c_string GetNodeResourceInfo(const CClientID &node_id)
unique_ptr[c_string] GetWorkerInfo(const CWorkerID &worker_id)
c_vector[c_string] GetAllWorkerInfo()
c_bool AddWorkerInfo(const c_string &serialized_string)
@@ -2,6 +2,7 @@ from ray.includes.unique_ids cimport (
CActorID,
CClientID,
CObjectID,
CWorkerID,
)
from ray.includes.global_state_accessor cimport (
@@ -57,3 +58,15 @@ cdef class GlobalStateAccessor:
def get_node_resource_info(self, node_id):
return self.inner.get().GetNodeResourceInfo(CClientID.FromBinary(node_id.binary()))
def get_worker_table(self):
return self.inner.get().GetAllWorkerInfo()
def get_worker_info(self, worker_id):
worker_info = self.inner.get().GetWorkerInfo(CWorkerID.FromBinary(worker_id.binary()))
if worker_info:
return c_string(worker_info.get().data(), worker_info.get().size())
return None
def add_worker_info(self, serialized_string):
return self.inner.get().AddWorkerInfo(serialized_string)
+42 -16
View File
@@ -602,26 +602,52 @@ class GlobalState:
"""Get a dictionary mapping worker ID to worker information."""
self._check_connected()
worker_keys = self.redis_client.keys("Worker*")
# Get all data in worker table
worker_table = self.global_state_accessor.get_worker_table()
workers_data = {}
for i in range(len(worker_table)):
worker_table_data = gcs_utils.WorkerTableData.FromString(
worker_table[i])
if worker_table_data.is_alive and \
worker_table_data.worker_type == gcs_utils.WORKER:
worker_id = binary_to_hex(
worker_table_data.worker_address.worker_id)
worker_info = worker_table_data.worker_info
for worker_key in worker_keys:
worker_info = self.redis_client.hgetall(worker_key)
worker_id = binary_to_hex(worker_key[len("Workers:"):])
workers_data[worker_id] = {
"node_ip_address": decode(worker_info[b"node_ip_address"]),
"plasma_store_socket": decode(
worker_info[b"plasma_store_socket"])
}
if b"stderr_file" in worker_info:
workers_data[worker_id]["stderr_file"] = decode(
worker_info[b"stderr_file"])
if b"stdout_file" in worker_info:
workers_data[worker_id]["stdout_file"] = decode(
worker_info[b"stdout_file"])
workers_data[worker_id] = {
"node_ip_address": decode(worker_info[b"node_ip_address"]),
"plasma_store_socket": decode(
worker_info[b"plasma_store_socket"])
}
if b"stderr_file" in worker_info:
workers_data[worker_id]["stderr_file"] = decode(
worker_info[b"stderr_file"])
if b"stdout_file" in worker_info:
workers_data[worker_id]["stdout_file"] = decode(
worker_info[b"stdout_file"])
return workers_data
def add_worker(self, worker_id, worker_type, worker_info):
""" Add a worker to the cluster.
Args:
worker_id: ID of this worker. Type is bytes.
worker_type: Type of this worker. Value is ray.gcs_utils.DRIVER or
ray.gcs_utils.WORKER.
worker_info: Info of this worker. Type is dict{str: str}.
Returns:
Is operation success
"""
worker_data = ray.gcs_utils.WorkerTableData()
worker_data.is_alive = True
worker_data.worker_address.worker_id = worker_id
worker_data.worker_type = worker_type
for k, v in worker_info.items():
worker_data.worker_info[k] = bytes(v, encoding="utf-8")
return self.global_state_accessor.add_worker_info(
worker_data.SerializeToString())
def _job_length(self):
event_log_sets = self.redis_client.keys("event_log*")
overall_smallest = sys.maxsize
+6 -7
View File
@@ -872,15 +872,14 @@ normal_excepthook = sys.excepthook
def custom_excepthook(type, value, tb):
# If this is a driver, push the exception to redis.
# If this is a driver, push the exception to GCS worker table.
if global_worker.mode == SCRIPT_MODE:
error_message = "".join(traceback.format_tb(tb))
try:
global_worker.redis_client.hmset(
b"Drivers:" + global_worker.worker_id,
{"exception": error_message})
except (ConnectionRefusedError, redis.exceptions.ConnectionError):
logger.warning("Could not push exception to redis.")
worker_id = global_worker.worker_id
worker_type = ray.gcs_utils.DRIVER
worker_info = {"exception": error_message}
ray.state.state.add_worker(worker_id, worker_type, worker_info)
# Call the normal excepthook.
normal_excepthook(type, value, tb)