mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 21:23:10 +08:00
GCS server use worker table to handle RegisterWorker instead of redis accessor (#9168)
This commit is contained in:
@@ -17,6 +17,7 @@ from ray.core.generated.gcs_pb2 import (
|
||||
ResourceTableData,
|
||||
ObjectLocationInfo,
|
||||
PubSubMessage,
|
||||
WorkerTableData,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
@@ -39,6 +40,7 @@ __all__ = [
|
||||
"construct_error_message",
|
||||
"ObjectLocationInfo",
|
||||
"PubSubMessage",
|
||||
"WorkerTableData",
|
||||
]
|
||||
|
||||
FUNCTION_PREFIX = "RemoteFunction:"
|
||||
@@ -69,6 +71,9 @@ TablePrefix_PROFILE_string = "PROFILE"
|
||||
TablePrefix_JOB_string = "JOB"
|
||||
TablePrefix_ACTOR_string = "ACTOR"
|
||||
|
||||
WORKER = 0
|
||||
DRIVER = 1
|
||||
|
||||
|
||||
def construct_error_message(job_id, error_type, message, timestamp):
|
||||
"""Construct a serialized ErrorTableData object.
|
||||
|
||||
@@ -6,6 +6,7 @@ from ray.includes.unique_ids cimport (
|
||||
CActorID,
|
||||
CClientID,
|
||||
CObjectID,
|
||||
CWorkerID,
|
||||
)
|
||||
|
||||
cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
|
||||
@@ -23,3 +24,6 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
|
||||
c_vector[c_string] GetAllActorInfo()
|
||||
unique_ptr[c_string] GetActorInfo(const CActorID &actor_id)
|
||||
c_string GetNodeResourceInfo(const CClientID &node_id)
|
||||
unique_ptr[c_string] GetWorkerInfo(const CWorkerID &worker_id)
|
||||
c_vector[c_string] GetAllWorkerInfo()
|
||||
c_bool AddWorkerInfo(const c_string &serialized_string)
|
||||
|
||||
@@ -2,6 +2,7 @@ from ray.includes.unique_ids cimport (
|
||||
CActorID,
|
||||
CClientID,
|
||||
CObjectID,
|
||||
CWorkerID,
|
||||
)
|
||||
|
||||
from ray.includes.global_state_accessor cimport (
|
||||
@@ -57,3 +58,15 @@ cdef class GlobalStateAccessor:
|
||||
|
||||
def get_node_resource_info(self, node_id):
|
||||
return self.inner.get().GetNodeResourceInfo(CClientID.FromBinary(node_id.binary()))
|
||||
|
||||
def get_worker_table(self):
|
||||
return self.inner.get().GetAllWorkerInfo()
|
||||
|
||||
def get_worker_info(self, worker_id):
|
||||
worker_info = self.inner.get().GetWorkerInfo(CWorkerID.FromBinary(worker_id.binary()))
|
||||
if worker_info:
|
||||
return c_string(worker_info.get().data(), worker_info.get().size())
|
||||
return None
|
||||
|
||||
def add_worker_info(self, serialized_string):
|
||||
return self.inner.get().AddWorkerInfo(serialized_string)
|
||||
|
||||
+42
-16
@@ -602,26 +602,52 @@ class GlobalState:
|
||||
"""Get a dictionary mapping worker ID to worker information."""
|
||||
self._check_connected()
|
||||
|
||||
worker_keys = self.redis_client.keys("Worker*")
|
||||
# Get all data in worker table
|
||||
worker_table = self.global_state_accessor.get_worker_table()
|
||||
workers_data = {}
|
||||
for i in range(len(worker_table)):
|
||||
worker_table_data = gcs_utils.WorkerTableData.FromString(
|
||||
worker_table[i])
|
||||
if worker_table_data.is_alive and \
|
||||
worker_table_data.worker_type == gcs_utils.WORKER:
|
||||
worker_id = binary_to_hex(
|
||||
worker_table_data.worker_address.worker_id)
|
||||
worker_info = worker_table_data.worker_info
|
||||
|
||||
for worker_key in worker_keys:
|
||||
worker_info = self.redis_client.hgetall(worker_key)
|
||||
worker_id = binary_to_hex(worker_key[len("Workers:"):])
|
||||
|
||||
workers_data[worker_id] = {
|
||||
"node_ip_address": decode(worker_info[b"node_ip_address"]),
|
||||
"plasma_store_socket": decode(
|
||||
worker_info[b"plasma_store_socket"])
|
||||
}
|
||||
if b"stderr_file" in worker_info:
|
||||
workers_data[worker_id]["stderr_file"] = decode(
|
||||
worker_info[b"stderr_file"])
|
||||
if b"stdout_file" in worker_info:
|
||||
workers_data[worker_id]["stdout_file"] = decode(
|
||||
worker_info[b"stdout_file"])
|
||||
workers_data[worker_id] = {
|
||||
"node_ip_address": decode(worker_info[b"node_ip_address"]),
|
||||
"plasma_store_socket": decode(
|
||||
worker_info[b"plasma_store_socket"])
|
||||
}
|
||||
if b"stderr_file" in worker_info:
|
||||
workers_data[worker_id]["stderr_file"] = decode(
|
||||
worker_info[b"stderr_file"])
|
||||
if b"stdout_file" in worker_info:
|
||||
workers_data[worker_id]["stdout_file"] = decode(
|
||||
worker_info[b"stdout_file"])
|
||||
return workers_data
|
||||
|
||||
def add_worker(self, worker_id, worker_type, worker_info):
|
||||
""" Add a worker to the cluster.
|
||||
|
||||
Args:
|
||||
worker_id: ID of this worker. Type is bytes.
|
||||
worker_type: Type of this worker. Value is ray.gcs_utils.DRIVER or
|
||||
ray.gcs_utils.WORKER.
|
||||
worker_info: Info of this worker. Type is dict{str: str}.
|
||||
|
||||
Returns:
|
||||
Is operation success
|
||||
"""
|
||||
worker_data = ray.gcs_utils.WorkerTableData()
|
||||
worker_data.is_alive = True
|
||||
worker_data.worker_address.worker_id = worker_id
|
||||
worker_data.worker_type = worker_type
|
||||
for k, v in worker_info.items():
|
||||
worker_data.worker_info[k] = bytes(v, encoding="utf-8")
|
||||
return self.global_state_accessor.add_worker_info(
|
||||
worker_data.SerializeToString())
|
||||
|
||||
def _job_length(self):
|
||||
event_log_sets = self.redis_client.keys("event_log*")
|
||||
overall_smallest = sys.maxsize
|
||||
|
||||
@@ -872,15 +872,14 @@ normal_excepthook = sys.excepthook
|
||||
|
||||
|
||||
def custom_excepthook(type, value, tb):
|
||||
# If this is a driver, push the exception to redis.
|
||||
# If this is a driver, push the exception to GCS worker table.
|
||||
if global_worker.mode == SCRIPT_MODE:
|
||||
error_message = "".join(traceback.format_tb(tb))
|
||||
try:
|
||||
global_worker.redis_client.hmset(
|
||||
b"Drivers:" + global_worker.worker_id,
|
||||
{"exception": error_message})
|
||||
except (ConnectionRefusedError, redis.exceptions.ConnectionError):
|
||||
logger.warning("Could not push exception to redis.")
|
||||
worker_id = global_worker.worker_id
|
||||
worker_type = ray.gcs_utils.DRIVER
|
||||
worker_info = {"exception": error_message}
|
||||
|
||||
ray.state.state.add_worker(worker_id, worker_type, worker_info)
|
||||
# Call the normal excepthook.
|
||||
normal_excepthook(type, value, tb)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user