mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:06:31 +08:00
GCS server use worker table to handle RegisterWorker instead of redis accessor (#9168)
This commit is contained in:
@@ -17,6 +17,7 @@ from ray.core.generated.gcs_pb2 import (
|
||||
ResourceTableData,
|
||||
ObjectLocationInfo,
|
||||
PubSubMessage,
|
||||
WorkerTableData,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
@@ -39,6 +40,7 @@ __all__ = [
|
||||
"construct_error_message",
|
||||
"ObjectLocationInfo",
|
||||
"PubSubMessage",
|
||||
"WorkerTableData",
|
||||
]
|
||||
|
||||
FUNCTION_PREFIX = "RemoteFunction:"
|
||||
@@ -69,6 +71,9 @@ TablePrefix_PROFILE_string = "PROFILE"
|
||||
TablePrefix_JOB_string = "JOB"
|
||||
TablePrefix_ACTOR_string = "ACTOR"
|
||||
|
||||
WORKER = 0
|
||||
DRIVER = 1
|
||||
|
||||
|
||||
def construct_error_message(job_id, error_type, message, timestamp):
|
||||
"""Construct a serialized ErrorTableData object.
|
||||
|
||||
@@ -6,6 +6,7 @@ from ray.includes.unique_ids cimport (
|
||||
CActorID,
|
||||
CClientID,
|
||||
CObjectID,
|
||||
CWorkerID,
|
||||
)
|
||||
|
||||
cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
|
||||
@@ -23,3 +24,6 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
|
||||
c_vector[c_string] GetAllActorInfo()
|
||||
unique_ptr[c_string] GetActorInfo(const CActorID &actor_id)
|
||||
c_string GetNodeResourceInfo(const CClientID &node_id)
|
||||
unique_ptr[c_string] GetWorkerInfo(const CWorkerID &worker_id)
|
||||
c_vector[c_string] GetAllWorkerInfo()
|
||||
c_bool AddWorkerInfo(const c_string &serialized_string)
|
||||
|
||||
@@ -2,6 +2,7 @@ from ray.includes.unique_ids cimport (
|
||||
CActorID,
|
||||
CClientID,
|
||||
CObjectID,
|
||||
CWorkerID,
|
||||
)
|
||||
|
||||
from ray.includes.global_state_accessor cimport (
|
||||
@@ -57,3 +58,15 @@ cdef class GlobalStateAccessor:
|
||||
|
||||
def get_node_resource_info(self, node_id):
|
||||
return self.inner.get().GetNodeResourceInfo(CClientID.FromBinary(node_id.binary()))
|
||||
|
||||
def get_worker_table(self):
|
||||
return self.inner.get().GetAllWorkerInfo()
|
||||
|
||||
def get_worker_info(self, worker_id):
|
||||
worker_info = self.inner.get().GetWorkerInfo(CWorkerID.FromBinary(worker_id.binary()))
|
||||
if worker_info:
|
||||
return c_string(worker_info.get().data(), worker_info.get().size())
|
||||
return None
|
||||
|
||||
def add_worker_info(self, serialized_string):
|
||||
return self.inner.get().AddWorkerInfo(serialized_string)
|
||||
|
||||
+42
-16
@@ -602,26 +602,52 @@ class GlobalState:
|
||||
"""Get a dictionary mapping worker ID to worker information."""
|
||||
self._check_connected()
|
||||
|
||||
worker_keys = self.redis_client.keys("Worker*")
|
||||
# Get all data in worker table
|
||||
worker_table = self.global_state_accessor.get_worker_table()
|
||||
workers_data = {}
|
||||
for i in range(len(worker_table)):
|
||||
worker_table_data = gcs_utils.WorkerTableData.FromString(
|
||||
worker_table[i])
|
||||
if worker_table_data.is_alive and \
|
||||
worker_table_data.worker_type == gcs_utils.WORKER:
|
||||
worker_id = binary_to_hex(
|
||||
worker_table_data.worker_address.worker_id)
|
||||
worker_info = worker_table_data.worker_info
|
||||
|
||||
for worker_key in worker_keys:
|
||||
worker_info = self.redis_client.hgetall(worker_key)
|
||||
worker_id = binary_to_hex(worker_key[len("Workers:"):])
|
||||
|
||||
workers_data[worker_id] = {
|
||||
"node_ip_address": decode(worker_info[b"node_ip_address"]),
|
||||
"plasma_store_socket": decode(
|
||||
worker_info[b"plasma_store_socket"])
|
||||
}
|
||||
if b"stderr_file" in worker_info:
|
||||
workers_data[worker_id]["stderr_file"] = decode(
|
||||
worker_info[b"stderr_file"])
|
||||
if b"stdout_file" in worker_info:
|
||||
workers_data[worker_id]["stdout_file"] = decode(
|
||||
worker_info[b"stdout_file"])
|
||||
workers_data[worker_id] = {
|
||||
"node_ip_address": decode(worker_info[b"node_ip_address"]),
|
||||
"plasma_store_socket": decode(
|
||||
worker_info[b"plasma_store_socket"])
|
||||
}
|
||||
if b"stderr_file" in worker_info:
|
||||
workers_data[worker_id]["stderr_file"] = decode(
|
||||
worker_info[b"stderr_file"])
|
||||
if b"stdout_file" in worker_info:
|
||||
workers_data[worker_id]["stdout_file"] = decode(
|
||||
worker_info[b"stdout_file"])
|
||||
return workers_data
|
||||
|
||||
def add_worker(self, worker_id, worker_type, worker_info):
|
||||
""" Add a worker to the cluster.
|
||||
|
||||
Args:
|
||||
worker_id: ID of this worker. Type is bytes.
|
||||
worker_type: Type of this worker. Value is ray.gcs_utils.DRIVER or
|
||||
ray.gcs_utils.WORKER.
|
||||
worker_info: Info of this worker. Type is dict{str: str}.
|
||||
|
||||
Returns:
|
||||
Is operation success
|
||||
"""
|
||||
worker_data = ray.gcs_utils.WorkerTableData()
|
||||
worker_data.is_alive = True
|
||||
worker_data.worker_address.worker_id = worker_id
|
||||
worker_data.worker_type = worker_type
|
||||
for k, v in worker_info.items():
|
||||
worker_data.worker_info[k] = bytes(v, encoding="utf-8")
|
||||
return self.global_state_accessor.add_worker_info(
|
||||
worker_data.SerializeToString())
|
||||
|
||||
def _job_length(self):
|
||||
event_log_sets = self.redis_client.keys("event_log*")
|
||||
overall_smallest = sys.maxsize
|
||||
|
||||
@@ -872,15 +872,14 @@ normal_excepthook = sys.excepthook
|
||||
|
||||
|
||||
def custom_excepthook(type, value, tb):
|
||||
# If this is a driver, push the exception to redis.
|
||||
# If this is a driver, push the exception to GCS worker table.
|
||||
if global_worker.mode == SCRIPT_MODE:
|
||||
error_message = "".join(traceback.format_tb(tb))
|
||||
try:
|
||||
global_worker.redis_client.hmset(
|
||||
b"Drivers:" + global_worker.worker_id,
|
||||
{"exception": error_message})
|
||||
except (ConnectionRefusedError, redis.exceptions.ConnectionError):
|
||||
logger.warning("Could not push exception to redis.")
|
||||
worker_id = global_worker.worker_id
|
||||
worker_type = ray.gcs_utils.DRIVER
|
||||
worker_info = {"exception": error_message}
|
||||
|
||||
ray.state.state.add_worker(worker_id, worker_type, worker_info)
|
||||
# Call the normal excepthook.
|
||||
normal_excepthook(type, value, tb)
|
||||
|
||||
|
||||
@@ -485,7 +485,7 @@ int64_t ClusterResourceScheduler::IsSchedulable(const TaskRequest &task_req,
|
||||
}
|
||||
}
|
||||
|
||||
// No check custom resources.
|
||||
// Now check custom resources.
|
||||
for (const auto task_req_custom_resource : task_req.custom_resources) {
|
||||
auto it = resources.custom_resources.find(task_req_custom_resource.id);
|
||||
|
||||
|
||||
@@ -138,7 +138,7 @@ class TaskRequest {
|
||||
public:
|
||||
/// List of predefined resources required by the task.
|
||||
std::vector<ResourceRequest> predefined_resources;
|
||||
/// List of custom resources required by the tasl.
|
||||
/// List of custom resources required by the task.
|
||||
std::vector<ResourceRequestWithId> custom_resources;
|
||||
/// List of placement hints. A placement hint is a node on which
|
||||
/// we desire to run this task. This is a soft constraint in that
|
||||
|
||||
@@ -646,8 +646,12 @@ void CoreWorker::RegisterToGcs() {
|
||||
worker_info.emplace("stderr_file", options_.stderr_file);
|
||||
}
|
||||
|
||||
RAY_CHECK_OK(gcs_client_->Workers().AsyncRegisterWorker(options_.worker_type, worker_id,
|
||||
worker_info, nullptr));
|
||||
auto worker_data = std::make_shared<rpc::WorkerTableData>();
|
||||
worker_data->mutable_worker_address()->set_worker_id(worker_id.Binary());
|
||||
worker_data->set_worker_type(options_.worker_type);
|
||||
worker_data->mutable_worker_info()->insert(worker_info.begin(), worker_info.end());
|
||||
|
||||
RAY_CHECK_OK(gcs_client_->Workers().AsyncAdd(worker_data, nullptr));
|
||||
}
|
||||
void CoreWorker::CheckForRayletFailure(const boost::system::error_code &error) {
|
||||
if (error == boost::asio::error::operation_aborted) {
|
||||
|
||||
+23
-11
@@ -647,7 +647,7 @@ class WorkerInfoAccessor {
|
||||
/// \param done Callback that will be called when subscription is complete.
|
||||
/// \return Status
|
||||
virtual Status AsyncSubscribeToWorkerFailures(
|
||||
const SubscribeCallback<WorkerID, rpc::WorkerFailureData> &subscribe,
|
||||
const SubscribeCallback<WorkerID, rpc::WorkerTableData> &subscribe,
|
||||
const StatusCallback &done) = 0;
|
||||
|
||||
/// Report a worker failure to GCS asynchronously.
|
||||
@@ -656,19 +656,31 @@ class WorkerInfoAccessor {
|
||||
/// \param callback Callback that will be called when report is complate.
|
||||
/// \param Status
|
||||
virtual Status AsyncReportWorkerFailure(
|
||||
const std::shared_ptr<rpc::WorkerFailureData> &data_ptr,
|
||||
const std::shared_ptr<rpc::WorkerTableData> &data_ptr,
|
||||
const StatusCallback &callback) = 0;
|
||||
|
||||
/// Register a worker to GCS asynchronously.
|
||||
/// Get worker specification from GCS asynchronously.
|
||||
///
|
||||
/// \param worker_type The type of the worker.
|
||||
/// \param worker_id The ID of the worker.
|
||||
/// \param worker_info The information of the worker.
|
||||
/// \return Status.
|
||||
virtual Status AsyncRegisterWorker(
|
||||
rpc::WorkerType worker_type, const WorkerID &worker_id,
|
||||
const std::unordered_map<std::string, std::string> &worker_info,
|
||||
const StatusCallback &callback) = 0;
|
||||
/// \param worker_id The ID of worker to look up in the GCS.
|
||||
/// \param callback Callback that will be called after lookup finishes.
|
||||
/// \return Status
|
||||
virtual Status AsyncGet(const WorkerID &worker_id,
|
||||
const OptionalItemCallback<rpc::WorkerTableData> &callback) = 0;
|
||||
|
||||
/// Get all worker info from GCS asynchronously.
|
||||
///
|
||||
/// \param callback Callback that will be called after lookup finished.
|
||||
/// \return Status
|
||||
virtual Status AsyncGetAll(const MultiItemCallback<rpc::WorkerTableData> &callback) = 0;
|
||||
|
||||
/// Add worker information to GCS asynchronously.
|
||||
///
|
||||
/// \param data_ptr The worker that will be add to GCS.
|
||||
/// \param callback Callback that will be called after worker information has been added
|
||||
/// to GCS.
|
||||
/// \return Status
|
||||
virtual Status AsyncAdd(const std::shared_ptr<rpc::WorkerTableData> &data_ptr,
|
||||
const StatusCallback &callback) = 0;
|
||||
|
||||
/// Reestablish subscription.
|
||||
/// This should be called when GCS server restarts from a failure.
|
||||
|
||||
@@ -178,5 +178,38 @@ std::unique_ptr<std::string> GlobalStateAccessor::GetActorCheckpointId(
|
||||
return actor_checkpoint_id_data;
|
||||
}
|
||||
|
||||
std::unique_ptr<std::string> GlobalStateAccessor::GetWorkerInfo(
|
||||
const WorkerID &worker_id) {
|
||||
std::unique_ptr<std::string> worker_table_data;
|
||||
std::promise<bool> promise;
|
||||
RAY_CHECK_OK(gcs_client_->Workers().AsyncGet(
|
||||
worker_id, TransformForOptionalItemCallback<rpc::WorkerTableData>(worker_table_data,
|
||||
promise)));
|
||||
promise.get_future().get();
|
||||
return worker_table_data;
|
||||
}
|
||||
|
||||
std::vector<std::string> GlobalStateAccessor::GetAllWorkerInfo() {
|
||||
std::vector<std::string> worker_table_data;
|
||||
std::promise<bool> promise;
|
||||
RAY_CHECK_OK(gcs_client_->Workers().AsyncGetAll(
|
||||
TransformForMultiItemCallback<rpc::WorkerTableData>(worker_table_data, promise)));
|
||||
promise.get_future().get();
|
||||
return worker_table_data;
|
||||
}
|
||||
|
||||
bool GlobalStateAccessor::AddWorkerInfo(const std::string &serialized_string) {
|
||||
auto data_ptr = std::make_shared<WorkerTableData>();
|
||||
data_ptr->ParseFromString(serialized_string);
|
||||
std::promise<bool> promise;
|
||||
RAY_CHECK_OK(
|
||||
gcs_client_->Workers().AsyncAdd(data_ptr, [&promise](const Status &status) {
|
||||
RAY_CHECK_OK(status);
|
||||
promise.set_value(true);
|
||||
}));
|
||||
promise.get_future().get();
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace gcs
|
||||
} // namespace ray
|
||||
|
||||
@@ -109,6 +109,28 @@ class GlobalStateAccessor {
|
||||
/// deserialized with protobuf function.
|
||||
std::unique_ptr<std::string> GetActorCheckpointId(const ActorID &actor_id);
|
||||
|
||||
/// Get information of a worker from GCS Service.
|
||||
///
|
||||
/// \param worker_id The ID of worker to look up in the GCS Service.
|
||||
/// \return Worker info. To support multi-language, we serialize each WorkerTableData
|
||||
/// and return the serialized string. Where used, it needs to be deserialized with
|
||||
/// protobuf function.
|
||||
std::unique_ptr<std::string> GetWorkerInfo(const WorkerID &worker_id);
|
||||
|
||||
/// Get information of all workers from GCS Service.
|
||||
///
|
||||
/// \return All worker info. To support multi-language, we serialize each
|
||||
/// WorkerTableData and return the serialized string. Where used, it needs to be
|
||||
/// deserialized with protobuf function.
|
||||
std::vector<std::string> GetAllWorkerInfo();
|
||||
|
||||
/// Add information of a worker to GCS Service.
|
||||
///
|
||||
/// \param serialized_string The serialized data of worker to be added in the GCS
|
||||
/// Service, use string is convenient for python to use.
|
||||
/// \return Is operation success.
|
||||
bool AddWorkerInfo(const std::string &serialized_string);
|
||||
|
||||
private:
|
||||
/// MultiItem transformation helper in template style.
|
||||
///
|
||||
|
||||
@@ -1252,17 +1252,16 @@ ServiceBasedWorkerInfoAccessor::ServiceBasedWorkerInfoAccessor(
|
||||
: client_impl_(client_impl) {}
|
||||
|
||||
Status ServiceBasedWorkerInfoAccessor::AsyncSubscribeToWorkerFailures(
|
||||
const SubscribeCallback<WorkerID, rpc::WorkerFailureData> &subscribe,
|
||||
const SubscribeCallback<WorkerID, rpc::WorkerTableData> &subscribe,
|
||||
const StatusCallback &done) {
|
||||
RAY_CHECK(subscribe != nullptr);
|
||||
subscribe_operation_ = [this, subscribe](const StatusCallback &done) {
|
||||
auto on_subscribe = [subscribe](const std::string &id, const std::string &data) {
|
||||
rpc::WorkerFailureData worker_failure_data;
|
||||
rpc::WorkerTableData worker_failure_data;
|
||||
worker_failure_data.ParseFromString(data);
|
||||
subscribe(WorkerID::FromBinary(id), worker_failure_data);
|
||||
};
|
||||
return client_impl_->GetGcsPubSub().SubscribeAll(WORKER_FAILURE_CHANNEL, on_subscribe,
|
||||
done);
|
||||
return client_impl_->GetGcsPubSub().SubscribeAll(WORKER_CHANNEL, on_subscribe, done);
|
||||
};
|
||||
return subscribe_operation_(done);
|
||||
}
|
||||
@@ -1276,7 +1275,7 @@ void ServiceBasedWorkerInfoAccessor::AsyncResubscribe(bool is_pubsub_server_rest
|
||||
}
|
||||
|
||||
Status ServiceBasedWorkerInfoAccessor::AsyncReportWorkerFailure(
|
||||
const std::shared_ptr<rpc::WorkerFailureData> &data_ptr,
|
||||
const std::shared_ptr<rpc::WorkerTableData> &data_ptr,
|
||||
const StatusCallback &callback) {
|
||||
rpc::Address worker_address = data_ptr->worker_address();
|
||||
RAY_LOG(DEBUG) << "Reporting worker failure, " << worker_address.DebugString();
|
||||
@@ -1294,22 +1293,48 @@ Status ServiceBasedWorkerInfoAccessor::AsyncReportWorkerFailure(
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ServiceBasedWorkerInfoAccessor::AsyncRegisterWorker(
|
||||
rpc::WorkerType worker_type, const WorkerID &worker_id,
|
||||
const std::unordered_map<std::string, std::string> &worker_info,
|
||||
const StatusCallback &callback) {
|
||||
RAY_LOG(DEBUG) << "Registering the worker. worker id = " << worker_id;
|
||||
rpc::RegisterWorkerRequest request;
|
||||
request.set_worker_type(worker_type);
|
||||
Status ServiceBasedWorkerInfoAccessor::AsyncGet(
|
||||
const WorkerID &worker_id,
|
||||
const OptionalItemCallback<rpc::WorkerTableData> &callback) {
|
||||
RAY_LOG(DEBUG) << "Getting worker info, worker id = " << worker_id;
|
||||
rpc::GetWorkerInfoRequest request;
|
||||
request.set_worker_id(worker_id.Binary());
|
||||
request.mutable_worker_info()->insert(worker_info.begin(), worker_info.end());
|
||||
client_impl_->GetGcsRpcClient().RegisterWorker(
|
||||
client_impl_->GetGcsRpcClient().GetWorkerInfo(
|
||||
request,
|
||||
[worker_id, callback](const Status &status, const rpc::RegisterWorkerReply &reply) {
|
||||
[worker_id, callback](const Status &status, const rpc::GetWorkerInfoReply &reply) {
|
||||
if (reply.has_worker_table_data()) {
|
||||
callback(status, reply.worker_table_data());
|
||||
} else {
|
||||
callback(status, boost::none);
|
||||
}
|
||||
RAY_LOG(DEBUG) << "Finished getting worker info, worker id = " << worker_id;
|
||||
});
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ServiceBasedWorkerInfoAccessor::AsyncGetAll(
|
||||
const MultiItemCallback<rpc::WorkerTableData> &callback) {
|
||||
RAY_LOG(DEBUG) << "Getting all worker info.";
|
||||
rpc::GetAllWorkerInfoRequest request;
|
||||
client_impl_->GetGcsRpcClient().GetAllWorkerInfo(
|
||||
request, [callback](const Status &status, const rpc::GetAllWorkerInfoReply &reply) {
|
||||
auto result = VectorFromProtobuf(reply.worker_table_data());
|
||||
callback(status, result);
|
||||
RAY_LOG(DEBUG) << "Finished getting all worker info, status = " << status;
|
||||
});
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ServiceBasedWorkerInfoAccessor::AsyncAdd(
|
||||
const std::shared_ptr<rpc::WorkerTableData> &data_ptr,
|
||||
const StatusCallback &callback) {
|
||||
rpc::AddWorkerInfoRequest request;
|
||||
request.mutable_worker_data()->CopyFrom(*data_ptr);
|
||||
client_impl_->GetGcsRpcClient().AddWorkerInfo(
|
||||
request, [callback](const Status &status, const rpc::AddWorkerInfoReply &reply) {
|
||||
if (callback) {
|
||||
callback(status);
|
||||
}
|
||||
RAY_LOG(DEBUG) << "Finished registering worker. worker id = " << worker_id;
|
||||
});
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@@ -374,16 +374,19 @@ class ServiceBasedWorkerInfoAccessor : public WorkerInfoAccessor {
|
||||
virtual ~ServiceBasedWorkerInfoAccessor() = default;
|
||||
|
||||
Status AsyncSubscribeToWorkerFailures(
|
||||
const SubscribeCallback<WorkerID, rpc::WorkerFailureData> &subscribe,
|
||||
const SubscribeCallback<WorkerID, rpc::WorkerTableData> &subscribe,
|
||||
const StatusCallback &done) override;
|
||||
|
||||
Status AsyncReportWorkerFailure(const std::shared_ptr<rpc::WorkerFailureData> &data_ptr,
|
||||
Status AsyncReportWorkerFailure(const std::shared_ptr<rpc::WorkerTableData> &data_ptr,
|
||||
const StatusCallback &callback) override;
|
||||
|
||||
Status AsyncRegisterWorker(
|
||||
rpc::WorkerType worker_type, const WorkerID &worker_id,
|
||||
const std::unordered_map<std::string, std::string> &worker_info,
|
||||
const StatusCallback &callback) override;
|
||||
Status AsyncGet(const WorkerID &worker_id,
|
||||
const OptionalItemCallback<rpc::WorkerTableData> &callback) override;
|
||||
|
||||
Status AsyncGetAll(const MultiItemCallback<rpc::WorkerTableData> &callback) override;
|
||||
|
||||
Status AsyncAdd(const std::shared_ptr<rpc::WorkerTableData> &data_ptr,
|
||||
const StatusCallback &callback) override;
|
||||
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override;
|
||||
|
||||
|
||||
@@ -226,6 +226,26 @@ TEST_F(GlobalStateAccessorTest, TestActorTable) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(GlobalStateAccessorTest, TestWorkerTable) {
|
||||
ASSERT_EQ(global_state_->GetAllWorkerInfo().size(), 0);
|
||||
// Add worker info
|
||||
auto worker_table_data = Mocker::GenWorkerTableData();
|
||||
worker_table_data->mutable_worker_address()->set_worker_id(
|
||||
WorkerID::FromRandom().Binary());
|
||||
ASSERT_TRUE(global_state_->AddWorkerInfo(worker_table_data->SerializeAsString()));
|
||||
|
||||
// Get worker info
|
||||
auto worker_id = WorkerID::FromBinary(worker_table_data->worker_address().worker_id());
|
||||
ASSERT_TRUE(global_state_->GetWorkerInfo(worker_id));
|
||||
|
||||
// Add another worker info
|
||||
auto another_worker_data = Mocker::GenWorkerTableData();
|
||||
another_worker_data->mutable_worker_address()->set_worker_id(
|
||||
WorkerID::FromRandom().Binary());
|
||||
ASSERT_TRUE(global_state_->AddWorkerInfo(another_worker_data->SerializeAsString()));
|
||||
ASSERT_EQ(global_state_->GetAllWorkerInfo().size(), 2);
|
||||
}
|
||||
|
||||
} // namespace ray
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
||||
@@ -455,7 +455,7 @@ class ServiceBasedGcsClientTest : public ::testing::Test {
|
||||
}
|
||||
|
||||
bool SubscribeToWorkerFailures(
|
||||
const gcs::SubscribeCallback<WorkerID, rpc::WorkerFailureData> &subscribe) {
|
||||
const gcs::SubscribeCallback<WorkerID, rpc::WorkerTableData> &subscribe) {
|
||||
std::promise<bool> promise;
|
||||
RAY_CHECK_OK(gcs_client_->Workers().AsyncSubscribeToWorkerFailures(
|
||||
subscribe, [&promise](Status status) { promise.set_value(status.ok()); }));
|
||||
@@ -463,7 +463,7 @@ class ServiceBasedGcsClientTest : public ::testing::Test {
|
||||
}
|
||||
|
||||
bool ReportWorkerFailure(
|
||||
const std::shared_ptr<rpc::WorkerFailureData> &worker_failure_data) {
|
||||
const std::shared_ptr<rpc::WorkerTableData> &worker_failure_data) {
|
||||
std::promise<bool> promise;
|
||||
RAY_CHECK_OK(gcs_client_->Workers().AsyncReportWorkerFailure(
|
||||
worker_failure_data,
|
||||
@@ -471,6 +471,13 @@ class ServiceBasedGcsClientTest : public ::testing::Test {
|
||||
return WaitReady(promise.get_future(), timeout_ms_);
|
||||
}
|
||||
|
||||
bool AddWorker(const std::shared_ptr<rpc::WorkerTableData> &worker_data) {
|
||||
std::promise<bool> promise;
|
||||
RAY_CHECK_OK(gcs_client_->Workers().AsyncAdd(
|
||||
worker_data, [&promise](Status status) { promise.set_value(status.ok()); }));
|
||||
return WaitReady(promise.get_future(), timeout_ms_);
|
||||
}
|
||||
|
||||
bool WaitReady(std::future<bool> future, const std::chrono::milliseconds &timeout_ms) {
|
||||
auto status = future.wait_for(timeout_ms);
|
||||
return status == std::future_status::ready && future.get();
|
||||
@@ -824,14 +831,22 @@ TEST_F(ServiceBasedGcsClientTest, TestWorkerInfo) {
|
||||
// Subscribe to all unexpected failure of workers from GCS.
|
||||
std::atomic<int> worker_failure_count(0);
|
||||
auto on_subscribe = [&worker_failure_count](const WorkerID &worker_id,
|
||||
const rpc::WorkerFailureData &result) {
|
||||
const rpc::WorkerTableData &result) {
|
||||
++worker_failure_count;
|
||||
};
|
||||
ASSERT_TRUE(SubscribeToWorkerFailures(on_subscribe));
|
||||
|
||||
// Report a worker failure to GCS.
|
||||
auto worker_failure_data = Mocker::GenWorkerFailureData();
|
||||
ASSERT_TRUE(ReportWorkerFailure(worker_failure_data));
|
||||
// Report a worker failure to GCS when this worker doesn't exist.
|
||||
auto worker_data = Mocker::GenWorkerTableData();
|
||||
worker_data->mutable_worker_address()->set_worker_id(WorkerID::FromRandom().Binary());
|
||||
ASSERT_TRUE(ReportWorkerFailure(worker_data));
|
||||
WaitPendingDone(worker_failure_count, 0);
|
||||
|
||||
// Add a worker to GCS.
|
||||
ASSERT_TRUE(AddWorker(worker_data));
|
||||
|
||||
// Report a worker failure to GCS when this worker is actually exist.
|
||||
ASSERT_TRUE(ReportWorkerFailure(worker_data));
|
||||
WaitPendingDone(worker_failure_count, 1);
|
||||
}
|
||||
|
||||
@@ -1065,7 +1080,7 @@ TEST_F(ServiceBasedGcsClientTest, TestWorkerTableResubscribe) {
|
||||
// Subscribe to all unexpected failure of workers from GCS.
|
||||
std::atomic<int> worker_failure_count(0);
|
||||
auto on_subscribe = [&worker_failure_count](const WorkerID &worker_id,
|
||||
const rpc::WorkerFailureData &result) {
|
||||
const rpc::WorkerTableData &result) {
|
||||
++worker_failure_count;
|
||||
};
|
||||
ASSERT_TRUE(SubscribeToWorkerFailures(on_subscribe));
|
||||
@@ -1073,9 +1088,13 @@ TEST_F(ServiceBasedGcsClientTest, TestWorkerTableResubscribe) {
|
||||
// Restart GCS
|
||||
RestartGcsServer();
|
||||
|
||||
// Add a worker before report worker failure to GCS.
|
||||
auto worker_data = Mocker::GenWorkerTableData();
|
||||
worker_data->mutable_worker_address()->set_worker_id(WorkerID::FromRandom().Binary());
|
||||
ASSERT_TRUE(AddWorker(worker_data));
|
||||
|
||||
// Report a worker failure to GCS and check if resubscribe works.
|
||||
auto worker_failure_data = Mocker::GenWorkerFailureData();
|
||||
ASSERT_TRUE(ReportWorkerFailure(worker_failure_data));
|
||||
ASSERT_TRUE(ReportWorkerFailure(worker_data));
|
||||
WaitPendingDone(worker_failure_count, 1);
|
||||
}
|
||||
|
||||
|
||||
@@ -19,11 +19,11 @@
|
||||
#include "gcs_job_manager.h"
|
||||
#include "gcs_node_manager.h"
|
||||
#include "gcs_object_manager.h"
|
||||
#include "gcs_worker_manager.h"
|
||||
#include "ray/common/network_util.h"
|
||||
#include "ray/common/ray_config.h"
|
||||
#include "stats_handler_impl.h"
|
||||
#include "task_info_handler_impl.h"
|
||||
#include "worker_info_handler_impl.h"
|
||||
|
||||
namespace ray {
|
||||
namespace gcs {
|
||||
@@ -92,9 +92,9 @@ void GcsServer::Start() {
|
||||
new rpc::ErrorInfoGrpcService(main_service_, *error_info_handler_));
|
||||
rpc_server_.RegisterService(*error_info_service_);
|
||||
|
||||
worker_info_handler_ = InitWorkerInfoHandler();
|
||||
gcs_worker_manager_ = InitGcsWorkerManager();
|
||||
worker_info_service_.reset(
|
||||
new rpc::WorkerInfoGrpcService(main_service_, *worker_info_handler_));
|
||||
new rpc::WorkerInfoGrpcService(main_service_, *gcs_worker_manager_));
|
||||
rpc_server_.RegisterService(*worker_info_service_);
|
||||
|
||||
auto load_completed_count = std::make_shared<int>(0);
|
||||
@@ -191,7 +191,7 @@ void GcsServer::InitGcsActorManager() {
|
||||
});
|
||||
|
||||
auto on_subscribe = [this](const std::string &id, const std::string &data) {
|
||||
rpc::WorkerFailureData worker_failure_data;
|
||||
rpc::WorkerTableData worker_failure_data;
|
||||
worker_failure_data.ParseFromString(data);
|
||||
auto &worker_address = worker_failure_data.worker_address();
|
||||
WorkerID worker_id = WorkerID::FromBinary(id);
|
||||
@@ -199,7 +199,7 @@ void GcsServer::InitGcsActorManager() {
|
||||
gcs_actor_manager_->OnWorkerDead(node_id, worker_id,
|
||||
worker_failure_data.intentional_disconnect());
|
||||
};
|
||||
RAY_CHECK_OK(gcs_pub_sub_->SubscribeAll(WORKER_FAILURE_CHANNEL, on_subscribe, nullptr));
|
||||
RAY_CHECK_OK(gcs_pub_sub_->SubscribeAll(WORKER_CHANNEL, on_subscribe, nullptr));
|
||||
}
|
||||
|
||||
void GcsServer::InitGcsJobManager() {
|
||||
@@ -243,9 +243,9 @@ std::unique_ptr<rpc::ErrorInfoHandler> GcsServer::InitErrorInfoHandler() {
|
||||
new rpc::DefaultErrorInfoHandler(*redis_gcs_client_));
|
||||
}
|
||||
|
||||
std::unique_ptr<rpc::WorkerInfoHandler> GcsServer::InitWorkerInfoHandler() {
|
||||
return std::unique_ptr<rpc::DefaultWorkerInfoHandler>(new rpc::DefaultWorkerInfoHandler(
|
||||
*redis_gcs_client_, gcs_table_storage_, gcs_pub_sub_));
|
||||
std::unique_ptr<GcsWorkerManager> GcsServer::InitGcsWorkerManager() {
|
||||
return std::unique_ptr<GcsWorkerManager>(
|
||||
new GcsWorkerManager(gcs_table_storage_, gcs_pub_sub_));
|
||||
}
|
||||
|
||||
} // namespace gcs
|
||||
|
||||
@@ -39,6 +39,7 @@ struct GcsServerConfig {
|
||||
class GcsNodeManager;
|
||||
class GcsActorManager;
|
||||
class GcsJobManager;
|
||||
class GcsWorkerManager;
|
||||
|
||||
/// The GcsServer will take over all requests from ServiceBasedGcsClient and transparent
|
||||
/// transmit the command to the backend reliable storage for the time being.
|
||||
@@ -96,8 +97,8 @@ class GcsServer {
|
||||
/// The error info handler
|
||||
virtual std::unique_ptr<rpc::ErrorInfoHandler> InitErrorInfoHandler();
|
||||
|
||||
/// The worker info handler
|
||||
virtual std::unique_ptr<rpc::WorkerInfoHandler> InitWorkerInfoHandler();
|
||||
/// The worker manager
|
||||
virtual std::unique_ptr<GcsWorkerManager> InitGcsWorkerManager();
|
||||
|
||||
private:
|
||||
/// Store the address of GCS server in Redis.
|
||||
@@ -140,8 +141,9 @@ class GcsServer {
|
||||
/// Error info handler and service
|
||||
std::unique_ptr<rpc::ErrorInfoHandler> error_info_handler_;
|
||||
std::unique_ptr<rpc::ErrorInfoGrpcService> error_info_service_;
|
||||
/// Worker info handler and service
|
||||
std::unique_ptr<rpc::WorkerInfoHandler> worker_info_handler_;
|
||||
/// The gcs worker manager
|
||||
std::unique_ptr<GcsWorkerManager> gcs_worker_manager_;
|
||||
/// Worker info service
|
||||
std::unique_ptr<rpc::WorkerInfoGrpcService> worker_info_service_;
|
||||
/// Backend client
|
||||
std::shared_ptr<RedisGcsClient> redis_gcs_client_;
|
||||
|
||||
@@ -112,7 +112,7 @@ template class GcsTable<ClientID, HeartbeatTableData>;
|
||||
template class GcsTable<ClientID, HeartbeatBatchTableData>;
|
||||
template class GcsTable<JobID, ErrorTableData>;
|
||||
template class GcsTable<UniqueID, ProfileTableData>;
|
||||
template class GcsTable<WorkerID, WorkerFailureData>;
|
||||
template class GcsTable<WorkerID, WorkerTableData>;
|
||||
template class GcsTable<ActorID, ActorTableData>;
|
||||
template class GcsTable<ActorCheckpointID, ActorCheckpointData>;
|
||||
template class GcsTable<ActorID, ActorCheckpointIdData>;
|
||||
|
||||
@@ -39,7 +39,7 @@ using rpc::ResourceTableData;
|
||||
using rpc::TaskLeaseData;
|
||||
using rpc::TaskReconstructionData;
|
||||
using rpc::TaskTableData;
|
||||
using rpc::WorkerFailureData;
|
||||
using rpc::WorkerTableData;
|
||||
|
||||
/// \class GcsTable
|
||||
///
|
||||
@@ -265,11 +265,11 @@ class GcsProfileTable : public GcsTable<UniqueID, ProfileTableData> {
|
||||
}
|
||||
};
|
||||
|
||||
class GcsWorkerFailureTable : public GcsTable<WorkerID, WorkerFailureData> {
|
||||
class GcsWorkerTable : public GcsTable<WorkerID, WorkerTableData> {
|
||||
public:
|
||||
explicit GcsWorkerFailureTable(std::shared_ptr<StoreClient> &store_client)
|
||||
explicit GcsWorkerTable(std::shared_ptr<StoreClient> &store_client)
|
||||
: GcsTable(store_client) {
|
||||
table_name_ = TablePrefix_Name(TablePrefix::WORKER_FAILURE);
|
||||
table_name_ = TablePrefix_Name(TablePrefix::WORKERS);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -349,9 +349,9 @@ class GcsTableStorage {
|
||||
return *profile_table_;
|
||||
}
|
||||
|
||||
GcsWorkerFailureTable &WorkerFailureTable() {
|
||||
RAY_CHECK(worker_failure_table_ != nullptr);
|
||||
return *worker_failure_table_;
|
||||
GcsWorkerTable &WorkerTable() {
|
||||
RAY_CHECK(worker_table_ != nullptr);
|
||||
return *worker_table_;
|
||||
}
|
||||
|
||||
protected:
|
||||
@@ -370,7 +370,7 @@ class GcsTableStorage {
|
||||
std::unique_ptr<GcsHeartbeatBatchTable> heartbeat_batch_table_;
|
||||
std::unique_ptr<GcsErrorInfoTable> error_info_table_;
|
||||
std::unique_ptr<GcsProfileTable> profile_table_;
|
||||
std::unique_ptr<GcsWorkerFailureTable> worker_failure_table_;
|
||||
std::unique_ptr<GcsWorkerTable> worker_table_;
|
||||
};
|
||||
|
||||
/// \class RedisGcsTableStorage
|
||||
@@ -394,7 +394,7 @@ class RedisGcsTableStorage : public GcsTableStorage {
|
||||
heartbeat_batch_table_.reset(new GcsHeartbeatBatchTable(store_client_));
|
||||
error_info_table_.reset(new GcsErrorInfoTable(store_client_));
|
||||
profile_table_.reset(new GcsProfileTable(store_client_));
|
||||
worker_failure_table_.reset(new GcsWorkerFailureTable(store_client_));
|
||||
worker_table_.reset(new GcsWorkerTable(store_client_));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -419,7 +419,7 @@ class InMemoryGcsTableStorage : public GcsTableStorage {
|
||||
heartbeat_batch_table_.reset(new GcsHeartbeatBatchTable(store_client_));
|
||||
error_info_table_.reset(new GcsErrorInfoTable(store_client_));
|
||||
profile_table_.reset(new GcsProfileTable(store_client_));
|
||||
worker_failure_table_.reset(new GcsWorkerFailureTable(store_client_));
|
||||
worker_table_.reset(new GcsWorkerTable(store_client_));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -0,0 +1,133 @@
|
||||
// Copyright 2017 The Ray Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "gcs_worker_manager.h"
|
||||
|
||||
namespace ray {
|
||||
namespace gcs {
|
||||
|
||||
void GcsWorkerManager::HandleReportWorkerFailure(
|
||||
const rpc::ReportWorkerFailureRequest &request, rpc::ReportWorkerFailureReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) {
|
||||
const rpc::Address worker_address = request.worker_failure().worker_address();
|
||||
RAY_LOG(DEBUG) << "Reporting worker failure, " << worker_address.DebugString();
|
||||
auto worker_failure_data = std::make_shared<WorkerTableData>();
|
||||
worker_failure_data->CopyFrom(request.worker_failure());
|
||||
worker_failure_data->set_is_alive(false);
|
||||
const auto worker_id = WorkerID::FromBinary(worker_address.worker_id());
|
||||
|
||||
// Before handle ReportWorkerFailureRequest, you should check if the worker is exists.
|
||||
auto on_get_done =
|
||||
[this, worker_address, worker_id, worker_failure_data, reply, send_reply_callback](
|
||||
const Status &status, const boost::optional<WorkerTableData> &result) {
|
||||
if (result) {
|
||||
auto on_put_done = [this, worker_address, worker_id, worker_failure_data, reply,
|
||||
send_reply_callback](const Status &status) {
|
||||
if (!status.ok()) {
|
||||
RAY_LOG(ERROR) << "Failed to report worker failure, "
|
||||
<< worker_address.DebugString();
|
||||
} else {
|
||||
RAY_CHECK_OK(gcs_pub_sub_->Publish(WORKER_CHANNEL, worker_id.Binary(),
|
||||
worker_failure_data->SerializeAsString(),
|
||||
nullptr));
|
||||
}
|
||||
GCS_RPC_SEND_REPLY(send_reply_callback, reply, status);
|
||||
};
|
||||
|
||||
// The worker exists in worker table, you can update the info of this worker.
|
||||
Status report_status = gcs_table_storage_->WorkerTable().Put(
|
||||
worker_id, *worker_failure_data, on_put_done);
|
||||
if (!report_status.ok()) {
|
||||
on_put_done(report_status);
|
||||
}
|
||||
} else {
|
||||
// The worker doesn't exists in worker table.
|
||||
RAY_LOG(WARNING) << "Failed to report worker failure, the worker doesn't "
|
||||
"exist, "
|
||||
<< worker_address.DebugString();
|
||||
GCS_RPC_SEND_REPLY(send_reply_callback, reply, status);
|
||||
}
|
||||
};
|
||||
Status status = gcs_table_storage_->WorkerTable().Get(worker_id, on_get_done);
|
||||
if (!status.ok()) {
|
||||
on_get_done(status, boost::none);
|
||||
}
|
||||
}
|
||||
|
||||
void GcsWorkerManager::HandleGetWorkerInfo(const rpc::GetWorkerInfoRequest &request,
|
||||
rpc::GetWorkerInfoReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) {
|
||||
WorkerID worker_id = WorkerID::FromBinary(request.worker_id());
|
||||
RAY_LOG(DEBUG) << "Getting worker info, worker id = " << worker_id;
|
||||
|
||||
auto on_done = [worker_id, reply, send_reply_callback](
|
||||
const Status &status,
|
||||
const boost::optional<WorkerTableData> &result) {
|
||||
if (result) {
|
||||
reply->mutable_worker_table_data()->CopyFrom(*result);
|
||||
}
|
||||
RAY_LOG(DEBUG) << "Finished getting worker info, worker id = " << worker_id;
|
||||
GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK());
|
||||
};
|
||||
|
||||
Status status = gcs_table_storage_->WorkerTable().Get(worker_id, on_done);
|
||||
if (!status.ok()) {
|
||||
on_done(status, boost::none);
|
||||
}
|
||||
}
|
||||
|
||||
void GcsWorkerManager::HandleGetAllWorkerInfo(
|
||||
const rpc::GetAllWorkerInfoRequest &request, rpc::GetAllWorkerInfoReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) {
|
||||
RAY_LOG(DEBUG) << "Getting all worker info.";
|
||||
auto on_done = [reply, send_reply_callback](
|
||||
const std::unordered_map<WorkerID, WorkerTableData> &result) {
|
||||
for (auto &data : result) {
|
||||
reply->add_worker_table_data()->CopyFrom(data.second);
|
||||
}
|
||||
RAY_LOG(DEBUG) << "Finished getting all worker info.";
|
||||
GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK());
|
||||
};
|
||||
Status status = gcs_table_storage_->WorkerTable().GetAll(on_done);
|
||||
if (!status.ok()) {
|
||||
on_done(std::unordered_map<WorkerID, WorkerTableData>());
|
||||
}
|
||||
}
|
||||
|
||||
void GcsWorkerManager::HandleAddWorkerInfo(const rpc::AddWorkerInfoRequest &request,
|
||||
rpc::AddWorkerInfoReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) {
|
||||
auto worker_data = std::make_shared<WorkerTableData>();
|
||||
worker_data->CopyFrom(request.worker_data());
|
||||
auto worker_id = WorkerID::FromBinary(worker_data->worker_address().worker_id());
|
||||
RAY_LOG(DEBUG) << "Adding worker " << worker_id;
|
||||
|
||||
auto on_done = [worker_id, worker_data, reply,
|
||||
send_reply_callback](const Status &status) {
|
||||
if (!status.ok()) {
|
||||
RAY_LOG(ERROR) << "Failed to add worker information, "
|
||||
<< worker_data->DebugString();
|
||||
}
|
||||
RAY_LOG(DEBUG) << "Finished adding worker " << worker_id;
|
||||
GCS_RPC_SEND_REPLY(send_reply_callback, reply, status);
|
||||
};
|
||||
|
||||
Status status = gcs_table_storage_->WorkerTable().Put(worker_id, *worker_data, on_done);
|
||||
if (!status.ok()) {
|
||||
on_done(status);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace gcs
|
||||
} // namespace ray
|
||||
@@ -0,0 +1,54 @@
|
||||
// Copyright 2017 The Ray Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "gcs_table_storage.h"
|
||||
#include "ray/gcs/pubsub/gcs_pub_sub.h"
|
||||
#include "ray/gcs/redis_gcs_client.h"
|
||||
#include "ray/rpc/gcs_server/gcs_rpc_server.h"
|
||||
|
||||
namespace ray {
|
||||
namespace gcs {
|
||||
|
||||
/// This implementation class of `WorkerInfoHandler`.
|
||||
class GcsWorkerManager : public rpc::WorkerInfoHandler {
|
||||
public:
|
||||
explicit GcsWorkerManager(std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage,
|
||||
std::shared_ptr<gcs::GcsPubSub> &gcs_pub_sub)
|
||||
: gcs_table_storage_(gcs_table_storage), gcs_pub_sub_(gcs_pub_sub) {}
|
||||
|
||||
void HandleReportWorkerFailure(const rpc::ReportWorkerFailureRequest &request,
|
||||
rpc::ReportWorkerFailureReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) override;
|
||||
|
||||
void HandleGetWorkerInfo(const rpc::GetWorkerInfoRequest &request,
|
||||
rpc::GetWorkerInfoReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) override;
|
||||
|
||||
void HandleGetAllWorkerInfo(const rpc::GetAllWorkerInfoRequest &request,
|
||||
rpc::GetAllWorkerInfoReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) override;
|
||||
|
||||
void HandleAddWorkerInfo(const rpc::AddWorkerInfoRequest &request,
|
||||
rpc::AddWorkerInfoReply *reply,
|
||||
rpc::SendReplyCallback send_reply_callback) override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage_;
|
||||
std::shared_ptr<gcs::GcsPubSub> gcs_pub_sub_;
|
||||
};
|
||||
|
||||
} // namespace gcs
|
||||
} // namespace ray
|
||||
@@ -390,6 +390,53 @@ class GcsServerTest : public ::testing::Test {
|
||||
client_->ReportWorkerFailure(
|
||||
request,
|
||||
[&promise](const Status &status, const rpc::ReportWorkerFailureReply &reply) {
|
||||
RAY_CHECK_OK(status);
|
||||
promise.set_value(status.ok());
|
||||
});
|
||||
return WaitReady(promise.get_future(), timeout_ms_);
|
||||
}
|
||||
|
||||
boost::optional<rpc::WorkerTableData> GetWorkerInfo(const std::string &worker_id) {
|
||||
rpc::GetWorkerInfoRequest request;
|
||||
request.set_worker_id(worker_id);
|
||||
boost::optional<rpc::WorkerTableData> worker_table_data_opt;
|
||||
std::promise<bool> promise;
|
||||
client_->GetWorkerInfo(
|
||||
request, [&worker_table_data_opt, &promise](
|
||||
const Status &status, const rpc::GetWorkerInfoReply &reply) {
|
||||
RAY_CHECK_OK(status);
|
||||
if (reply.has_worker_table_data()) {
|
||||
worker_table_data_opt = reply.worker_table_data();
|
||||
} else {
|
||||
worker_table_data_opt = boost::none;
|
||||
}
|
||||
promise.set_value(true);
|
||||
});
|
||||
EXPECT_TRUE(WaitReady(promise.get_future(), timeout_ms_));
|
||||
return worker_table_data_opt;
|
||||
}
|
||||
|
||||
std::vector<rpc::WorkerTableData> GetAllWorkerInfo() {
|
||||
std::vector<rpc::WorkerTableData> worker_table_data;
|
||||
rpc::GetAllWorkerInfoRequest request;
|
||||
std::promise<bool> promise;
|
||||
client_->GetAllWorkerInfo(
|
||||
request, [&worker_table_data, &promise](const Status &status,
|
||||
const rpc::GetAllWorkerInfoReply &reply) {
|
||||
RAY_CHECK_OK(status);
|
||||
for (int index = 0; index < reply.worker_table_data_size(); ++index) {
|
||||
worker_table_data.push_back(reply.worker_table_data(index));
|
||||
}
|
||||
promise.set_value(true);
|
||||
});
|
||||
EXPECT_TRUE(WaitReady(promise.get_future(), timeout_ms_));
|
||||
return worker_table_data;
|
||||
}
|
||||
|
||||
bool AddWorkerInfo(const rpc::AddWorkerInfoRequest &request) {
|
||||
std::promise<bool> promise;
|
||||
client_->AddWorkerInfo(
|
||||
request, [&promise](const Status &status, const rpc::AddWorkerInfoReply &reply) {
|
||||
RAY_CHECK_OK(status);
|
||||
promise.set_value(true);
|
||||
});
|
||||
@@ -725,12 +772,29 @@ TEST_F(GcsServerTest, TestErrorInfo) {
|
||||
}
|
||||
|
||||
TEST_F(GcsServerTest, TestWorkerInfo) {
|
||||
rpc::WorkerFailureData worker_failure_data;
|
||||
worker_failure_data.mutable_worker_address()->set_ip_address("127.0.0.1");
|
||||
worker_failure_data.mutable_worker_address()->set_port(5566);
|
||||
// Report worker failure
|
||||
auto worker_failure_data = Mocker::GenWorkerTableData();
|
||||
worker_failure_data->mutable_worker_address()->set_ip_address("127.0.0.1");
|
||||
worker_failure_data->mutable_worker_address()->set_port(5566);
|
||||
rpc::ReportWorkerFailureRequest report_worker_failure_request;
|
||||
report_worker_failure_request.mutable_worker_failure()->CopyFrom(worker_failure_data);
|
||||
report_worker_failure_request.mutable_worker_failure()->CopyFrom(*worker_failure_data);
|
||||
ASSERT_TRUE(ReportWorkerFailure(report_worker_failure_request));
|
||||
std::vector<rpc::WorkerTableData> worker_table_data = GetAllWorkerInfo();
|
||||
ASSERT_TRUE(worker_table_data.size() == 0);
|
||||
|
||||
// Add worker info
|
||||
auto worker_data = Mocker::GenWorkerTableData();
|
||||
worker_data->mutable_worker_address()->set_worker_id(WorkerID::FromRandom().Binary());
|
||||
rpc::AddWorkerInfoRequest add_worker_request;
|
||||
add_worker_request.mutable_worker_data()->CopyFrom(*worker_data);
|
||||
ASSERT_TRUE(AddWorkerInfo(add_worker_request));
|
||||
ASSERT_TRUE(GetAllWorkerInfo().size() == 1);
|
||||
|
||||
// Get worker info
|
||||
boost::optional<rpc::WorkerTableData> result =
|
||||
GetWorkerInfo(worker_data->worker_address().worker_id());
|
||||
ASSERT_TRUE(result->worker_address().worker_id() ==
|
||||
worker_data->worker_address().worker_id());
|
||||
}
|
||||
|
||||
} // namespace ray
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
// Copyright 2017 The Ray Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "worker_info_handler_impl.h"
|
||||
|
||||
namespace ray {
|
||||
namespace rpc {
|
||||
|
||||
void DefaultWorkerInfoHandler::HandleReportWorkerFailure(
|
||||
const ReportWorkerFailureRequest &request, ReportWorkerFailureReply *reply,
|
||||
SendReplyCallback send_reply_callback) {
|
||||
const Address worker_address = request.worker_failure().worker_address();
|
||||
RAY_LOG(DEBUG) << "Reporting worker failure, " << worker_address.DebugString();
|
||||
auto worker_failure_data = std::make_shared<WorkerFailureData>();
|
||||
worker_failure_data->CopyFrom(request.worker_failure());
|
||||
const auto worker_id = WorkerID::FromBinary(worker_address.worker_id());
|
||||
auto on_done = [this, worker_address, worker_id, worker_failure_data, reply,
|
||||
send_reply_callback](const Status &status) {
|
||||
if (!status.ok()) {
|
||||
RAY_LOG(ERROR) << "Failed to report worker failure, "
|
||||
<< worker_address.DebugString();
|
||||
} else {
|
||||
RAY_CHECK_OK(gcs_pub_sub_->Publish(WORKER_FAILURE_CHANNEL, worker_id.Binary(),
|
||||
worker_failure_data->SerializeAsString(),
|
||||
nullptr));
|
||||
}
|
||||
GCS_RPC_SEND_REPLY(send_reply_callback, reply, status);
|
||||
};
|
||||
|
||||
Status status = gcs_table_storage_->WorkerFailureTable().Put(
|
||||
worker_id, *worker_failure_data, on_done);
|
||||
if (!status.ok()) {
|
||||
on_done(status);
|
||||
}
|
||||
}
|
||||
|
||||
void DefaultWorkerInfoHandler::HandleRegisterWorker(
|
||||
const RegisterWorkerRequest &request, RegisterWorkerReply *reply,
|
||||
SendReplyCallback send_reply_callback) {
|
||||
auto worker_type = request.worker_type();
|
||||
auto worker_id = WorkerID::FromBinary(request.worker_id());
|
||||
auto worker_info = MapFromProtobuf(request.worker_info());
|
||||
|
||||
auto on_done = [worker_id, reply, send_reply_callback](const Status &status) {
|
||||
if (!status.ok()) {
|
||||
RAY_LOG(ERROR) << "Failed to register worker " << worker_id;
|
||||
} else {
|
||||
RAY_LOG(DEBUG) << "Finished registering worker " << worker_id;
|
||||
}
|
||||
GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK());
|
||||
};
|
||||
|
||||
Status status = gcs_client_.Workers().AsyncRegisterWorker(worker_type, worker_id,
|
||||
worker_info, on_done);
|
||||
if (!status.ok()) {
|
||||
on_done(status);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rpc
|
||||
} // namespace ray
|
||||
@@ -1,51 +0,0 @@
|
||||
// Copyright 2017 The Ray Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "gcs_table_storage.h"
|
||||
#include "ray/gcs/pubsub/gcs_pub_sub.h"
|
||||
#include "ray/gcs/redis_gcs_client.h"
|
||||
#include "ray/rpc/gcs_server/gcs_rpc_server.h"
|
||||
|
||||
namespace ray {
|
||||
namespace rpc {
|
||||
|
||||
/// This implementation class of `WorkerInfoHandler`.
|
||||
class DefaultWorkerInfoHandler : public rpc::WorkerInfoHandler {
|
||||
public:
|
||||
explicit DefaultWorkerInfoHandler(
|
||||
gcs::RedisGcsClient &gcs_client,
|
||||
std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage,
|
||||
std::shared_ptr<gcs::GcsPubSub> &gcs_pub_sub)
|
||||
: gcs_client_(gcs_client),
|
||||
gcs_table_storage_(gcs_table_storage),
|
||||
gcs_pub_sub_(gcs_pub_sub) {}
|
||||
|
||||
void HandleReportWorkerFailure(const ReportWorkerFailureRequest &request,
|
||||
ReportWorkerFailureReply *reply,
|
||||
SendReplyCallback send_reply_callback) override;
|
||||
|
||||
void HandleRegisterWorker(const RegisterWorkerRequest &request,
|
||||
RegisterWorkerReply *reply,
|
||||
SendReplyCallback send_reply_callback) override;
|
||||
|
||||
private:
|
||||
gcs::RedisGcsClient &gcs_client_;
|
||||
std::shared_ptr<gcs::GcsTableStorage> gcs_table_storage_;
|
||||
std::shared_ptr<gcs::GcsPubSub> gcs_pub_sub_;
|
||||
};
|
||||
|
||||
} // namespace rpc
|
||||
} // namespace ray
|
||||
@@ -82,11 +82,11 @@ inline std::shared_ptr<ray::rpc::ActorTableData> CreateActorTableData(
|
||||
}
|
||||
|
||||
/// Helper function to produce worker failure data.
|
||||
inline std::shared_ptr<ray::rpc::WorkerFailureData> CreateWorkerFailureData(
|
||||
inline std::shared_ptr<ray::rpc::WorkerTableData> CreateWorkerFailureData(
|
||||
const ClientID &raylet_id, const WorkerID &worker_id, const std::string &address,
|
||||
int32_t port, int64_t timestamp = std::time(nullptr),
|
||||
bool intentional_disconnect = false) {
|
||||
auto worker_failure_info_ptr = std::make_shared<ray::rpc::WorkerFailureData>();
|
||||
auto worker_failure_info_ptr = std::make_shared<ray::rpc::WorkerTableData>();
|
||||
worker_failure_info_ptr->mutable_worker_address()->set_raylet_id(raylet_id.Binary());
|
||||
worker_failure_info_ptr->mutable_worker_address()->set_worker_id(worker_id.Binary());
|
||||
worker_failure_info_ptr->mutable_worker_address()->set_ip_address(address);
|
||||
|
||||
@@ -29,7 +29,7 @@ namespace gcs {
|
||||
#define NODE_CHANNEL "NODE"
|
||||
#define NODE_RESOURCE_CHANNEL "NODE_RESOURCE"
|
||||
#define ACTOR_CHANNEL "ACTOR"
|
||||
#define WORKER_FAILURE_CHANNEL "WORKER_FAILURE"
|
||||
#define WORKER_CHANNEL "WORKER"
|
||||
#define OBJECT_CHANNEL "OBJECT"
|
||||
#define TASK_CHANNEL "TASK"
|
||||
#define TASK_LEASE_CHANNEL "TASK_LEASE"
|
||||
|
||||
@@ -792,50 +792,43 @@ Status RedisStatsInfoAccessor::AsyncAddProfileData(
|
||||
|
||||
RedisWorkerInfoAccessor::RedisWorkerInfoAccessor(RedisGcsClient *client_impl)
|
||||
: client_impl_(client_impl),
|
||||
worker_failure_sub_executor_(client_impl->worker_failure_table()) {}
|
||||
worker_failure_sub_executor_(client_impl->worker_table()) {}
|
||||
|
||||
Status RedisWorkerInfoAccessor::AsyncSubscribeToWorkerFailures(
|
||||
const SubscribeCallback<WorkerID, WorkerFailureData> &subscribe,
|
||||
const SubscribeCallback<WorkerID, WorkerTableData> &subscribe,
|
||||
const StatusCallback &done) {
|
||||
RAY_CHECK(subscribe != nullptr);
|
||||
return worker_failure_sub_executor_.AsyncSubscribeAll(ClientID::Nil(), subscribe, done);
|
||||
}
|
||||
|
||||
Status RedisWorkerInfoAccessor::AsyncReportWorkerFailure(
|
||||
const std::shared_ptr<WorkerFailureData> &data_ptr, const StatusCallback &callback) {
|
||||
WorkerFailureTable::WriteCallback on_done = nullptr;
|
||||
const std::shared_ptr<WorkerTableData> &data_ptr, const StatusCallback &callback) {
|
||||
WorkerTable::WriteCallback on_done = nullptr;
|
||||
if (callback != nullptr) {
|
||||
on_done = [callback](RedisGcsClient *client, const WorkerID &id,
|
||||
const WorkerFailureData &data) { callback(Status::OK()); };
|
||||
const WorkerTableData &data) { callback(Status::OK()); };
|
||||
}
|
||||
|
||||
WorkerID worker_id = WorkerID::FromBinary(data_ptr->worker_address().worker_id());
|
||||
WorkerFailureTable &worker_failure_table = client_impl_->worker_failure_table();
|
||||
WorkerTable &worker_failure_table = client_impl_->worker_table();
|
||||
return worker_failure_table.Add(JobID::Nil(), worker_id, data_ptr, on_done);
|
||||
}
|
||||
|
||||
Status RedisWorkerInfoAccessor::AsyncRegisterWorker(
|
||||
rpc::WorkerType worker_type, const WorkerID &worker_id,
|
||||
const std::unordered_map<std::string, std::string> &worker_info,
|
||||
const StatusCallback &callback) {
|
||||
std::vector<std::string> args;
|
||||
args.emplace_back("HMSET");
|
||||
if (worker_type == rpc::WorkerType::DRIVER) {
|
||||
args.emplace_back("Drivers:" + worker_id.Binary());
|
||||
} else {
|
||||
args.emplace_back("Workers:" + worker_id.Binary());
|
||||
}
|
||||
for (const auto &entry : worker_info) {
|
||||
args.push_back(entry.first);
|
||||
args.push_back(entry.second);
|
||||
}
|
||||
Status RedisWorkerInfoAccessor::AsyncGet(
|
||||
const WorkerID &worker_id,
|
||||
const OptionalItemCallback<rpc::WorkerTableData> &callback) {
|
||||
return Status::Invalid("Not implemented");
|
||||
}
|
||||
|
||||
auto status = client_impl_->primary_context()->RunArgvAsync(args);
|
||||
if (callback) {
|
||||
// TODO (kfstorm): Invoke the callback asynchronously.
|
||||
callback(status);
|
||||
}
|
||||
return status;
|
||||
Status RedisWorkerInfoAccessor::AsyncGetAll(
|
||||
const MultiItemCallback<rpc::WorkerTableData> &callback) {
|
||||
return Status::Invalid("Not implemented");
|
||||
}
|
||||
|
||||
Status RedisWorkerInfoAccessor::AsyncAdd(
|
||||
const std::shared_ptr<rpc::WorkerTableData> &data_ptr,
|
||||
const StatusCallback &callback) {
|
||||
return Status::Invalid("Not implemented");
|
||||
}
|
||||
|
||||
} // namespace gcs
|
||||
|
||||
@@ -436,23 +436,26 @@ class RedisWorkerInfoAccessor : public WorkerInfoAccessor {
|
||||
virtual ~RedisWorkerInfoAccessor() = default;
|
||||
|
||||
Status AsyncSubscribeToWorkerFailures(
|
||||
const SubscribeCallback<WorkerID, WorkerFailureData> &subscribe,
|
||||
const SubscribeCallback<WorkerID, WorkerTableData> &subscribe,
|
||||
const StatusCallback &done) override;
|
||||
|
||||
Status AsyncReportWorkerFailure(const std::shared_ptr<WorkerFailureData> &data_ptr,
|
||||
Status AsyncReportWorkerFailure(const std::shared_ptr<WorkerTableData> &data_ptr,
|
||||
const StatusCallback &callback) override;
|
||||
|
||||
Status AsyncRegisterWorker(
|
||||
rpc::WorkerType worker_type, const WorkerID &worker_id,
|
||||
const std::unordered_map<std::string, std::string> &worker_info,
|
||||
const StatusCallback &callback) override;
|
||||
Status AsyncGet(const WorkerID &worker_id,
|
||||
const OptionalItemCallback<rpc::WorkerTableData> &callback) override;
|
||||
|
||||
Status AsyncGetAll(const MultiItemCallback<rpc::WorkerTableData> &callback) override;
|
||||
|
||||
Status AsyncAdd(const std::shared_ptr<rpc::WorkerTableData> &data_ptr,
|
||||
const StatusCallback &callback) override;
|
||||
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override {}
|
||||
|
||||
private:
|
||||
RedisGcsClient *client_impl_{nullptr};
|
||||
|
||||
typedef SubscriptionExecutor<WorkerID, WorkerFailureData, WorkerFailureTable>
|
||||
typedef SubscriptionExecutor<WorkerID, WorkerTableData, WorkerTable>
|
||||
WorkerFailureSubscriptionExecutor;
|
||||
WorkerFailureSubscriptionExecutor worker_failure_sub_executor_;
|
||||
};
|
||||
|
||||
@@ -68,13 +68,14 @@ Status RedisGcsClient::Connect(boost::asio::io_service &io_service) {
|
||||
actor_checkpoint_table_.reset(new ActorCheckpointTable(shard_contexts, this));
|
||||
actor_checkpoint_id_table_.reset(new ActorCheckpointIdTable(shard_contexts, this));
|
||||
resource_table_.reset(new DynamicResourceTable({primary_context}, this));
|
||||
worker_failure_table_.reset(new WorkerFailureTable(shard_contexts, this));
|
||||
worker_table_.reset(new WorkerTable(shard_contexts, this));
|
||||
|
||||
if (RayConfig::instance().gcs_actor_service_enabled()) {
|
||||
actor_accessor_.reset(new RedisActorInfoAccessor(this));
|
||||
} else {
|
||||
actor_accessor_.reset(new RedisLogBasedActorInfoAccessor(this));
|
||||
}
|
||||
|
||||
job_accessor_.reset(new RedisJobInfoAccessor(this));
|
||||
object_accessor_.reset(new RedisObjectInfoAccessor(this));
|
||||
node_accessor_.reset(new RedisNodeInfoAccessor(this));
|
||||
@@ -123,9 +124,7 @@ LogBasedActorTable &RedisGcsClient::log_based_actor_table() {
|
||||
|
||||
ActorTable &RedisGcsClient::actor_table() { return *actor_table_; }
|
||||
|
||||
WorkerFailureTable &RedisGcsClient::worker_failure_table() {
|
||||
return *worker_failure_table_;
|
||||
}
|
||||
WorkerTable &RedisGcsClient::worker_table() { return *worker_table_; }
|
||||
|
||||
TaskReconstructionLog &RedisGcsClient::task_reconstruction_log() {
|
||||
return *task_reconstruction_log_;
|
||||
|
||||
@@ -107,7 +107,7 @@ class RAY_EXPORT RedisGcsClient : public GcsClient {
|
||||
/// Implements the Stats() interface.
|
||||
ProfileTable &profile_table();
|
||||
/// Implements the Workers() interface.
|
||||
WorkerFailureTable &worker_failure_table();
|
||||
WorkerTable &worker_table();
|
||||
|
||||
private:
|
||||
// GCS command type. If CommandType::kChain, chain-replicated versions of the tables
|
||||
@@ -130,7 +130,7 @@ class RAY_EXPORT RedisGcsClient : public GcsClient {
|
||||
std::unique_ptr<ActorCheckpointTable> actor_checkpoint_table_;
|
||||
std::unique_ptr<ActorCheckpointIdTable> actor_checkpoint_id_table_;
|
||||
std::unique_ptr<DynamicResourceTable> resource_table_;
|
||||
std::unique_ptr<WorkerFailureTable> worker_failure_table_;
|
||||
std::unique_ptr<WorkerTable> worker_table_;
|
||||
std::unique_ptr<JobTable> job_table_;
|
||||
};
|
||||
|
||||
|
||||
@@ -210,7 +210,7 @@ template class SubscriptionExecutor<ClientID, ResourceChangeNotification,
|
||||
template class SubscriptionExecutor<ClientID, HeartbeatTableData, HeartbeatTable>;
|
||||
template class SubscriptionExecutor<ClientID, HeartbeatBatchTableData,
|
||||
HeartbeatBatchTable>;
|
||||
template class SubscriptionExecutor<WorkerID, WorkerFailureData, WorkerFailureTable>;
|
||||
template class SubscriptionExecutor<WorkerID, WorkerTableData, WorkerTable>;
|
||||
|
||||
} // namespace gcs
|
||||
|
||||
|
||||
@@ -872,10 +872,10 @@ template class Log<JobID, JobTableData>;
|
||||
template class Log<UniqueID, ProfileTableData>;
|
||||
template class Log<ClientID, HeartbeatTableData>;
|
||||
template class Log<ClientID, HeartbeatBatchTableData>;
|
||||
template class Log<WorkerID, WorkerFailureData>;
|
||||
template class Log<WorkerID, WorkerTableData>;
|
||||
template class Table<ActorCheckpointID, ActorCheckpointData>;
|
||||
template class Table<ActorID, ActorCheckpointIdData>;
|
||||
template class Table<WorkerID, WorkerFailureData>;
|
||||
template class Table<WorkerID, WorkerTableData>;
|
||||
template class Table<ActorID, ActorTableData>;
|
||||
|
||||
template class Log<ClientID, ResourceTableData>;
|
||||
|
||||
@@ -52,7 +52,7 @@ using rpc::TablePubsub;
|
||||
using rpc::TaskLeaseData;
|
||||
using rpc::TaskReconstructionData;
|
||||
using rpc::TaskTableData;
|
||||
using rpc::WorkerFailureData;
|
||||
using rpc::WorkerTableData;
|
||||
|
||||
class RedisContext;
|
||||
|
||||
@@ -754,15 +754,15 @@ class ActorTable : public Table<ActorID, ActorTableData> {
|
||||
Status Get(const ActorID &actor_id, ActorTableData *actor_table_data);
|
||||
};
|
||||
|
||||
class WorkerFailureTable : public Table<WorkerID, WorkerFailureData> {
|
||||
class WorkerTable : public Table<WorkerID, WorkerTableData> {
|
||||
public:
|
||||
WorkerFailureTable(const std::vector<std::shared_ptr<RedisContext>> &contexts,
|
||||
RedisGcsClient *client)
|
||||
WorkerTable(const std::vector<std::shared_ptr<RedisContext>> &contexts,
|
||||
RedisGcsClient *client)
|
||||
: Table(contexts, client) {
|
||||
pubsub_channel_ = TablePubsub::WORKER_FAILURE_PUBSUB;
|
||||
prefix_ = TablePrefix::WORKER_FAILURE;
|
||||
prefix_ = TablePrefix::WORKERS;
|
||||
}
|
||||
virtual ~WorkerFailureTable() {}
|
||||
virtual ~WorkerTable() {}
|
||||
};
|
||||
|
||||
class TaskReconstructionLog : public Log<TaskID, TaskReconstructionData> {
|
||||
|
||||
@@ -127,10 +127,10 @@ struct Mocker {
|
||||
return error_table_data;
|
||||
}
|
||||
|
||||
static std::shared_ptr<rpc::WorkerFailureData> GenWorkerFailureData() {
|
||||
auto worker_failure_data = std::make_shared<rpc::WorkerFailureData>();
|
||||
worker_failure_data->set_timestamp(std::time(nullptr));
|
||||
return worker_failure_data;
|
||||
static std::shared_ptr<rpc::WorkerTableData> GenWorkerTableData() {
|
||||
auto worker_table_data = std::make_shared<rpc::WorkerTableData>();
|
||||
worker_table_data->set_timestamp(std::time(nullptr));
|
||||
return worker_table_data;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -41,7 +41,8 @@ enum TablePrefix {
|
||||
ACTOR_CHECKPOINT_ID = 16;
|
||||
NODE_RESOURCE = 17;
|
||||
DIRECT_ACTOR = 18;
|
||||
WORKER_FAILURE = 19;
|
||||
// WORKER is already used in WorkerType, so use WORKERS here.
|
||||
WORKERS = 19;
|
||||
TABLE_PREFIX_MAX = 20;
|
||||
}
|
||||
|
||||
@@ -296,13 +297,19 @@ message ActorCheckpointIdData {
|
||||
repeated uint64 timestamps = 3;
|
||||
}
|
||||
|
||||
message WorkerFailureData {
|
||||
message WorkerTableData {
|
||||
// Is this worker alive.
|
||||
bool is_alive = 1;
|
||||
// Address of the worker that failed.
|
||||
Address worker_address = 1;
|
||||
// The UNIX timestamp at which the worker failed.
|
||||
Address worker_address = 2;
|
||||
// The UNIX timestamp at which this worker's state was updated.
|
||||
int64 timestamp = 3;
|
||||
// Is intentional disconnect
|
||||
// Whether it's an intentional disconnect, only applies then `is_alive` is false.
|
||||
bool intentional_disconnect = 4;
|
||||
// Type of this worker.
|
||||
WorkerType worker_type = 5;
|
||||
// This is for AddWorker.
|
||||
map<string, bytes> worker_info = 6;
|
||||
}
|
||||
|
||||
message ResourceMap {
|
||||
|
||||
@@ -411,23 +411,38 @@ service ErrorInfoGcsService {
|
||||
}
|
||||
|
||||
message ReportWorkerFailureRequest {
|
||||
WorkerFailureData worker_failure = 1;
|
||||
WorkerTableData worker_failure = 1;
|
||||
}
|
||||
|
||||
message ReportWorkerFailureReply {
|
||||
GcsStatus status = 1;
|
||||
}
|
||||
|
||||
message RegisterWorkerRequest {
|
||||
/// The type of the worker.
|
||||
WorkerType worker_type = 1;
|
||||
/// The ID of the worker.
|
||||
bytes worker_id = 2;
|
||||
/// The information of the worker in a dictionary.
|
||||
map<string, bytes> worker_info = 3;
|
||||
message GetWorkerInfoRequest {
|
||||
// ID of this worker.
|
||||
bytes worker_id = 1;
|
||||
}
|
||||
|
||||
message RegisterWorkerReply {
|
||||
message GetWorkerInfoReply {
|
||||
GcsStatus status = 1;
|
||||
// Data of worker.
|
||||
WorkerTableData worker_table_data = 2;
|
||||
}
|
||||
|
||||
message GetAllWorkerInfoRequest {
|
||||
}
|
||||
|
||||
message GetAllWorkerInfoReply {
|
||||
GcsStatus status = 1;
|
||||
// Data of worker
|
||||
repeated WorkerTableData worker_table_data = 2;
|
||||
}
|
||||
|
||||
message AddWorkerInfoRequest {
|
||||
WorkerTableData worker_data = 1;
|
||||
}
|
||||
|
||||
message AddWorkerInfoReply {
|
||||
GcsStatus status = 1;
|
||||
}
|
||||
|
||||
@@ -435,8 +450,12 @@ message RegisterWorkerReply {
|
||||
service WorkerInfoGcsService {
|
||||
// Report a worker failure to GCS Service.
|
||||
rpc ReportWorkerFailure(ReportWorkerFailureRequest) returns (ReportWorkerFailureReply);
|
||||
// Register a worker to GCS Service.
|
||||
rpc RegisterWorker(RegisterWorkerRequest) returns (RegisterWorkerReply);
|
||||
// Get worker information from GCS Service by worker id.
|
||||
rpc GetWorkerInfo(GetWorkerInfoRequest) returns (GetWorkerInfoReply);
|
||||
// Get information of all workers from GCS Service.
|
||||
rpc GetAllWorkerInfo(GetAllWorkerInfoRequest) returns (GetAllWorkerInfoReply);
|
||||
// Add worker information to GCS Service.
|
||||
rpc AddWorkerInfo(AddWorkerInfoRequest) returns (AddWorkerInfoReply);
|
||||
}
|
||||
|
||||
message CreateActorRequest {
|
||||
|
||||
@@ -254,7 +254,7 @@ ray::Status NodeManager::RegisterGcs() {
|
||||
// node failure. These workers can be identified by comparing the raylet_id
|
||||
// in their rpc::Address to the ID of a failed raylet.
|
||||
const auto &worker_failure_handler =
|
||||
[this](const WorkerID &id, const gcs::WorkerFailureData &worker_failure_data) {
|
||||
[this](const WorkerID &id, const gcs::WorkerTableData &worker_failure_data) {
|
||||
HandleUnexpectedWorkerFailure(worker_failure_data.worker_address());
|
||||
};
|
||||
RAY_CHECK_OK(gcs_client_->Workers().AsyncSubscribeToWorkerFailures(
|
||||
|
||||
@@ -771,7 +771,7 @@ class NodeManager : public rpc::NodeManagerServiceHandler {
|
||||
void WaitForTaskArgsRequests(std::pair<ScheduleFn, Task> &work);
|
||||
|
||||
// TODO(swang): Evict entries from these caches.
|
||||
/// Cache for the WorkerFailureTable in the GCS.
|
||||
/// Cache for the WorkerTable in the GCS.
|
||||
absl::flat_hash_set<WorkerID> failed_workers_cache_;
|
||||
/// Cache for the ClientTable in the GCS.
|
||||
absl::flat_hash_set<ClientID> failed_nodes_cache_;
|
||||
|
||||
@@ -227,8 +227,16 @@ class GcsRpcClient {
|
||||
VOID_GCS_RPC_CLIENT_METHOD(WorkerInfoGcsService, ReportWorkerFailure,
|
||||
worker_info_grpc_client_, )
|
||||
|
||||
/// Register a worker to GCS Service.
|
||||
VOID_GCS_RPC_CLIENT_METHOD(WorkerInfoGcsService, RegisterWorker,
|
||||
/// Get worker information from GCS Service.
|
||||
VOID_GCS_RPC_CLIENT_METHOD(WorkerInfoGcsService, GetWorkerInfo,
|
||||
worker_info_grpc_client_, )
|
||||
|
||||
/// Get information of all workers from GCS Service.
|
||||
VOID_GCS_RPC_CLIENT_METHOD(WorkerInfoGcsService, GetAllWorkerInfo,
|
||||
worker_info_grpc_client_, )
|
||||
|
||||
/// Add worker information to GCS Service.
|
||||
VOID_GCS_RPC_CLIENT_METHOD(WorkerInfoGcsService, AddWorkerInfo,
|
||||
worker_info_grpc_client_, )
|
||||
|
||||
private:
|
||||
|
||||
@@ -427,9 +427,17 @@ class WorkerInfoGcsServiceHandler {
|
||||
ReportWorkerFailureReply *reply,
|
||||
SendReplyCallback send_reply_callback) = 0;
|
||||
|
||||
virtual void HandleRegisterWorker(const RegisterWorkerRequest &request,
|
||||
RegisterWorkerReply *reply,
|
||||
SendReplyCallback send_reply_callback) = 0;
|
||||
virtual void HandleGetWorkerInfo(const GetWorkerInfoRequest &request,
|
||||
GetWorkerInfoReply *reply,
|
||||
SendReplyCallback send_reply_callback) = 0;
|
||||
|
||||
virtual void HandleGetAllWorkerInfo(const GetAllWorkerInfoRequest &request,
|
||||
GetAllWorkerInfoReply *reply,
|
||||
SendReplyCallback send_reply_callback) = 0;
|
||||
|
||||
virtual void HandleAddWorkerInfo(const AddWorkerInfoRequest &request,
|
||||
AddWorkerInfoReply *reply,
|
||||
SendReplyCallback send_reply_callback) = 0;
|
||||
};
|
||||
|
||||
/// The `GrpcService` for `WorkerInfoGcsService`.
|
||||
@@ -449,7 +457,9 @@ class WorkerInfoGrpcService : public GrpcService {
|
||||
const std::unique_ptr<grpc::ServerCompletionQueue> &cq,
|
||||
std::vector<std::unique_ptr<ServerCallFactory>> *server_call_factories) override {
|
||||
WORKER_INFO_SERVICE_RPC_HANDLER(ReportWorkerFailure);
|
||||
WORKER_INFO_SERVICE_RPC_HANDLER(RegisterWorker);
|
||||
WORKER_INFO_SERVICE_RPC_HANDLER(GetWorkerInfo);
|
||||
WORKER_INFO_SERVICE_RPC_HANDLER(GetAllWorkerInfo);
|
||||
WORKER_INFO_SERVICE_RPC_HANDLER(AddWorkerInfo);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
Reference in New Issue
Block a user