mirror of
https://github.com/wassname/ray.git
synced 2026-07-02 23:38:26 +08:00
[GCS]GCS client support multi-thread subscribe&resubscribe&unsubscribe (#9718)
This commit is contained in:
@@ -89,11 +89,8 @@ bool ActorManager::AddActorHandle(std::unique_ptr<ActorHandle> actor_handle,
|
||||
auto actor_notification_callback =
|
||||
std::bind(&ActorManager::HandleActorStateNotification, this,
|
||||
std::placeholders::_1, std::placeholders::_2);
|
||||
{
|
||||
absl::MutexLock lock(&gcs_client_mutex_);
|
||||
RAY_CHECK_OK(gcs_client_->Actors().AsyncSubscribe(
|
||||
actor_id, actor_notification_callback, nullptr));
|
||||
}
|
||||
RAY_CHECK_OK(gcs_client_->Actors().AsyncSubscribe(
|
||||
actor_id, actor_notification_callback, nullptr));
|
||||
|
||||
if (!RayConfig::instance().gcs_actor_service_enabled()) {
|
||||
RAY_CHECK(reference_counter_->SetDeleteCallback(
|
||||
|
||||
@@ -174,11 +174,8 @@ class ActorManager {
|
||||
void HandleActorStateNotification(const ActorID &actor_id,
|
||||
const gcs::ActorTableData &actor_data);
|
||||
|
||||
/// Mutex to protect the gcs_client_ field.
|
||||
/// NOTE: Now gcs client is not thread safe, so we add lock protection.
|
||||
mutable absl::Mutex gcs_client_mutex_;
|
||||
/// GCS client.
|
||||
std::shared_ptr<gcs::GcsClient> gcs_client_ GUARDED_BY(gcs_client_mutex_);
|
||||
std::shared_ptr<gcs::GcsClient> gcs_client_;
|
||||
|
||||
/// Interface to submit tasks directly to other actors.
|
||||
std::shared_ptr<CoreWorkerDirectActorTaskSubmitterInterface> direct_actor_submitter_;
|
||||
|
||||
@@ -326,8 +326,11 @@ Status ServiceBasedActorInfoAccessor::AsyncSubscribe(
|
||||
on_subscribe, subscribe_done);
|
||||
};
|
||||
|
||||
subscribe_operations_[actor_id] = subscribe_operation;
|
||||
fetch_data_operations_[actor_id] = fetch_data_operation;
|
||||
{
|
||||
absl::MutexLock lock(&mutex_);
|
||||
subscribe_operations_[actor_id] = subscribe_operation;
|
||||
fetch_data_operations_[actor_id] = fetch_data_operation;
|
||||
}
|
||||
return subscribe_operation(
|
||||
[fetch_data_operation, done](const Status &status) { fetch_data_operation(done); });
|
||||
}
|
||||
@@ -335,6 +338,7 @@ Status ServiceBasedActorInfoAccessor::AsyncSubscribe(
|
||||
Status ServiceBasedActorInfoAccessor::AsyncUnsubscribe(const ActorID &actor_id) {
|
||||
RAY_LOG(DEBUG) << "Cancelling subscription to an actor, actor id = " << actor_id;
|
||||
auto status = client_impl_->GetGcsPubSub().Unsubscribe(ACTOR_CHANNEL, actor_id.Hex());
|
||||
absl::MutexLock lock(&mutex_);
|
||||
subscribe_operations_.erase(actor_id);
|
||||
fetch_data_operations_.erase(actor_id);
|
||||
RAY_LOG(DEBUG) << "Finished cancelling subscription to an actor, actor id = "
|
||||
@@ -418,6 +422,7 @@ void ServiceBasedActorInfoAccessor::AsyncResubscribe(bool is_pubsub_server_resta
|
||||
// If only the GCS sever has restarted, we only need to fetch data from the GCS server.
|
||||
// If the pub-sub server has also restarted, we need to resubscribe to the pub-sub
|
||||
// server first, then fetch data from the GCS server.
|
||||
absl::MutexLock lock(&mutex_);
|
||||
if (is_pubsub_server_restarted) {
|
||||
if (subscribe_all_operation_ != nullptr) {
|
||||
RAY_CHECK_OK(subscribe_all_operation_(
|
||||
@@ -426,7 +431,14 @@ void ServiceBasedActorInfoAccessor::AsyncResubscribe(bool is_pubsub_server_resta
|
||||
for (auto &item : subscribe_operations_) {
|
||||
auto &actor_id = item.first;
|
||||
RAY_CHECK_OK(item.second([this, actor_id](const Status &status) {
|
||||
fetch_data_operations_[actor_id](nullptr);
|
||||
absl::MutexLock lock(&mutex_);
|
||||
auto fetch_data_operation = fetch_data_operations_[actor_id];
|
||||
// `fetch_data_operation` is called in the callback function of subscribe.
|
||||
// Before that, if the user calls `AsyncUnsubscribe` function, the corresponding
|
||||
// fetch function will be deleted, so we need to check if it's null.
|
||||
if (fetch_data_operation != nullptr) {
|
||||
fetch_data_operation(nullptr);
|
||||
}
|
||||
}));
|
||||
}
|
||||
} else {
|
||||
@@ -1218,8 +1230,11 @@ Status ServiceBasedObjectInfoAccessor::AsyncSubscribeToLocations(
|
||||
on_subscribe, subscribe_done);
|
||||
};
|
||||
|
||||
subscribe_object_operations_[object_id] = subscribe_operation;
|
||||
fetch_object_data_operations_[object_id] = fetch_data_operation;
|
||||
{
|
||||
absl::MutexLock lock(&mutex_);
|
||||
subscribe_object_operations_[object_id] = subscribe_operation;
|
||||
fetch_object_data_operations_[object_id] = fetch_data_operation;
|
||||
}
|
||||
return subscribe_operation(
|
||||
[fetch_data_operation, done](const Status &status) { fetch_data_operation(done); });
|
||||
}
|
||||
@@ -1229,10 +1244,18 @@ void ServiceBasedObjectInfoAccessor::AsyncResubscribe(bool is_pubsub_server_rest
|
||||
// If only the GCS sever has restarted, we only need to fetch data from the GCS server.
|
||||
// If the pub-sub server has also restarted, we need to resubscribe to the pub-sub
|
||||
// server first, then fetch data from the GCS server.
|
||||
absl::MutexLock lock(&mutex_);
|
||||
if (is_pubsub_server_restarted) {
|
||||
for (auto &item : subscribe_object_operations_) {
|
||||
RAY_CHECK_OK(item.second([this, item](const Status &status) {
|
||||
fetch_object_data_operations_[item.first](nullptr);
|
||||
absl::MutexLock lock(&mutex_);
|
||||
auto fetch_object_data_operation = fetch_object_data_operations_[item.first];
|
||||
// `fetch_object_data_operation` is called in the callback function of subscribe.
|
||||
// Before that, if the user calls `AsyncUnsubscribeToLocations` function, the
|
||||
// corresponding fetch function will be deleted, so we need to check if it's null.
|
||||
if (fetch_object_data_operation != nullptr) {
|
||||
fetch_object_data_operation(nullptr);
|
||||
}
|
||||
}));
|
||||
}
|
||||
} else {
|
||||
@@ -1246,6 +1269,7 @@ Status ServiceBasedObjectInfoAccessor::AsyncUnsubscribeToLocations(
|
||||
const ObjectID &object_id) {
|
||||
RAY_LOG(DEBUG) << "Unsubscribing object location, object id = " << object_id;
|
||||
auto status = client_impl_->GetGcsPubSub().Unsubscribe(OBJECT_CHANNEL, object_id.Hex());
|
||||
absl::MutexLock lock(&mutex_);
|
||||
subscribe_object_operations_.erase(object_id);
|
||||
fetch_object_data_operations_.erase(object_id);
|
||||
RAY_LOG(DEBUG) << "Finished unsubscribing object location, object id = " << object_id;
|
||||
|
||||
@@ -126,11 +126,16 @@ class ServiceBasedActorInfoAccessor : public ActorInfoAccessor {
|
||||
/// server restarts from a failure.
|
||||
FetchDataOperation fetch_all_data_operation_;
|
||||
|
||||
// Mutex to protect the subscribe_operations_ field and fetch_data_operations_ field.
|
||||
absl::Mutex mutex_;
|
||||
|
||||
/// Save the subscribe operation of actors.
|
||||
std::unordered_map<ActorID, SubscribeOperation> subscribe_operations_;
|
||||
std::unordered_map<ActorID, SubscribeOperation> subscribe_operations_
|
||||
GUARDED_BY(mutex_);
|
||||
|
||||
/// Save the fetch data operation of actors.
|
||||
std::unordered_map<ActorID, FetchDataOperation> fetch_data_operations_;
|
||||
std::unordered_map<ActorID, FetchDataOperation> fetch_data_operations_
|
||||
GUARDED_BY(mutex_);
|
||||
|
||||
ServiceBasedGcsClient *client_impl_;
|
||||
|
||||
@@ -330,13 +335,19 @@ class ServiceBasedObjectInfoAccessor : public ObjectInfoAccessor {
|
||||
void AsyncResubscribe(bool is_pubsub_server_restarted) override;
|
||||
|
||||
private:
|
||||
// Mutex to protect the subscribe_object_operations_ field and
|
||||
// fetch_object_data_operations_ field.
|
||||
absl::Mutex mutex_;
|
||||
|
||||
/// Save the subscribe operations, so we can call them again when PubSub
|
||||
/// server restarts from a failure.
|
||||
std::unordered_map<ObjectID, SubscribeOperation> subscribe_object_operations_;
|
||||
std::unordered_map<ObjectID, SubscribeOperation> subscribe_object_operations_
|
||||
GUARDED_BY(mutex_);
|
||||
|
||||
/// Save the fetch data operation in this function, so we can call it again when GCS
|
||||
/// server restarts from a failure.
|
||||
std::unordered_map<ObjectID, FetchDataOperation> fetch_object_data_operations_;
|
||||
std::unordered_map<ObjectID, FetchDataOperation> fetch_object_data_operations_
|
||||
GUARDED_BY(mutex_);
|
||||
|
||||
ServiceBasedGcsClient *client_impl_;
|
||||
|
||||
|
||||
@@ -1133,6 +1133,52 @@ TEST_F(ServiceBasedGcsClientTest, TestGcsRedisFailureDetector) {
|
||||
RAY_CHECK(gcs_server_->IsStopped());
|
||||
}
|
||||
|
||||
TEST_F(ServiceBasedGcsClientTest, TestMultiThreadSubAndUnsub) {
|
||||
auto sub_finished_count = std::make_shared<std::atomic<int>>(0);
|
||||
int size = 5;
|
||||
std::vector<std::unique_ptr<std::thread>> threads;
|
||||
threads.resize(size);
|
||||
|
||||
// The number of times each thread executes subscribe & resubscribe & unsubscribe.
|
||||
const int sub_and_unsub_loop_count = 20;
|
||||
|
||||
// Multithreading subscribe/resubscribe/unsubscribe actors.
|
||||
auto job_id = JobID::FromInt(1);
|
||||
for (int index = 0; index < size; ++index) {
|
||||
threads[index].reset(new std::thread([this, job_id] {
|
||||
for (int index = 0; index < sub_and_unsub_loop_count; ++index) {
|
||||
auto actor_id = ActorID::Of(job_id, RandomTaskId(), 0);
|
||||
ASSERT_TRUE(SubscribeActor(
|
||||
actor_id, [](const ActorID &id, const rpc::ActorTableData &result) {}));
|
||||
gcs_client_->Actors().AsyncResubscribe(false);
|
||||
UnsubscribeActor(actor_id);
|
||||
}
|
||||
}));
|
||||
}
|
||||
for (auto &thread : threads) {
|
||||
thread->join();
|
||||
thread.reset();
|
||||
}
|
||||
|
||||
// Multithreading subscribe/resubscribe/unsubscribe objects.
|
||||
for (int index = 0; index < size; ++index) {
|
||||
threads[index].reset(new std::thread([this] {
|
||||
for (int index = 0; index < sub_and_unsub_loop_count; ++index) {
|
||||
auto object_id = ObjectID::FromRandom();
|
||||
ASSERT_TRUE(SubscribeToLocations(
|
||||
object_id,
|
||||
[](const ObjectID &id, const gcs::ObjectChangeNotification &result) {}));
|
||||
gcs_client_->Objects().AsyncResubscribe(false);
|
||||
UnsubscribeToLocations(object_id);
|
||||
}
|
||||
}));
|
||||
}
|
||||
for (auto &thread : threads) {
|
||||
thread->join();
|
||||
thread.reset();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ray
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
||||
Reference in New Issue
Block a user