From 46af99ee253e94267592a4c605a605f887441d0c Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Tue, 11 Aug 2020 12:31:27 -0700 Subject: [PATCH] [GCS Actor Management] Race condition around creating -> created phase. (#10035) * Fix the issue. * Address a code review. --- src/ray/gcs/gcs_server/gcs_actor_manager.cc | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.cc b/src/ray/gcs/gcs_server/gcs_actor_manager.cc index 9d5a81b0e..4c45b3aed 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.cc @@ -891,6 +891,14 @@ void GcsActorManager::OnActorCreationSuccess(const std::shared_ptr &ac } actor->UpdateState(rpc::ActorTableData::ALIVE); auto actor_table_data = actor->GetActorTableData(); + + // We should register the entry to the in-memory index before flushing them to + // GCS because otherwise, there could be timing problems due to asynchronous Put. + auto worker_id = actor->GetWorkerID(); + auto node_id = actor->GetNodeID(); + RAY_CHECK(!worker_id.IsNil()); + RAY_CHECK(!node_id.IsNil()); + RAY_CHECK(created_actors_[node_id].emplace(worker_id, actor_id).second); // The backend storage is reliable in the future, so the status must be ok. RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, actor_table_data, @@ -898,7 +906,6 @@ void GcsActorManager::OnActorCreationSuccess(const std::shared_ptr &ac RAY_CHECK_OK(gcs_pub_sub_->Publish(ACTOR_CHANNEL, actor_id.Hex(), actor_table_data.SerializeAsString(), nullptr)); - // Invoke all callbacks for all registration requests of this actor (duplicated // requests are included) and remove all of them from // actor_to_create_callbacks_. @@ -909,12 +916,6 @@ void GcsActorManager::OnActorCreationSuccess(const std::shared_ptr &ac } actor_to_create_callbacks_.erase(iter); } - - auto worker_id = actor->GetWorkerID(); - auto node_id = actor->GetNodeID(); - RAY_CHECK(!worker_id.IsNil()); - RAY_CHECK(!node_id.IsNil()); - RAY_CHECK(created_actors_[node_id].emplace(worker_id, actor_id).second); })); }