From 30f82329e39b5cfae84589231eacbb8f84dcd2d9 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 29 Jan 2021 17:55:46 -0800 Subject: [PATCH] [core] Add debug information for the PullManager and LocalObjectManager (#13782) * Add debug info * Formatting. Co-authored-by: SangBin Cho --- src/ray/object_manager/object_manager.cc | 1 + src/ray/object_manager/pull_manager.cc | 12 ++++++++++++ src/ray/object_manager/pull_manager.h | 2 ++ src/ray/raylet/local_object_manager.cc | 18 +++++++++++++++++- src/ray/raylet/local_object_manager.h | 5 +++++ src/ray/raylet/node_manager.cc | 1 + src/ray/raylet/worker_pool.cc | 4 ++++ 7 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/ray/object_manager/object_manager.cc b/src/ray/object_manager/object_manager.cc index ddd71c766..448245e01 100644 --- a/src/ray/object_manager/object_manager.cc +++ b/src/ray/object_manager/object_manager.cc @@ -818,6 +818,7 @@ std::string ObjectManager::DebugString() const { result << "\n" << object_directory_->DebugString(); result << "\n" << store_notification_->DebugString(); result << "\n" << buffer_pool_.DebugString(); + result << "\n" << pull_manager_->DebugString(); return result.str(); } diff --git a/src/ray/object_manager/pull_manager.cc b/src/ray/object_manager/pull_manager.cc index f4920a8de..9be63c7e1 100644 --- a/src/ray/object_manager/pull_manager.cc +++ b/src/ray/object_manager/pull_manager.cc @@ -424,4 +424,16 @@ void PullManager::Tick() { int PullManager::NumActiveRequests() const { return object_pull_requests_.size(); } +std::string PullManager::DebugString() const { + std::stringstream result; + result << "PullManager:"; + result << "\n- num bytes available for pulled objects: " << num_bytes_available_; + result << "\n- num bytes being pulled: " << num_bytes_being_pulled_; + result << "\n- num pull request bundles: " << pull_request_bundles_.size(); + result << "\n- num objects requested pull: " << object_pull_requests_.size(); + result << "\n- num objects actively being pulled: " + << active_object_pull_requests_.size(); + return result.str(); +} + } // namespace ray diff --git a/src/ray/object_manager/pull_manager.h b/src/ray/object_manager/pull_manager.h index 3a542fef7..b0c80e338 100644 --- a/src/ray/object_manager/pull_manager.h +++ b/src/ray/object_manager/pull_manager.h @@ -100,6 +100,8 @@ class PullManager { /// The number of ongoing object pulls. int NumActiveRequests() const; + std::string DebugString() const; + private: /// A helper structure for tracking information about each ongoing object pull. struct ObjectPullRequest { diff --git a/src/ray/raylet/local_object_manager.cc b/src/ray/raylet/local_object_manager.cc index 9909beb76..9ebaf75a8 100644 --- a/src/ray/raylet/local_object_manager.cc +++ b/src/ray/raylet/local_object_manager.cc @@ -32,6 +32,7 @@ void LocalObjectManager::PinObjects(const std::vector &object_ids, continue; } RAY_LOG(DEBUG) << "Pinning object " << object_id; + pinned_objects_size_ += object->GetSize(); pinned_objects_.emplace(object_id, std::move(object)); } } @@ -69,7 +70,10 @@ void LocalObjectManager::ReleaseFreedObject(const ObjectID &object_id) { if (automatic_object_deletion_enabled_) { spilled_object_pending_delete_.push(object_id); } - pinned_objects_.erase(object_id); + if (pinned_objects_.count(object_id)) { + pinned_objects_size_ -= pinned_objects_[object_id]->GetSize(); + pinned_objects_.erase(object_id); + } } // Try to evict all copies of the object from the cluster. @@ -237,6 +241,7 @@ void LocalObjectManager::SpillObjectsInternal( for (const auto &object_id : objects_to_spill) { auto it = objects_pending_spill_.find(object_id); RAY_CHECK(it != objects_pending_spill_.end()); + pinned_objects_size_ += it->second->GetSize(); pinned_objects_.emplace(object_id, std::move(it->second)); objects_pending_spill_.erase(it); } @@ -454,6 +459,17 @@ void LocalObjectManager::FillObjectSpillingStats(rpc::GetNodeStatsReply *reply) stats->set_restored_objects_total(restored_objects_total_); } +std::string LocalObjectManager::DebugString() const { + std::stringstream result; + result << "LocalObjectManager:\n"; + result << "- num pinned objects: " << pinned_objects_.size() << "\n"; + result << "- pinned objects size: " << pinned_objects_size_ << "\n"; + result << "- num objects pending restore: " << objects_pending_restore_.size() << "\n"; + result << "- num objects pending spill: " << objects_pending_spill_.size() << "\n"; + result << "- num bytes pending spill: " << num_bytes_pending_spill_ << "\n"; + return result.str(); +} + }; // namespace raylet }; // namespace ray diff --git a/src/ray/raylet/local_object_manager.h b/src/ray/raylet/local_object_manager.h index c4f157d58..57ef8d3a1 100644 --- a/src/ray/raylet/local_object_manager.h +++ b/src/ray/raylet/local_object_manager.h @@ -136,6 +136,8 @@ class LocalObjectManager { /// \param Output parameter. void FillObjectSpillingStats(rpc::GetNodeStatsReply *reply) const; + std::string DebugString() const; + private: FRIEND_TEST(LocalObjectManagerTest, TestSpillObjectsOfSize); FRIEND_TEST(LocalObjectManagerTest, @@ -203,6 +205,9 @@ class LocalObjectManager { // Objects that are pinned on this node. absl::flat_hash_map> pinned_objects_; + // Total size of objects pinned on this node. + size_t pinned_objects_size_ = 0; + // Objects that were pinned on this node but that are being spilled. // These objects will be released once spilling is complete and the URL is // written to the object directory. diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 251e28e26..cbe287ef7 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -2334,6 +2334,7 @@ std::string NodeManager::DebugString() const { for (auto &pair : cluster_resource_map_) { result << "\n" << pair.first.Hex() << ": " << pair.second.DebugString(); } + result << "\n" << local_object_manager_.DebugString(); result << "\n" << object_manager_.DebugString(); result << "\n" << gcs_client_->DebugString(); result << "\n" << worker_pool_.DebugString(); diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc index 4ed257f46..ff6083199 100644 --- a/src/ray/raylet/worker_pool.cc +++ b/src/ray/raylet/worker_pool.cc @@ -1037,6 +1037,10 @@ std::string WorkerPool::DebugString() const { << " workers: " << entry.second.registered_workers.size(); result << "\n- num " << Language_Name(entry.first) << " drivers: " << entry.second.registered_drivers.size(); + result << "\n- num object spill callbacks queued: " + << entry.second.spill_io_worker_state.pending_io_tasks.size(); + result << "\n- num object restore queued: " + << entry.second.restore_io_worker_state.pending_io_tasks.size(); } result << "\n- num idle workers: " << idle_of_all_languages_.size(); return result.str();