From de86d5aff7fcd6f43ce5fc480b00d16dc1c8ab01 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Thu, 19 Nov 2020 11:38:44 -0800 Subject: [PATCH] ActorStatisticalData() debug metrics bog down raylet with 100% CPU (#12148) * comment out bad * update --- doc/dev/RELEASE_PROCESS.rst | 3 ++- src/ray/raylet/node_manager.cc | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/dev/RELEASE_PROCESS.rst b/doc/dev/RELEASE_PROCESS.rst index 838c2b447..b14a3ef4e 100644 --- a/doc/dev/RELEASE_PROCESS.rst +++ b/doc/dev/RELEASE_PROCESS.rst @@ -51,7 +51,8 @@ This document describes the process for creating new releases. **IMPORTANT**: check that the test are actually running (printing output regularly) and aren't just stuck at an iteration. You must also check that the node CPU usage is stable - (and not increasing or decreasing over time, which indicates a leak). + (and not increasing or decreasing over time, which indicates a leak). You can see the head node + and worker node CPU utilizations in the AWS console. 3. Multi-node regression tests diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 014aad929..13e0b2db0 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -44,6 +44,8 @@ struct ActorStats { ActorStats GetActorStatisticalData( std::unordered_map actor_registry) { ActorStats item; + /* TODO(ekl) this gets slower and slower over time since we never clean up dead actors. + * https://github.com/ray-project/ray/issues/11239 for (auto &pair : actor_registry) { if (pair.second.GetState() == ray::rpc::ActorTableData::ALIVE) { item.live_actors += 1; @@ -53,6 +55,7 @@ ActorStats GetActorStatisticalData( item.dead_actors += 1; } } + */ return item; }