[Autoscaler/Core] Remove autoscaler spam (#12952)

This commit is contained in:
Alex Wu
2020-12-18 18:22:45 -08:00
committed by GitHub
parent ac5ea2c13d
commit 404161a3ff
3 changed files with 45 additions and 12 deletions
+12 -12
View File
@@ -79,27 +79,27 @@ class LoadMetrics:
active_ips = set(active_ips)
active_ips.add(self.local_ip)
def prune(mapping):
def prune(mapping, should_log):
unwanted = set(mapping) - active_ips
for unwanted_key in unwanted:
# TODO (Alex): Change this back to info after #12138.
logger.debug("LoadMetrics: "
"Removed mapping: {} - {}".format(
unwanted_key, mapping[unwanted_key]))
if should_log:
logger.info("LoadMetrics: "
"Removed mapping: {} - {}".format(
unwanted_key, mapping[unwanted_key]))
del mapping[unwanted_key]
if unwanted:
if unwanted and should_log:
# TODO (Alex): Change this back to info after #12138.
logger.debug(
logger.info(
"LoadMetrics: "
"Removed {} stale ip mappings: {} not in {}".format(
len(unwanted), unwanted, active_ips))
assert not (unwanted & set(mapping))
prune(self.last_used_time_by_ip)
prune(self.static_resources_by_ip)
prune(self.dynamic_resources_by_ip)
prune(self.resource_load_by_ip)
prune(self.last_heartbeat_time_by_ip)
prune(self.last_used_time_by_ip, should_log=True)
prune(self.static_resources_by_ip, should_log=False)
prune(self.dynamic_resources_by_ip, should_log=False)
prune(self.resource_load_by_ip, should_log=False)
prune(self.last_heartbeat_time_by_ip, should_log=False)
def get_node_resources(self):
"""Return a list of node resources (static resource sizes).
@@ -258,6 +258,7 @@ std::shared_ptr<rpc::GcsNodeInfo> GcsNodeManager::RemoveNode(
// Remove from cluster resources.
gcs_resource_manager_->OnNodeDead(node_id);
resources_buffer_.erase(node_id);
node_resource_usages_.erase(node_id);
if (!is_intended) {
// Broadcast a warning to all of the drivers indicating that the node
// has been marked as dead.
@@ -41,11 +41,43 @@ TEST_F(GcsNodeManagerTest, TestManagement) {
auto node = Mocker::GenNodeInfo();
auto node_id = NodeID::FromBinary(node->node_id());
{
rpc::GetAllResourceUsageRequest request;
rpc::GetAllResourceUsageReply reply;
auto send_reply_callback = [](ray::Status status, std::function<void()> f1,
std::function<void()> f2) {};
node_manager.HandleGetAllResourceUsage(request, &reply, send_reply_callback);
ASSERT_EQ(reply.resource_usage_data().batch().size(), 0);
}
node_manager.AddNode(node);
ASSERT_EQ(node, node_manager.GetAliveNode(node_id).value());
rpc::ReportResourceUsageRequest report_request;
(*report_request.mutable_resources()->mutable_resources_available())["CPU"] = 2;
(*report_request.mutable_resources()->mutable_resources_total())["CPU"] = 2;
node_manager.UpdateNodeResourceUsage(node_id, report_request);
{
rpc::GetAllResourceUsageRequest request;
rpc::GetAllResourceUsageReply reply;
auto send_reply_callback = [](ray::Status status, std::function<void()> f1,
std::function<void()> f2) {};
node_manager.HandleGetAllResourceUsage(request, &reply, send_reply_callback);
ASSERT_EQ(reply.resource_usage_data().batch().size(), 1);
}
node_manager.RemoveNode(node_id);
ASSERT_TRUE(!node_manager.GetAliveNode(node_id).has_value());
{
rpc::GetAllResourceUsageRequest request;
rpc::GetAllResourceUsageReply reply;
auto send_reply_callback = [](ray::Status status, std::function<void()> f1,
std::function<void()> f2) {};
node_manager.HandleGetAllResourceUsage(request, &reply, send_reply_callback);
ASSERT_EQ(reply.resource_usage_data().batch().size(), 0);
}
}
TEST_F(GcsNodeManagerTest, TestListener) {