From 166560e4287332eef8bdf3fc65e771d6df4e7ec3 Mon Sep 17 00:00:00 2001 From: Yunzhi Zhang <35828389+zzyunzhi@users.noreply.github.com> Date: Tue, 17 Dec 2019 01:05:57 -0800 Subject: [PATCH] [Dashboard] displays resources row (#6516) --- python/ray/dashboard/dashboard.py | 26 ++++++++++++++++++++++++++ src/ray/protobuf/node_manager.proto | 2 ++ src/ray/raylet/node_manager.cc | 17 +++++++++++++++++ 3 files changed, 45 insertions(+) diff --git a/python/ray/dashboard/dashboard.py b/python/ray/dashboard/dashboard.py index ba90ca4a4..0130513aa 100644 --- a/python/ray/dashboard/dashboard.py +++ b/python/ray/dashboard/dashboard.py @@ -43,6 +43,21 @@ def to_unix_time(dt): return (dt - datetime.datetime(1970, 1, 1)).total_seconds() +def round_resource_value(quantity): + if quantity.is_integer(): + return int(quantity) + else: + return round(quantity, 2) + + +def format_resource(resource_name, quantity): + if resource_name == "object_store_memory" or resource_name == "memory": + # Convert to 100MiB chunks and then to GiB + quantity = quantity * (50 * 1024 * 1024) / (1024 * 1024 * 1024) + return f"{round_resource_value(quantity)} GiB" + return f"{round_resource_value(quantity)}" + + class Dashboard(object): """A dashboard process for monitoring Ray nodes. @@ -144,6 +159,17 @@ class Dashboard(object): async def raylet_info(req) -> aiohttp.web.Response: D = self.raylet_stats.get_raylet_stats() + for address, data in D.items(): + available_resources = data["availableResources"] + total_resources = data["totalResources"] + extra_info = "" + for resource_name in sorted(available_resources.keys()): + total = total_resources[resource_name] + occupied = total - available_resources[resource_name] + total = format_resource(resource_name, total) + occupied = format_resource(resource_name, occupied) + extra_info += f"{resource_name}: {occupied} / {total}, " + data["extraInfo"] = extra_info[:-2] return await json_response(result=D) async def logs(req) -> aiohttp.web.Response: diff --git a/src/ray/protobuf/node_manager.proto b/src/ray/protobuf/node_manager.proto index bea723e12..e7a5987c9 100644 --- a/src/ray/protobuf/node_manager.proto +++ b/src/ray/protobuf/node_manager.proto @@ -82,6 +82,8 @@ message ViewData { message NodeStatsReply { repeated WorkerStats workers_stats = 1; repeated ViewData view_data = 2; + map available_resources = 3; + map total_resources = 4; } // Service for inter-node-manager communication. diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index e480da84c..8d076b49a 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -2984,6 +2984,23 @@ void NodeManager::HandleNodeStatsRequest(const rpc::NodeStatsRequest &request, worker_stats->set_pid(driver->Pid()); worker_stats->set_is_driver(true); } + // Record available resources of this node. + const auto &available_resources = + cluster_resource_map_.at(client_id_).GetAvailableResources().GetResourceMap(); + // Record total resources of this node. + const auto &total_resources = + cluster_resource_map_.at(client_id_).GetTotalResources().GetResourceMap(); + auto available_resources_map = reply->mutable_available_resources(); + auto total_resources_map = reply->mutable_total_resources(); + for (const auto &pair : total_resources) { + (*total_resources_map)[pair.first] = pair.second; + auto it = available_resources.find(pair.first); + if (it != available_resources.end()) { + (*available_resources_map)[pair.first] = it->second; + } else { + (*available_resources_map)[pair.first] = 0.0; + } + } // Ensure we never report an empty set of metrics. if (!recorded_metrics_) { RecordMetrics();