Export additional metrics to Prometheus (#14061)

This commit is contained in:
Kathryn Zhou
2021-02-14 23:16:26 -08:00
committed by GitHub
parent b45ae76765
commit 82539f2da4
2 changed files with 74 additions and 8 deletions
+67 -7
View File
@@ -77,7 +77,25 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule,
"node_cpu": Gauge("node_cpu", "Total CPU usage on a ray node",
"percentage", ["ip"]),
"node_mem": Gauge("node_mem", "Total memory usage on a ray node",
"mb", ["ip"]),
"bytes", ["ip"]),
"node_disk_usage": Gauge("node_disk_usage",
"Total disk usage (bytes) on a ray node",
"bytes", ["ip"]),
"node_disk_utilization_percentage": Gauge(
"node_disk_utilization_percentage",
"Total disk utilization (percentage) on a ray node",
"percentage", ["ip"]),
"node_network_sent": Gauge("node_network_sent",
"Total network sent", "bytes", ["ip"]),
"node_network_received": Gauge("node_network_received",
"Total network received", "bytes",
["ip"]),
"node_network_send_speed": Gauge("node_network_send_speed",
"Network send speed", "bytes/sec",
["ip"]),
"node_network_receive_speed": Gauge("node_network_receive_speed",
"Network receive speed",
"bytes/sec", ["ip"]),
"raylet_cpu": Gauge("raylet_cpu",
"CPU usage of the raylet on a node.",
"percentage", ["ip", "pid"]),
@@ -237,8 +255,10 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule,
self._network_stats_hist.append((now, network_stats))
self._network_stats_hist = self._network_stats_hist[-7:]
then, prev_network_stats = self._network_stats_hist[0]
netstats = ((network_stats[0] - prev_network_stats[0]) / (now - then),
(network_stats[1] - prev_network_stats[1]) / (now - then))
prev_send, prev_recv = prev_network_stats
now_send, now_recv = network_stats
network_speed_stats = ((now_send - prev_send) / (now - then),
(now_recv - prev_recv) / (now - then))
return {
"now": now,
"hostname": self._hostname,
@@ -251,7 +271,8 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule,
"loadAvg": self._get_load_avg(),
"disk": self._get_disk_usage(),
"gpus": self._get_gpu_usage(),
"net": netstats,
"network": network_stats,
"network_speed": network_speed_stats,
"cmdline": self._get_raylet_cmdline(),
}
@@ -264,10 +285,45 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule,
# -- Mem per node --
total, avail, _ = stats["mem"]
mem_usage = float(total - avail) / 1e6
mem_usage = float(total - avail)
mem_record = Record(
gauge=self._gauges["node_mem"], value=mem_usage, tags={"ip": ip})
# -- Disk per node --
used, free = 0, 0
for entry in stats["disk"].values():
used += entry.used
free += entry.free
disk_utilization = float(used / (used + free)) * 100
disk_usage_record = Record(
gauge=self._gauges["node_disk_usage"], value=used, tags={"ip": ip})
disk_utilization_percentage_record = Record(
gauge=self._gauges["node_disk_utilization_percentage"],
value=disk_utilization,
tags={"ip": ip})
# -- Network speed (send/receive) stats per node --
network_stats = stats["network"]
network_sent_record = Record(
gauge=self._gauges["node_network_sent"],
value=network_stats[0],
tags={"ip": ip})
network_received_record = Record(
gauge=self._gauges["node_network_received"],
value=network_stats[1],
tags={"ip": ip})
# -- Network speed (send/receive) per node --
network_speed_stats = stats["network_speed"]
network_send_speed_record = Record(
gauge=self._gauges["node_network_send_speed"],
value=network_speed_stats[0],
tags={"ip": ip})
network_receive_speed_record = Record(
gauge=self._gauges["node_network_receive_speed"],
value=network_speed_stats[1],
tags={"ip": ip})
raylet_stats = self._get_raylet_stats()
raylet_pid = str(raylet_stats["pid"])
# -- raylet CPU --
@@ -290,8 +346,12 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule,
"pid": raylet_pid
})
self._metrics_agent.record_reporter_stats(
[cpu_record, mem_record, raylet_cpu_record, raylet_mem_record])
self._metrics_agent.record_reporter_stats([
cpu_record, mem_record, disk_usage_record,
disk_utilization_percentage_record, network_sent_record,
network_received_record, network_send_speed_record,
network_receive_speed_record, raylet_cpu_record, raylet_mem_record
])
async def _perform_iteration(self, aioredis_client):
"""Get any changes to the log files and push updates to Redis."""
@@ -105,7 +105,13 @@ def test_prometheus_physical_stats_record(enable_test_module, shutdown_only):
prom_addresses)
return all([
"ray_node_cpu" in metric_names, "ray_node_mem" in metric_names,
"ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names
"ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names,
"ray_node_disk_usage" in metric_names,
"ray_node_disk_utilization_percentage" in metric_names,
"ray_node_network_sent" in metric_names,
"ray_node_network_received" in metric_names,
"ray_node_network_send_speed" in metric_names,
"ray_node_network_receive_speed" in metric_names
])
def test_case_ip_correct():