mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 16:31:25 +08:00
Export additional metrics to Prometheus (#14061)
This commit is contained in:
@@ -77,7 +77,25 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule,
|
||||
"node_cpu": Gauge("node_cpu", "Total CPU usage on a ray node",
|
||||
"percentage", ["ip"]),
|
||||
"node_mem": Gauge("node_mem", "Total memory usage on a ray node",
|
||||
"mb", ["ip"]),
|
||||
"bytes", ["ip"]),
|
||||
"node_disk_usage": Gauge("node_disk_usage",
|
||||
"Total disk usage (bytes) on a ray node",
|
||||
"bytes", ["ip"]),
|
||||
"node_disk_utilization_percentage": Gauge(
|
||||
"node_disk_utilization_percentage",
|
||||
"Total disk utilization (percentage) on a ray node",
|
||||
"percentage", ["ip"]),
|
||||
"node_network_sent": Gauge("node_network_sent",
|
||||
"Total network sent", "bytes", ["ip"]),
|
||||
"node_network_received": Gauge("node_network_received",
|
||||
"Total network received", "bytes",
|
||||
["ip"]),
|
||||
"node_network_send_speed": Gauge("node_network_send_speed",
|
||||
"Network send speed", "bytes/sec",
|
||||
["ip"]),
|
||||
"node_network_receive_speed": Gauge("node_network_receive_speed",
|
||||
"Network receive speed",
|
||||
"bytes/sec", ["ip"]),
|
||||
"raylet_cpu": Gauge("raylet_cpu",
|
||||
"CPU usage of the raylet on a node.",
|
||||
"percentage", ["ip", "pid"]),
|
||||
@@ -237,8 +255,10 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule,
|
||||
self._network_stats_hist.append((now, network_stats))
|
||||
self._network_stats_hist = self._network_stats_hist[-7:]
|
||||
then, prev_network_stats = self._network_stats_hist[0]
|
||||
netstats = ((network_stats[0] - prev_network_stats[0]) / (now - then),
|
||||
(network_stats[1] - prev_network_stats[1]) / (now - then))
|
||||
prev_send, prev_recv = prev_network_stats
|
||||
now_send, now_recv = network_stats
|
||||
network_speed_stats = ((now_send - prev_send) / (now - then),
|
||||
(now_recv - prev_recv) / (now - then))
|
||||
return {
|
||||
"now": now,
|
||||
"hostname": self._hostname,
|
||||
@@ -251,7 +271,8 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule,
|
||||
"loadAvg": self._get_load_avg(),
|
||||
"disk": self._get_disk_usage(),
|
||||
"gpus": self._get_gpu_usage(),
|
||||
"net": netstats,
|
||||
"network": network_stats,
|
||||
"network_speed": network_speed_stats,
|
||||
"cmdline": self._get_raylet_cmdline(),
|
||||
}
|
||||
|
||||
@@ -264,10 +285,45 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule,
|
||||
|
||||
# -- Mem per node --
|
||||
total, avail, _ = stats["mem"]
|
||||
mem_usage = float(total - avail) / 1e6
|
||||
mem_usage = float(total - avail)
|
||||
mem_record = Record(
|
||||
gauge=self._gauges["node_mem"], value=mem_usage, tags={"ip": ip})
|
||||
|
||||
# -- Disk per node --
|
||||
used, free = 0, 0
|
||||
for entry in stats["disk"].values():
|
||||
used += entry.used
|
||||
free += entry.free
|
||||
disk_utilization = float(used / (used + free)) * 100
|
||||
disk_usage_record = Record(
|
||||
gauge=self._gauges["node_disk_usage"], value=used, tags={"ip": ip})
|
||||
disk_utilization_percentage_record = Record(
|
||||
gauge=self._gauges["node_disk_utilization_percentage"],
|
||||
value=disk_utilization,
|
||||
tags={"ip": ip})
|
||||
|
||||
# -- Network speed (send/receive) stats per node --
|
||||
network_stats = stats["network"]
|
||||
network_sent_record = Record(
|
||||
gauge=self._gauges["node_network_sent"],
|
||||
value=network_stats[0],
|
||||
tags={"ip": ip})
|
||||
network_received_record = Record(
|
||||
gauge=self._gauges["node_network_received"],
|
||||
value=network_stats[1],
|
||||
tags={"ip": ip})
|
||||
|
||||
# -- Network speed (send/receive) per node --
|
||||
network_speed_stats = stats["network_speed"]
|
||||
network_send_speed_record = Record(
|
||||
gauge=self._gauges["node_network_send_speed"],
|
||||
value=network_speed_stats[0],
|
||||
tags={"ip": ip})
|
||||
network_receive_speed_record = Record(
|
||||
gauge=self._gauges["node_network_receive_speed"],
|
||||
value=network_speed_stats[1],
|
||||
tags={"ip": ip})
|
||||
|
||||
raylet_stats = self._get_raylet_stats()
|
||||
raylet_pid = str(raylet_stats["pid"])
|
||||
# -- raylet CPU --
|
||||
@@ -290,8 +346,12 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule,
|
||||
"pid": raylet_pid
|
||||
})
|
||||
|
||||
self._metrics_agent.record_reporter_stats(
|
||||
[cpu_record, mem_record, raylet_cpu_record, raylet_mem_record])
|
||||
self._metrics_agent.record_reporter_stats([
|
||||
cpu_record, mem_record, disk_usage_record,
|
||||
disk_utilization_percentage_record, network_sent_record,
|
||||
network_received_record, network_send_speed_record,
|
||||
network_receive_speed_record, raylet_cpu_record, raylet_mem_record
|
||||
])
|
||||
|
||||
async def _perform_iteration(self, aioredis_client):
|
||||
"""Get any changes to the log files and push updates to Redis."""
|
||||
|
||||
@@ -105,7 +105,13 @@ def test_prometheus_physical_stats_record(enable_test_module, shutdown_only):
|
||||
prom_addresses)
|
||||
return all([
|
||||
"ray_node_cpu" in metric_names, "ray_node_mem" in metric_names,
|
||||
"ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names
|
||||
"ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names,
|
||||
"ray_node_disk_usage" in metric_names,
|
||||
"ray_node_disk_utilization_percentage" in metric_names,
|
||||
"ray_node_network_sent" in metric_names,
|
||||
"ray_node_network_received" in metric_names,
|
||||
"ray_node_network_send_speed" in metric_names,
|
||||
"ray_node_network_receive_speed" in metric_names
|
||||
])
|
||||
|
||||
def test_case_ip_correct():
|
||||
|
||||
Reference in New Issue
Block a user