diff --git a/dashboard/modules/reporter/reporter_agent.py b/dashboard/modules/reporter/reporter_agent.py index 3d9472a3d..e604f7463 100644 --- a/dashboard/modules/reporter/reporter_agent.py +++ b/dashboard/modules/reporter/reporter_agent.py @@ -77,7 +77,25 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule, "node_cpu": Gauge("node_cpu", "Total CPU usage on a ray node", "percentage", ["ip"]), "node_mem": Gauge("node_mem", "Total memory usage on a ray node", - "mb", ["ip"]), + "bytes", ["ip"]), + "node_disk_usage": Gauge("node_disk_usage", + "Total disk usage (bytes) on a ray node", + "bytes", ["ip"]), + "node_disk_utilization_percentage": Gauge( + "node_disk_utilization_percentage", + "Total disk utilization (percentage) on a ray node", + "percentage", ["ip"]), + "node_network_sent": Gauge("node_network_sent", + "Total network sent", "bytes", ["ip"]), + "node_network_received": Gauge("node_network_received", + "Total network received", "bytes", + ["ip"]), + "node_network_send_speed": Gauge("node_network_send_speed", + "Network send speed", "bytes/sec", + ["ip"]), + "node_network_receive_speed": Gauge("node_network_receive_speed", + "Network receive speed", + "bytes/sec", ["ip"]), "raylet_cpu": Gauge("raylet_cpu", "CPU usage of the raylet on a node.", "percentage", ["ip", "pid"]), @@ -237,8 +255,10 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule, self._network_stats_hist.append((now, network_stats)) self._network_stats_hist = self._network_stats_hist[-7:] then, prev_network_stats = self._network_stats_hist[0] - netstats = ((network_stats[0] - prev_network_stats[0]) / (now - then), - (network_stats[1] - prev_network_stats[1]) / (now - then)) + prev_send, prev_recv = prev_network_stats + now_send, now_recv = network_stats + network_speed_stats = ((now_send - prev_send) / (now - then), + (now_recv - prev_recv) / (now - then)) return { "now": now, "hostname": self._hostname, @@ -251,7 +271,8 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule, "loadAvg": self._get_load_avg(), "disk": self._get_disk_usage(), "gpus": self._get_gpu_usage(), - "net": netstats, + "network": network_stats, + "network_speed": network_speed_stats, "cmdline": self._get_raylet_cmdline(), } @@ -264,10 +285,45 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule, # -- Mem per node -- total, avail, _ = stats["mem"] - mem_usage = float(total - avail) / 1e6 + mem_usage = float(total - avail) mem_record = Record( gauge=self._gauges["node_mem"], value=mem_usage, tags={"ip": ip}) + # -- Disk per node -- + used, free = 0, 0 + for entry in stats["disk"].values(): + used += entry.used + free += entry.free + disk_utilization = float(used / (used + free)) * 100 + disk_usage_record = Record( + gauge=self._gauges["node_disk_usage"], value=used, tags={"ip": ip}) + disk_utilization_percentage_record = Record( + gauge=self._gauges["node_disk_utilization_percentage"], + value=disk_utilization, + tags={"ip": ip}) + + # -- Network speed (send/receive) stats per node -- + network_stats = stats["network"] + network_sent_record = Record( + gauge=self._gauges["node_network_sent"], + value=network_stats[0], + tags={"ip": ip}) + network_received_record = Record( + gauge=self._gauges["node_network_received"], + value=network_stats[1], + tags={"ip": ip}) + + # -- Network speed (send/receive) per node -- + network_speed_stats = stats["network_speed"] + network_send_speed_record = Record( + gauge=self._gauges["node_network_send_speed"], + value=network_speed_stats[0], + tags={"ip": ip}) + network_receive_speed_record = Record( + gauge=self._gauges["node_network_receive_speed"], + value=network_speed_stats[1], + tags={"ip": ip}) + raylet_stats = self._get_raylet_stats() raylet_pid = str(raylet_stats["pid"]) # -- raylet CPU -- @@ -290,8 +346,12 @@ class ReporterAgent(dashboard_utils.DashboardAgentModule, "pid": raylet_pid }) - self._metrics_agent.record_reporter_stats( - [cpu_record, mem_record, raylet_cpu_record, raylet_mem_record]) + self._metrics_agent.record_reporter_stats([ + cpu_record, mem_record, disk_usage_record, + disk_utilization_percentage_record, network_sent_record, + network_received_record, network_send_speed_record, + network_receive_speed_record, raylet_cpu_record, raylet_mem_record + ]) async def _perform_iteration(self, aioredis_client): """Get any changes to the log files and push updates to Redis.""" diff --git a/dashboard/modules/reporter/tests/test_reporter.py b/dashboard/modules/reporter/tests/test_reporter.py index 001ea42a5..72617562f 100644 --- a/dashboard/modules/reporter/tests/test_reporter.py +++ b/dashboard/modules/reporter/tests/test_reporter.py @@ -105,7 +105,13 @@ def test_prometheus_physical_stats_record(enable_test_module, shutdown_only): prom_addresses) return all([ "ray_node_cpu" in metric_names, "ray_node_mem" in metric_names, - "ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names + "ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names, + "ray_node_disk_usage" in metric_names, + "ray_node_disk_utilization_percentage" in metric_names, + "ray_node_network_sent" in metric_names, + "ray_node_network_received" in metric_names, + "ray_node_network_send_speed" in metric_names, + "ray_node_network_receive_speed" in metric_names ]) def test_case_ip_correct():