[autoscaler/dashboard] Publish resource usage in units of bytes (#14002)

2026-06-27 20:06:31 +08:00 · 2021-02-09 10:27:26 -08:00
parent 43083b9653
commit 1dcdfe9101
3 changed files with 42 additions and 18 deletions
@@ -5,6 +5,7 @@ import time
 from typing import Dict, List

 import numpy as np
+import ray.ray_constants
 import ray._private.services as services
 from ray.autoscaler._private.constants import MEMORY_RESOURCE_UNIT_BYTES,\
    AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE
@@ -212,8 +213,15 @@ class LoadMetrics:
                                 ) if self.static_resources_by_ip else {}
        usage_dict = {}
        for key in total_resources:
-            total = total_resources[key]
-            usage_dict[key] = (total - available_resources[key], total)
+            if key in ["memory", "object_store_memory"]:
+                total = total_resources[key] * \
+                    ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES
+                available = available_resources[key] * \
+                    ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES
+                usage_dict[key] = (total - available, total)
+            else:
+                total = total_resources[key]
+                usage_dict[key] = (total - available_resources[key], total)

        summarized_demand_vector = freq_of_dicts(
            self.get_resource_demand_vector(clip=False))
@@ -313,12 +313,12 @@ def format_pg(pg):

 def get_usage_report(lm_summary) -> str:
    usage_lines = []
-    for resource, (used, total) in lm_summary.usage.items():
+    for resource, (used, total) in sorted(lm_summary.usage.items()):
        if "node:" in resource:
            continue  # Skip the auto-added per-node "node:<ip>" resource.
        line = f" {used}/{total} {resource}"
        if resource in ["memory", "object_store_memory"]:
-            to_GiB = ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES / 2**30
+            to_GiB = 1 / 2**30
            used *= to_GiB
            total *= to_GiB
            line = f" {used:.2f}/{total:.3f} GiB {resource}"
@@ -8,6 +8,7 @@ import unittest
 import copy

 import ray
+import ray.ray_constants
 from ray.autoscaler._private.util import \
    rewrite_legacy_yaml_to_available_node_types, format_info_string, \
    format_info_string_no_node_types
@@ -1215,15 +1216,27 @@ class LoadMetricsTest(unittest.TestCase):
                strategy=PlacementStrategy.PACK,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
        ]
-        lm.update("1.1.1.1", {"CPU": 64}, {"CPU": 2}, {})
+        lm.update(
+            "1.1.1.1",
+            {
+                "CPU": 64,
+                "memory": 20,  # 1000 MiB
+                "object_store_memory": 40  # 2000 MiB
+            },
+            {
+                "CPU": 2,
+                "memory": 10,  # 500 MiB
+                "object_store_memory": 20  # 1000 MiB
+            },
+            {})
        lm.update("1.1.1.2", {
            "CPU": 64,
            "GPU": 8,
-            "accelerator_type:V100": 1
+            "accelerator_type:V100": 1,
        }, {
            "CPU": 0,
            "GPU": 1,
-            "accelerator_type:V100": 1
+            "accelerator_type:V100": 1,
        }, {})
        lm.update("1.1.1.3", {
            "CPU": 64,
@@ -1257,6 +1270,9 @@ class LoadMetricsTest(unittest.TestCase):

        assert summary.usage["CPU"] == (190, 194)
        assert summary.usage["GPU"] == (15, 16)
+        assert summary.usage["memory"] == (500 * 2**20, 1000 * 2**20)
+        assert summary.usage["object_store_memory"] == \
+            (1000 * 2**20, 2000 * 2**20)
        assert summary.usage["accelerator_type:V100"][1] == 2, \
            "Not comparing the usage value due to floating point error."

@@ -1280,7 +1296,7 @@ class LoadMetricsTest(unittest.TestCase):
        # TODO (Alex): This set of nodes won't be very useful in practice
        # because the node:xxx.xxx.xxx.xxx resources means that no 2 nodes
        # should ever have the same set of resources.
-        assert len(summary.node_types) == 3
+        assert len(summary.node_types) == 3, summary.node_types


 class AutoscalingTest(unittest.TestCase):
@@ -2413,8 +2429,8 @@ def test_info_string():
            "CPU": (530, 544),
            "GPU": (2, 2),
            "AcceleratorType:V100": (0, 2),
-            "memory": (0, 1583.19),
-            "object_store_memory": (0, 471.02)
+            "memory": (2 * 2**30, 2**33),
+            "object_store_memory": (3.14 * 2**30, 2**34)
        },
        resource_demand=[({
            "CPU": 1
@@ -2457,11 +2473,11 @@ Resources
 --------------------------------------------------------

 Usage:
+ 0/2 AcceleratorType:V100
 530/544 CPU
 2/2 GPU
- 0/2 AcceleratorType:V100
- 0.00/77.304 GiB memory
- 0.00/22.999 GiB object_store_memory
+ 2.00/8.000 GiB memory
+ 3.14/16.000 GiB object_store_memory

 Demands:
 {'CPU': 1}: 150+ pending tasks/actors
@@ -2484,8 +2500,8 @@ def test_info_string_no_node_type():
            "CPU": (530, 544),
            "GPU": (2, 2),
            "AcceleratorType:V100": (0, 2),
-            "memory": (0, 1583.19),
-            "object_store_memory": (0, 471.02)
+            "memory": (2 * 2**30, 2**33),
+            "object_store_memory": (3.14 * 2**30, 2**34)
        },
        resource_demand=[({
            "CPU": 1
@@ -2512,11 +2528,11 @@ Node status
 Resources
 -----------------------------------------------------
 Usage:
+ 0/2 AcceleratorType:V100
 530/544 CPU
 2/2 GPU
- 0/2 AcceleratorType:V100
- 0.00/77.304 GiB memory
- 0.00/22.999 GiB object_store_memory
+ 2.00/8.000 GiB memory
+ 3.14/16.000 GiB object_store_memory

 Demands:
 {'CPU': 1}: 150+ pending tasks/actors