diff --git a/python/ray/autoscaler/_private/load_metrics.py b/python/ray/autoscaler/_private/load_metrics.py index bf9dc564b..09ea11238 100644 --- a/python/ray/autoscaler/_private/load_metrics.py +++ b/python/ray/autoscaler/_private/load_metrics.py @@ -5,6 +5,7 @@ import time from typing import Dict, List import numpy as np +import ray.ray_constants import ray._private.services as services from ray.autoscaler._private.constants import MEMORY_RESOURCE_UNIT_BYTES,\ AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE @@ -212,8 +213,15 @@ class LoadMetrics: ) if self.static_resources_by_ip else {} usage_dict = {} for key in total_resources: - total = total_resources[key] - usage_dict[key] = (total - available_resources[key], total) + if key in ["memory", "object_store_memory"]: + total = total_resources[key] * \ + ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES + available = available_resources[key] * \ + ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES + usage_dict[key] = (total - available, total) + else: + total = total_resources[key] + usage_dict[key] = (total - available_resources[key], total) summarized_demand_vector = freq_of_dicts( self.get_resource_demand_vector(clip=False)) diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 39ebd5e79..788da5cc2 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -313,12 +313,12 @@ def format_pg(pg): def get_usage_report(lm_summary) -> str: usage_lines = [] - for resource, (used, total) in lm_summary.usage.items(): + for resource, (used, total) in sorted(lm_summary.usage.items()): if "node:" in resource: continue # Skip the auto-added per-node "node:" resource. line = f" {used}/{total} {resource}" if resource in ["memory", "object_store_memory"]: - to_GiB = ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES / 2**30 + to_GiB = 1 / 2**30 used *= to_GiB total *= to_GiB line = f" {used:.2f}/{total:.3f} GiB {resource}" diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index 977c2f2b8..d753ffcab 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -8,6 +8,7 @@ import unittest import copy import ray +import ray.ray_constants from ray.autoscaler._private.util import \ rewrite_legacy_yaml_to_available_node_types, format_info_string, \ format_info_string_no_node_types @@ -1215,15 +1216,27 @@ class LoadMetricsTest(unittest.TestCase): strategy=PlacementStrategy.PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), ] - lm.update("1.1.1.1", {"CPU": 64}, {"CPU": 2}, {}) + lm.update( + "1.1.1.1", + { + "CPU": 64, + "memory": 20, # 1000 MiB + "object_store_memory": 40 # 2000 MiB + }, + { + "CPU": 2, + "memory": 10, # 500 MiB + "object_store_memory": 20 # 1000 MiB + }, + {}) lm.update("1.1.1.2", { "CPU": 64, "GPU": 8, - "accelerator_type:V100": 1 + "accelerator_type:V100": 1, }, { "CPU": 0, "GPU": 1, - "accelerator_type:V100": 1 + "accelerator_type:V100": 1, }, {}) lm.update("1.1.1.3", { "CPU": 64, @@ -1257,6 +1270,9 @@ class LoadMetricsTest(unittest.TestCase): assert summary.usage["CPU"] == (190, 194) assert summary.usage["GPU"] == (15, 16) + assert summary.usage["memory"] == (500 * 2**20, 1000 * 2**20) + assert summary.usage["object_store_memory"] == \ + (1000 * 2**20, 2000 * 2**20) assert summary.usage["accelerator_type:V100"][1] == 2, \ "Not comparing the usage value due to floating point error." @@ -1280,7 +1296,7 @@ class LoadMetricsTest(unittest.TestCase): # TODO (Alex): This set of nodes won't be very useful in practice # because the node:xxx.xxx.xxx.xxx resources means that no 2 nodes # should ever have the same set of resources. - assert len(summary.node_types) == 3 + assert len(summary.node_types) == 3, summary.node_types class AutoscalingTest(unittest.TestCase): @@ -2413,8 +2429,8 @@ def test_info_string(): "CPU": (530, 544), "GPU": (2, 2), "AcceleratorType:V100": (0, 2), - "memory": (0, 1583.19), - "object_store_memory": (0, 471.02) + "memory": (2 * 2**30, 2**33), + "object_store_memory": (3.14 * 2**30, 2**34) }, resource_demand=[({ "CPU": 1 @@ -2457,11 +2473,11 @@ Resources -------------------------------------------------------- Usage: + 0/2 AcceleratorType:V100 530/544 CPU 2/2 GPU - 0/2 AcceleratorType:V100 - 0.00/77.304 GiB memory - 0.00/22.999 GiB object_store_memory + 2.00/8.000 GiB memory + 3.14/16.000 GiB object_store_memory Demands: {'CPU': 1}: 150+ pending tasks/actors @@ -2484,8 +2500,8 @@ def test_info_string_no_node_type(): "CPU": (530, 544), "GPU": (2, 2), "AcceleratorType:V100": (0, 2), - "memory": (0, 1583.19), - "object_store_memory": (0, 471.02) + "memory": (2 * 2**30, 2**33), + "object_store_memory": (3.14 * 2**30, 2**34) }, resource_demand=[({ "CPU": 1 @@ -2512,11 +2528,11 @@ Node status Resources ----------------------------------------------------- Usage: + 0/2 AcceleratorType:V100 530/544 CPU 2/2 GPU - 0/2 AcceleratorType:V100 - 0.00/77.304 GiB memory - 0.00/22.999 GiB object_store_memory + 2.00/8.000 GiB memory + 3.14/16.000 GiB object_store_memory Demands: {'CPU': 1}: 150+ pending tasks/actors