[autoscaler/dashboard] Publish resource usage in units of bytes (#14002)

This commit is contained in:
Alex Wu
2021-02-09 10:27:26 -08:00
committed by GitHub
parent 43083b9653
commit 1dcdfe9101
3 changed files with 42 additions and 18 deletions
+10 -2
View File
@@ -5,6 +5,7 @@ import time
from typing import Dict, List
import numpy as np
import ray.ray_constants
import ray._private.services as services
from ray.autoscaler._private.constants import MEMORY_RESOURCE_UNIT_BYTES,\
AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE
@@ -212,8 +213,15 @@ class LoadMetrics:
) if self.static_resources_by_ip else {}
usage_dict = {}
for key in total_resources:
total = total_resources[key]
usage_dict[key] = (total - available_resources[key], total)
if key in ["memory", "object_store_memory"]:
total = total_resources[key] * \
ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES
available = available_resources[key] * \
ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES
usage_dict[key] = (total - available, total)
else:
total = total_resources[key]
usage_dict[key] = (total - available_resources[key], total)
summarized_demand_vector = freq_of_dicts(
self.get_resource_demand_vector(clip=False))
+2 -2
View File
@@ -313,12 +313,12 @@ def format_pg(pg):
def get_usage_report(lm_summary) -> str:
usage_lines = []
for resource, (used, total) in lm_summary.usage.items():
for resource, (used, total) in sorted(lm_summary.usage.items()):
if "node:" in resource:
continue # Skip the auto-added per-node "node:<ip>" resource.
line = f" {used}/{total} {resource}"
if resource in ["memory", "object_store_memory"]:
to_GiB = ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES / 2**30
to_GiB = 1 / 2**30
used *= to_GiB
total *= to_GiB
line = f" {used:.2f}/{total:.3f} GiB {resource}"
@@ -8,6 +8,7 @@ import unittest
import copy
import ray
import ray.ray_constants
from ray.autoscaler._private.util import \
rewrite_legacy_yaml_to_available_node_types, format_info_string, \
format_info_string_no_node_types
@@ -1215,15 +1216,27 @@ class LoadMetricsTest(unittest.TestCase):
strategy=PlacementStrategy.PACK,
bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
]
lm.update("1.1.1.1", {"CPU": 64}, {"CPU": 2}, {})
lm.update(
"1.1.1.1",
{
"CPU": 64,
"memory": 20, # 1000 MiB
"object_store_memory": 40 # 2000 MiB
},
{
"CPU": 2,
"memory": 10, # 500 MiB
"object_store_memory": 20 # 1000 MiB
},
{})
lm.update("1.1.1.2", {
"CPU": 64,
"GPU": 8,
"accelerator_type:V100": 1
"accelerator_type:V100": 1,
}, {
"CPU": 0,
"GPU": 1,
"accelerator_type:V100": 1
"accelerator_type:V100": 1,
}, {})
lm.update("1.1.1.3", {
"CPU": 64,
@@ -1257,6 +1270,9 @@ class LoadMetricsTest(unittest.TestCase):
assert summary.usage["CPU"] == (190, 194)
assert summary.usage["GPU"] == (15, 16)
assert summary.usage["memory"] == (500 * 2**20, 1000 * 2**20)
assert summary.usage["object_store_memory"] == \
(1000 * 2**20, 2000 * 2**20)
assert summary.usage["accelerator_type:V100"][1] == 2, \
"Not comparing the usage value due to floating point error."
@@ -1280,7 +1296,7 @@ class LoadMetricsTest(unittest.TestCase):
# TODO (Alex): This set of nodes won't be very useful in practice
# because the node:xxx.xxx.xxx.xxx resources means that no 2 nodes
# should ever have the same set of resources.
assert len(summary.node_types) == 3
assert len(summary.node_types) == 3, summary.node_types
class AutoscalingTest(unittest.TestCase):
@@ -2413,8 +2429,8 @@ def test_info_string():
"CPU": (530, 544),
"GPU": (2, 2),
"AcceleratorType:V100": (0, 2),
"memory": (0, 1583.19),
"object_store_memory": (0, 471.02)
"memory": (2 * 2**30, 2**33),
"object_store_memory": (3.14 * 2**30, 2**34)
},
resource_demand=[({
"CPU": 1
@@ -2457,11 +2473,11 @@ Resources
--------------------------------------------------------
Usage:
0/2 AcceleratorType:V100
530/544 CPU
2/2 GPU
0/2 AcceleratorType:V100
0.00/77.304 GiB memory
0.00/22.999 GiB object_store_memory
2.00/8.000 GiB memory
3.14/16.000 GiB object_store_memory
Demands:
{'CPU': 1}: 150+ pending tasks/actors
@@ -2484,8 +2500,8 @@ def test_info_string_no_node_type():
"CPU": (530, 544),
"GPU": (2, 2),
"AcceleratorType:V100": (0, 2),
"memory": (0, 1583.19),
"object_store_memory": (0, 471.02)
"memory": (2 * 2**30, 2**33),
"object_store_memory": (3.14 * 2**30, 2**34)
},
resource_demand=[({
"CPU": 1
@@ -2512,11 +2528,11 @@ Node status
Resources
-----------------------------------------------------
Usage:
0/2 AcceleratorType:V100
530/544 CPU
2/2 GPU
0/2 AcceleratorType:V100
0.00/77.304 GiB memory
0.00/22.999 GiB object_store_memory
2.00/8.000 GiB memory
3.14/16.000 GiB object_store_memory
Demands:
{'CPU': 1}: 150+ pending tasks/actors