mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:06:31 +08:00
[autoscaler/dashboard] Publish resource usage in units of bytes (#14002)
This commit is contained in:
@@ -5,6 +5,7 @@ import time
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
import ray.ray_constants
|
||||
import ray._private.services as services
|
||||
from ray.autoscaler._private.constants import MEMORY_RESOURCE_UNIT_BYTES,\
|
||||
AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE
|
||||
@@ -212,8 +213,15 @@ class LoadMetrics:
|
||||
) if self.static_resources_by_ip else {}
|
||||
usage_dict = {}
|
||||
for key in total_resources:
|
||||
total = total_resources[key]
|
||||
usage_dict[key] = (total - available_resources[key], total)
|
||||
if key in ["memory", "object_store_memory"]:
|
||||
total = total_resources[key] * \
|
||||
ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES
|
||||
available = available_resources[key] * \
|
||||
ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES
|
||||
usage_dict[key] = (total - available, total)
|
||||
else:
|
||||
total = total_resources[key]
|
||||
usage_dict[key] = (total - available_resources[key], total)
|
||||
|
||||
summarized_demand_vector = freq_of_dicts(
|
||||
self.get_resource_demand_vector(clip=False))
|
||||
|
||||
@@ -313,12 +313,12 @@ def format_pg(pg):
|
||||
|
||||
def get_usage_report(lm_summary) -> str:
|
||||
usage_lines = []
|
||||
for resource, (used, total) in lm_summary.usage.items():
|
||||
for resource, (used, total) in sorted(lm_summary.usage.items()):
|
||||
if "node:" in resource:
|
||||
continue # Skip the auto-added per-node "node:<ip>" resource.
|
||||
line = f" {used}/{total} {resource}"
|
||||
if resource in ["memory", "object_store_memory"]:
|
||||
to_GiB = ray.ray_constants.MEMORY_RESOURCE_UNIT_BYTES / 2**30
|
||||
to_GiB = 1 / 2**30
|
||||
used *= to_GiB
|
||||
total *= to_GiB
|
||||
line = f" {used:.2f}/{total:.3f} GiB {resource}"
|
||||
|
||||
@@ -8,6 +8,7 @@ import unittest
|
||||
import copy
|
||||
|
||||
import ray
|
||||
import ray.ray_constants
|
||||
from ray.autoscaler._private.util import \
|
||||
rewrite_legacy_yaml_to_available_node_types, format_info_string, \
|
||||
format_info_string_no_node_types
|
||||
@@ -1215,15 +1216,27 @@ class LoadMetricsTest(unittest.TestCase):
|
||||
strategy=PlacementStrategy.PACK,
|
||||
bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
|
||||
]
|
||||
lm.update("1.1.1.1", {"CPU": 64}, {"CPU": 2}, {})
|
||||
lm.update(
|
||||
"1.1.1.1",
|
||||
{
|
||||
"CPU": 64,
|
||||
"memory": 20, # 1000 MiB
|
||||
"object_store_memory": 40 # 2000 MiB
|
||||
},
|
||||
{
|
||||
"CPU": 2,
|
||||
"memory": 10, # 500 MiB
|
||||
"object_store_memory": 20 # 1000 MiB
|
||||
},
|
||||
{})
|
||||
lm.update("1.1.1.2", {
|
||||
"CPU": 64,
|
||||
"GPU": 8,
|
||||
"accelerator_type:V100": 1
|
||||
"accelerator_type:V100": 1,
|
||||
}, {
|
||||
"CPU": 0,
|
||||
"GPU": 1,
|
||||
"accelerator_type:V100": 1
|
||||
"accelerator_type:V100": 1,
|
||||
}, {})
|
||||
lm.update("1.1.1.3", {
|
||||
"CPU": 64,
|
||||
@@ -1257,6 +1270,9 @@ class LoadMetricsTest(unittest.TestCase):
|
||||
|
||||
assert summary.usage["CPU"] == (190, 194)
|
||||
assert summary.usage["GPU"] == (15, 16)
|
||||
assert summary.usage["memory"] == (500 * 2**20, 1000 * 2**20)
|
||||
assert summary.usage["object_store_memory"] == \
|
||||
(1000 * 2**20, 2000 * 2**20)
|
||||
assert summary.usage["accelerator_type:V100"][1] == 2, \
|
||||
"Not comparing the usage value due to floating point error."
|
||||
|
||||
@@ -1280,7 +1296,7 @@ class LoadMetricsTest(unittest.TestCase):
|
||||
# TODO (Alex): This set of nodes won't be very useful in practice
|
||||
# because the node:xxx.xxx.xxx.xxx resources means that no 2 nodes
|
||||
# should ever have the same set of resources.
|
||||
assert len(summary.node_types) == 3
|
||||
assert len(summary.node_types) == 3, summary.node_types
|
||||
|
||||
|
||||
class AutoscalingTest(unittest.TestCase):
|
||||
@@ -2413,8 +2429,8 @@ def test_info_string():
|
||||
"CPU": (530, 544),
|
||||
"GPU": (2, 2),
|
||||
"AcceleratorType:V100": (0, 2),
|
||||
"memory": (0, 1583.19),
|
||||
"object_store_memory": (0, 471.02)
|
||||
"memory": (2 * 2**30, 2**33),
|
||||
"object_store_memory": (3.14 * 2**30, 2**34)
|
||||
},
|
||||
resource_demand=[({
|
||||
"CPU": 1
|
||||
@@ -2457,11 +2473,11 @@ Resources
|
||||
--------------------------------------------------------
|
||||
|
||||
Usage:
|
||||
0/2 AcceleratorType:V100
|
||||
530/544 CPU
|
||||
2/2 GPU
|
||||
0/2 AcceleratorType:V100
|
||||
0.00/77.304 GiB memory
|
||||
0.00/22.999 GiB object_store_memory
|
||||
2.00/8.000 GiB memory
|
||||
3.14/16.000 GiB object_store_memory
|
||||
|
||||
Demands:
|
||||
{'CPU': 1}: 150+ pending tasks/actors
|
||||
@@ -2484,8 +2500,8 @@ def test_info_string_no_node_type():
|
||||
"CPU": (530, 544),
|
||||
"GPU": (2, 2),
|
||||
"AcceleratorType:V100": (0, 2),
|
||||
"memory": (0, 1583.19),
|
||||
"object_store_memory": (0, 471.02)
|
||||
"memory": (2 * 2**30, 2**33),
|
||||
"object_store_memory": (3.14 * 2**30, 2**34)
|
||||
},
|
||||
resource_demand=[({
|
||||
"CPU": 1
|
||||
@@ -2512,11 +2528,11 @@ Node status
|
||||
Resources
|
||||
-----------------------------------------------------
|
||||
Usage:
|
||||
0/2 AcceleratorType:V100
|
||||
530/544 CPU
|
||||
2/2 GPU
|
||||
0/2 AcceleratorType:V100
|
||||
0.00/77.304 GiB memory
|
||||
0.00/22.999 GiB object_store_memory
|
||||
2.00/8.000 GiB memory
|
||||
3.14/16.000 GiB object_store_memory
|
||||
|
||||
Demands:
|
||||
{'CPU': 1}: 150+ pending tasks/actors
|
||||
|
||||
Reference in New Issue
Block a user