[Autoscaler] Fix utilization calc (#10728)

This commit is contained in:
Alex Wu
2020-09-11 12:41:08 -07:00
committed by GitHub
parent 3eed3eca09
commit 1ff56765f0
2 changed files with 10 additions and 7 deletions
@@ -88,16 +88,18 @@ class ResourceDemandScheduler:
node_resources = []
node_type_counts = collections.defaultdict(int)
def add_node(node_type, existing_resource_usages=None):
def add_node(node_type, available_resources=None):
if node_type not in self.node_types:
raise RuntimeError("Missing entry for node_type {} in "
"available_node_types config: {}".format(
node_type, self.node_types))
# Careful not to include the same dict object multiple times.
available = copy.deepcopy(self.node_types[node_type]["resources"])
if existing_resource_usages:
for resource, used in existing_resource_usages.items():
available[resource] -= used
# If available_resources is None this might be because the node is
# no longer pending, but the raylet hasn't sent a heartbeat to gcs
# yet.
if available_resources is not None:
available = copy.deepcopy(available_resources)
node_resources.append(available)
node_type_counts[node_type] += 1
@@ -106,9 +108,10 @@ class ResourceDemandScheduler:
tags = self.provider.node_tags(node_id)
if TAG_RAY_USER_NODE_TYPE in tags:
node_type = tags[TAG_RAY_USER_NODE_TYPE]
node_type_counts[node_type] += 1
ip = self.provider.internal_ip(node_id)
resources = usage_by_ip.get(ip, {})
add_node(node_type, resources)
available_resources = usage_by_ip.get(ip)
add_node(node_type, available_resources)
for node_type, count in pending_nodes.items():
for _ in range(count):
@@ -307,7 +307,7 @@ class AutoscalingTest(unittest.TestCase):
head_ip, {
"CPU": 4,
"GPU": 1
}, {"GPU": 1}, {},
}, {"GPU": 0}, {},
waiting_bundles=[{
"GPU": 1
}])