From e56f2b85869ea4cecd2187e4a1ad84eedfbd2371 Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Fri, 18 Sep 2020 13:39:00 -0700 Subject: [PATCH] [autoscaler] hotfix calculate_node_resources (#10874) --- .../autoscaler/resource_demand_scheduler.py | 2 +- .../tests/test_resource_demand_scheduler.py | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/python/ray/autoscaler/resource_demand_scheduler.py b/python/ray/autoscaler/resource_demand_scheduler.py index 00acd2f01..3b3bd20ba 100644 --- a/python/ray/autoscaler/resource_demand_scheduler.py +++ b/python/ray/autoscaler/resource_demand_scheduler.py @@ -56,6 +56,7 @@ class ResourceDemandScheduler: nodes: List of existing nodes in the cluster. pending_nodes: Summary of node types currently being launched. resource_demands: Vector of resource demands from the scheduler. + usage_by_ip: Mapping from ip to available resources. """ if resource_demands is None: @@ -113,7 +114,6 @@ class ResourceDemandScheduler: tags = self.provider.node_tags(node_id) if TAG_RAY_USER_NODE_TYPE in tags: node_type = tags[TAG_RAY_USER_NODE_TYPE] - node_type_counts[node_type] += 1 ip = self.provider.internal_ip(node_id) available_resources = usage_by_ip.get(ip) add_node(node_type, available_resources) diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index dea49e821..f6200b0e5 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -180,6 +180,27 @@ def test_get_nodes_to_launch_limits(): assert to_launch == [] +def test_calculate_node_resources(): + provider = MockProvider() + scheduler = ResourceDemandScheduler(provider, TYPES_A, 10) + + provider.create_node({}, {TAG_RAY_USER_NODE_TYPE: "p2.8xlarge"}, 2) + + nodes = provider.non_terminated_nodes({}) + + ips = provider.non_terminated_node_ips({}) + # 2 free p2.8xls + utilizations = {ip: {"GPU": 8} for ip in ips} + # 1 more on the way + pending_nodes = {"p2.8xlarge": 1} + # requires 4 p2.8xls (only 3 are in cluster/pending) + demands = [{"GPU": 8}] * (len(utilizations) + 2) + to_launch = scheduler.get_nodes_to_launch(nodes, pending_nodes, demands, + utilizations) + + assert to_launch == [("p2.8xlarge", 1)] + + class LoadMetricsTest(unittest.TestCase): def testResourceDemandVector(self): lm = LoadMetrics()