[autoscaler] Avoid launching GPU nodes when the workload only has CPU tasks. (#13776)

* wip * avoid gpus * update * update
2026-06-27 19:16:19 +08:00 · 2021-01-29 09:50:28 -08:00
parent 4d6817c683
commit b20a38febb
3 changed files with 48 additions and 5 deletions
@@ -15,6 +15,9 @@ def env_integer(key, default):
 # Whether event logging to driver is enabled. Set to 0 to disable.
 AUTOSCALER_EVENTS = env_integer("AUTOSCALER_EVENTS", 1)

+# Whether to avoid launching GPU nodes for CPU only tasks.
+AUTOSCALER_CONSERVE_GPU_NODES = env_integer("AUTOSCALER_CONSERVE_GPU_NODES", 1)
+
 # How long to wait for a node to start, in seconds
 NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900)

@@ -17,6 +17,7 @@ from typing import List, Dict
 from ray.autoscaler.node_provider import NodeProvider
 from ray.gcs_utils import PlacementGroupTableData
 from ray.core.generated.common_pb2 import PlacementStrategy
+from ray.autoscaler._private.constants import AUTOSCALER_CONSERVE_GPU_NODES
 from ray.autoscaler.tags import (
    TAG_RAY_USER_NODE_TYPE, NODE_KIND_UNMANAGED, NODE_TYPE_LEGACY_WORKER,
    NODE_KIND_WORKER, NODE_TYPE_LEGACY_HEAD, TAG_RAY_NODE_KIND, NODE_KIND_HEAD)
@@ -639,7 +640,7 @@ def get_nodes_for(node_types: Dict[NodeType, NodeTypeConfigDict],
            # resources. This will behave properly with the current utilization
            # score heuristic, but it's a little dangerous and misleading.
            logger.warning(
-                f"The autoscaler could not find a node type to satisfy the"
+                f"The autoscaler could not find a node type to satisfy the "
                f"request: {resources}. If this request is related to "
                f"placement groups the resource request will resolve itself, "
                f"otherwise please specify a node type with the necessary "
@@ -664,8 +665,16 @@ def get_nodes_for(node_types: Dict[NodeType, NodeTypeConfigDict],


 def _utilization_score(node_resources: ResourceDict,
-                       resources: ResourceDict) -> float:
+                       resources: List[ResourceDict]) -> float:
    remaining = copy.deepcopy(node_resources)
+    is_gpu_node = "GPU" in node_resources
+    any_gpu_task = any("GPU" in r for r in resources)
+
+    # Avoid launching GPU nodes if there aren't any GPU tasks at all. Note that
+    # if there *is* a GPU task, then CPU tasks can be scheduled as well.
+    if AUTOSCALER_CONSERVE_GPU_NODES:
+        if is_gpu_node and not any_gpu_task:
+            return None

    fittable = []
    for r in resources:
@@ -105,6 +105,14 @@ def test_util_score():
        (8, 8)


+def test_gpu_node_util_score():
+    # Avoid scheduling CPU tasks on GPU node.
+    assert _utilization_score({"GPU": 1, "CPU": 1}, [{"CPU": 1}]) is None
+    assert _utilization_score({"GPU": 1, "CPU": 1}, [{"CPU": 1, "GPU": 1}]) \
+        == (1.0, 1.0)
+    assert _utilization_score({"GPU": 1, "CPU": 1}, [{"GPU": 1}]) == (0.0, 0.5)
+
+
 def test_bin_pack():
    assert get_bin_pack_residual([], [{"GPU": 2}, {"GPU": 2}])[0] == \
        [{"GPU": 2}, {"GPU": 2}]
@@ -247,6 +255,32 @@ def test_get_nodes_packing_heuristic():
        }


+def test_gpu_node_avoid_cpu_task():
+    types = {
+        "cpu": {
+            "resources": {
+                "CPU": 1
+            },
+            "max_workers": 10,
+        },
+        "gpu": {
+            "resources": {
+                "GPU": 1,
+                "CPU": 100,
+            },
+            "max_workers": 10,
+        },
+    }
+    r1 = [{"CPU": 1}] * 100
+    assert get_nodes_for(types, {}, "empty_node", 100, r1) == {"cpu": 10}
+    r2 = [{"GPU": 1}] + [{"CPU": 1}] * 100
+    assert get_nodes_for(types, {}, "empty_node", 100, r2) == \
+        {"gpu": 1}
+    r3 = [{"GPU": 1}] * 4 + [{"CPU": 1}] * 404
+    assert get_nodes_for(types, {}, "empty_node", 100, r3) == \
+        {"gpu": 4, "cpu": 4}
+
+
 def test_get_nodes_respects_max_limit():
    types = {
        "m4.large": {
@@ -2029,7 +2063,6 @@ class AutoscalingTest(unittest.TestCase):
                "node_config": {},
                "resources": {
                    "CPU": 2,
-                    "GPU": 1,
                    "WORKER": 1
                },
                "max_workers": 3
@@ -2146,7 +2179,6 @@ class AutoscalingTest(unittest.TestCase):
                "node_config": {},
                "resources": {
                    "CPU": 2,
-                    "GPU": 1,
                    "WORKER": 1
                },
                "max_workers": 3,
@@ -2260,7 +2292,6 @@ class AutoscalingTest(unittest.TestCase):
                "node_config": {},
                "resources": {
                    "CPU": 2,
-                    "GPU": 1,
                    "WORKER": 1
                },
                "max_workers": 3,