diff --git a/python/ray/autoscaler/_private/constants.py b/python/ray/autoscaler/_private/constants.py index 3fd3ec65e..2fbf6ec32 100644 --- a/python/ray/autoscaler/_private/constants.py +++ b/python/ray/autoscaler/_private/constants.py @@ -15,6 +15,9 @@ def env_integer(key, default): # Whether event logging to driver is enabled. Set to 0 to disable. AUTOSCALER_EVENTS = env_integer("AUTOSCALER_EVENTS", 1) +# Whether to avoid launching GPU nodes for CPU only tasks. +AUTOSCALER_CONSERVE_GPU_NODES = env_integer("AUTOSCALER_CONSERVE_GPU_NODES", 1) + # How long to wait for a node to start, in seconds NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900) diff --git a/python/ray/autoscaler/_private/resource_demand_scheduler.py b/python/ray/autoscaler/_private/resource_demand_scheduler.py index 523fd7d2f..0a08e0579 100644 --- a/python/ray/autoscaler/_private/resource_demand_scheduler.py +++ b/python/ray/autoscaler/_private/resource_demand_scheduler.py @@ -17,6 +17,7 @@ from typing import List, Dict from ray.autoscaler.node_provider import NodeProvider from ray.gcs_utils import PlacementGroupTableData from ray.core.generated.common_pb2 import PlacementStrategy +from ray.autoscaler._private.constants import AUTOSCALER_CONSERVE_GPU_NODES from ray.autoscaler.tags import ( TAG_RAY_USER_NODE_TYPE, NODE_KIND_UNMANAGED, NODE_TYPE_LEGACY_WORKER, NODE_KIND_WORKER, NODE_TYPE_LEGACY_HEAD, TAG_RAY_NODE_KIND, NODE_KIND_HEAD) @@ -639,7 +640,7 @@ def get_nodes_for(node_types: Dict[NodeType, NodeTypeConfigDict], # resources. This will behave properly with the current utilization # score heuristic, but it's a little dangerous and misleading. logger.warning( - f"The autoscaler could not find a node type to satisfy the" + f"The autoscaler could not find a node type to satisfy the " f"request: {resources}. If this request is related to " f"placement groups the resource request will resolve itself, " f"otherwise please specify a node type with the necessary " @@ -664,8 +665,16 @@ def get_nodes_for(node_types: Dict[NodeType, NodeTypeConfigDict], def _utilization_score(node_resources: ResourceDict, - resources: ResourceDict) -> float: + resources: List[ResourceDict]) -> float: remaining = copy.deepcopy(node_resources) + is_gpu_node = "GPU" in node_resources + any_gpu_task = any("GPU" in r for r in resources) + + # Avoid launching GPU nodes if there aren't any GPU tasks at all. Note that + # if there *is* a GPU task, then CPU tasks can be scheduled as well. + if AUTOSCALER_CONSERVE_GPU_NODES: + if is_gpu_node and not any_gpu_task: + return None fittable = [] for r in resources: diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index 536cbe18b..977c2f2b8 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -105,6 +105,14 @@ def test_util_score(): (8, 8) +def test_gpu_node_util_score(): + # Avoid scheduling CPU tasks on GPU node. + assert _utilization_score({"GPU": 1, "CPU": 1}, [{"CPU": 1}]) is None + assert _utilization_score({"GPU": 1, "CPU": 1}, [{"CPU": 1, "GPU": 1}]) \ + == (1.0, 1.0) + assert _utilization_score({"GPU": 1, "CPU": 1}, [{"GPU": 1}]) == (0.0, 0.5) + + def test_bin_pack(): assert get_bin_pack_residual([], [{"GPU": 2}, {"GPU": 2}])[0] == \ [{"GPU": 2}, {"GPU": 2}] @@ -247,6 +255,32 @@ def test_get_nodes_packing_heuristic(): } +def test_gpu_node_avoid_cpu_task(): + types = { + "cpu": { + "resources": { + "CPU": 1 + }, + "max_workers": 10, + }, + "gpu": { + "resources": { + "GPU": 1, + "CPU": 100, + }, + "max_workers": 10, + }, + } + r1 = [{"CPU": 1}] * 100 + assert get_nodes_for(types, {}, "empty_node", 100, r1) == {"cpu": 10} + r2 = [{"GPU": 1}] + [{"CPU": 1}] * 100 + assert get_nodes_for(types, {}, "empty_node", 100, r2) == \ + {"gpu": 1} + r3 = [{"GPU": 1}] * 4 + [{"CPU": 1}] * 404 + assert get_nodes_for(types, {}, "empty_node", 100, r3) == \ + {"gpu": 4, "cpu": 4} + + def test_get_nodes_respects_max_limit(): types = { "m4.large": { @@ -2029,7 +2063,6 @@ class AutoscalingTest(unittest.TestCase): "node_config": {}, "resources": { "CPU": 2, - "GPU": 1, "WORKER": 1 }, "max_workers": 3 @@ -2146,7 +2179,6 @@ class AutoscalingTest(unittest.TestCase): "node_config": {}, "resources": { "CPU": 2, - "GPU": 1, "WORKER": 1 }, "max_workers": 3, @@ -2260,7 +2292,6 @@ class AutoscalingTest(unittest.TestCase): "node_config": {}, "resources": { "CPU": 2, - "GPU": 1, "WORKER": 1 }, "max_workers": 3,