mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 19:16:19 +08:00
[autoscaler] Avoid launching GPU nodes when the workload only has CPU tasks. (#13776)
* wip * avoid gpus * update * update
This commit is contained in:
@@ -15,6 +15,9 @@ def env_integer(key, default):
|
||||
# Whether event logging to driver is enabled. Set to 0 to disable.
|
||||
AUTOSCALER_EVENTS = env_integer("AUTOSCALER_EVENTS", 1)
|
||||
|
||||
# Whether to avoid launching GPU nodes for CPU only tasks.
|
||||
AUTOSCALER_CONSERVE_GPU_NODES = env_integer("AUTOSCALER_CONSERVE_GPU_NODES", 1)
|
||||
|
||||
# How long to wait for a node to start, in seconds
|
||||
NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900)
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ from typing import List, Dict
|
||||
from ray.autoscaler.node_provider import NodeProvider
|
||||
from ray.gcs_utils import PlacementGroupTableData
|
||||
from ray.core.generated.common_pb2 import PlacementStrategy
|
||||
from ray.autoscaler._private.constants import AUTOSCALER_CONSERVE_GPU_NODES
|
||||
from ray.autoscaler.tags import (
|
||||
TAG_RAY_USER_NODE_TYPE, NODE_KIND_UNMANAGED, NODE_TYPE_LEGACY_WORKER,
|
||||
NODE_KIND_WORKER, NODE_TYPE_LEGACY_HEAD, TAG_RAY_NODE_KIND, NODE_KIND_HEAD)
|
||||
@@ -639,7 +640,7 @@ def get_nodes_for(node_types: Dict[NodeType, NodeTypeConfigDict],
|
||||
# resources. This will behave properly with the current utilization
|
||||
# score heuristic, but it's a little dangerous and misleading.
|
||||
logger.warning(
|
||||
f"The autoscaler could not find a node type to satisfy the"
|
||||
f"The autoscaler could not find a node type to satisfy the "
|
||||
f"request: {resources}. If this request is related to "
|
||||
f"placement groups the resource request will resolve itself, "
|
||||
f"otherwise please specify a node type with the necessary "
|
||||
@@ -664,8 +665,16 @@ def get_nodes_for(node_types: Dict[NodeType, NodeTypeConfigDict],
|
||||
|
||||
|
||||
def _utilization_score(node_resources: ResourceDict,
|
||||
resources: ResourceDict) -> float:
|
||||
resources: List[ResourceDict]) -> float:
|
||||
remaining = copy.deepcopy(node_resources)
|
||||
is_gpu_node = "GPU" in node_resources
|
||||
any_gpu_task = any("GPU" in r for r in resources)
|
||||
|
||||
# Avoid launching GPU nodes if there aren't any GPU tasks at all. Note that
|
||||
# if there *is* a GPU task, then CPU tasks can be scheduled as well.
|
||||
if AUTOSCALER_CONSERVE_GPU_NODES:
|
||||
if is_gpu_node and not any_gpu_task:
|
||||
return None
|
||||
|
||||
fittable = []
|
||||
for r in resources:
|
||||
|
||||
@@ -105,6 +105,14 @@ def test_util_score():
|
||||
(8, 8)
|
||||
|
||||
|
||||
def test_gpu_node_util_score():
|
||||
# Avoid scheduling CPU tasks on GPU node.
|
||||
assert _utilization_score({"GPU": 1, "CPU": 1}, [{"CPU": 1}]) is None
|
||||
assert _utilization_score({"GPU": 1, "CPU": 1}, [{"CPU": 1, "GPU": 1}]) \
|
||||
== (1.0, 1.0)
|
||||
assert _utilization_score({"GPU": 1, "CPU": 1}, [{"GPU": 1}]) == (0.0, 0.5)
|
||||
|
||||
|
||||
def test_bin_pack():
|
||||
assert get_bin_pack_residual([], [{"GPU": 2}, {"GPU": 2}])[0] == \
|
||||
[{"GPU": 2}, {"GPU": 2}]
|
||||
@@ -247,6 +255,32 @@ def test_get_nodes_packing_heuristic():
|
||||
}
|
||||
|
||||
|
||||
def test_gpu_node_avoid_cpu_task():
|
||||
types = {
|
||||
"cpu": {
|
||||
"resources": {
|
||||
"CPU": 1
|
||||
},
|
||||
"max_workers": 10,
|
||||
},
|
||||
"gpu": {
|
||||
"resources": {
|
||||
"GPU": 1,
|
||||
"CPU": 100,
|
||||
},
|
||||
"max_workers": 10,
|
||||
},
|
||||
}
|
||||
r1 = [{"CPU": 1}] * 100
|
||||
assert get_nodes_for(types, {}, "empty_node", 100, r1) == {"cpu": 10}
|
||||
r2 = [{"GPU": 1}] + [{"CPU": 1}] * 100
|
||||
assert get_nodes_for(types, {}, "empty_node", 100, r2) == \
|
||||
{"gpu": 1}
|
||||
r3 = [{"GPU": 1}] * 4 + [{"CPU": 1}] * 404
|
||||
assert get_nodes_for(types, {}, "empty_node", 100, r3) == \
|
||||
{"gpu": 4, "cpu": 4}
|
||||
|
||||
|
||||
def test_get_nodes_respects_max_limit():
|
||||
types = {
|
||||
"m4.large": {
|
||||
@@ -2029,7 +2063,6 @@ class AutoscalingTest(unittest.TestCase):
|
||||
"node_config": {},
|
||||
"resources": {
|
||||
"CPU": 2,
|
||||
"GPU": 1,
|
||||
"WORKER": 1
|
||||
},
|
||||
"max_workers": 3
|
||||
@@ -2146,7 +2179,6 @@ class AutoscalingTest(unittest.TestCase):
|
||||
"node_config": {},
|
||||
"resources": {
|
||||
"CPU": 2,
|
||||
"GPU": 1,
|
||||
"WORKER": 1
|
||||
},
|
||||
"max_workers": 3,
|
||||
@@ -2260,7 +2292,6 @@ class AutoscalingTest(unittest.TestCase):
|
||||
"node_config": {},
|
||||
"resources": {
|
||||
"CPU": 2,
|
||||
"GPU": 1,
|
||||
"WORKER": 1
|
||||
},
|
||||
"max_workers": 3,
|
||||
|
||||
Reference in New Issue
Block a user