[autoscaler] Avoid launching GPU nodes when the workload only has CPU tasks. (#13776)

* wip

* avoid gpus

* update

* update
This commit is contained in:
Eric Liang
2021-01-29 09:50:28 -08:00
committed by GitHub
parent 4d6817c683
commit b20a38febb
3 changed files with 48 additions and 5 deletions
@@ -15,6 +15,9 @@ def env_integer(key, default):
# Whether event logging to driver is enabled. Set to 0 to disable.
AUTOSCALER_EVENTS = env_integer("AUTOSCALER_EVENTS", 1)
# Whether to avoid launching GPU nodes for CPU only tasks.
AUTOSCALER_CONSERVE_GPU_NODES = env_integer("AUTOSCALER_CONSERVE_GPU_NODES", 1)
# How long to wait for a node to start, in seconds
NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900)
@@ -17,6 +17,7 @@ from typing import List, Dict
from ray.autoscaler.node_provider import NodeProvider
from ray.gcs_utils import PlacementGroupTableData
from ray.core.generated.common_pb2 import PlacementStrategy
from ray.autoscaler._private.constants import AUTOSCALER_CONSERVE_GPU_NODES
from ray.autoscaler.tags import (
TAG_RAY_USER_NODE_TYPE, NODE_KIND_UNMANAGED, NODE_TYPE_LEGACY_WORKER,
NODE_KIND_WORKER, NODE_TYPE_LEGACY_HEAD, TAG_RAY_NODE_KIND, NODE_KIND_HEAD)
@@ -639,7 +640,7 @@ def get_nodes_for(node_types: Dict[NodeType, NodeTypeConfigDict],
# resources. This will behave properly with the current utilization
# score heuristic, but it's a little dangerous and misleading.
logger.warning(
f"The autoscaler could not find a node type to satisfy the"
f"The autoscaler could not find a node type to satisfy the "
f"request: {resources}. If this request is related to "
f"placement groups the resource request will resolve itself, "
f"otherwise please specify a node type with the necessary "
@@ -664,8 +665,16 @@ def get_nodes_for(node_types: Dict[NodeType, NodeTypeConfigDict],
def _utilization_score(node_resources: ResourceDict,
resources: ResourceDict) -> float:
resources: List[ResourceDict]) -> float:
remaining = copy.deepcopy(node_resources)
is_gpu_node = "GPU" in node_resources
any_gpu_task = any("GPU" in r for r in resources)
# Avoid launching GPU nodes if there aren't any GPU tasks at all. Note that
# if there *is* a GPU task, then CPU tasks can be scheduled as well.
if AUTOSCALER_CONSERVE_GPU_NODES:
if is_gpu_node and not any_gpu_task:
return None
fittable = []
for r in resources:
@@ -105,6 +105,14 @@ def test_util_score():
(8, 8)
def test_gpu_node_util_score():
# Avoid scheduling CPU tasks on GPU node.
assert _utilization_score({"GPU": 1, "CPU": 1}, [{"CPU": 1}]) is None
assert _utilization_score({"GPU": 1, "CPU": 1}, [{"CPU": 1, "GPU": 1}]) \
== (1.0, 1.0)
assert _utilization_score({"GPU": 1, "CPU": 1}, [{"GPU": 1}]) == (0.0, 0.5)
def test_bin_pack():
assert get_bin_pack_residual([], [{"GPU": 2}, {"GPU": 2}])[0] == \
[{"GPU": 2}, {"GPU": 2}]
@@ -247,6 +255,32 @@ def test_get_nodes_packing_heuristic():
}
def test_gpu_node_avoid_cpu_task():
types = {
"cpu": {
"resources": {
"CPU": 1
},
"max_workers": 10,
},
"gpu": {
"resources": {
"GPU": 1,
"CPU": 100,
},
"max_workers": 10,
},
}
r1 = [{"CPU": 1}] * 100
assert get_nodes_for(types, {}, "empty_node", 100, r1) == {"cpu": 10}
r2 = [{"GPU": 1}] + [{"CPU": 1}] * 100
assert get_nodes_for(types, {}, "empty_node", 100, r2) == \
{"gpu": 1}
r3 = [{"GPU": 1}] * 4 + [{"CPU": 1}] * 404
assert get_nodes_for(types, {}, "empty_node", 100, r3) == \
{"gpu": 4, "cpu": 4}
def test_get_nodes_respects_max_limit():
types = {
"m4.large": {
@@ -2029,7 +2063,6 @@ class AutoscalingTest(unittest.TestCase):
"node_config": {},
"resources": {
"CPU": 2,
"GPU": 1,
"WORKER": 1
},
"max_workers": 3
@@ -2146,7 +2179,6 @@ class AutoscalingTest(unittest.TestCase):
"node_config": {},
"resources": {
"CPU": 2,
"GPU": 1,
"WORKER": 1
},
"max_workers": 3,
@@ -2260,7 +2292,6 @@ class AutoscalingTest(unittest.TestCase):
"node_config": {},
"resources": {
"CPU": 2,
"GPU": 1,
"WORKER": 1
},
"max_workers": 3,