[k8s] Read gpu resources properly (#12942)

* Read gpu resources properly

* Comments and docstrings

* Comment formatting
This commit is contained in:
Gekho457
2020-12-18 01:32:12 -08:00
committed by GitHub
parent 426f8a8d15
commit bff50cfc37
2 changed files with 42 additions and 7 deletions
+4 -3
View File
@@ -280,9 +280,10 @@ def _bootstrap_config(config: Dict[str, Any],
f"Failed to autodetect node resources: {str(exc)}. "
"You can see full stack trace with higher verbosity.")
# NOTE: if `resources` field is missing, validate_config for non-AWS will
# fail (the schema error will ask the user to manually fill the resources)
# as we currently support autofilling resources for AWS instances only.
# NOTE: if `resources` field is missing, validate_config for providers
# other than AWS and Kubernetes will fail (the schema error will ask the
# user to manually fill the resources) as we currently support autofilling
# resources for AWS and Kubernetes only.
validate_config(config)
resolved_config = provider_cls.bootstrap_config(config)
@@ -60,6 +60,13 @@ def bootstrap_kubernetes(config):
def fillout_resources_kubernetes(config):
"""Fills CPU and GPU resources by reading pod spec of each available node
type.
For each node type and each of CPU/GPU, looks at container's resources
and limits, takes min of the two. The result is rounded up, as Ray does
not currently support fractional CPU.
"""
if "available_node_types" not in config:
return config["available_node_types"]
node_types = copy.deepcopy(config["available_node_types"])
@@ -96,20 +103,47 @@ def get_resource(container_resources, resource_name):
limit = _get_resource(
container_resources, resource_name, field_name="limits")
resource = min(request, limit)
# float("inf") value means the resource wasn't detected in either
# requests or limits
return 0 if resource == float("inf") else int(resource)
def _get_resource(container_resources, resource_name, field_name):
if (field_name in container_resources
and resource_name in container_resources[field_name]):
return _parse_resource(container_resources[field_name][resource_name])
else:
"""Returns the resource quantity.
The amount of resource is rounded up to nearest integer.
Returns float("inf") if the resource is not present.
Args:
container_resources (dict): Container's resource field.
resource_name (str): One of 'cpu' or 'gpu'.
field_name (str): One of 'requests' or 'limits'.
Returns:
Union[int, float]: Detected resource quantity.
"""
if field_name not in container_resources:
# No limit/resource field.
return float("inf")
resources = container_resources[field_name]
# Look for keys containing the resource_name. For example,
# the key 'nvidia.com/gpu' contains the key 'gpu'.
matching_keys = [key for key in resources if resource_name in key.lower()]
if len(matching_keys) == 0:
return float("inf")
if len(matching_keys) > 1:
# Should have only one match -- mostly relevant for gpu.
raise ValueError(f"Multiple {resource_name} types not supported.")
# E.g. 'nvidia.com/gpu' or 'cpu'.
resource_key = matching_keys.pop()
resource_quantity = resources[resource_key]
return _parse_resource(resource_quantity)
def _parse_resource(resource):
resource_str = str(resource)
if resource_str[-1] == "m":
# For example, '500m' rounds up to 1.
return math.ceil(int(resource_str[:-1]) / 1000)
else:
return int(resource_str)