mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 21:53:18 +08:00
[k8s] Read gpu resources properly (#12942)
* Read gpu resources properly * Comments and docstrings * Comment formatting
This commit is contained in:
@@ -280,9 +280,10 @@ def _bootstrap_config(config: Dict[str, Any],
|
||||
f"Failed to autodetect node resources: {str(exc)}. "
|
||||
"You can see full stack trace with higher verbosity.")
|
||||
|
||||
# NOTE: if `resources` field is missing, validate_config for non-AWS will
|
||||
# fail (the schema error will ask the user to manually fill the resources)
|
||||
# as we currently support autofilling resources for AWS instances only.
|
||||
# NOTE: if `resources` field is missing, validate_config for providers
|
||||
# other than AWS and Kubernetes will fail (the schema error will ask the
|
||||
# user to manually fill the resources) as we currently support autofilling
|
||||
# resources for AWS and Kubernetes only.
|
||||
validate_config(config)
|
||||
resolved_config = provider_cls.bootstrap_config(config)
|
||||
|
||||
|
||||
@@ -60,6 +60,13 @@ def bootstrap_kubernetes(config):
|
||||
|
||||
|
||||
def fillout_resources_kubernetes(config):
|
||||
"""Fills CPU and GPU resources by reading pod spec of each available node
|
||||
type.
|
||||
|
||||
For each node type and each of CPU/GPU, looks at container's resources
|
||||
and limits, takes min of the two. The result is rounded up, as Ray does
|
||||
not currently support fractional CPU.
|
||||
"""
|
||||
if "available_node_types" not in config:
|
||||
return config["available_node_types"]
|
||||
node_types = copy.deepcopy(config["available_node_types"])
|
||||
@@ -96,20 +103,47 @@ def get_resource(container_resources, resource_name):
|
||||
limit = _get_resource(
|
||||
container_resources, resource_name, field_name="limits")
|
||||
resource = min(request, limit)
|
||||
# float("inf") value means the resource wasn't detected in either
|
||||
# requests or limits
|
||||
return 0 if resource == float("inf") else int(resource)
|
||||
|
||||
|
||||
def _get_resource(container_resources, resource_name, field_name):
|
||||
if (field_name in container_resources
|
||||
and resource_name in container_resources[field_name]):
|
||||
return _parse_resource(container_resources[field_name][resource_name])
|
||||
else:
|
||||
"""Returns the resource quantity.
|
||||
|
||||
The amount of resource is rounded up to nearest integer.
|
||||
Returns float("inf") if the resource is not present.
|
||||
|
||||
Args:
|
||||
container_resources (dict): Container's resource field.
|
||||
resource_name (str): One of 'cpu' or 'gpu'.
|
||||
field_name (str): One of 'requests' or 'limits'.
|
||||
|
||||
Returns:
|
||||
Union[int, float]: Detected resource quantity.
|
||||
"""
|
||||
if field_name not in container_resources:
|
||||
# No limit/resource field.
|
||||
return float("inf")
|
||||
resources = container_resources[field_name]
|
||||
# Look for keys containing the resource_name. For example,
|
||||
# the key 'nvidia.com/gpu' contains the key 'gpu'.
|
||||
matching_keys = [key for key in resources if resource_name in key.lower()]
|
||||
if len(matching_keys) == 0:
|
||||
return float("inf")
|
||||
if len(matching_keys) > 1:
|
||||
# Should have only one match -- mostly relevant for gpu.
|
||||
raise ValueError(f"Multiple {resource_name} types not supported.")
|
||||
# E.g. 'nvidia.com/gpu' or 'cpu'.
|
||||
resource_key = matching_keys.pop()
|
||||
resource_quantity = resources[resource_key]
|
||||
return _parse_resource(resource_quantity)
|
||||
|
||||
|
||||
def _parse_resource(resource):
|
||||
resource_str = str(resource)
|
||||
if resource_str[-1] == "m":
|
||||
# For example, '500m' rounds up to 1.
|
||||
return math.ceil(int(resource_str[:-1]) / 1000)
|
||||
else:
|
||||
return int(resource_str)
|
||||
|
||||
Reference in New Issue
Block a user