diff --git a/python/ray/autoscaler/_private/commands.py b/python/ray/autoscaler/_private/commands.py index 6f70c4c7e..0c7e3abbd 100644 --- a/python/ray/autoscaler/_private/commands.py +++ b/python/ray/autoscaler/_private/commands.py @@ -280,9 +280,10 @@ def _bootstrap_config(config: Dict[str, Any], f"Failed to autodetect node resources: {str(exc)}. " "You can see full stack trace with higher verbosity.") - # NOTE: if `resources` field is missing, validate_config for non-AWS will - # fail (the schema error will ask the user to manually fill the resources) - # as we currently support autofilling resources for AWS instances only. + # NOTE: if `resources` field is missing, validate_config for providers + # other than AWS and Kubernetes will fail (the schema error will ask the + # user to manually fill the resources) as we currently support autofilling + # resources for AWS and Kubernetes only. validate_config(config) resolved_config = provider_cls.bootstrap_config(config) diff --git a/python/ray/autoscaler/_private/kubernetes/config.py b/python/ray/autoscaler/_private/kubernetes/config.py index 20173119b..b285e7701 100644 --- a/python/ray/autoscaler/_private/kubernetes/config.py +++ b/python/ray/autoscaler/_private/kubernetes/config.py @@ -60,6 +60,13 @@ def bootstrap_kubernetes(config): def fillout_resources_kubernetes(config): + """Fills CPU and GPU resources by reading pod spec of each available node + type. + + For each node type and each of CPU/GPU, looks at container's resources + and limits, takes min of the two. The result is rounded up, as Ray does + not currently support fractional CPU. + """ if "available_node_types" not in config: return config["available_node_types"] node_types = copy.deepcopy(config["available_node_types"]) @@ -96,20 +103,47 @@ def get_resource(container_resources, resource_name): limit = _get_resource( container_resources, resource_name, field_name="limits") resource = min(request, limit) + # float("inf") value means the resource wasn't detected in either + # requests or limits return 0 if resource == float("inf") else int(resource) def _get_resource(container_resources, resource_name, field_name): - if (field_name in container_resources - and resource_name in container_resources[field_name]): - return _parse_resource(container_resources[field_name][resource_name]) - else: + """Returns the resource quantity. + + The amount of resource is rounded up to nearest integer. + Returns float("inf") if the resource is not present. + + Args: + container_resources (dict): Container's resource field. + resource_name (str): One of 'cpu' or 'gpu'. + field_name (str): One of 'requests' or 'limits'. + + Returns: + Union[int, float]: Detected resource quantity. + """ + if field_name not in container_resources: + # No limit/resource field. return float("inf") + resources = container_resources[field_name] + # Look for keys containing the resource_name. For example, + # the key 'nvidia.com/gpu' contains the key 'gpu'. + matching_keys = [key for key in resources if resource_name in key.lower()] + if len(matching_keys) == 0: + return float("inf") + if len(matching_keys) > 1: + # Should have only one match -- mostly relevant for gpu. + raise ValueError(f"Multiple {resource_name} types not supported.") + # E.g. 'nvidia.com/gpu' or 'cpu'. + resource_key = matching_keys.pop() + resource_quantity = resources[resource_key] + return _parse_resource(resource_quantity) def _parse_resource(resource): resource_str = str(resource) if resource_str[-1] == "m": + # For example, '500m' rounds up to 1. return math.ceil(int(resource_str[:-1]) / 1000) else: return int(resource_str)