diff --git a/python/ray/autoscaler/_private/aws/node_provider.py b/python/ray/autoscaler/_private/aws/node_provider.py index 7288dc2d4..c49c9046e 100644 --- a/python/ray/autoscaler/_private/aws/node_provider.py +++ b/python/ray/autoscaler/_private/aws/node_provider.py @@ -4,7 +4,7 @@ import threading from collections import defaultdict import logging import time -from typing import Any, Dict +from typing import Any, Dict, List import boto3 import botocore @@ -52,6 +52,35 @@ def make_ec2_client(region, max_retries, aws_credentials=None): "ec2", region_name=region, config=config, **aws_credentials) +def list_ec2_instances(region: str) -> List[Dict[str, Any]]: + """Get all instance-types/resources available in the user's AWS region. + Args: + region (str): the region of the AWS provider. e.g., "us-west-2". + Returns: + final_instance_types: a list of instances. An example of one element in + the list: + {'InstanceType': 'm5a.xlarge', 'ProcessorInfo': + {'SupportedArchitectures': ['x86_64'], 'SustainedClockSpeedInGhz': + 2.5},'VCpuInfo': {'DefaultVCpus': 4, 'DefaultCores': 2, + 'DefaultThreadsPerCore': 2, 'ValidCores': [2], + 'ValidThreadsPerCore': [1, 2]}, 'MemoryInfo': {'SizeInMiB': 16384}, + ...} + + """ + final_instance_types = [] + instance_types = boto3.client( + "ec2", region_name=region).describe_instance_types() + final_instance_types.extend(copy.deepcopy(instance_types["InstanceTypes"])) + while "NextToken" in instance_types: + instance_types = boto3.client( + "ec2", region_name=region).describe_instance_types( + NextToken=instance_types["NextToken"]) + final_instance_types.extend( + copy.deepcopy(instance_types["InstanceTypes"])) + + return final_instance_types + + class AWSNodeProvider(NodeProvider): def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) @@ -456,8 +485,8 @@ class AWSNodeProvider(NodeProvider): return cluster_config cluster_config = copy.deepcopy(cluster_config) - instances_list = boto3.client("ec2").describe_instance_types()[ - "InstanceTypes"] + instances_list = list_ec2_instances( + cluster_config["provider"]["region"]) instances_dict = { instance["InstanceType"]: instance for instance in instances_list @@ -488,5 +517,8 @@ class AWSNodeProvider(NodeProvider): "resources"] = autodetected_resources cli_logger.print("Updating the resources of {} to {}.", node_type, autodetected_resources) - + else: + raise ValueError("Instance type " + instance_type + + " is not available in AWS region: " + + cluster_config["provider"]["region"] + ".") return cluster_config diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index f3f28d549..6aa6a7bb6 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -105,10 +105,8 @@ def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]: try: defaults = _fillout_available_node_types_resources(defaults) except Exception: - # We don't want to introduce new errors with filling available node - # types resources feature. - logger.exception("Failed to autodetect node resources") - + logger.exception("Failed to autodetect node resources.") + raise return defaults diff --git a/python/ray/tests/test_autoscaler_yaml.py b/python/ray/tests/test_autoscaler_yaml.py index 8d31fc076..6f70bba70 100644 --- a/python/ray/tests/test_autoscaler_yaml.py +++ b/python/ray/tests/test_autoscaler_yaml.py @@ -23,8 +23,10 @@ CONFIG_PATHS += recursive_fnmatch( class AutoscalingConfigTest(unittest.TestCase): def testValidateDefaultConfig(self): for config_path in CONFIG_PATHS: - if "aws/example-multi-node-type.yaml" in config_path: - # This is tested in testValidateDefaultConfigAWSMultiNodeTypes. + if ("aws/example-multi-node-type.yaml" in config_path + or "staroid/example-multi-node-type.yaml" in config_path): + # aws is tested in testValidateDefaultConfigAWSMultiNodeTypes. + # staroid fails as it requires an installation of staroid. continue with open(config_path) as f: config = yaml.safe_load(f)