diff --git a/python/ray/ray_constants.py b/python/ray/ray_constants.py index 4988a1927..d17eb8f19 100644 --- a/python/ray/ray_constants.py +++ b/python/ray/ray_constants.py @@ -127,6 +127,9 @@ DASHBOARD_AGENT_DIED_ERROR = "dashboard_agent_died" DASHBOARD_DIED_ERROR = "dashboard_died" RAYLET_CONNECTION_ERROR = "raylet_connection_error" +# Used in gpu detection +RESOURCE_CONSTRAINT_PREFIX = "GPUType:" + # Abort autoscaling if more than this number of errors are encountered. This # is a safety feature to prevent e.g. runaway node launches. AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5) diff --git a/python/ray/resource_spec.py b/python/ray/resource_spec.py index 0399952ae..9779baa34 100644 --- a/python/ray/resource_spec.py +++ b/python/ray/resource_spec.py @@ -3,6 +3,7 @@ from collections import namedtuple import logging import multiprocessing import os +import re import subprocess import sys @@ -165,6 +166,13 @@ class ResourceSpec( if gpu_ids is not None: num_gpus = min(num_gpus, len(gpu_ids)) + try: + info_string = _get_gpu_info_string() + gpu_types = _constraints_from_gpu_info(info_string) + resources.update(gpu_types) + except Exception: + logger.exception("Could not parse gpu information.") + # Choose a default object store size. system_memory = ray.utils.get_system_memory() avail_memory = ray.utils.estimate_available_memory() @@ -251,3 +259,66 @@ def _autodetect_num_gpus(): lines = subprocess.check_output(cmdargs).splitlines()[1:] result = len([l.rstrip() for l in lines if l.startswith(b"NVIDIA")]) return result + + +def _constraints_from_gpu_info(info_str): + """Parse the contents of a /proc/driver/nvidia/gpus/*/information to get the +gpu model type. + + Args: + info_str (str): The contents of the file. + + Returns: + (str) The full model name. + """ + if info_str is None: + return {} + lines = info_str.split("\n") + full_model_name = None + for line in lines: + split = line.split(":") + if len(split) != 2: + continue + k, v = split + if k.strip() == "Model": + full_model_name = v.strip() + break + pretty_name = _pretty_gpu_name(full_model_name) + if pretty_name: + constraint_name = "{}{}".format( + ray_constants.RESOURCE_CONSTRAINT_PREFIX, pretty_name) + return {constraint_name: 1} + return {} + + +def _get_gpu_info_string(): + """Get the gpu type for this machine. + + TODO(Alex): All the caveats of _autodetect_num_gpus and we assume only one + gpu type. + + Returns: + (str) The gpu's model name. + """ + if sys.platform.startswith("linux"): + proc_gpus_path = "/proc/driver/nvidia/gpus" + if os.path.isdir(proc_gpus_path): + gpu_dirs = os.listdir(proc_gpus_path) + if len(gpu_dirs) > 0: + gpu_info_path = "{}/{}/information".format( + proc_gpus_path, gpu_dirs[0]) + info_str = open(gpu_info_path).read() + return info_str + return None + + +# TODO(Alex): This pattern may not work for non NVIDIA Tesla GPUs (which have +# the form "Tesla V100-SXM2-16GB" or "Tesla K80"). +GPU_NAME_PATTERN = re.compile("\w+\s+([A-Z0-9]+)") + + +def _pretty_gpu_name(name): + if name is None: + return None + match = GPU_NAME_PATTERN.match(name) + return match.group(1) if match else None diff --git a/python/ray/tests/test_advanced_3.py b/python/ray/tests/test_advanced_3.py index fbab8c8c0..1ce9e826e 100644 --- a/python/ray/tests/test_advanced_3.py +++ b/python/ray/tests/test_advanced_3.py @@ -15,6 +15,7 @@ import ray import ray.ray_constants as ray_constants import ray.cluster_utils import ray.test_utils +from ray import resource_spec import setproctitle from ray.test_utils import (check_call_ray, RayTestTimeoutException, @@ -683,6 +684,44 @@ def test_ray_address_environment_variable(ray_start_cluster): ray.shutdown() +def test_gpu_info_parsing(): + info_string = """Model: Tesla V100-SXM2-16GB +IRQ: 107 +GPU UUID: GPU-8eaaebb8-bb64-8489-fda2-62256e821983 +Video BIOS: 88.00.4f.00.09 +Bus Type: PCIe +DMA Size: 47 bits +DMA Mask: 0x7fffffffffff +Bus Location: 0000:00:1e.0 +Device Minor: 0 +Blacklisted: No + """ + constraints_dict = resource_spec._constraints_from_gpu_info(info_string) + expected_dict = { + "{}V100".format(ray_constants.RESOURCE_CONSTRAINT_PREFIX): 1 + } + assert constraints_dict == expected_dict + + info_string = """Model: Tesla T4 +IRQ: 10 +GPU UUID: GPU-415fe7a8-f784-6e3d-a958-92ecffacafe2 +Video BIOS: 90.04.84.00.06 +Bus Type: PCIe +DMA Size: 47 bits +DMA Mask: 0x7fffffffffff +Bus Location: 0000:00:1b.0 +Device Minor: 0 +Blacklisted: No + """ + constraints_dict = resource_spec._constraints_from_gpu_info(info_string) + expected_dict = { + "{}T4".format(ray_constants.RESOURCE_CONSTRAINT_PREFIX): 1 + } + assert constraints_dict == expected_dict + + assert resource_spec._constraints_from_gpu_info(None) == {} + + if __name__ == "__main__": import pytest sys.exit(pytest.main(["-v", __file__]))