[Core] Gpu type detection (#9695)

* .

* .

* .

* .

* .

* .

* .

* .

* Test cases

* detection only

* .

* Done?

* .

* .

* Done

* added test case

* .

* .

* .

* .

* .

* .

* Update python/ray/ray_constants.py

Co-authored-by: Eric Liang <ekhliang@gmail.com>

* .

* .

Co-authored-by: Eric Liang <ekhliang@gmail.com>
This commit is contained in:
Alex Wu
2020-08-01 11:43:56 -07:00
committed by SangBin Cho
parent f8f6f342f6
commit ea1ac15da0
3 changed files with 113 additions and 0 deletions
+3
View File
@@ -127,6 +127,9 @@ DASHBOARD_AGENT_DIED_ERROR = "dashboard_agent_died"
DASHBOARD_DIED_ERROR = "dashboard_died"
RAYLET_CONNECTION_ERROR = "raylet_connection_error"
# Used in gpu detection
RESOURCE_CONSTRAINT_PREFIX = "GPUType:"
# Abort autoscaling if more than this number of errors are encountered. This
# is a safety feature to prevent e.g. runaway node launches.
AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)
+71
View File
@@ -3,6 +3,7 @@ from collections import namedtuple
import logging
import multiprocessing
import os
import re
import subprocess
import sys
@@ -165,6 +166,13 @@ class ResourceSpec(
if gpu_ids is not None:
num_gpus = min(num_gpus, len(gpu_ids))
try:
info_string = _get_gpu_info_string()
gpu_types = _constraints_from_gpu_info(info_string)
resources.update(gpu_types)
except Exception:
logger.exception("Could not parse gpu information.")
# Choose a default object store size.
system_memory = ray.utils.get_system_memory()
avail_memory = ray.utils.estimate_available_memory()
@@ -251,3 +259,66 @@ def _autodetect_num_gpus():
lines = subprocess.check_output(cmdargs).splitlines()[1:]
result = len([l.rstrip() for l in lines if l.startswith(b"NVIDIA")])
return result
def _constraints_from_gpu_info(info_str):
"""Parse the contents of a /proc/driver/nvidia/gpus/*/information to get the
gpu model type.
Args:
info_str (str): The contents of the file.
Returns:
(str) The full model name.
"""
if info_str is None:
return {}
lines = info_str.split("\n")
full_model_name = None
for line in lines:
split = line.split(":")
if len(split) != 2:
continue
k, v = split
if k.strip() == "Model":
full_model_name = v.strip()
break
pretty_name = _pretty_gpu_name(full_model_name)
if pretty_name:
constraint_name = "{}{}".format(
ray_constants.RESOURCE_CONSTRAINT_PREFIX, pretty_name)
return {constraint_name: 1}
return {}
def _get_gpu_info_string():
"""Get the gpu type for this machine.
TODO(Alex): All the caveats of _autodetect_num_gpus and we assume only one
gpu type.
Returns:
(str) The gpu's model name.
"""
if sys.platform.startswith("linux"):
proc_gpus_path = "/proc/driver/nvidia/gpus"
if os.path.isdir(proc_gpus_path):
gpu_dirs = os.listdir(proc_gpus_path)
if len(gpu_dirs) > 0:
gpu_info_path = "{}/{}/information".format(
proc_gpus_path, gpu_dirs[0])
info_str = open(gpu_info_path).read()
return info_str
return None
# TODO(Alex): This pattern may not work for non NVIDIA Tesla GPUs (which have
# the form "Tesla V100-SXM2-16GB" or "Tesla K80").
GPU_NAME_PATTERN = re.compile("\w+\s+([A-Z0-9]+)")
def _pretty_gpu_name(name):
if name is None:
return None
match = GPU_NAME_PATTERN.match(name)
return match.group(1) if match else None
+39
View File
@@ -15,6 +15,7 @@ import ray
import ray.ray_constants as ray_constants
import ray.cluster_utils
import ray.test_utils
from ray import resource_spec
import setproctitle
from ray.test_utils import (check_call_ray, RayTestTimeoutException,
@@ -683,6 +684,44 @@ def test_ray_address_environment_variable(ray_start_cluster):
ray.shutdown()
def test_gpu_info_parsing():
info_string = """Model: Tesla V100-SXM2-16GB
IRQ: 107
GPU UUID: GPU-8eaaebb8-bb64-8489-fda2-62256e821983
Video BIOS: 88.00.4f.00.09
Bus Type: PCIe
DMA Size: 47 bits
DMA Mask: 0x7fffffffffff
Bus Location: 0000:00:1e.0
Device Minor: 0
Blacklisted: No
"""
constraints_dict = resource_spec._constraints_from_gpu_info(info_string)
expected_dict = {
"{}V100".format(ray_constants.RESOURCE_CONSTRAINT_PREFIX): 1
}
assert constraints_dict == expected_dict
info_string = """Model: Tesla T4
IRQ: 10
GPU UUID: GPU-415fe7a8-f784-6e3d-a958-92ecffacafe2
Video BIOS: 90.04.84.00.06
Bus Type: PCIe
DMA Size: 47 bits
DMA Mask: 0x7fffffffffff
Bus Location: 0000:00:1b.0
Device Minor: 0
Blacklisted: No
"""
constraints_dict = resource_spec._constraints_from_gpu_info(info_string)
expected_dict = {
"{}T4".format(ray_constants.RESOURCE_CONSTRAINT_PREFIX): 1
}
assert constraints_dict == expected_dict
assert resource_spec._constraints_from_gpu_info(None) == {}
if __name__ == "__main__":
import pytest
sys.exit(pytest.main(["-v", __file__]))