mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 15:22:56 +08:00
[Core] Gpu type detection (#9695)
* . * . * . * . * . * . * . * . * Test cases * detection only * . * Done? * . * . * Done * added test case * . * . * . * . * . * . * Update python/ray/ray_constants.py Co-authored-by: Eric Liang <ekhliang@gmail.com> * . * . Co-authored-by: Eric Liang <ekhliang@gmail.com>
This commit is contained in:
@@ -127,6 +127,9 @@ DASHBOARD_AGENT_DIED_ERROR = "dashboard_agent_died"
|
||||
DASHBOARD_DIED_ERROR = "dashboard_died"
|
||||
RAYLET_CONNECTION_ERROR = "raylet_connection_error"
|
||||
|
||||
# Used in gpu detection
|
||||
RESOURCE_CONSTRAINT_PREFIX = "GPUType:"
|
||||
|
||||
# Abort autoscaling if more than this number of errors are encountered. This
|
||||
# is a safety feature to prevent e.g. runaway node launches.
|
||||
AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)
|
||||
|
||||
@@ -3,6 +3,7 @@ from collections import namedtuple
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
@@ -165,6 +166,13 @@ class ResourceSpec(
|
||||
if gpu_ids is not None:
|
||||
num_gpus = min(num_gpus, len(gpu_ids))
|
||||
|
||||
try:
|
||||
info_string = _get_gpu_info_string()
|
||||
gpu_types = _constraints_from_gpu_info(info_string)
|
||||
resources.update(gpu_types)
|
||||
except Exception:
|
||||
logger.exception("Could not parse gpu information.")
|
||||
|
||||
# Choose a default object store size.
|
||||
system_memory = ray.utils.get_system_memory()
|
||||
avail_memory = ray.utils.estimate_available_memory()
|
||||
@@ -251,3 +259,66 @@ def _autodetect_num_gpus():
|
||||
lines = subprocess.check_output(cmdargs).splitlines()[1:]
|
||||
result = len([l.rstrip() for l in lines if l.startswith(b"NVIDIA")])
|
||||
return result
|
||||
|
||||
|
||||
def _constraints_from_gpu_info(info_str):
|
||||
"""Parse the contents of a /proc/driver/nvidia/gpus/*/information to get the
|
||||
gpu model type.
|
||||
|
||||
Args:
|
||||
info_str (str): The contents of the file.
|
||||
|
||||
Returns:
|
||||
(str) The full model name.
|
||||
"""
|
||||
if info_str is None:
|
||||
return {}
|
||||
lines = info_str.split("\n")
|
||||
full_model_name = None
|
||||
for line in lines:
|
||||
split = line.split(":")
|
||||
if len(split) != 2:
|
||||
continue
|
||||
k, v = split
|
||||
if k.strip() == "Model":
|
||||
full_model_name = v.strip()
|
||||
break
|
||||
pretty_name = _pretty_gpu_name(full_model_name)
|
||||
if pretty_name:
|
||||
constraint_name = "{}{}".format(
|
||||
ray_constants.RESOURCE_CONSTRAINT_PREFIX, pretty_name)
|
||||
return {constraint_name: 1}
|
||||
return {}
|
||||
|
||||
|
||||
def _get_gpu_info_string():
|
||||
"""Get the gpu type for this machine.
|
||||
|
||||
TODO(Alex): All the caveats of _autodetect_num_gpus and we assume only one
|
||||
gpu type.
|
||||
|
||||
Returns:
|
||||
(str) The gpu's model name.
|
||||
"""
|
||||
if sys.platform.startswith("linux"):
|
||||
proc_gpus_path = "/proc/driver/nvidia/gpus"
|
||||
if os.path.isdir(proc_gpus_path):
|
||||
gpu_dirs = os.listdir(proc_gpus_path)
|
||||
if len(gpu_dirs) > 0:
|
||||
gpu_info_path = "{}/{}/information".format(
|
||||
proc_gpus_path, gpu_dirs[0])
|
||||
info_str = open(gpu_info_path).read()
|
||||
return info_str
|
||||
return None
|
||||
|
||||
|
||||
# TODO(Alex): This pattern may not work for non NVIDIA Tesla GPUs (which have
|
||||
# the form "Tesla V100-SXM2-16GB" or "Tesla K80").
|
||||
GPU_NAME_PATTERN = re.compile("\w+\s+([A-Z0-9]+)")
|
||||
|
||||
|
||||
def _pretty_gpu_name(name):
|
||||
if name is None:
|
||||
return None
|
||||
match = GPU_NAME_PATTERN.match(name)
|
||||
return match.group(1) if match else None
|
||||
|
||||
@@ -15,6 +15,7 @@ import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
import ray.cluster_utils
|
||||
import ray.test_utils
|
||||
from ray import resource_spec
|
||||
import setproctitle
|
||||
|
||||
from ray.test_utils import (check_call_ray, RayTestTimeoutException,
|
||||
@@ -683,6 +684,44 @@ def test_ray_address_environment_variable(ray_start_cluster):
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def test_gpu_info_parsing():
|
||||
info_string = """Model: Tesla V100-SXM2-16GB
|
||||
IRQ: 107
|
||||
GPU UUID: GPU-8eaaebb8-bb64-8489-fda2-62256e821983
|
||||
Video BIOS: 88.00.4f.00.09
|
||||
Bus Type: PCIe
|
||||
DMA Size: 47 bits
|
||||
DMA Mask: 0x7fffffffffff
|
||||
Bus Location: 0000:00:1e.0
|
||||
Device Minor: 0
|
||||
Blacklisted: No
|
||||
"""
|
||||
constraints_dict = resource_spec._constraints_from_gpu_info(info_string)
|
||||
expected_dict = {
|
||||
"{}V100".format(ray_constants.RESOURCE_CONSTRAINT_PREFIX): 1
|
||||
}
|
||||
assert constraints_dict == expected_dict
|
||||
|
||||
info_string = """Model: Tesla T4
|
||||
IRQ: 10
|
||||
GPU UUID: GPU-415fe7a8-f784-6e3d-a958-92ecffacafe2
|
||||
Video BIOS: 90.04.84.00.06
|
||||
Bus Type: PCIe
|
||||
DMA Size: 47 bits
|
||||
DMA Mask: 0x7fffffffffff
|
||||
Bus Location: 0000:00:1b.0
|
||||
Device Minor: 0
|
||||
Blacklisted: No
|
||||
"""
|
||||
constraints_dict = resource_spec._constraints_from_gpu_info(info_string)
|
||||
expected_dict = {
|
||||
"{}T4".format(ray_constants.RESOURCE_CONSTRAINT_PREFIX): 1
|
||||
}
|
||||
assert constraints_dict == expected_dict
|
||||
|
||||
assert resource_spec._constraints_from_gpu_info(None) == {}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
Reference in New Issue
Block a user