From 7422f64ea9031339ad870f03fb4967203d64dfd6 Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Wed, 16 Sep 2020 21:02:52 -0700 Subject: [PATCH] [hotfix] CPU Detection (#10821) --- python/ray/tests/test_advanced_3.py | 51 +++++++++++++++++++++ python/ray/utils.py | 71 +++++++++++++++++++++++------ 2 files changed, 107 insertions(+), 15 deletions(-) diff --git a/python/ray/tests/test_advanced_3.py b/python/ray/tests/test_advanced_3.py index 6b996f9a1..982dd96aa 100644 --- a/python/ray/tests/test_advanced_3.py +++ b/python/ray/tests/test_advanced_3.py @@ -4,6 +4,7 @@ import logging import os import sys import socket +import tempfile import time import numpy as np @@ -736,6 +737,56 @@ def test_accelerator_type_api(shutdown_only): assert ray.available_resources()[resource_name] < quantity +def test_detect_docker_cpus(): + # No limits set + with tempfile.NamedTemporaryFile( + "w") as quota_file, tempfile.NamedTemporaryFile( + "w") as period_file, tempfile.NamedTemporaryFile( + "w") as cpuset_file: + quota_file.write("-1") + period_file.write("100000") + cpuset_file.write("0-63") + quota_file.flush() + period_file.flush() + cpuset_file.flush() + assert ray.utils._get_docker_cpus( + cpu_quota_file_name=quota_file.name, + cpu_share_file_name=period_file.name, + cpuset_file_name=cpuset_file.name) == 64 + + # No cpuset used + with tempfile.NamedTemporaryFile( + "w") as quota_file, tempfile.NamedTemporaryFile( + "w") as period_file, tempfile.NamedTemporaryFile( + "w") as cpuset_file: + quota_file.write("-1") + period_file.write("100000") + cpuset_file.write("0-10,20,50-63") + quota_file.flush() + period_file.flush() + cpuset_file.flush() + assert ray.utils._get_docker_cpus( + cpu_quota_file_name=quota_file.name, + cpu_share_file_name=period_file.name, + cpuset_file_name=cpuset_file.name) == 26 + + # Quota set + with tempfile.NamedTemporaryFile( + "w") as quota_file, tempfile.NamedTemporaryFile( + "w") as period_file, tempfile.NamedTemporaryFile( + "w") as cpuset_file: + quota_file.write("42") + period_file.write("100") + cpuset_file.write("0-63") + quota_file.flush() + period_file.flush() + cpuset_file.flush() + assert ray.utils._get_docker_cpus( + cpu_quota_file_name=quota_file.name, + cpu_share_file_name=period_file.name, + cpuset_file_name=cpuset_file.name) == 0.42 + + if __name__ == "__main__": import pytest sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/utils.py b/python/ray/utils.py index bf3d2d554..e5d400ece 100644 --- a/python/ray/utils.py +++ b/python/ray/utils.py @@ -499,22 +499,55 @@ def get_system_memory(): return psutil_memory_in_bytes -def _get_docker_cpus(): - # 1. Try using CFS Quota (https://bugs.openjdk.java.net/browse/JDK-8146115) - # 2. Try Nproc (CPU sets) - cpu_quota_file_name = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us" - cpu_share_file_name = "/sys/fs/cgroup/cpu/cpu.cfs_period_us" - num_cpus = 0 +def _get_docker_cpus( + cpu_quota_file_name="/sys/fs/cgroup/cpu/cpu.cfs_quota_us", + cpu_share_file_name="/sys/fs/cgroup/cpu/cpu.cfs_period_us", + cpuset_file_name="/sys/fs/cgroup/cpuset/cpuset.cpus"): + # TODO (Alex): Don't implement this logic oursleves. + # Docker has 2 underyling ways of implementing CPU limits: + # https://docs.docker.com/config/containers/resource_constraints/#configure-the-default-cfs-scheduler + # 1. --cpuset-cpus 2. --cpus or --cpu-quota/--cpu-period (--cpu-shares is a + # soft limit so we don't worry about it). For Ray's purposes, if we use + # docker, the number of vCPUs on a machine is whichever is set (ties broken + # by smaller value). + + cpu_quota = None + # See: https://bugs.openjdk.java.net/browse/JDK-8146115 if os.path.exists(cpu_quota_file_name) and os.path.exists( cpu_quota_file_name): - with open(cpu_quota_file_name, "r") as f: - num_cpus = int(f.read()) - if num_cpus != -1: - with open(cpu_share_file_name, "r") as f: - num_cpus /= int(f.read()) - return num_cpus + try: + with open(cpu_quota_file_name, "r") as quota_file, open( + cpu_share_file_name, "r") as period_file: + cpu_quota = float(quota_file.read()) / float( + period_file.read()) + except Exception as e: + logger.exception("Unexpected error calculating docker cpu quota.", + e) + if cpu_quota < 0: + cpu_quota = None - return int(subprocess.check_output("nproc")) + cpuset_num = None + if os.path.exists(cpuset_file_name): + try: + with open(cpuset_file_name) as cpuset_file: + ranges_as_string = cpuset_file.read() + ranges = ranges_as_string.split(",") + cpu_ids = [] + for num_or_range in ranges: + if "-" in num_or_range: + start, end = num_or_range.split("-") + cpu_ids.extend(list(range(int(start), int(end) + 1))) + else: + cpu_ids.append(int(num_or_range)) + cpuset_num = len(cpu_ids) + except Exception as e: + logger.exception("Unexpected error calculating docker cpuset ids.", + e) + + if cpu_quota and cpuset_num: + return min(cpu_quota, cpuset_num) + else: + return cpu_quota or cpuset_num def get_num_cpus(): @@ -531,8 +564,7 @@ def get_num_cpus(): # Not easy to get cpu count in docker, see: # https://bugs.python.org/issue36054 docker_count = _get_docker_cpus() - if docker_count != cpu_count: - cpu_count = docker_count + if docker_count is not None and docker_count != cpu_count: if "RAY_DISABLE_DOCKER_CPU_WARNING" not in os.environ: logger.warning( "Detecting docker specified CPUs. In " @@ -543,6 +575,15 @@ def get_num_cpus(): "`RAY_USE_MULTIPROCESSING_CPU_COUNT=1` as an env var " "before starting Ray. Set the env var: " "`RAY_DISABLE_DOCKER_CPU_WARNING=1` to mute this warning.") + # TODO (Alex): We should probably add support for fractional cpus. + if int(docker_count) != float(docker_count): + logger.warning( + f"Ray currently does not support initializing Ray" + f"with fractional cpus. Your num_cpus will be " + f"truncated from {docker_count} to " + f"{int(docker_count)}.") + docker_count = int(docker_count) + cpu_count = docker_count except Exception: # `nproc` and cgroup are linux-only. If docker only works on linux