Fix mis-memory counting in memory monitor for contaienr environment (#8113)

Co-authored-by: weich <weich@nvidia.com>
This commit is contained in:
yncxcw
2020-04-22 15:32:35 -06:00
committed by GitHub
parent 0bb918f2b1
commit 51559c08b9
+7
View File
@@ -114,6 +114,13 @@ class MemoryMonitor:
with open("/sys/fs/cgroup/memory/memory.usage_in_bytes",
"rb") as f:
used_gb = int(f.read()) / (1024**3)
# Exclude the page cache
with open("/sys/fs/cgroup/memory/memory.stat", "r") as f:
for line in f.readlines():
if line.split(" ")[0] == "cache":
used_gb = \
used_gb - int(line.split(" ")[1]) / (1024**3)
assert used_gb >= 0
if used_gb > total_gb * self.error_threshold:
raise RayOutOfMemoryError(
RayOutOfMemoryError.get_message(used_gb, total_gb,