First pass at ray memory command for memory debugging (#7589)

This commit is contained in:
Eric Liang
2020-03-17 20:45:07 -07:00
committed by GitHub
parent e6a045df48
commit 745b9d643d
32 changed files with 794 additions and 89 deletions
+43 -1
View File
@@ -11,6 +11,7 @@ import numpy
import gc
import inspect
import threading
import traceback
import time
import logging
import os
@@ -558,6 +559,46 @@ cdef void gc_collect() nogil:
num_freed, end - start))
# This function introduces ~2-7us of overhead per call (i.e., it can be called
# up to hundreds of thousands of times per second).
cdef void get_py_stack(c_string* stack_out) nogil:
"""Get the Python call site.
This can be called from within C++ code to retrieve the file name and line
number of the Python code that is calling into the core worker.
"""
with gil:
frame = inspect.currentframe()
msg = ""
while frame:
filename = frame.f_code.co_filename
# Decode Ray internal frames to add annotations.
if filename.endswith("ray/worker.py"):
if frame.f_code.co_name == "put":
msg = "(put object) "
elif filename.endswith("ray/workers/default_worker.py"):
pass
elif filename.endswith("ray/remote_function.py"):
# TODO(ekl) distinguish between task return objects and
# arguments. This can only be done in the core worker.
msg = "(task call) "
elif filename.endswith("ray/actor.py"):
# TODO(ekl) distinguish between actor return objects and
# arguments. This can only be done in the core worker.
msg = "(actor call) "
elif filename.endswith("ray/serialization.py"):
if frame.f_code.co_name == "id_deserializer":
msg = "(deserialize task arg) "
else:
msg += "{}:{}:{}".format(
frame.f_code.co_filename, frame.f_code.co_name,
frame.f_lineno)
break
frame = frame.f_back
stack_out[0] = msg.encode("ascii")
cdef shared_ptr[CBuffer] string_to_buffer(c_string& c_str):
cdef shared_ptr[CBuffer] empty_metadata
if c_str.size() == 0:
@@ -603,7 +644,8 @@ cdef class CoreWorker:
raylet_socket.encode("ascii"), job_id.native(),
gcs_options.native()[0], log_dir.encode("utf-8"),
node_ip_address.encode("utf-8"), node_manager_port,
task_execution_handler, check_signals, gc_collect, True))
task_execution_handler, check_signals, gc_collect,
get_py_stack, True))
def run_task_loop(self):
with nogil:
+2 -1
View File
@@ -98,6 +98,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
const CWorkerID &worker_id) nogil,
CRayStatus() nogil,
void() nogil,
void(c_string *stack_out) nogil,
c_bool ref_counting_enabled)
CWorkerType &GetWorkerType()
CLanguage &GetLanguage()
@@ -188,7 +189,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
unordered_map[CObjectID, pair[size_t, size_t]] GetAllReferenceCounts()
void GetAsync(const CObjectID &object_id,
ray_callback_function successs_callback,
ray_callback_function success_callback,
ray_callback_function fallback_callback,
void* python_future)
+18
View File
@@ -11,6 +11,24 @@ def global_gc():
worker.core_worker.global_gc()
def memory_summary():
"""Returns a formatted string describing memory usage in the cluster."""
import grpc
from ray.core.generated import node_manager_pb2
from ray.core.generated import node_manager_pb2_grpc
# We can ask any Raylet for the global memory info.
raylet = ray.nodes()[0]
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
ray.nodes()[0]["NodeManagerPort"])
channel = grpc.insecure_channel(raylet_address)
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
reply = stub.FormatGlobalMemoryInfo(
node_manager_pb2.FormatGlobalMemoryInfoRequest(), timeout=30.0)
return reply.memory_summary
def free(object_ids, local_only=False, delete_creating_tasks=False):
"""Free a list of IDs from object stores.
+3 -5
View File
@@ -53,11 +53,9 @@ class RayOutOfMemoryError(Exception):
round(get_shared(psutil.virtual_memory()) / (1024**3), 2))
+ "currently being used by the Ray object store. You can set "
"the object store size with the `object_store_memory` "
"parameter when starting Ray, and the max Redis size with "
"`redis_max_memory`. Note that Ray assumes all system "
"memory is available for use by workers. If your system "
"has other applications running, you should manually set "
"these memory limits to a lower value.")
"parameter when starting Ray.\n---\n"
"--- Tip: Use the `ray memory` command to list active "
"objects in the cluster.\n---\n")
class MemoryMonitor:
+33 -1
View File
@@ -945,10 +945,40 @@ def stat(address):
channel = grpc.insecure_channel(raylet_address)
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
reply = stub.GetNodeStats(
node_manager_pb2.GetNodeStatsRequest(), timeout=2.0)
node_manager_pb2.GetNodeStatsRequest(include_memory_info=False),
timeout=2.0)
print(reply)
@cli.command()
@click.option(
"--address",
required=False,
type=str,
help="Override the address to connect to.")
def memory(address):
if not address:
address = services.find_redis_address_or_die()
logger.info("Connecting to Ray instance at {}.".format(address))
ray.init(address=address)
print(ray.internal.internal_api.memory_summary())
@cli.command()
@click.option(
"--address",
required=False,
type=str,
help="Override the address to connect to.")
def globalgc(address):
if not address:
address = services.find_redis_address_or_die()
logger.info("Connecting to Ray instance at {}.".format(address))
ray.init(address=address)
ray.internal.internal_api.global_gc()
print("Triggered gc.collect() on all workers.")
cli.add_command(dashboard)
cli.add_command(start)
cli.add_command(stop)
@@ -966,6 +996,8 @@ cli.add_command(get_worker_ips)
cli.add_command(microbenchmark)
cli.add_command(stack)
cli.add_command(stat)
cli.add_command(memory)
cli.add_command(globalgc)
cli.add_command(timeline)
cli.add_command(project_cli)
cli.add_command(session_cli)
+8
View File
@@ -22,6 +22,14 @@ py_test(
deps = ["//:ray_lib"],
)
py_test(
name = "test_memstat",
size = "small",
srcs = ["test_memstat.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_iter",
size = "medium",
+195
View File
@@ -0,0 +1,195 @@
import ray
import numpy as np
import time
from ray.internal.internal_api import memory_summary
# Unique strings.
DRIVER_PID = "driver pid"
WORKER_PID = "worker pid"
UNKNOWN_SIZE = " ? "
# Reference status.
PINNED_IN_MEMORY = "PINNED_IN_MEMORY"
LOCAL_REF = "LOCAL_REFERENCE"
USED_BY_PENDING_TASK = "USED_BY_PENDING_TASK"
CAPTURED_IN_OBJECT = "CAPTURED_IN_OBJECT"
# Call sites.
PUT_OBJ = "(put object)"
TASK_CALL_OBJ = "(task call)"
ACTOR_TASK_CALL_OBJ = "(actor call)"
DESER_TASK_ARG = "(deserialize task arg)"
DESER_ACTOR_TASK_ARG = "(deserialize actor task arg)"
def data_lines(memory_str):
for line in memory_str.split("\n"):
if (not line or "---" in line or "===" in line or "Object ID" in line
or "pid=" in line):
continue
yield line
def num_objects(memory_str):
n = 0
for line in data_lines(memory_str):
n += 1
return n
def count(memory_str, substr):
n = 0
for line in memory_str.split("\n"):
if substr in line:
n += 1
return n
def test_driver_put_ref(ray_start_regular):
info = memory_summary()
assert num_objects(info) == 0, info
x_id = ray.put("HI")
info = memory_summary()
print(info)
assert num_objects(info) == 1, info
assert count(info, DRIVER_PID) == 1, info
assert count(info, WORKER_PID) == 0, info
del x_id
info = memory_summary()
assert num_objects(info) == 0, info
def test_worker_task_refs(ray_start_regular):
@ray.remote
def f(y):
x_id = ray.put("HI")
info = memory_summary()
del x_id
return info
x_id = f.remote(np.zeros(100000))
info = ray.get(x_id)
print(info)
assert num_objects(info) == 4, info
# Task argument plus task return ids.
assert count(info, TASK_CALL_OBJ) == 2, info
assert count(info, DRIVER_PID) == 1, info
assert count(info, WORKER_PID) == 1, info
assert count(info, LOCAL_REF) == 2, info
assert count(info, PINNED_IN_MEMORY) == 1, info
assert count(info, PUT_OBJ) == 1, info
assert count(info, DESER_TASK_ARG) == 1, info
assert count(info, UNKNOWN_SIZE) == 1, info
assert count(info, "test_memstat.py:f") == 1, info
assert count(info, "test_memstat.py:test_worker_task_refs") == 2, info
info = memory_summary()
print(info)
assert num_objects(info) == 1, info
assert count(info, DRIVER_PID) == 1, info
assert count(info, TASK_CALL_OBJ) == 1, info
assert count(info, UNKNOWN_SIZE) == 0, info
assert count(info, x_id.hex()) == 1, info
del x_id
info = memory_summary()
assert num_objects(info) == 0, info
def test_actor_task_refs(ray_start_regular):
@ray.remote
class Actor:
def __init__(self):
self.refs = []
def f(self, x):
self.refs.append(x)
return memory_summary()
def make_actor():
return Actor.remote()
actor = make_actor()
x_id = actor.f.remote(np.zeros(100000))
info = ray.get(x_id)
print(info)
assert num_objects(info) == 4, info
# Actor handle, task argument id, task return id.
assert count(info, ACTOR_TASK_CALL_OBJ) == 3, info
assert count(info, DRIVER_PID) == 1, info
assert count(info, WORKER_PID) == 1, info
assert count(info, LOCAL_REF) == 1, info
assert count(info, PINNED_IN_MEMORY) == 1, info
assert count(info, USED_BY_PENDING_TASK) == 2, info
assert count(info, DESER_ACTOR_TASK_ARG) == 1, info
assert count(info, "test_memstat.py:test_actor_task_refs") == 2, info
assert count(info, "test_memstat.py:make_actor") == 1, info
del x_id
# These should accumulate in the actor.
for _ in range(5):
ray.get(actor.f.remote([ray.put(np.zeros(100000))]))
info = memory_summary()
print(info)
assert count(info, DESER_ACTOR_TASK_ARG) == 5, info
assert count(info, ACTOR_TASK_CALL_OBJ) == 1, info
# Cleanup.
del actor
time.sleep(1)
info = memory_summary()
assert num_objects(info) == 0, info
def test_nested_object_refs(ray_start_regular):
x_id = ray.put(np.zeros(100000))
y_id = ray.put([x_id])
z_id = ray.put([y_id])
del x_id, y_id
info = memory_summary()
print(info)
assert num_objects(info) == 3, info
assert count(info, LOCAL_REF) == 1, info
assert count(info, CAPTURED_IN_OBJECT) == 2, info
del z_id
def test_pinned_object_call_site(ray_start_regular):
# Local ref only.
x_id = ray.put(np.zeros(100000))
info = memory_summary()
print(info)
assert num_objects(info) == 1, info
assert count(info, LOCAL_REF) == 1, info
assert count(info, PINNED_IN_MEMORY) == 0, info
assert count(info, "test_memstat.py") == 1, info
# Local ref + pinned buffer.
buf = ray.get(x_id)
info = memory_summary()
print(info)
assert num_objects(info) == 1, info
assert count(info, LOCAL_REF) == 0, info
assert count(info, PINNED_IN_MEMORY) == 1, info
assert count(info, "test_memstat.py") == 1, info
# Just pinned buffer.
del x_id
info = memory_summary()
print(info)
assert num_objects(info) == 1, info
assert count(info, LOCAL_REF) == 0, info
assert count(info, PINNED_IN_MEMORY) == 1, info
assert count(info, "test_memstat.py") == 1, info
# Nothing.
del buf
info = memory_summary()
print(info)
assert num_objects(info) == 0, info
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+1 -4
View File
@@ -1549,10 +1549,7 @@ def put(value, weakref=False):
except ObjectStoreFullError:
logger.info(
"Put failed since the value was either too large or the "
"store was full of pinned objects. If you are putting "
"and holding references to a lot of object ids, consider "
"ray.put(value, weakref=True) to allow object data to "
"be evicted early.")
"store was full of pinned objects.")
raise
return object_id