mirror of
https://github.com/wassname/ray.git
synced 2026-07-02 16:14:06 +08:00
First pass at ray memory command for memory debugging (#7589)
This commit is contained in:
+43
-1
@@ -11,6 +11,7 @@ import numpy
|
||||
import gc
|
||||
import inspect
|
||||
import threading
|
||||
import traceback
|
||||
import time
|
||||
import logging
|
||||
import os
|
||||
@@ -558,6 +559,46 @@ cdef void gc_collect() nogil:
|
||||
num_freed, end - start))
|
||||
|
||||
|
||||
# This function introduces ~2-7us of overhead per call (i.e., it can be called
|
||||
# up to hundreds of thousands of times per second).
|
||||
cdef void get_py_stack(c_string* stack_out) nogil:
|
||||
"""Get the Python call site.
|
||||
|
||||
This can be called from within C++ code to retrieve the file name and line
|
||||
number of the Python code that is calling into the core worker.
|
||||
"""
|
||||
|
||||
with gil:
|
||||
frame = inspect.currentframe()
|
||||
msg = ""
|
||||
while frame:
|
||||
filename = frame.f_code.co_filename
|
||||
# Decode Ray internal frames to add annotations.
|
||||
if filename.endswith("ray/worker.py"):
|
||||
if frame.f_code.co_name == "put":
|
||||
msg = "(put object) "
|
||||
elif filename.endswith("ray/workers/default_worker.py"):
|
||||
pass
|
||||
elif filename.endswith("ray/remote_function.py"):
|
||||
# TODO(ekl) distinguish between task return objects and
|
||||
# arguments. This can only be done in the core worker.
|
||||
msg = "(task call) "
|
||||
elif filename.endswith("ray/actor.py"):
|
||||
# TODO(ekl) distinguish between actor return objects and
|
||||
# arguments. This can only be done in the core worker.
|
||||
msg = "(actor call) "
|
||||
elif filename.endswith("ray/serialization.py"):
|
||||
if frame.f_code.co_name == "id_deserializer":
|
||||
msg = "(deserialize task arg) "
|
||||
else:
|
||||
msg += "{}:{}:{}".format(
|
||||
frame.f_code.co_filename, frame.f_code.co_name,
|
||||
frame.f_lineno)
|
||||
break
|
||||
frame = frame.f_back
|
||||
stack_out[0] = msg.encode("ascii")
|
||||
|
||||
|
||||
cdef shared_ptr[CBuffer] string_to_buffer(c_string& c_str):
|
||||
cdef shared_ptr[CBuffer] empty_metadata
|
||||
if c_str.size() == 0:
|
||||
@@ -603,7 +644,8 @@ cdef class CoreWorker:
|
||||
raylet_socket.encode("ascii"), job_id.native(),
|
||||
gcs_options.native()[0], log_dir.encode("utf-8"),
|
||||
node_ip_address.encode("utf-8"), node_manager_port,
|
||||
task_execution_handler, check_signals, gc_collect, True))
|
||||
task_execution_handler, check_signals, gc_collect,
|
||||
get_py_stack, True))
|
||||
|
||||
def run_task_loop(self):
|
||||
with nogil:
|
||||
|
||||
@@ -98,6 +98,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
const CWorkerID &worker_id) nogil,
|
||||
CRayStatus() nogil,
|
||||
void() nogil,
|
||||
void(c_string *stack_out) nogil,
|
||||
c_bool ref_counting_enabled)
|
||||
CWorkerType &GetWorkerType()
|
||||
CLanguage &GetLanguage()
|
||||
@@ -188,7 +189,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
|
||||
unordered_map[CObjectID, pair[size_t, size_t]] GetAllReferenceCounts()
|
||||
|
||||
void GetAsync(const CObjectID &object_id,
|
||||
ray_callback_function successs_callback,
|
||||
ray_callback_function success_callback,
|
||||
ray_callback_function fallback_callback,
|
||||
void* python_future)
|
||||
|
||||
|
||||
@@ -11,6 +11,24 @@ def global_gc():
|
||||
worker.core_worker.global_gc()
|
||||
|
||||
|
||||
def memory_summary():
|
||||
"""Returns a formatted string describing memory usage in the cluster."""
|
||||
|
||||
import grpc
|
||||
from ray.core.generated import node_manager_pb2
|
||||
from ray.core.generated import node_manager_pb2_grpc
|
||||
|
||||
# We can ask any Raylet for the global memory info.
|
||||
raylet = ray.nodes()[0]
|
||||
raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
|
||||
ray.nodes()[0]["NodeManagerPort"])
|
||||
channel = grpc.insecure_channel(raylet_address)
|
||||
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
|
||||
reply = stub.FormatGlobalMemoryInfo(
|
||||
node_manager_pb2.FormatGlobalMemoryInfoRequest(), timeout=30.0)
|
||||
return reply.memory_summary
|
||||
|
||||
|
||||
def free(object_ids, local_only=False, delete_creating_tasks=False):
|
||||
"""Free a list of IDs from object stores.
|
||||
|
||||
|
||||
@@ -53,11 +53,9 @@ class RayOutOfMemoryError(Exception):
|
||||
round(get_shared(psutil.virtual_memory()) / (1024**3), 2))
|
||||
+ "currently being used by the Ray object store. You can set "
|
||||
"the object store size with the `object_store_memory` "
|
||||
"parameter when starting Ray, and the max Redis size with "
|
||||
"`redis_max_memory`. Note that Ray assumes all system "
|
||||
"memory is available for use by workers. If your system "
|
||||
"has other applications running, you should manually set "
|
||||
"these memory limits to a lower value.")
|
||||
"parameter when starting Ray.\n---\n"
|
||||
"--- Tip: Use the `ray memory` command to list active "
|
||||
"objects in the cluster.\n---\n")
|
||||
|
||||
|
||||
class MemoryMonitor:
|
||||
|
||||
@@ -945,10 +945,40 @@ def stat(address):
|
||||
channel = grpc.insecure_channel(raylet_address)
|
||||
stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
|
||||
reply = stub.GetNodeStats(
|
||||
node_manager_pb2.GetNodeStatsRequest(), timeout=2.0)
|
||||
node_manager_pb2.GetNodeStatsRequest(include_memory_info=False),
|
||||
timeout=2.0)
|
||||
print(reply)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option(
|
||||
"--address",
|
||||
required=False,
|
||||
type=str,
|
||||
help="Override the address to connect to.")
|
||||
def memory(address):
|
||||
if not address:
|
||||
address = services.find_redis_address_or_die()
|
||||
logger.info("Connecting to Ray instance at {}.".format(address))
|
||||
ray.init(address=address)
|
||||
print(ray.internal.internal_api.memory_summary())
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option(
|
||||
"--address",
|
||||
required=False,
|
||||
type=str,
|
||||
help="Override the address to connect to.")
|
||||
def globalgc(address):
|
||||
if not address:
|
||||
address = services.find_redis_address_or_die()
|
||||
logger.info("Connecting to Ray instance at {}.".format(address))
|
||||
ray.init(address=address)
|
||||
ray.internal.internal_api.global_gc()
|
||||
print("Triggered gc.collect() on all workers.")
|
||||
|
||||
|
||||
cli.add_command(dashboard)
|
||||
cli.add_command(start)
|
||||
cli.add_command(stop)
|
||||
@@ -966,6 +996,8 @@ cli.add_command(get_worker_ips)
|
||||
cli.add_command(microbenchmark)
|
||||
cli.add_command(stack)
|
||||
cli.add_command(stat)
|
||||
cli.add_command(memory)
|
||||
cli.add_command(globalgc)
|
||||
cli.add_command(timeline)
|
||||
cli.add_command(project_cli)
|
||||
cli.add_command(session_cli)
|
||||
|
||||
@@ -22,6 +22,14 @@ py_test(
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_memstat",
|
||||
size = "small",
|
||||
srcs = ["test_memstat.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_iter",
|
||||
size = "medium",
|
||||
|
||||
@@ -0,0 +1,195 @@
|
||||
import ray
|
||||
import numpy as np
|
||||
import time
|
||||
from ray.internal.internal_api import memory_summary
|
||||
|
||||
# Unique strings.
|
||||
DRIVER_PID = "driver pid"
|
||||
WORKER_PID = "worker pid"
|
||||
UNKNOWN_SIZE = " ? "
|
||||
|
||||
# Reference status.
|
||||
PINNED_IN_MEMORY = "PINNED_IN_MEMORY"
|
||||
LOCAL_REF = "LOCAL_REFERENCE"
|
||||
USED_BY_PENDING_TASK = "USED_BY_PENDING_TASK"
|
||||
CAPTURED_IN_OBJECT = "CAPTURED_IN_OBJECT"
|
||||
|
||||
# Call sites.
|
||||
PUT_OBJ = "(put object)"
|
||||
TASK_CALL_OBJ = "(task call)"
|
||||
ACTOR_TASK_CALL_OBJ = "(actor call)"
|
||||
DESER_TASK_ARG = "(deserialize task arg)"
|
||||
DESER_ACTOR_TASK_ARG = "(deserialize actor task arg)"
|
||||
|
||||
|
||||
def data_lines(memory_str):
|
||||
for line in memory_str.split("\n"):
|
||||
if (not line or "---" in line or "===" in line or "Object ID" in line
|
||||
or "pid=" in line):
|
||||
continue
|
||||
yield line
|
||||
|
||||
|
||||
def num_objects(memory_str):
|
||||
n = 0
|
||||
for line in data_lines(memory_str):
|
||||
n += 1
|
||||
return n
|
||||
|
||||
|
||||
def count(memory_str, substr):
|
||||
n = 0
|
||||
for line in memory_str.split("\n"):
|
||||
if substr in line:
|
||||
n += 1
|
||||
return n
|
||||
|
||||
|
||||
def test_driver_put_ref(ray_start_regular):
|
||||
info = memory_summary()
|
||||
assert num_objects(info) == 0, info
|
||||
x_id = ray.put("HI")
|
||||
info = memory_summary()
|
||||
print(info)
|
||||
assert num_objects(info) == 1, info
|
||||
assert count(info, DRIVER_PID) == 1, info
|
||||
assert count(info, WORKER_PID) == 0, info
|
||||
del x_id
|
||||
info = memory_summary()
|
||||
assert num_objects(info) == 0, info
|
||||
|
||||
|
||||
def test_worker_task_refs(ray_start_regular):
|
||||
@ray.remote
|
||||
def f(y):
|
||||
x_id = ray.put("HI")
|
||||
info = memory_summary()
|
||||
del x_id
|
||||
return info
|
||||
|
||||
x_id = f.remote(np.zeros(100000))
|
||||
info = ray.get(x_id)
|
||||
print(info)
|
||||
assert num_objects(info) == 4, info
|
||||
# Task argument plus task return ids.
|
||||
assert count(info, TASK_CALL_OBJ) == 2, info
|
||||
assert count(info, DRIVER_PID) == 1, info
|
||||
assert count(info, WORKER_PID) == 1, info
|
||||
assert count(info, LOCAL_REF) == 2, info
|
||||
assert count(info, PINNED_IN_MEMORY) == 1, info
|
||||
assert count(info, PUT_OBJ) == 1, info
|
||||
assert count(info, DESER_TASK_ARG) == 1, info
|
||||
assert count(info, UNKNOWN_SIZE) == 1, info
|
||||
assert count(info, "test_memstat.py:f") == 1, info
|
||||
assert count(info, "test_memstat.py:test_worker_task_refs") == 2, info
|
||||
|
||||
info = memory_summary()
|
||||
print(info)
|
||||
assert num_objects(info) == 1, info
|
||||
assert count(info, DRIVER_PID) == 1, info
|
||||
assert count(info, TASK_CALL_OBJ) == 1, info
|
||||
assert count(info, UNKNOWN_SIZE) == 0, info
|
||||
assert count(info, x_id.hex()) == 1, info
|
||||
|
||||
del x_id
|
||||
info = memory_summary()
|
||||
assert num_objects(info) == 0, info
|
||||
|
||||
|
||||
def test_actor_task_refs(ray_start_regular):
|
||||
@ray.remote
|
||||
class Actor:
|
||||
def __init__(self):
|
||||
self.refs = []
|
||||
|
||||
def f(self, x):
|
||||
self.refs.append(x)
|
||||
return memory_summary()
|
||||
|
||||
def make_actor():
|
||||
return Actor.remote()
|
||||
|
||||
actor = make_actor()
|
||||
x_id = actor.f.remote(np.zeros(100000))
|
||||
info = ray.get(x_id)
|
||||
print(info)
|
||||
assert num_objects(info) == 4, info
|
||||
# Actor handle, task argument id, task return id.
|
||||
assert count(info, ACTOR_TASK_CALL_OBJ) == 3, info
|
||||
assert count(info, DRIVER_PID) == 1, info
|
||||
assert count(info, WORKER_PID) == 1, info
|
||||
assert count(info, LOCAL_REF) == 1, info
|
||||
assert count(info, PINNED_IN_MEMORY) == 1, info
|
||||
assert count(info, USED_BY_PENDING_TASK) == 2, info
|
||||
assert count(info, DESER_ACTOR_TASK_ARG) == 1, info
|
||||
assert count(info, "test_memstat.py:test_actor_task_refs") == 2, info
|
||||
assert count(info, "test_memstat.py:make_actor") == 1, info
|
||||
del x_id
|
||||
|
||||
# These should accumulate in the actor.
|
||||
for _ in range(5):
|
||||
ray.get(actor.f.remote([ray.put(np.zeros(100000))]))
|
||||
info = memory_summary()
|
||||
print(info)
|
||||
assert count(info, DESER_ACTOR_TASK_ARG) == 5, info
|
||||
assert count(info, ACTOR_TASK_CALL_OBJ) == 1, info
|
||||
|
||||
# Cleanup.
|
||||
del actor
|
||||
time.sleep(1)
|
||||
info = memory_summary()
|
||||
assert num_objects(info) == 0, info
|
||||
|
||||
|
||||
def test_nested_object_refs(ray_start_regular):
|
||||
x_id = ray.put(np.zeros(100000))
|
||||
y_id = ray.put([x_id])
|
||||
z_id = ray.put([y_id])
|
||||
del x_id, y_id
|
||||
info = memory_summary()
|
||||
print(info)
|
||||
assert num_objects(info) == 3, info
|
||||
assert count(info, LOCAL_REF) == 1, info
|
||||
assert count(info, CAPTURED_IN_OBJECT) == 2, info
|
||||
del z_id
|
||||
|
||||
|
||||
def test_pinned_object_call_site(ray_start_regular):
|
||||
# Local ref only.
|
||||
x_id = ray.put(np.zeros(100000))
|
||||
info = memory_summary()
|
||||
print(info)
|
||||
assert num_objects(info) == 1, info
|
||||
assert count(info, LOCAL_REF) == 1, info
|
||||
assert count(info, PINNED_IN_MEMORY) == 0, info
|
||||
assert count(info, "test_memstat.py") == 1, info
|
||||
|
||||
# Local ref + pinned buffer.
|
||||
buf = ray.get(x_id)
|
||||
info = memory_summary()
|
||||
print(info)
|
||||
assert num_objects(info) == 1, info
|
||||
assert count(info, LOCAL_REF) == 0, info
|
||||
assert count(info, PINNED_IN_MEMORY) == 1, info
|
||||
assert count(info, "test_memstat.py") == 1, info
|
||||
|
||||
# Just pinned buffer.
|
||||
del x_id
|
||||
info = memory_summary()
|
||||
print(info)
|
||||
assert num_objects(info) == 1, info
|
||||
assert count(info, LOCAL_REF) == 0, info
|
||||
assert count(info, PINNED_IN_MEMORY) == 1, info
|
||||
assert count(info, "test_memstat.py") == 1, info
|
||||
|
||||
# Nothing.
|
||||
del buf
|
||||
info = memory_summary()
|
||||
print(info)
|
||||
assert num_objects(info) == 0, info
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
@@ -1549,10 +1549,7 @@ def put(value, weakref=False):
|
||||
except ObjectStoreFullError:
|
||||
logger.info(
|
||||
"Put failed since the value was either too large or the "
|
||||
"store was full of pinned objects. If you are putting "
|
||||
"and holding references to a lot of object ids, consider "
|
||||
"ray.put(value, weakref=True) to allow object data to "
|
||||
"be evicted early.")
|
||||
"store was full of pinned objects.")
|
||||
raise
|
||||
return object_id
|
||||
|
||||
|
||||
Reference in New Issue
Block a user