mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 01:31:08 +08:00
[Object Spilling] Add consumed bytes to detect thrashing. (#13853)
This commit is contained in:
@@ -13,7 +13,9 @@ def global_gc():
|
||||
worker.core_worker.global_gc()
|
||||
|
||||
|
||||
def memory_summary(node_manager_address=None, node_manager_port=None):
|
||||
def memory_summary(node_manager_address=None,
|
||||
node_manager_port=None,
|
||||
stats_only=False):
|
||||
"""Returns a formatted string describing memory usage in the cluster."""
|
||||
|
||||
import grpc
|
||||
@@ -63,6 +65,11 @@ def memory_summary(node_manager_address=None, node_manager_port=None):
|
||||
reply.store_stats.restored_objects_total,
|
||||
int(reply.store_stats.restored_bytes_total / (1024 * 1024) /
|
||||
reply.store_stats.restore_time_total_s)))
|
||||
if reply.store_stats.consumed_bytes > 0:
|
||||
store_summary += ("Objects consumed by Ray tasks: {} MiB.".format(
|
||||
int(reply.store_stats.consumed_bytes / (1024 * 1024))))
|
||||
if stats_only:
|
||||
return store_summary
|
||||
return reply.memory_summary + "\n" + store_summary
|
||||
|
||||
|
||||
|
||||
@@ -1372,7 +1372,13 @@ def timeline(address):
|
||||
type=str,
|
||||
default=ray_constants.REDIS_DEFAULT_PASSWORD,
|
||||
help="Connect to ray with redis_password.")
|
||||
def memory(address, redis_password):
|
||||
@click.option(
|
||||
"--stats-only",
|
||||
is_flag=True,
|
||||
type=bool,
|
||||
default=False,
|
||||
help="Connect to ray with redis_password.")
|
||||
def memory(address, redis_password, stats_only):
|
||||
"""Print object references held in a Ray cluster."""
|
||||
if not address:
|
||||
address = services.get_ray_address_to_use_or_die()
|
||||
@@ -1381,7 +1387,8 @@ def memory(address, redis_password):
|
||||
raylet = state.node_table()[0]
|
||||
print(
|
||||
ray.internal.internal_api.memory_summary(raylet["NodeManagerAddress"],
|
||||
raylet["NodeManagerPort"]))
|
||||
raylet["NodeManagerPort"],
|
||||
stats_only))
|
||||
|
||||
|
||||
@cli.command()
|
||||
|
||||
@@ -27,7 +27,8 @@ DESER_ACTOR_TASK_ARG = "(deserialize actor task arg)"
|
||||
def data_lines(memory_str):
|
||||
for line in memory_str.split("\n"):
|
||||
if (not line or "---" in line or "===" in line or "Object ID" in line
|
||||
or "pid=" in line or "Plasma memory" in line):
|
||||
or "pid=" in line or "Plasma memory" in line
|
||||
or "Objects consumed" in line):
|
||||
continue
|
||||
yield line
|
||||
|
||||
|
||||
@@ -88,6 +88,27 @@ def is_dir_empty(temp_folder,
|
||||
return num_files == 0
|
||||
|
||||
|
||||
def assert_no_thrashing(address):
|
||||
state = ray.state.GlobalState()
|
||||
state._initialize_global_state(address,
|
||||
ray.ray_constants.REDIS_DEFAULT_PASSWORD)
|
||||
raylet = state.node_table()[0]
|
||||
memory_summary = ray.internal.internal_api.memory_summary(
|
||||
raylet["NodeManagerAddress"],
|
||||
raylet["NodeManagerPort"],
|
||||
stats_only=True)
|
||||
restored_bytes = 0
|
||||
consumed_bytes = 0
|
||||
|
||||
for line in memory_summary.split("\n"):
|
||||
if "Restored" in line:
|
||||
restored_bytes = int(line.split(" ")[1])
|
||||
if "consumed" in line:
|
||||
consumed_bytes = int(line.split(" ")[-2])
|
||||
assert consumed_bytes >= restored_bytes, (
|
||||
f"consumed: {consumed_bytes}, restored: {restored_bytes}")
|
||||
|
||||
|
||||
def test_invalid_config_raises_exception(shutdown_only):
|
||||
# Make sure ray.init raises an exception before
|
||||
# it starts processes when invalid object spilling
|
||||
@@ -187,7 +208,7 @@ def test_spilling_not_done_for_pinned_object(object_spilling_config,
|
||||
shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
object_spilling_config, temp_folder = object_spilling_config
|
||||
ray.init(
|
||||
address = ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
"max_io_workers": 4,
|
||||
@@ -203,6 +224,7 @@ def test_spilling_not_done_for_pinned_object(object_spilling_config,
|
||||
ref2 = ray.put(arr) # noqa
|
||||
|
||||
wait_for_condition(lambda: is_dir_empty(temp_folder))
|
||||
assert_no_thrashing(address["redis_address"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -249,6 +271,7 @@ def test_spill_remote_object(ray_start_cluster,
|
||||
|
||||
# Test passing the spilled object as an arg to another task.
|
||||
ray.get(depends.remote(ref))
|
||||
assert_no_thrashing(cluster.address)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -256,7 +279,7 @@ def test_spill_remote_object(ray_start_cluster,
|
||||
def test_spill_objects_automatically(object_spilling_config, shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
object_spilling_config, _ = object_spilling_config
|
||||
ray.init(
|
||||
address = ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
@@ -287,14 +310,15 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only):
|
||||
solution = solution_buffer[index]
|
||||
sample = ray.get(ref, timeout=0)
|
||||
assert np.array_equal(sample, solution)
|
||||
assert_no_thrashing(address["redis_address"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() in ["Darwin", "Windows"], reason="Failing on Windows.")
|
||||
platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.")
|
||||
def test_spill_stats(object_spilling_config, shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
object_spilling_config, _ = object_spilling_config
|
||||
ray.init(
|
||||
address = ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
_system_config={
|
||||
@@ -319,17 +343,31 @@ def test_spill_stats(object_spilling_config, shutdown_only):
|
||||
|
||||
x_id = f.remote() # noqa
|
||||
ray.get(x_id)
|
||||
s = memory_summary()
|
||||
s = memory_summary(stats_only=True)
|
||||
assert "Plasma memory usage 50 MiB, 1 objects, 50.0% full" in s, s
|
||||
assert "Spilled 200 MiB, 4 objects" in s, s
|
||||
assert "Restored 150 MiB, 3 objects" in s, s
|
||||
|
||||
# Test if consumed bytes are correctly calculated.
|
||||
obj = ray.put(np.zeros(30 * 1024 * 1024, dtype=np.uint8))
|
||||
|
||||
@ray.remote
|
||||
def func_with_ref(obj):
|
||||
return True
|
||||
|
||||
ray.get(func_with_ref.remote(obj))
|
||||
|
||||
s = memory_summary(stats_only=True)
|
||||
# 50MB * 5 references + 30MB used for task execution.
|
||||
assert "Objects consumed by Ray tasks: 280 MiB." in s, s
|
||||
assert_no_thrashing(address["redis_address"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
def test_spill_during_get(object_spilling_config, shutdown_only):
|
||||
object_spilling_config, _ = object_spilling_config
|
||||
ray.init(
|
||||
address = ray.init(
|
||||
num_cpus=4,
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
_system_config={
|
||||
@@ -355,6 +393,7 @@ def test_spill_during_get(object_spilling_config, shutdown_only):
|
||||
# objects are being created.
|
||||
for x in ids:
|
||||
print(ray.get(x).shape)
|
||||
assert_no_thrashing(address["redis_address"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -362,7 +401,7 @@ def test_spill_during_get(object_spilling_config, shutdown_only):
|
||||
def test_spill_deadlock(object_spilling_config, shutdown_only):
|
||||
object_spilling_config, _ = object_spilling_config
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
ray.init(
|
||||
address = ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
"max_io_workers": 1,
|
||||
@@ -386,6 +425,7 @@ def test_spill_deadlock(object_spilling_config, shutdown_only):
|
||||
ref = random.choice(replay_buffer)
|
||||
sample = ray.get(ref, timeout=0)
|
||||
assert np.array_equal(sample, arr)
|
||||
assert_no_thrashing(address["redis_address"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -394,7 +434,7 @@ def test_delete_objects(object_spilling_config, shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
object_spilling_config, temp_folder = object_spilling_config
|
||||
|
||||
ray.init(
|
||||
address = ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
"max_io_workers": 1,
|
||||
@@ -417,6 +457,7 @@ def test_delete_objects(object_spilling_config, shutdown_only):
|
||||
del replay_buffer
|
||||
del ref
|
||||
wait_for_condition(lambda: is_dir_empty(temp_folder))
|
||||
assert_no_thrashing(address["redis_address"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -426,7 +467,7 @@ def test_delete_objects_delete_while_creating(object_spilling_config,
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
object_spilling_config, temp_folder = object_spilling_config
|
||||
|
||||
ray.init(
|
||||
address = ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
"max_io_workers": 4,
|
||||
@@ -457,6 +498,7 @@ def test_delete_objects_delete_while_creating(object_spilling_config,
|
||||
del replay_buffer
|
||||
del ref
|
||||
wait_for_condition(lambda: is_dir_empty(temp_folder))
|
||||
assert_no_thrashing(address["redis_address"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -466,7 +508,7 @@ def test_delete_objects_on_worker_failure(object_spilling_config,
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
object_spilling_config, temp_folder = object_spilling_config
|
||||
|
||||
ray.init(
|
||||
address = ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
"max_io_workers": 4,
|
||||
@@ -518,6 +560,7 @@ def test_delete_objects_on_worker_failure(object_spilling_config,
|
||||
|
||||
# After all, make sure all objects are deleted upon worker failures.
|
||||
wait_for_condition(lambda: is_dir_empty(temp_folder))
|
||||
assert_no_thrashing(address["redis_address"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -539,10 +582,11 @@ def test_delete_objects_multi_node(multi_node_object_spilling_config,
|
||||
"object_store_full_delay_ms": 100,
|
||||
"object_spilling_config": object_spilling_config,
|
||||
})
|
||||
ray.init(address=cluster.address)
|
||||
# Add 2 worker nodes.
|
||||
for _ in range(2):
|
||||
cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)
|
||||
ray.init(address=cluster.address)
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
arr = np.random.rand(1024 * 1024) # 8 MB data
|
||||
|
||||
@@ -565,9 +609,9 @@ def test_delete_objects_multi_node(multi_node_object_spilling_config,
|
||||
self.replay_buffer.pop()
|
||||
|
||||
# Do random sampling.
|
||||
for _ in range(200):
|
||||
for _ in range(50):
|
||||
ref = random.choice(self.replay_buffer)
|
||||
sample = ray.get(ref, timeout=0)
|
||||
sample = ray.get(ref, timeout=10)
|
||||
assert np.array_equal(sample, arr)
|
||||
|
||||
actors = [Actor.remote() for _ in range(3)]
|
||||
@@ -586,6 +630,7 @@ def test_delete_objects_multi_node(multi_node_object_spilling_config,
|
||||
wait_for_condition(lambda: wait_until_actor_dead(actor))
|
||||
# The multi node deletion should work.
|
||||
wait_for_condition(lambda: is_dir_empty(temp_folder))
|
||||
assert_no_thrashing(cluster.address)
|
||||
|
||||
|
||||
@pytest.mark.skipif(platform.system() == "Windows", reason="Flaky on Windows.")
|
||||
@@ -593,7 +638,7 @@ def test_fusion_objects(object_spilling_config, shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
object_spilling_config, temp_folder = object_spilling_config
|
||||
min_spilling_size = 10 * 1024 * 1024
|
||||
ray.init(
|
||||
address = ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
"max_io_workers": 3,
|
||||
@@ -637,12 +682,13 @@ def test_fusion_objects(object_spilling_config, shutdown_only):
|
||||
if file_size >= min_spilling_size:
|
||||
is_test_passing = True
|
||||
assert is_test_passing
|
||||
assert_no_thrashing(address["redis_address"])
|
||||
|
||||
|
||||
# https://github.com/ray-project/ray/issues/12912
|
||||
def do_test_release_resource(object_spilling_config, expect_released):
|
||||
object_spilling_config, temp_folder = object_spilling_config
|
||||
ray.init(
|
||||
address = ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
@@ -674,6 +720,7 @@ def do_test_release_resource(object_spilling_config, expect_released):
|
||||
assert ready
|
||||
else:
|
||||
assert not ready
|
||||
assert_no_thrashing(address["redis_address"])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -745,6 +792,7 @@ def test_spill_objects_on_object_transfer(object_spilling_config,
|
||||
# spilling.
|
||||
tasks = [foo.remote(*task_args) for task_args in args]
|
||||
ray.get(tasks)
|
||||
assert_no_thrashing(cluster.address)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -801,14 +849,6 @@ os.kill(os.getpid(), sig)
|
||||
driver.format(temp_dir=str(temp_folder), signum=2)))
|
||||
wait_for_condition(lambda: is_dir_empty(temp_folder, append_path=""))
|
||||
|
||||
# Q: Looks like Sigterm doesn't work with Ray?
|
||||
# print("Sending sigterm...")
|
||||
# # Run a driver with sigterm.
|
||||
# with pytest.raises(subprocess.CalledProcessError):
|
||||
# print(run_string_as_driver(
|
||||
# driver.format(temp_dir=str(temp_folder), signum=15)))
|
||||
# wait_for_condition(is_dir_empty, timeout=1000)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main(["-sv", __file__]))
|
||||
|
||||
Reference in New Issue
Block a user