[Object Spilling] Add consumed bytes to detect thrashing. (#13853)

This commit is contained in:
SangBin Cho
2021-02-03 14:16:26 -08:00
committed by GitHub
parent 77ee2c569f
commit cb9fa90203
20 changed files with 172 additions and 67 deletions
+8 -1
View File
@@ -13,7 +13,9 @@ def global_gc():
worker.core_worker.global_gc()
def memory_summary(node_manager_address=None, node_manager_port=None):
def memory_summary(node_manager_address=None,
node_manager_port=None,
stats_only=False):
"""Returns a formatted string describing memory usage in the cluster."""
import grpc
@@ -63,6 +65,11 @@ def memory_summary(node_manager_address=None, node_manager_port=None):
reply.store_stats.restored_objects_total,
int(reply.store_stats.restored_bytes_total / (1024 * 1024) /
reply.store_stats.restore_time_total_s)))
if reply.store_stats.consumed_bytes > 0:
store_summary += ("Objects consumed by Ray tasks: {} MiB.".format(
int(reply.store_stats.consumed_bytes / (1024 * 1024))))
if stats_only:
return store_summary
return reply.memory_summary + "\n" + store_summary
+9 -2
View File
@@ -1372,7 +1372,13 @@ def timeline(address):
type=str,
default=ray_constants.REDIS_DEFAULT_PASSWORD,
help="Connect to ray with redis_password.")
def memory(address, redis_password):
@click.option(
"--stats-only",
is_flag=True,
type=bool,
default=False,
help="Connect to ray with redis_password.")
def memory(address, redis_password, stats_only):
"""Print object references held in a Ray cluster."""
if not address:
address = services.get_ray_address_to_use_or_die()
@@ -1381,7 +1387,8 @@ def memory(address, redis_password):
raylet = state.node_table()[0]
print(
ray.internal.internal_api.memory_summary(raylet["NodeManagerAddress"],
raylet["NodeManagerPort"]))
raylet["NodeManagerPort"],
stats_only))
@cli.command()
+2 -1
View File
@@ -27,7 +27,8 @@ DESER_ACTOR_TASK_ARG = "(deserialize actor task arg)"
def data_lines(memory_str):
for line in memory_str.split("\n"):
if (not line or "---" in line or "===" in line or "Object ID" in line
or "pid=" in line or "Plasma memory" in line):
or "pid=" in line or "Plasma memory" in line
or "Objects consumed" in line):
continue
yield line
+63 -23
View File
@@ -88,6 +88,27 @@ def is_dir_empty(temp_folder,
return num_files == 0
def assert_no_thrashing(address):
state = ray.state.GlobalState()
state._initialize_global_state(address,
ray.ray_constants.REDIS_DEFAULT_PASSWORD)
raylet = state.node_table()[0]
memory_summary = ray.internal.internal_api.memory_summary(
raylet["NodeManagerAddress"],
raylet["NodeManagerPort"],
stats_only=True)
restored_bytes = 0
consumed_bytes = 0
for line in memory_summary.split("\n"):
if "Restored" in line:
restored_bytes = int(line.split(" ")[1])
if "consumed" in line:
consumed_bytes = int(line.split(" ")[-2])
assert consumed_bytes >= restored_bytes, (
f"consumed: {consumed_bytes}, restored: {restored_bytes}")
def test_invalid_config_raises_exception(shutdown_only):
# Make sure ray.init raises an exception before
# it starts processes when invalid object spilling
@@ -187,7 +208,7 @@ def test_spilling_not_done_for_pinned_object(object_spilling_config,
shutdown_only):
# Limit our object store to 75 MiB of memory.
object_spilling_config, temp_folder = object_spilling_config
ray.init(
address = ray.init(
object_store_memory=75 * 1024 * 1024,
_system_config={
"max_io_workers": 4,
@@ -203,6 +224,7 @@ def test_spilling_not_done_for_pinned_object(object_spilling_config,
ref2 = ray.put(arr) # noqa
wait_for_condition(lambda: is_dir_empty(temp_folder))
assert_no_thrashing(address["redis_address"])
@pytest.mark.skipif(
@@ -249,6 +271,7 @@ def test_spill_remote_object(ray_start_cluster,
# Test passing the spilled object as an arg to another task.
ray.get(depends.remote(ref))
assert_no_thrashing(cluster.address)
@pytest.mark.skipif(
@@ -256,7 +279,7 @@ def test_spill_remote_object(ray_start_cluster,
def test_spill_objects_automatically(object_spilling_config, shutdown_only):
# Limit our object store to 75 MiB of memory.
object_spilling_config, _ = object_spilling_config
ray.init(
address = ray.init(
num_cpus=1,
object_store_memory=75 * 1024 * 1024,
_system_config={
@@ -287,14 +310,15 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only):
solution = solution_buffer[index]
sample = ray.get(ref, timeout=0)
assert np.array_equal(sample, solution)
assert_no_thrashing(address["redis_address"])
@pytest.mark.skipif(
platform.system() in ["Darwin", "Windows"], reason="Failing on Windows.")
platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.")
def test_spill_stats(object_spilling_config, shutdown_only):
# Limit our object store to 75 MiB of memory.
object_spilling_config, _ = object_spilling_config
ray.init(
address = ray.init(
num_cpus=1,
object_store_memory=100 * 1024 * 1024,
_system_config={
@@ -319,17 +343,31 @@ def test_spill_stats(object_spilling_config, shutdown_only):
x_id = f.remote() # noqa
ray.get(x_id)
s = memory_summary()
s = memory_summary(stats_only=True)
assert "Plasma memory usage 50 MiB, 1 objects, 50.0% full" in s, s
assert "Spilled 200 MiB, 4 objects" in s, s
assert "Restored 150 MiB, 3 objects" in s, s
# Test if consumed bytes are correctly calculated.
obj = ray.put(np.zeros(30 * 1024 * 1024, dtype=np.uint8))
@ray.remote
def func_with_ref(obj):
return True
ray.get(func_with_ref.remote(obj))
s = memory_summary(stats_only=True)
# 50MB * 5 references + 30MB used for task execution.
assert "Objects consumed by Ray tasks: 280 MiB." in s, s
assert_no_thrashing(address["redis_address"])
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
def test_spill_during_get(object_spilling_config, shutdown_only):
object_spilling_config, _ = object_spilling_config
ray.init(
address = ray.init(
num_cpus=4,
object_store_memory=100 * 1024 * 1024,
_system_config={
@@ -355,6 +393,7 @@ def test_spill_during_get(object_spilling_config, shutdown_only):
# objects are being created.
for x in ids:
print(ray.get(x).shape)
assert_no_thrashing(address["redis_address"])
@pytest.mark.skipif(
@@ -362,7 +401,7 @@ def test_spill_during_get(object_spilling_config, shutdown_only):
def test_spill_deadlock(object_spilling_config, shutdown_only):
object_spilling_config, _ = object_spilling_config
# Limit our object store to 75 MiB of memory.
ray.init(
address = ray.init(
object_store_memory=75 * 1024 * 1024,
_system_config={
"max_io_workers": 1,
@@ -386,6 +425,7 @@ def test_spill_deadlock(object_spilling_config, shutdown_only):
ref = random.choice(replay_buffer)
sample = ray.get(ref, timeout=0)
assert np.array_equal(sample, arr)
assert_no_thrashing(address["redis_address"])
@pytest.mark.skipif(
@@ -394,7 +434,7 @@ def test_delete_objects(object_spilling_config, shutdown_only):
# Limit our object store to 75 MiB of memory.
object_spilling_config, temp_folder = object_spilling_config
ray.init(
address = ray.init(
object_store_memory=75 * 1024 * 1024,
_system_config={
"max_io_workers": 1,
@@ -417,6 +457,7 @@ def test_delete_objects(object_spilling_config, shutdown_only):
del replay_buffer
del ref
wait_for_condition(lambda: is_dir_empty(temp_folder))
assert_no_thrashing(address["redis_address"])
@pytest.mark.skipif(
@@ -426,7 +467,7 @@ def test_delete_objects_delete_while_creating(object_spilling_config,
# Limit our object store to 75 MiB of memory.
object_spilling_config, temp_folder = object_spilling_config
ray.init(
address = ray.init(
object_store_memory=75 * 1024 * 1024,
_system_config={
"max_io_workers": 4,
@@ -457,6 +498,7 @@ def test_delete_objects_delete_while_creating(object_spilling_config,
del replay_buffer
del ref
wait_for_condition(lambda: is_dir_empty(temp_folder))
assert_no_thrashing(address["redis_address"])
@pytest.mark.skipif(
@@ -466,7 +508,7 @@ def test_delete_objects_on_worker_failure(object_spilling_config,
# Limit our object store to 75 MiB of memory.
object_spilling_config, temp_folder = object_spilling_config
ray.init(
address = ray.init(
object_store_memory=75 * 1024 * 1024,
_system_config={
"max_io_workers": 4,
@@ -518,6 +560,7 @@ def test_delete_objects_on_worker_failure(object_spilling_config,
# After all, make sure all objects are deleted upon worker failures.
wait_for_condition(lambda: is_dir_empty(temp_folder))
assert_no_thrashing(address["redis_address"])
@pytest.mark.skipif(
@@ -539,10 +582,11 @@ def test_delete_objects_multi_node(multi_node_object_spilling_config,
"object_store_full_delay_ms": 100,
"object_spilling_config": object_spilling_config,
})
ray.init(address=cluster.address)
# Add 2 worker nodes.
for _ in range(2):
cluster.add_node(num_cpus=1, object_store_memory=75 * 1024 * 1024)
ray.init(address=cluster.address)
cluster.wait_for_nodes()
arr = np.random.rand(1024 * 1024) # 8 MB data
@@ -565,9 +609,9 @@ def test_delete_objects_multi_node(multi_node_object_spilling_config,
self.replay_buffer.pop()
# Do random sampling.
for _ in range(200):
for _ in range(50):
ref = random.choice(self.replay_buffer)
sample = ray.get(ref, timeout=0)
sample = ray.get(ref, timeout=10)
assert np.array_equal(sample, arr)
actors = [Actor.remote() for _ in range(3)]
@@ -586,6 +630,7 @@ def test_delete_objects_multi_node(multi_node_object_spilling_config,
wait_for_condition(lambda: wait_until_actor_dead(actor))
# The multi node deletion should work.
wait_for_condition(lambda: is_dir_empty(temp_folder))
assert_no_thrashing(cluster.address)
@pytest.mark.skipif(platform.system() == "Windows", reason="Flaky on Windows.")
@@ -593,7 +638,7 @@ def test_fusion_objects(object_spilling_config, shutdown_only):
# Limit our object store to 75 MiB of memory.
object_spilling_config, temp_folder = object_spilling_config
min_spilling_size = 10 * 1024 * 1024
ray.init(
address = ray.init(
object_store_memory=75 * 1024 * 1024,
_system_config={
"max_io_workers": 3,
@@ -637,12 +682,13 @@ def test_fusion_objects(object_spilling_config, shutdown_only):
if file_size >= min_spilling_size:
is_test_passing = True
assert is_test_passing
assert_no_thrashing(address["redis_address"])
# https://github.com/ray-project/ray/issues/12912
def do_test_release_resource(object_spilling_config, expect_released):
object_spilling_config, temp_folder = object_spilling_config
ray.init(
address = ray.init(
num_cpus=1,
object_store_memory=75 * 1024 * 1024,
_system_config={
@@ -674,6 +720,7 @@ def do_test_release_resource(object_spilling_config, expect_released):
assert ready
else:
assert not ready
assert_no_thrashing(address["redis_address"])
@pytest.mark.skipif(
@@ -745,6 +792,7 @@ def test_spill_objects_on_object_transfer(object_spilling_config,
# spilling.
tasks = [foo.remote(*task_args) for task_args in args]
ray.get(tasks)
assert_no_thrashing(cluster.address)
@pytest.mark.skipif(
@@ -801,14 +849,6 @@ os.kill(os.getpid(), sig)
driver.format(temp_dir=str(temp_folder), signum=2)))
wait_for_condition(lambda: is_dir_empty(temp_folder, append_path=""))
# Q: Looks like Sigterm doesn't work with Ray?
# print("Sending sigterm...")
# # Run a driver with sigterm.
# with pytest.raises(subprocess.CalledProcessError):
# print(run_string_as_driver(
# driver.format(temp_dir=str(temp_folder), signum=15)))
# wait_for_condition(is_dir_empty, timeout=1000)
if __name__ == "__main__":
sys.exit(pytest.main(["-sv", __file__]))