mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 19:48:31 +08:00
[Object Spilling] Multi node file spilling V2. (#13542)
* done. * done. * Fix a mistake. * Ready. * Fix issues. * fix. * Finished the first round of code review. * formatting. * In progress. * Formatting. * Addressed code review. * Formatting * Fix tests. * fix bugs. * Skip flaky tests for now.
This commit is contained in:
@@ -345,6 +345,10 @@ def setup_external_storage(config):
|
||||
elif storage_type == "smart_open":
|
||||
_external_storage = ExternalStorageSmartOpenImpl(
|
||||
**config["params"])
|
||||
elif storage_type == "mock_distributed_fs":
|
||||
# This storage is used to unit test distributed external storages.
|
||||
# TODO(sang): Delete it after introducing the mock S3 test.
|
||||
_external_storage = FileSystemStorage(**config["params"])
|
||||
else:
|
||||
raise ValueError(f"Unknown external storage type: {storage_type}")
|
||||
else:
|
||||
|
||||
@@ -330,3 +330,6 @@ class RayParams:
|
||||
# Validate external storage usage.
|
||||
external_storage.setup_external_storage(object_spilling_config)
|
||||
external_storage.reset_external_storage()
|
||||
# Configure the proper system config.
|
||||
self._system_config["is_external_storage_type_fs"] = (
|
||||
object_spilling_config["type"] == "filesystem")
|
||||
|
||||
@@ -53,7 +53,6 @@ py_test_module_list(
|
||||
"test_multinode_failures_2.py",
|
||||
"test_multiprocessing.py",
|
||||
"test_object_manager.py",
|
||||
"test_object_spilling.py",
|
||||
"test_output.py",
|
||||
"test_reconstruction.py",
|
||||
"test_reference_counting.py",
|
||||
@@ -134,6 +133,7 @@ py_test_module_list(
|
||||
py_test_module_list(
|
||||
files = [
|
||||
"test_placement_group.py",
|
||||
"test_object_spilling.py",
|
||||
],
|
||||
size = "large",
|
||||
extra_srcs = SRCS,
|
||||
|
||||
@@ -21,6 +21,15 @@ file_system_object_spilling_config = {
|
||||
"directory_path": spill_local_path
|
||||
}
|
||||
}
|
||||
# Since we have differet protocol for a local external storage (e.g., fs)
|
||||
# and distributed external storage (e.g., S3), we need to test both cases.
|
||||
# This mocks the distributed fs with cluster utils.
|
||||
mock_distributed_fs_object_spilling_config = {
|
||||
"type": "mock_distributed_fs",
|
||||
"params": {
|
||||
"directory_path": spill_local_path
|
||||
}
|
||||
}
|
||||
smart_open_object_spilling_config = {
|
||||
"type": "smart_open",
|
||||
"params": {
|
||||
@@ -29,6 +38,15 @@ smart_open_object_spilling_config = {
|
||||
}
|
||||
|
||||
|
||||
def create_object_spilling_config(request, tmp_path):
|
||||
if (request.param["type"] == "filesystem"
|
||||
or request.param["type"] == "mock_distributed_fs"):
|
||||
temp_folder = tmp_path / "spill"
|
||||
temp_folder.mkdir()
|
||||
request.param["params"]["directory_path"] = str(temp_folder)
|
||||
return json.dumps(request.param), temp_folder
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="function",
|
||||
params=[
|
||||
@@ -36,10 +54,18 @@ smart_open_object_spilling_config = {
|
||||
# TODO(sang): Add a mock dependency to test S3.
|
||||
# smart_open_object_spilling_config,
|
||||
])
|
||||
def object_spilling_config(request, tmpdir):
|
||||
if request.param["type"] == "filesystem":
|
||||
request.param["params"]["directory_path"] = str(tmpdir)
|
||||
yield json.dumps(request.param)
|
||||
def object_spilling_config(request, tmp_path):
|
||||
yield create_object_spilling_config(request, tmp_path)
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="function",
|
||||
params=[
|
||||
file_system_object_spilling_config,
|
||||
mock_distributed_fs_object_spilling_config
|
||||
])
|
||||
def multi_node_object_spilling_config(request, tmp_path):
|
||||
yield create_object_spilling_config(request, tmp_path)
|
||||
|
||||
|
||||
def test_invalid_config_raises_exception(shutdown_only):
|
||||
@@ -75,22 +101,17 @@ def test_url_generation_and_parse():
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
def test_spilling_not_done_for_pinned_object(tmp_path, shutdown_only):
|
||||
def test_spilling_not_done_for_pinned_object(object_spilling_config,
|
||||
shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
temp_folder = tmp_path / "spill"
|
||||
temp_folder.mkdir()
|
||||
object_spilling_config, temp_folder = object_spilling_config
|
||||
ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
"max_io_workers": 4,
|
||||
"automatic_object_spilling_enabled": True,
|
||||
"object_store_full_delay_ms": 100,
|
||||
"object_spilling_config": json.dumps({
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": str(temp_folder)
|
||||
}
|
||||
}),
|
||||
"object_spilling_config": object_spilling_config,
|
||||
"min_spilling_size": 0,
|
||||
})
|
||||
arr = np.random.rand(5 * 1024 * 1024) # 40 MB
|
||||
@@ -110,27 +131,23 @@ def test_spilling_not_done_for_pinned_object(tmp_path, shutdown_only):
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [{
|
||||
"num_cpus": 0,
|
||||
"object_store_memory": 75 * 1024 * 1024,
|
||||
"_system_config": {
|
||||
def test_spill_remote_object(ray_start_cluster,
|
||||
multi_node_object_spilling_config):
|
||||
cluster = ray_start_cluster
|
||||
object_spilling_config, _ = multi_node_object_spilling_config
|
||||
cluster.add_node(
|
||||
num_cpus=0,
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
"automatic_object_spilling_enabled": True,
|
||||
"object_store_full_delay_ms": 100,
|
||||
"max_io_workers": 4,
|
||||
"object_spilling_config": json.dumps({
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": "/tmp"
|
||||
}
|
||||
}),
|
||||
"object_spilling_config": object_spilling_config,
|
||||
"min_spilling_size": 0,
|
||||
},
|
||||
}],
|
||||
indirect=True)
|
||||
def test_spill_remote_object(ray_start_cluster_head):
|
||||
cluster = ray_start_cluster_head
|
||||
})
|
||||
ray.init(address=cluster.address)
|
||||
cluster.add_node(object_store_memory=75 * 1024 * 1024)
|
||||
cluster.wait_for_nodes()
|
||||
|
||||
@ray.remote
|
||||
def put():
|
||||
@@ -162,6 +179,7 @@ def test_spill_remote_object(ray_start_cluster_head):
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
def test_spill_objects_automatically(object_spilling_config, shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
object_spilling_config, _ = object_spilling_config
|
||||
ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
@@ -197,10 +215,9 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only):
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
def test_spill_stats(tmp_path, shutdown_only):
|
||||
def test_spill_stats(object_spilling_config, shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
temp_folder = tmp_path / "spill"
|
||||
temp_folder.mkdir()
|
||||
object_spilling_config, _ = object_spilling_config
|
||||
ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
@@ -208,14 +225,7 @@ def test_spill_stats(tmp_path, shutdown_only):
|
||||
"automatic_object_spilling_enabled": True,
|
||||
"max_io_workers": 100,
|
||||
"min_spilling_size": 1,
|
||||
"object_spilling_config": json.dumps(
|
||||
{
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": str(temp_folder)
|
||||
}
|
||||
},
|
||||
separators=(",", ":"))
|
||||
"object_spilling_config": object_spilling_config
|
||||
},
|
||||
)
|
||||
|
||||
@@ -242,6 +252,7 @@ def test_spill_stats(tmp_path, shutdown_only):
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
def test_spill_during_get(object_spilling_config, shutdown_only):
|
||||
object_spilling_config, _ = object_spilling_config
|
||||
ray.init(
|
||||
num_cpus=4,
|
||||
object_store_memory=100 * 1024 * 1024,
|
||||
@@ -273,6 +284,7 @@ def test_spill_during_get(object_spilling_config, shutdown_only):
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
def test_spill_deadlock(object_spilling_config, shutdown_only):
|
||||
object_spilling_config, _ = object_spilling_config
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
@@ -302,10 +314,9 @@ def test_spill_deadlock(object_spilling_config, shutdown_only):
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
def test_delete_objects(tmp_path, shutdown_only):
|
||||
def test_delete_objects(object_spilling_config, shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
temp_folder = tmp_path / "spill"
|
||||
temp_folder.mkdir()
|
||||
object_spilling_config, temp_folder = object_spilling_config
|
||||
ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
@@ -313,12 +324,7 @@ def test_delete_objects(tmp_path, shutdown_only):
|
||||
"min_spilling_size": 0,
|
||||
"automatic_object_spilling_enabled": True,
|
||||
"object_store_full_delay_ms": 100,
|
||||
"object_spilling_config": json.dumps({
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": str(temp_folder)
|
||||
}
|
||||
}),
|
||||
"object_spilling_config": object_spilling_config,
|
||||
})
|
||||
arr = np.random.rand(1024 * 1024) # 8 MB data
|
||||
replay_buffer = []
|
||||
@@ -343,13 +349,11 @@ def test_delete_objects(tmp_path, shutdown_only):
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() in ["Windows", "Darwin"],
|
||||
reason="Failing on "
|
||||
"Windows and Mac.")
|
||||
def test_delete_objects_delete_while_creating(tmp_path, shutdown_only):
|
||||
platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.")
|
||||
def test_delete_objects_delete_while_creating(object_spilling_config,
|
||||
shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
temp_folder = tmp_path / "spill"
|
||||
temp_folder.mkdir()
|
||||
object_spilling_config, temp_folder = object_spilling_config
|
||||
ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
@@ -357,12 +361,7 @@ def test_delete_objects_delete_while_creating(tmp_path, shutdown_only):
|
||||
"min_spilling_size": 0,
|
||||
"automatic_object_spilling_enabled": True,
|
||||
"object_store_full_delay_ms": 100,
|
||||
"object_spilling_config": json.dumps({
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": str(temp_folder)
|
||||
}
|
||||
}),
|
||||
"object_spilling_config": object_spilling_config,
|
||||
})
|
||||
arr = np.random.rand(1024 * 1024) # 8 MB data
|
||||
replay_buffer = []
|
||||
@@ -395,25 +394,18 @@ def test_delete_objects_delete_while_creating(tmp_path, shutdown_only):
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() in ["Windows", "Darwin"],
|
||||
reason="Failing on Windows "
|
||||
"and Mac.")
|
||||
def test_delete_objects_on_worker_failure(tmp_path, shutdown_only):
|
||||
platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.")
|
||||
def test_delete_objects_on_worker_failure(object_spilling_config,
|
||||
shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
temp_folder = tmp_path / "spill"
|
||||
temp_folder.mkdir()
|
||||
object_spilling_config, temp_folder = object_spilling_config
|
||||
ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
_system_config={
|
||||
"max_io_workers": 4,
|
||||
"automatic_object_spilling_enabled": True,
|
||||
"object_store_full_delay_ms": 100,
|
||||
"object_spilling_config": json.dumps({
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": str(temp_folder)
|
||||
}
|
||||
}),
|
||||
"object_spilling_config": object_spilling_config,
|
||||
"min_spilling_size": 0,
|
||||
})
|
||||
|
||||
@@ -469,10 +461,10 @@ def test_delete_objects_on_worker_failure(tmp_path, shutdown_only):
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
def test_delete_objects_multi_node(tmp_path, ray_start_cluster):
|
||||
def test_delete_objects_multi_node(multi_node_object_spilling_config,
|
||||
ray_start_cluster):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
temp_folder = tmp_path / "spill"
|
||||
temp_folder.mkdir()
|
||||
object_spilling_config, temp_folder = multi_node_object_spilling_config
|
||||
cluster = ray_start_cluster
|
||||
# Head node.
|
||||
cluster.add_node(
|
||||
@@ -483,12 +475,7 @@ def test_delete_objects_multi_node(tmp_path, ray_start_cluster):
|
||||
"min_spilling_size": 20 * 1024 * 1024,
|
||||
"automatic_object_spilling_enabled": True,
|
||||
"object_store_full_delay_ms": 100,
|
||||
"object_spilling_config": json.dumps({
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": str(temp_folder)
|
||||
}
|
||||
}),
|
||||
"object_spilling_config": object_spilling_config,
|
||||
})
|
||||
# Add 2 worker nodes.
|
||||
for _ in range(2):
|
||||
@@ -546,10 +533,9 @@ def test_delete_objects_multi_node(tmp_path, ray_start_cluster):
|
||||
|
||||
|
||||
@pytest.mark.skipif(platform.system() == "Windows", reason="Flaky on Windows.")
|
||||
def test_fusion_objects(tmp_path, shutdown_only):
|
||||
def test_fusion_objects(object_spilling_config, shutdown_only):
|
||||
# Limit our object store to 75 MiB of memory.
|
||||
temp_folder = tmp_path / "spill"
|
||||
temp_folder.mkdir()
|
||||
object_spilling_config, temp_folder = object_spilling_config
|
||||
min_spilling_size = 10 * 1024 * 1024
|
||||
ray.init(
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
@@ -557,12 +543,7 @@ def test_fusion_objects(tmp_path, shutdown_only):
|
||||
"max_io_workers": 3,
|
||||
"automatic_object_spilling_enabled": True,
|
||||
"object_store_full_delay_ms": 100,
|
||||
"object_spilling_config": json.dumps({
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": str(temp_folder)
|
||||
}
|
||||
}),
|
||||
"object_spilling_config": object_spilling_config,
|
||||
"min_spilling_size": min_spilling_size,
|
||||
})
|
||||
replay_buffer = []
|
||||
@@ -600,8 +581,8 @@ def test_fusion_objects(tmp_path, shutdown_only):
|
||||
|
||||
|
||||
# https://github.com/ray-project/ray/issues/12912
|
||||
def do_test_release_resource(tmp_path, expect_released):
|
||||
temp_folder = tmp_path / "spill"
|
||||
def do_test_release_resource(object_spilling_config, expect_released):
|
||||
object_spilling_config, temp_folder = object_spilling_config
|
||||
ray.init(
|
||||
num_cpus=1,
|
||||
object_store_memory=75 * 1024 * 1024,
|
||||
@@ -609,12 +590,7 @@ def do_test_release_resource(tmp_path, expect_released):
|
||||
"max_io_workers": 1,
|
||||
"release_resources_during_plasma_fetch": expect_released,
|
||||
"automatic_object_spilling_enabled": True,
|
||||
"object_spilling_config": json.dumps({
|
||||
"type": "filesystem",
|
||||
"params": {
|
||||
"directory_path": str(temp_folder)
|
||||
}
|
||||
}),
|
||||
"object_spilling_config": object_spilling_config,
|
||||
})
|
||||
plasma_obj = ray.put(np.ones(50 * 1024 * 1024, dtype=np.uint8))
|
||||
for _ in range(5):
|
||||
@@ -643,14 +619,14 @@ def do_test_release_resource(tmp_path, expect_released):
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
def test_no_release_during_plasma_fetch(tmp_path, shutdown_only):
|
||||
do_test_release_resource(tmp_path, expect_released=False)
|
||||
def test_no_release_during_plasma_fetch(object_spilling_config, shutdown_only):
|
||||
do_test_release_resource(object_spilling_config, expect_released=False)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
platform.system() == "Windows", reason="Failing on Windows.")
|
||||
def test_release_during_plasma_fetch(tmp_path, shutdown_only):
|
||||
do_test_release_resource(tmp_path, expect_released=True)
|
||||
def test_release_during_plasma_fetch(object_spilling_config, shutdown_only):
|
||||
do_test_release_resource(object_spilling_config, expect_released=True)
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
@@ -661,6 +637,7 @@ def test_release_during_plasma_fetch(tmp_path, shutdown_only):
|
||||
@pytest.mark.timeout(30)
|
||||
def test_spill_objects_on_object_transfer(object_spilling_config,
|
||||
ray_start_cluster):
|
||||
object_spilling_config, _ = object_spilling_config
|
||||
# This test checks that objects get spilled to make room for transferred
|
||||
# objects.
|
||||
cluster = ray_start_cluster
|
||||
|
||||
Reference in New Issue
Block a user