[Object Spilling] Multi node file spilling V2. (#13542)

* done.

* done.

* Fix a mistake.

* Ready.

* Fix issues.

* fix.

* Finished the first round of code review.

* formatting.

* In progress.

* Formatting.

* Addressed code review.

* Formatting

* Fix tests.

* fix bugs.

* Skip flaky tests for now.
This commit is contained in:
SangBin Cho
2021-01-23 23:15:32 -08:00
committed by GitHub
parent e675e5b75a
commit edbb2937d3
36 changed files with 573 additions and 249 deletions
+4
View File
@@ -345,6 +345,10 @@ def setup_external_storage(config):
elif storage_type == "smart_open":
_external_storage = ExternalStorageSmartOpenImpl(
**config["params"])
elif storage_type == "mock_distributed_fs":
# This storage is used to unit test distributed external storages.
# TODO(sang): Delete it after introducing the mock S3 test.
_external_storage = FileSystemStorage(**config["params"])
else:
raise ValueError(f"Unknown external storage type: {storage_type}")
else:
+3
View File
@@ -330,3 +330,6 @@ class RayParams:
# Validate external storage usage.
external_storage.setup_external_storage(object_spilling_config)
external_storage.reset_external_storage()
# Configure the proper system config.
self._system_config["is_external_storage_type_fs"] = (
object_spilling_config["type"] == "filesystem")
+1 -1
View File
@@ -53,7 +53,6 @@ py_test_module_list(
"test_multinode_failures_2.py",
"test_multiprocessing.py",
"test_object_manager.py",
"test_object_spilling.py",
"test_output.py",
"test_reconstruction.py",
"test_reference_counting.py",
@@ -134,6 +133,7 @@ py_test_module_list(
py_test_module_list(
files = [
"test_placement_group.py",
"test_object_spilling.py",
],
size = "large",
extra_srcs = SRCS,
+80 -103
View File
@@ -21,6 +21,15 @@ file_system_object_spilling_config = {
"directory_path": spill_local_path
}
}
# Since we have differet protocol for a local external storage (e.g., fs)
# and distributed external storage (e.g., S3), we need to test both cases.
# This mocks the distributed fs with cluster utils.
mock_distributed_fs_object_spilling_config = {
"type": "mock_distributed_fs",
"params": {
"directory_path": spill_local_path
}
}
smart_open_object_spilling_config = {
"type": "smart_open",
"params": {
@@ -29,6 +38,15 @@ smart_open_object_spilling_config = {
}
def create_object_spilling_config(request, tmp_path):
if (request.param["type"] == "filesystem"
or request.param["type"] == "mock_distributed_fs"):
temp_folder = tmp_path / "spill"
temp_folder.mkdir()
request.param["params"]["directory_path"] = str(temp_folder)
return json.dumps(request.param), temp_folder
@pytest.fixture(
scope="function",
params=[
@@ -36,10 +54,18 @@ smart_open_object_spilling_config = {
# TODO(sang): Add a mock dependency to test S3.
# smart_open_object_spilling_config,
])
def object_spilling_config(request, tmpdir):
if request.param["type"] == "filesystem":
request.param["params"]["directory_path"] = str(tmpdir)
yield json.dumps(request.param)
def object_spilling_config(request, tmp_path):
yield create_object_spilling_config(request, tmp_path)
@pytest.fixture(
scope="function",
params=[
file_system_object_spilling_config,
mock_distributed_fs_object_spilling_config
])
def multi_node_object_spilling_config(request, tmp_path):
yield create_object_spilling_config(request, tmp_path)
def test_invalid_config_raises_exception(shutdown_only):
@@ -75,22 +101,17 @@ def test_url_generation_and_parse():
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
def test_spilling_not_done_for_pinned_object(tmp_path, shutdown_only):
def test_spilling_not_done_for_pinned_object(object_spilling_config,
shutdown_only):
# Limit our object store to 75 MiB of memory.
temp_folder = tmp_path / "spill"
temp_folder.mkdir()
object_spilling_config, temp_folder = object_spilling_config
ray.init(
object_store_memory=75 * 1024 * 1024,
_system_config={
"max_io_workers": 4,
"automatic_object_spilling_enabled": True,
"object_store_full_delay_ms": 100,
"object_spilling_config": json.dumps({
"type": "filesystem",
"params": {
"directory_path": str(temp_folder)
}
}),
"object_spilling_config": object_spilling_config,
"min_spilling_size": 0,
})
arr = np.random.rand(5 * 1024 * 1024) # 40 MB
@@ -110,27 +131,23 @@ def test_spilling_not_done_for_pinned_object(tmp_path, shutdown_only):
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
@pytest.mark.parametrize(
"ray_start_cluster_head", [{
"num_cpus": 0,
"object_store_memory": 75 * 1024 * 1024,
"_system_config": {
def test_spill_remote_object(ray_start_cluster,
multi_node_object_spilling_config):
cluster = ray_start_cluster
object_spilling_config, _ = multi_node_object_spilling_config
cluster.add_node(
num_cpus=0,
object_store_memory=75 * 1024 * 1024,
_system_config={
"automatic_object_spilling_enabled": True,
"object_store_full_delay_ms": 100,
"max_io_workers": 4,
"object_spilling_config": json.dumps({
"type": "filesystem",
"params": {
"directory_path": "/tmp"
}
}),
"object_spilling_config": object_spilling_config,
"min_spilling_size": 0,
},
}],
indirect=True)
def test_spill_remote_object(ray_start_cluster_head):
cluster = ray_start_cluster_head
})
ray.init(address=cluster.address)
cluster.add_node(object_store_memory=75 * 1024 * 1024)
cluster.wait_for_nodes()
@ray.remote
def put():
@@ -162,6 +179,7 @@ def test_spill_remote_object(ray_start_cluster_head):
platform.system() == "Windows", reason="Failing on Windows.")
def test_spill_objects_automatically(object_spilling_config, shutdown_only):
# Limit our object store to 75 MiB of memory.
object_spilling_config, _ = object_spilling_config
ray.init(
num_cpus=1,
object_store_memory=75 * 1024 * 1024,
@@ -197,10 +215,9 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only):
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
def test_spill_stats(tmp_path, shutdown_only):
def test_spill_stats(object_spilling_config, shutdown_only):
# Limit our object store to 75 MiB of memory.
temp_folder = tmp_path / "spill"
temp_folder.mkdir()
object_spilling_config, _ = object_spilling_config
ray.init(
num_cpus=1,
object_store_memory=100 * 1024 * 1024,
@@ -208,14 +225,7 @@ def test_spill_stats(tmp_path, shutdown_only):
"automatic_object_spilling_enabled": True,
"max_io_workers": 100,
"min_spilling_size": 1,
"object_spilling_config": json.dumps(
{
"type": "filesystem",
"params": {
"directory_path": str(temp_folder)
}
},
separators=(",", ":"))
"object_spilling_config": object_spilling_config
},
)
@@ -242,6 +252,7 @@ def test_spill_stats(tmp_path, shutdown_only):
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
def test_spill_during_get(object_spilling_config, shutdown_only):
object_spilling_config, _ = object_spilling_config
ray.init(
num_cpus=4,
object_store_memory=100 * 1024 * 1024,
@@ -273,6 +284,7 @@ def test_spill_during_get(object_spilling_config, shutdown_only):
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
def test_spill_deadlock(object_spilling_config, shutdown_only):
object_spilling_config, _ = object_spilling_config
# Limit our object store to 75 MiB of memory.
ray.init(
object_store_memory=75 * 1024 * 1024,
@@ -302,10 +314,9 @@ def test_spill_deadlock(object_spilling_config, shutdown_only):
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
def test_delete_objects(tmp_path, shutdown_only):
def test_delete_objects(object_spilling_config, shutdown_only):
# Limit our object store to 75 MiB of memory.
temp_folder = tmp_path / "spill"
temp_folder.mkdir()
object_spilling_config, temp_folder = object_spilling_config
ray.init(
object_store_memory=75 * 1024 * 1024,
_system_config={
@@ -313,12 +324,7 @@ def test_delete_objects(tmp_path, shutdown_only):
"min_spilling_size": 0,
"automatic_object_spilling_enabled": True,
"object_store_full_delay_ms": 100,
"object_spilling_config": json.dumps({
"type": "filesystem",
"params": {
"directory_path": str(temp_folder)
}
}),
"object_spilling_config": object_spilling_config,
})
arr = np.random.rand(1024 * 1024) # 8 MB data
replay_buffer = []
@@ -343,13 +349,11 @@ def test_delete_objects(tmp_path, shutdown_only):
@pytest.mark.skipif(
platform.system() in ["Windows", "Darwin"],
reason="Failing on "
"Windows and Mac.")
def test_delete_objects_delete_while_creating(tmp_path, shutdown_only):
platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.")
def test_delete_objects_delete_while_creating(object_spilling_config,
shutdown_only):
# Limit our object store to 75 MiB of memory.
temp_folder = tmp_path / "spill"
temp_folder.mkdir()
object_spilling_config, temp_folder = object_spilling_config
ray.init(
object_store_memory=75 * 1024 * 1024,
_system_config={
@@ -357,12 +361,7 @@ def test_delete_objects_delete_while_creating(tmp_path, shutdown_only):
"min_spilling_size": 0,
"automatic_object_spilling_enabled": True,
"object_store_full_delay_ms": 100,
"object_spilling_config": json.dumps({
"type": "filesystem",
"params": {
"directory_path": str(temp_folder)
}
}),
"object_spilling_config": object_spilling_config,
})
arr = np.random.rand(1024 * 1024) # 8 MB data
replay_buffer = []
@@ -395,25 +394,18 @@ def test_delete_objects_delete_while_creating(tmp_path, shutdown_only):
@pytest.mark.skipif(
platform.system() in ["Windows", "Darwin"],
reason="Failing on Windows "
"and Mac.")
def test_delete_objects_on_worker_failure(tmp_path, shutdown_only):
platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.")
def test_delete_objects_on_worker_failure(object_spilling_config,
shutdown_only):
# Limit our object store to 75 MiB of memory.
temp_folder = tmp_path / "spill"
temp_folder.mkdir()
object_spilling_config, temp_folder = object_spilling_config
ray.init(
object_store_memory=75 * 1024 * 1024,
_system_config={
"max_io_workers": 4,
"automatic_object_spilling_enabled": True,
"object_store_full_delay_ms": 100,
"object_spilling_config": json.dumps({
"type": "filesystem",
"params": {
"directory_path": str(temp_folder)
}
}),
"object_spilling_config": object_spilling_config,
"min_spilling_size": 0,
})
@@ -469,10 +461,10 @@ def test_delete_objects_on_worker_failure(tmp_path, shutdown_only):
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
def test_delete_objects_multi_node(tmp_path, ray_start_cluster):
def test_delete_objects_multi_node(multi_node_object_spilling_config,
ray_start_cluster):
# Limit our object store to 75 MiB of memory.
temp_folder = tmp_path / "spill"
temp_folder.mkdir()
object_spilling_config, temp_folder = multi_node_object_spilling_config
cluster = ray_start_cluster
# Head node.
cluster.add_node(
@@ -483,12 +475,7 @@ def test_delete_objects_multi_node(tmp_path, ray_start_cluster):
"min_spilling_size": 20 * 1024 * 1024,
"automatic_object_spilling_enabled": True,
"object_store_full_delay_ms": 100,
"object_spilling_config": json.dumps({
"type": "filesystem",
"params": {
"directory_path": str(temp_folder)
}
}),
"object_spilling_config": object_spilling_config,
})
# Add 2 worker nodes.
for _ in range(2):
@@ -546,10 +533,9 @@ def test_delete_objects_multi_node(tmp_path, ray_start_cluster):
@pytest.mark.skipif(platform.system() == "Windows", reason="Flaky on Windows.")
def test_fusion_objects(tmp_path, shutdown_only):
def test_fusion_objects(object_spilling_config, shutdown_only):
# Limit our object store to 75 MiB of memory.
temp_folder = tmp_path / "spill"
temp_folder.mkdir()
object_spilling_config, temp_folder = object_spilling_config
min_spilling_size = 10 * 1024 * 1024
ray.init(
object_store_memory=75 * 1024 * 1024,
@@ -557,12 +543,7 @@ def test_fusion_objects(tmp_path, shutdown_only):
"max_io_workers": 3,
"automatic_object_spilling_enabled": True,
"object_store_full_delay_ms": 100,
"object_spilling_config": json.dumps({
"type": "filesystem",
"params": {
"directory_path": str(temp_folder)
}
}),
"object_spilling_config": object_spilling_config,
"min_spilling_size": min_spilling_size,
})
replay_buffer = []
@@ -600,8 +581,8 @@ def test_fusion_objects(tmp_path, shutdown_only):
# https://github.com/ray-project/ray/issues/12912
def do_test_release_resource(tmp_path, expect_released):
temp_folder = tmp_path / "spill"
def do_test_release_resource(object_spilling_config, expect_released):
object_spilling_config, temp_folder = object_spilling_config
ray.init(
num_cpus=1,
object_store_memory=75 * 1024 * 1024,
@@ -609,12 +590,7 @@ def do_test_release_resource(tmp_path, expect_released):
"max_io_workers": 1,
"release_resources_during_plasma_fetch": expect_released,
"automatic_object_spilling_enabled": True,
"object_spilling_config": json.dumps({
"type": "filesystem",
"params": {
"directory_path": str(temp_folder)
}
}),
"object_spilling_config": object_spilling_config,
})
plasma_obj = ray.put(np.ones(50 * 1024 * 1024, dtype=np.uint8))
for _ in range(5):
@@ -643,14 +619,14 @@ def do_test_release_resource(tmp_path, expect_released):
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
def test_no_release_during_plasma_fetch(tmp_path, shutdown_only):
do_test_release_resource(tmp_path, expect_released=False)
def test_no_release_during_plasma_fetch(object_spilling_config, shutdown_only):
do_test_release_resource(object_spilling_config, expect_released=False)
@pytest.mark.skipif(
platform.system() == "Windows", reason="Failing on Windows.")
def test_release_during_plasma_fetch(tmp_path, shutdown_only):
do_test_release_resource(tmp_path, expect_released=True)
def test_release_during_plasma_fetch(object_spilling_config, shutdown_only):
do_test_release_resource(object_spilling_config, expect_released=True)
@pytest.mark.skip(
@@ -661,6 +637,7 @@ def test_release_during_plasma_fetch(tmp_path, shutdown_only):
@pytest.mark.timeout(30)
def test_spill_objects_on_object_transfer(object_spilling_config,
ray_start_cluster):
object_spilling_config, _ = object_spilling_config
# This test checks that objects get spilled to make room for transferred
# objects.
cluster = ray_start_cluster