[Object spilling] Update object directory and reload spilled objects automatically (#11021)

* Fix pytest... * Release objects that have been spilled * GCS object table interface refactor * Add spilled URL to object location info * refactor to include spilled URL in notifications * improve tests * Add spilled URL to object directory results * Remove force restore call * Merge spilled URL and location * fix * CI * build * osx * Fix multitenancy issues * Skip windows tests
2026-07-04 06:44:03 +08:00 · 2020-10-02 15:52:42 -07:00
parent c17169dc11
commit ada58abcd9
43 changed files with 499 additions and 430 deletions
@@ -1,5 +1,7 @@
 import json
 import random
+import platform
+import sys
 import time

 import numpy as np
@@ -8,6 +10,8 @@ import psutil
 import ray


+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="Failing on Windows.")
 def test_spill_objects_manually(shutdown_only):
    # Limit our object store to 75 MiB of memory.
    ray.init(
@@ -25,7 +29,6 @@ def test_spill_objects_manually(shutdown_only):
    arr = np.random.rand(1024 * 1024)  # 8 MB data
    replay_buffer = []
    pinned_objects = set()
-    spilled_objects = set()

    # Create objects of more than 200 MiB.
    for _ in range(25):
@@ -38,7 +41,6 @@ def test_spill_objects_manually(shutdown_only):
            except ray.exceptions.ObjectStoreFullError:
                ref_to_spill = pinned_objects.pop()
                ray.experimental.force_spill_objects([ref_to_spill])
-                spilled_objects.add(ref_to_spill)

    def is_worker(cmdline):
        return cmdline and cmdline[0].startswith("ray::")
@@ -54,17 +56,16 @@ def test_spill_objects_manually(shutdown_only):
    # restoring objects back.
    refs_to_spill = (pinned_objects.pop(), pinned_objects.pop())
    ray.experimental.force_spill_objects(refs_to_spill)
-    spilled_objects.update(refs_to_spill)

    # randomly sample objects
    for _ in range(100):
        ref = random.choice(replay_buffer)
-        if ref in spilled_objects:
-            ray.experimental.force_restore_spilled_objects([ref])
        sample = ray.get(ref)
        assert np.array_equal(sample, arr)


+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="Failing on Windows.")
 def test_spill_objects_manually_from_workers(shutdown_only):
    # Limit our object store to 100 MiB of memory.
    ray.init(
@@ -82,15 +83,22 @@ def test_spill_objects_manually_from_workers(shutdown_only):

    @ray.remote
    def _worker():
-        arr = np.random.rand(100 * 1024)
+        arr = np.random.rand(1024 * 1024)  # 8 MB data
        ref = ray.put(arr)
        ray.experimental.force_spill_objects([ref])
-        ray.experimental.force_restore_spilled_objects([ref])
-        assert np.array_equal(ray.get(ref), arr)
+        return ref

-    ray.get([_worker.remote() for _ in range(50)])
+    # Create objects of more than 200 MiB.
+    replay_buffer = [ray.get(_worker.remote()) for _ in range(25)]
+    values = {ref: np.copy(ray.get(ref)) for ref in replay_buffer}
+    # Randomly sample objects.
+    for _ in range(100):
+        ref = random.choice(replay_buffer)
+        sample = ray.get(ref)
+        assert np.array_equal(sample, values[ref])


+@pytest.mark.skip(reason="Not implemented yet.")
 def test_spill_objects_manually_with_workers(shutdown_only):
    # Limit our object store to 75 MiB of memory.
    ray.init(
@@ -118,27 +126,29 @@ def test_spill_objects_manually_with_workers(shutdown_only):
        assert np.array_equal(restored, arr)


+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="Failing on Windows.")
@pytest.mark.parametrize(
    "ray_start_cluster_head", [{
        "num_cpus": 0,
        "object_store_memory": 75 * 1024 * 1024,
-        "_object_spilling_config": {
+        "object_spilling_config": {
            "type": "filesystem",
            "params": {
                "directory_path": "/tmp"
            }
        },
-        "_system_config": json.dumps({
+        "_system_config": {
            "object_store_full_max_retries": 0,
            "max_io_workers": 4,
-        }),
+        },
    }],
    indirect=True)
 def test_spill_remote_object(ray_start_cluster_head):
    cluster = ray_start_cluster_head
    cluster.add_node(
        object_store_memory=75 * 1024 * 1024,
-        _object_spilling_config={
+        object_spilling_config={
            "type": "filesystem",
            "params": {
                "directory_path": "/tmp"
@@ -149,23 +159,33 @@ def test_spill_remote_object(ray_start_cluster_head):
    def put():
        return np.random.rand(5 * 1024 * 1024)  # 40 MB data

-    # Create 2 objects. Only 1 should fit.
+    @ray.remote
+    def depends(arg):
+        return
+
    ref = put.remote()
-    ray.get(ref)
+    copy = np.copy(ray.get(ref))
+    # Evict local copy.
+    ray.put(np.random.rand(5 * 1024 * 1024))  # 40 MB data
+    # Remote copy should not fit.
    with pytest.raises(ray.exceptions.RayTaskError):
        ray.get(put.remote())
-    time.sleep(1)
    # Spill 1 object. The second should now fit.
    ray.experimental.force_spill_objects([ref])
    ray.get(put.remote())

-    # TODO(swang): Restoring from the object directory is not yet supported.
-    # ray.experimental.force_restore_spilled_objects([ref])
-    # sample = ray.get(ref)
-    # assert np.array_equal(sample, copy)
+    sample = ray.get(ref)
+    assert np.array_equal(sample, copy)
+    # Evict the spilled object.
+    del sample
+    ray.get(put.remote())
+    ray.put(np.random.rand(5 * 1024 * 1024))  # 40 MB data
+
+    # Test passing the spilled object as an arg to another task.
+    ray.get(depends.remote(ref))


-@pytest.mark.skip(reason="have not been fully implemented")
+@pytest.mark.skip(reason="Not implemented yet.")
 def test_spill_objects_automatically(shutdown_only):
    # Limit our object store to 75 MiB of memory.
    ray.init(
@@ -196,3 +216,7 @@ def test_spill_objects_automatically(shutdown_only):
        ref = random.choice(replay_buffer)
        sample = ray.get(ref, timeout=0)
        assert np.array_equal(sample, arr)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(["-sv", __file__]))