[core] Move out-of-memory handling into the plasma store and support async object creation (#12186)

* Refactor to extract creation request queue * timer on oom * move timer out * Move evict_if_full and on_store_full into plasma store * Remove client-side code * revert * Distinguish between transient and permanent OOM delays * update * Move out create request queue, unit test * unit test * Fix max retries * test * Do not pin restored objects * First pass to add polling requests, unit test passes * worker plasma client retries plasma requests * cleanup * Clean up after disconnected clients, check memory leaks * Support immediate requests in request queue * Option to try creating immediately * lint * Fix build, address comments * doc * fixes * debug travis * debug * debug * debug * debug * Revert "debug" This reverts commit 6bf2f6ee5640e71630c4aecdb7ebf54911ea32db. Revert "debug" This reverts commit 73017099c9b06cdaae1217bf0e0f4d23ed68a9e5. Revert "debug" This reverts commit 5a155529e28cee9461a598b0cdf7b6a3cc194c93. Revert "debug" This reverts commit b50c2101afd45d4cf663daae857bfe1b40387703. Revert "debug travis" This reverts commit 012b8721dedf9bca46294ae75eee2815b160368b. * Skip if new scheduler enabled * error message * merge
2026-07-03 20:22:39 +08:00 · 2020-12-02 13:25:54 -05:00
parent 786f839ff3
commit 443339ab19
32 changed files with 1010 additions and 278 deletions
@@ -348,7 +348,8 @@ def test_system_config_when_connecting(ray_start_cluster):
    obj_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))

    for _ in range(5):
-        ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))
+        put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))
+    del put_ref

    # This would not raise an exception if object pinning was enabled.
    with pytest.raises(ray.exceptions.ObjectLostError):
@@ -7,6 +7,7 @@ import warnings

 import ray
 from ray.cluster_utils import Cluster
+from ray.exceptions import GetTimeoutError

 if (multiprocessing.cpu_count() < 40
        or ray.utils.get_system_memory() < 50 * 10**9):
@@ -33,6 +34,29 @@ def ray_start_cluster_with_resource():
    cluster.shutdown()


+@pytest.mark.parametrize(
+    "ray_start_cluster_head", [{
+        "num_cpus": 0,
+        "object_store_memory": 75 * 1024 * 1024,
+    }],
+    indirect=True)
+def test_object_transfer_during_oom(ray_start_cluster_head):
+    cluster = ray_start_cluster_head
+    cluster.add_node(object_store_memory=75 * 1024 * 1024)
+
+    @ray.remote
+    def put():
+        return np.random.rand(5 * 1024 * 1024)  # 40 MB data
+
+    local_ref = ray.put(np.random.rand(5 * 1024 * 1024))
+    remote_ref = put.remote()
+
+    with pytest.raises(GetTimeoutError):
+        ray.get(remote_ref, timeout=1)
+    del local_ref
+    ray.get(remote_ref)
+
+
 # This test is here to make sure that when we broadcast an object to a bunch of
 # machines, we don't have too many excess object transfers.
@pytest.mark.skip(reason="TODO(ekl)")
@@ -12,7 +12,7 @@ import psutil
 import ray
 from ray.external_storage import (create_url_with_offset,
                                  parse_url_with_offset)
-from ray.test_utils import wait_for_condition
+from ray.test_utils import new_scheduler_enabled, wait_for_condition

 bucket_name = "object-spilling-test"
 spill_local_path = "/tmp/spill"
@@ -338,16 +338,19 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only):

@pytest.mark.skipif(
    platform.system() == "Windows", reason="Failing on Windows.")
-@pytest.mark.skip(
-    "Temporarily disabled until OutOfMemory retries can be moved "
-    "into the plasma store")
+@pytest.mark.skipif(new_scheduler_enabled(), reason="hangs")
 def test_spill_during_get(object_spilling_config, shutdown_only):
    ray.init(
        num_cpus=4,
        object_store_memory=100 * 1024 * 1024,
        _system_config={
            "automatic_object_spilling_enabled": True,
-            "max_io_workers": 2,
+            "object_store_full_initial_delay_ms": 100,
+            # NOTE(swang): Use infinite retries because the OOM timer can still
+            # get accidentally triggered when objects are released too slowly
+            # (see github.com/ray-project/ray/issues/12040).
+            "object_store_full_max_retries": -1,
+            "max_io_workers": 1,
            "object_spilling_config": object_spilling_config,
            "min_spilling_size": 0,
        },
@@ -10,8 +10,8 @@ import pytest

 import ray
 import ray.cluster_utils
-from ray.test_utils import SignalActor, put_object, wait_for_condition, \
-    new_scheduler_enabled
+from ray.test_utils import (SignalActor, put_object, wait_for_condition,
+                            new_scheduler_enabled)

 logger = logging.getLogger(__name__)

@@ -167,7 +167,7 @@ def test_dependency_refcounts(ray_start_regular):
    check_refcounts({})


-@pytest.mark.skipif(new_scheduler_enabled(), reason="dynres notimpl")
+@pytest.mark.skipif(new_scheduler_enabled(), reason="hangs")
 def test_actor_creation_task(ray_start_regular):
    @ray.remote
    def large_object():
@@ -269,7 +269,16 @@ def test_feature_flag(shutdown_only):

    # The ray.get below fails with only LRU eviction, as the object
    # that was ray.put by the actor should have been evicted.
-    _fill_object_store_and_get(actor.get_large_object.remote(), succeed=False)
+    ref = actor.get_large_object.remote()
+    ray.get(ref)
+
+    # Keep refs in scope so that they don't get GCed immediately.
+    for _ in range(5):
+        put_ref = ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8))
+    del put_ref
+
+    wait_for_condition(
+        lambda: not ray.worker.global_worker.core_worker.object_exists(ref))


 def test_out_of_band_serialized_object_ref(one_worker_100MiB):