Move plasma retry logic into plasma store provider (#7328)

2026-06-28 18:29:08 +08:00 · 2020-02-26 16:57:02 -08:00
parent aec03656d5
commit 2ad9bc5684
6 changed files with 67 additions and 63 deletions
@@ -93,10 +93,6 @@ from ray.exceptions import (
 )
 from ray.experimental.no_return import NoReturn
 from ray.utils import decode
-from ray.ray_constants import (
-    DEFAULT_PUT_OBJECT_DELAY,
-    DEFAULT_PUT_OBJECT_RETRIES,
-)

 cimport cpython

@@ -673,32 +669,17 @@ cdef class CoreWorker:
                            size_t data_size, ObjectID object_id,
                            c_vector[CObjectID] contained_ids,
                            CObjectID *c_object_id, shared_ptr[CBuffer] *data):
-        delay = ray_constants.DEFAULT_PUT_OBJECT_DELAY
-        for attempt in reversed(
-                range(ray_constants.DEFAULT_PUT_OBJECT_RETRIES)):
-            try:
-                if object_id is None:
-                    with nogil:
-                        check_status(self.core_worker.get().Create(
-                                     metadata, data_size, contained_ids,
-                                     c_object_id, data))
-                else:
-                    c_object_id[0] = object_id.native()
-                    with nogil:
-                        check_status(self.core_worker.get().Create(
-                                    metadata, data_size,
-                                    c_object_id[0], data))
-                break
-            except ObjectStoreFullError as e:
-                if attempt:
-                    logger.warning("Waiting {} seconds for space to free up "
-                                   "in the object store.".format(delay))
-                    gc.collect()
-                    time.sleep(delay)
-                    delay *= 2
-                else:
-                    self.dump_object_store_memory_usage()
-                    raise e
+        if object_id is None:
+            with nogil:
+                check_status(self.core_worker.get().Create(
+                             metadata, data_size, contained_ids,
+                             c_object_id, data))
+        else:
+            c_object_id[0] = object_id.native()
+            with nogil:
+                check_status(self.core_worker.get().Create(
+                            metadata, data_size,
+                            c_object_id[0], data))

        # If data is nullptr, that means the ObjectID already existed,
        # which we ignore.
@@ -22,12 +22,6 @@ ID_SIZE = 20
 # The default maximum number of bytes to allocate to the object store unless
 # overridden by the user.
 DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = 20 * 10**9
-# The default number of retries to call `put` when the object store is full.
-DEFAULT_PUT_OBJECT_RETRIES = 5
-# The default seconds for delay between calls to retry `put` when
-# the object store is full. This delay is exponentially doubled up to
-# DEFAULT_PUT_OBJECT_RETRIES times.
-DEFAULT_PUT_OBJECT_DELAY = 1
 # The smallest cap on the memory used by the object store that we allow.
 # This must be greater than MEMORY_RESOURCE_UNIT_BYTES * 0.7
 OBJECT_STORE_MINIMUM_MEMORY_BYTES = 75 * 1024 * 1024
@@ -892,30 +892,33 @@ def test_connect_with_disconnected_node(shutdown_only):
@pytest.mark.parametrize(
    "ray_start_cluster_head", [{
        "num_cpus": 5,
-        "object_store_memory": 10**8
+        "object_store_memory": 10**8,
+        "_internal_config": json.dumps({
+            "object_store_full_max_retries": 0
+        })
    }],
    indirect=True)
-@pytest.mark.parametrize("num_actors", [1, 2, 5])
-def test_parallel_actor_fill_plasma_retry(ray_start_cluster_head, num_actors):
+def test_parallel_actor_fill_plasma_retry(ray_start_cluster_head):
    @ray.remote
    class LargeMemoryActor:
        def some_expensive_task(self):
            return np.zeros(10**8 // 2, dtype=np.uint8)

-    actors = [LargeMemoryActor.remote() for _ in range(num_actors)]
+    actors = [LargeMemoryActor.remote() for _ in range(5)]
    for _ in range(10):
        pending = [a.some_expensive_task.remote() for a in actors]
        while pending:
            [done], pending = ray.wait(pending, num_returns=1)


-@pytest.mark.parametrize(
-    "ray_start_cluster_head", [{
-        "num_cpus": 2,
-        "object_store_memory": 10**8
-    }],
-    indirect=True)
-def test_fill_object_store_exception(ray_start_cluster_head):
+def test_fill_object_store_exception(shutdown_only):
+    ray.init(
+        num_cpus=2,
+        object_store_memory=10**8,
+        _internal_config=json.dumps({
+            "object_store_full_max_retries": 0
+        }))
+
    @ray.remote
    def expensive_task():
        return np.zeros((10**8) // 10, dtype=np.uint8)
@@ -25,6 +25,7 @@ logger = logging.getLogger(__name__)
 def one_worker_100MiB(request):
    config = json.dumps({
        "distributed_ref_counting_enabled": 1,
+        "object_store_full_max_retries": 1,
    })
    yield ray.init(
        num_cpus=1,