[Object Spilling] Remove retries and use a timer instead. (#13175)

2026-07-04 23:20:54 +08:00 · 2021-01-19 11:01:45 -08:00
parent 86d5000047
commit 99375c4cfc
10 changed files with 88 additions and 75 deletions
@@ -210,7 +210,6 @@ class RayParams:
                raise Exception(
                    "Object pinning cannot be enabled if using LRU eviction.")
            self._system_config["object_pinning_enabled"] = False
-            self._system_config["object_store_full_max_retries"] = -1
            self._system_config["free_objects_period_milliseconds"] = 1000

        # Set the internal config options for object reconstruction.
@@ -22,7 +22,6 @@ def get_default_fixure_system_config():
    system_config = {
        "object_timeout_milliseconds": 200,
        "num_heartbeats_timeout": 10,
-        "object_store_full_max_retries": 3,
        "object_store_full_delay_ms": 100,
    }
    return system_config
@@ -1023,9 +1023,6 @@ def test_connect_with_disconnected_node(shutdown_only):
    "ray_start_cluster_head", [{
        "num_cpus": 5,
        "object_store_memory": 10**8,
-        "_system_config": {
-            "object_store_full_max_retries": 0
-        }
    }],
    indirect=True)
 def test_parallel_actor_fill_plasma_retry(ray_start_cluster_head):
@@ -1042,10 +1039,7 @@ def test_parallel_actor_fill_plasma_retry(ray_start_cluster_head):


 def test_fill_object_store_exception(shutdown_only):
-    ray.init(
-        num_cpus=2,
-        object_store_memory=10**8,
-        _system_config={"object_store_full_max_retries": 0})
+    ray.init(num_cpus=2, object_store_memory=10**8)

    @ray.remote
    def expensive_task():
@@ -84,7 +84,6 @@ def test_spilling_not_done_for_pinned_object(tmp_path, shutdown_only):
        _system_config={
            "max_io_workers": 4,
            "automatic_object_spilling_enabled": True,
-            "object_store_full_max_retries": 4,
            "object_store_full_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
@@ -117,7 +116,6 @@ def test_spilling_not_done_for_pinned_object(tmp_path, shutdown_only):
        "object_store_memory": 75 * 1024 * 1024,
        "_system_config": {
            "automatic_object_spilling_enabled": True,
-            "object_store_full_max_retries": 4,
            "object_store_full_delay_ms": 100,
            "max_io_workers": 4,
            "object_spilling_config": json.dumps({
@@ -170,7 +168,6 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only):
        _system_config={
            "max_io_workers": 4,
            "automatic_object_spilling_enabled": True,
-            "object_store_full_max_retries": 4,
            "object_store_full_delay_ms": 100,
            "object_spilling_config": object_spilling_config,
            "min_spilling_size": 0
@@ -251,10 +248,6 @@ def test_spill_during_get(object_spilling_config, shutdown_only):
        _system_config={
            "automatic_object_spilling_enabled": True,
            "object_store_full_delay_ms": 100,
-            # NOTE(swang): Use infinite retries because the OOM timer can still
-            # get accidentally triggered when objects are released too slowly
-            # (see github.com/ray-project/ray/issues/12040).
-            "object_store_full_max_retries": -1,
            "max_io_workers": 1,
            "object_spilling_config": object_spilling_config,
            "min_spilling_size": 0,
@@ -286,7 +279,6 @@ def test_spill_deadlock(object_spilling_config, shutdown_only):
        _system_config={
            "max_io_workers": 1,
            "automatic_object_spilling_enabled": True,
-            "object_store_full_max_retries": 4,
            "object_store_full_delay_ms": 100,
            "object_spilling_config": object_spilling_config,
            "min_spilling_size": 0,
@@ -320,7 +312,6 @@ def test_delete_objects(tmp_path, shutdown_only):
            "max_io_workers": 1,
            "min_spilling_size": 0,
            "automatic_object_spilling_enabled": True,
-            "object_store_full_max_retries": 4,
            "object_store_full_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
@@ -363,7 +354,6 @@ def test_delete_objects_delete_while_creating(tmp_path, shutdown_only):
            "max_io_workers": 4,
            "min_spilling_size": 0,
            "automatic_object_spilling_enabled": True,
-            "object_store_full_max_retries": 4,
            "object_store_full_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
@@ -413,7 +403,6 @@ def test_delete_objects_on_worker_failure(tmp_path, shutdown_only):
        _system_config={
            "max_io_workers": 4,
            "automatic_object_spilling_enabled": True,
-            "object_store_full_max_retries": 4,
            "object_store_full_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
@@ -489,7 +478,6 @@ def test_delete_objects_multi_node(tmp_path, ray_start_cluster):
            "max_io_workers": 2,
            "min_spilling_size": 20 * 1024 * 1024,
            "automatic_object_spilling_enabled": True,
-            "object_store_full_max_retries": 4,
            "object_store_full_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
@@ -563,7 +551,6 @@ def test_fusion_objects(tmp_path, shutdown_only):
        _system_config={
            "max_io_workers": 3,
            "automatic_object_spilling_enabled": True,
-            "object_store_full_max_retries": 4,
            "object_store_full_delay_ms": 100,
            "object_spilling_config": json.dumps({
                "type": "filesystem",
@@ -20,7 +20,6 @@ logger = logging.getLogger(__name__)
@pytest.fixture
 def one_worker_100MiB(request):
    config = {
-        "object_store_full_max_retries": 2,
        "task_retry_delay_ms": 0,
        "object_timeout_milliseconds": 1000,
    }
@@ -238,9 +238,6 @@ RAY_CONFIG(uint32_t, maximum_gcs_dead_node_cached_count, 1000)
 /// The interval at which the gcs server will print debug info.
 RAY_CONFIG(int64_t, gcs_dump_debug_log_interval_minutes, 1)

-/// Maximum number of times to retry putting an object when the plasma store is full.
-/// Can be set to -1 to enable unlimited retries.
-RAY_CONFIG(int32_t, object_store_full_max_retries, 1000)
 /// Duration to sleep after failing to put an object in plasma because it is full.
 RAY_CONFIG(uint32_t, object_store_full_delay_ms, 10)

@@ -361,6 +358,9 @@ RAY_CONFIG(int64_t, min_spilling_size, 100 * 1024 * 1024)
 /// TODO(sang): Fix it.
 RAY_CONFIG(bool, automatic_object_deletion_enabled, true)

+/// Grace period until we throw the OOM error to the application in seconds.
+RAY_CONFIG(int64_t, oom_grace_period_s, 10)
+
 /* Configuration parameters for locality-aware scheduling. */
 /// Whether to enable locality-aware leasing. If enabled, then Ray will consider task
 /// dependency locality when choosing a worker for leasing.
@@ -81,18 +81,16 @@ std::pair<PlasmaObject, PlasmaError> CreateRequestQueue::TryRequestImmediately(
 }

 bool CreateRequestQueue::ProcessRequest(std::unique_ptr<CreateRequest> &request) {
-  // Return an OOM error to the client if we have hit the maximum number of
-  // retries.
-  // TODO(sang): Delete this logic?
+  // TODO(sang): Delete this logic when lru evict is removed.
  bool evict_if_full = evict_if_full_;
-  if (max_retries_ == 0) {
-    // If we cannot retry, then always evict on the first attempt.
-    evict_if_full = true;
-  } else if (num_retries_ > 0) {
-    // Always try to evict after the first attempt.
+  if (oom_start_time_ns_ != -1) {
+    // If the first attempt fails, we set the evict_if_full true.
+    // We need this logic because if lru_evict flag is on, this is false because we
+    // shouldn't evict objects in the first attempt.
    evict_if_full = true;
  }
-  request->error = request->create_callback(evict_if_full, &request->result);
+  request->error =
+      request->create_callback(/*evict_if_full=*/evict_if_full, &request->result);
  return request->error != PlasmaError::OutOfMemory;
 }

@@ -100,21 +98,25 @@ Status CreateRequestQueue::ProcessRequests() {
  while (!queue_.empty()) {
    auto request_it = queue_.begin();
    auto create_ok = ProcessRequest(*request_it);
+    auto now = get_time_();
    if (create_ok) {
      FinishRequest(request_it);
+      // Reset the oom start time since the creation succeeds.
+      oom_start_time_ns_ = -1;
    } else {
      if (trigger_global_gc_) {
        trigger_global_gc_();
      }

+      if (oom_start_time_ns_ == -1) {
+        oom_start_time_ns_ = now;
+      }
      if (spill_objects_callback_()) {
        return Status::TransientObjectStoreFull("Waiting for spilling.");
-      } else if (num_retries_ < max_retries_ || max_retries_ == -1) {
+      } else if (now - oom_start_time_ns_ < oom_grace_period_ns_) {
        // We need a grace period since (1) global GC takes a bit of time to
        // kick in, and (2) there is a race between spilling finishing and space
        // actually freeing up in the object store.
-        // If max_retries == -1, we retry infinitely.
-        num_retries_ += 1;
        return Status::ObjectStoreFull("Waiting for grace period.");
      } else {
        // Raise OOM. In this case, the request will be marked as OOM.
@@ -135,9 +137,6 @@ void CreateRequestQueue::FinishRequest(
  RAY_CHECK(it->second == nullptr);
  it->second = std::move(request);
  queue_.erase(request_it);
-
-  // Reset the number of retries since we are no longer trying this request.
-  num_retries_ = 0;
 }

 void CreateRequestQueue::RemoveDisconnectedClientRequests(
@@ -34,15 +34,18 @@ class CreateRequestQueue {
  using CreateObjectCallback =
      std::function<PlasmaError(bool evict_if_full, PlasmaObject *result)>;

-  CreateRequestQueue(int32_t max_retries, bool evict_if_full,
+  CreateRequestQueue(bool evict_if_full, int64_t oom_grace_period_s,
                     ray::SpillObjectsCallback spill_objects_callback,
-                     std::function<void()> trigger_global_gc)
-      : max_retries_(max_retries),
-        evict_if_full_(evict_if_full),
+                     std::function<void()> trigger_global_gc,
+                     std::function<int64_t()> get_time)
+      : evict_if_full_(evict_if_full),
+        oom_grace_period_ns_(oom_grace_period_s * 1e9),
        spill_objects_callback_(spill_objects_callback),
-        trigger_global_gc_(trigger_global_gc) {
-    RAY_LOG(DEBUG) << "Starting plasma::CreateRequestQueue with " << max_retries_
-                   << " retries on OOM, evict if full? " << (evict_if_full_ ? 1 : 0);
+        trigger_global_gc_(trigger_global_gc),
+        get_time_(get_time) {
+    RAY_LOG(DEBUG) << "Starting plasma::CreateRequestQueue with OOM grace period "
+                   << oom_grace_period_ns_ << ", evict if full? "
+                   << (evict_if_full_ ? 1 : 0);
  }

  /// Add a request to the queue. The caller should use the returned request ID
@@ -148,17 +151,15 @@ class CreateRequestQueue {
  /// a request by retrying. Start at 1 because 0 means "do not retry".
  uint64_t next_req_id_ = 1;

-  /// The maximum number of times to retry each request upon OOM.
-  const int32_t max_retries_;
-
-  /// The number of times the request at the head of the queue has been tried.
-  int32_t num_retries_ = 0;
-
  /// On the first attempt to create an object, whether to evict from the
  /// object store to make space. If the first attempt fails, then we will
  /// always try to evict.
  const bool evict_if_full_;

+  /// Grace period until we throw the OOM error to the application.
+  /// -1 means grace period is infinite.
+  const int64_t oom_grace_period_ns_;
+
  /// A callback to trigger object spilling. It tries to spill objects upto max
  /// throughput. It returns true if space is made by object spilling, and false if
  /// there's no more space to be made.
@@ -168,6 +169,9 @@ class CreateRequestQueue {
  /// full.
  const std::function<void()> trigger_global_gc_;

+  /// A callback to return the current time.
+  const std::function<int64_t()> get_time_;
+
  /// Queue of object creation requests to respond to. Requests will be placed
  /// on this queue if the object store does not have enough room at the time
  /// that the client made the creation request, but space may be made through
@@ -189,6 +193,9 @@ class CreateRequestQueue {
  /// Last time global gc was invoked in ms.
  uint64_t last_global_gc_ms_;

+  /// The time OOM timer first starts. It becomes -1 upon every creation success.
+  int64_t oom_start_time_ns_ = -1;
+
  friend class CreateRequestQueueTest;
 };

@@ -125,9 +125,11 @@ PlasmaStore::PlasmaStore(boost::asio::io_service &main_service, std::string dire
      usage_log_interval_ns_(RayConfig::instance().object_store_usage_log_interval_s() *
                             1e9),
      create_request_queue_(
-          RayConfig::instance().object_store_full_max_retries(),
          /*evict_if_full=*/RayConfig::instance().object_pinning_enabled(),
-          spill_objects_callback, object_store_full_callback) {
+          /*oom_grace_period_s=*/RayConfig::instance().oom_grace_period_s(),
+          spill_objects_callback, object_store_full_callback,
+          /*get_time=*/
+          []() { return absl::GetCurrentTimeNanos(); }) {
  store_info_.directory = directory;
  store_info_.hugepages_enabled = hugepages_enabled;
 }
@@ -46,17 +46,24 @@ class MockClient : public ClientInterface {
 class CreateRequestQueueTest : public ::testing::Test {
 public:
  CreateRequestQueueTest()
-      : queue_(
-            /*max_retries=*/2,
+      : oom_grace_period_s_(1),
+        current_time_ns_(0),
+        queue_(
            /*evict_if_full=*/true,
+            /*oom_grace_period_s=*/oom_grace_period_s_,
            /*spill_object_callback=*/[&]() { return false; },
-            /*on_global_gc=*/[&]() { num_global_gc_++; }) {}
+            /*on_global_gc=*/[&]() { num_global_gc_++; },
+            /*get_time=*/[&]() { return current_time_ns_; }) {}

  void AssertNoLeaks() {
    ASSERT_TRUE(queue_.queue_.empty());
    ASSERT_TRUE(queue_.fulfilled_requests_.empty());
  }

+  void TearDown() { current_time_ns_ = 0; }
+
+  int64_t oom_grace_period_s_;
+  int64_t current_time_ns_;
  CreateRequestQueue queue_;
  int num_global_gc_ = 0;
 };
@@ -66,6 +73,8 @@ TEST_F(CreateRequestQueueTest, TestSimple) {
    result->data_size = 1234;
    return PlasmaError::OK;
  };
+  // Advance the clock without processing objects. This shouldn't have an impact.
+  current_time_ns_ += 10e9;
  auto client = std::make_shared<MockClient>();
  auto req_id = queue_.AddRequest(ObjectID::Nil(), client, request);
  ASSERT_REQUEST_UNFINISHED(queue_, req_id);
@@ -115,8 +124,9 @@ TEST_F(CreateRequestQueueTest, TestOom) {
  ASSERT_REQUEST_UNFINISHED(queue_, req_id2);
  ASSERT_EQ(num_global_gc_, 2);

-  // Retries used up. The first request should reply with OOM and the second
+  // Grace period is done. The first request should reply with OOM and the second
  // request should also be served.
+  current_time_ns_ += oom_grace_period_s_ * 2e9;
  ASSERT_TRUE(queue_.ProcessRequests().ok());
  ASSERT_EQ(num_global_gc_, 3);

@@ -129,12 +139,14 @@ TEST_F(CreateRequestQueueTest, TestOom) {

 TEST(CreateRequestQueueParameterTest, TestOomInfiniteRetry) {
  int num_global_gc_ = 0;
+  int64_t current_time_ns;
  CreateRequestQueue queue(
-      /*max_retries=*/-1,
      /*evict_if_full=*/true,
+      /*oom_grace_period_s=*/100,
      // Spilling is failing.
      /*spill_object_callback=*/[&]() { return false; },
-      /*on_global_gc=*/[&]() { num_global_gc_++; });
+      /*on_global_gc=*/[&]() { num_global_gc_++; },
+      /*get_time=*/[&]() { return current_time_ns; });

  auto oom_request = [&](bool evict_if_full, PlasmaObject *result) {
    return PlasmaError::OutOfMemory;
@@ -148,7 +160,9 @@ TEST(CreateRequestQueueParameterTest, TestOomInfiniteRetry) {
  auto req_id1 = queue.AddRequest(ObjectID::Nil(), client, oom_request);
  auto req_id2 = queue.AddRequest(ObjectID::Nil(), client, blocked_request);

-  for (int i = 0; i < 3; i++) {
+  for (int i = 0; i < 10; i++) {
+    // Advance 1 second.
+    current_time_ns += 1e9;
    ASSERT_TRUE(queue.ProcessRequests().IsObjectStoreFull());
    ASSERT_EQ(num_global_gc_, i + 1);
  }
@@ -160,10 +174,11 @@ TEST(CreateRequestQueueParameterTest, TestOomInfiniteRetry) {

 TEST_F(CreateRequestQueueTest, TestTransientOom) {
  CreateRequestQueue queue(
-      /*max_retries=*/2,
-      /*evict_if_full=*/false,
+      /*evict_if_full=*/true,
+      /*oom_grace_period_s=*/oom_grace_period_s_,
      /*spill_object_callback=*/[&]() { return true; },
-      /*on_global_gc=*/[&]() { num_global_gc_++; });
+      /*on_global_gc=*/[&]() { num_global_gc_++; },
+      /*get_time=*/[&]() { return current_time_ns_; });

  auto return_status = PlasmaError::OutOfMemory;
  auto oom_request = [&](bool evict_if_full, PlasmaObject *result) {
@@ -181,14 +196,18 @@ TEST_F(CreateRequestQueueTest, TestTransientOom) {
  auto req_id1 = queue.AddRequest(ObjectID::Nil(), client, oom_request);
  auto req_id2 = queue.AddRequest(ObjectID::Nil(), client, blocked_request);

-  // Transient OOM should not use up any retries.
-  for (int i = 0; i < 3; i++) {
+  // Transient OOM should happen until the grace period.
+  for (int i = 0; i < 9; i++) {
+    // Advance 0.1 seconds. OOM grace period is 1 second, so it should return transient
+    // error.
+    current_time_ns_ += 1e8;
    ASSERT_TRUE(queue.ProcessRequests().IsTransientObjectStoreFull());
    ASSERT_REQUEST_UNFINISHED(queue, req_id1);
    ASSERT_REQUEST_UNFINISHED(queue, req_id2);
    ASSERT_EQ(num_global_gc_, i + 1);
  }

+  current_time_ns_ += oom_grace_period_s_ * 2e9;
  // Return OK for the first request. The second request should also be served.
  return_status = PlasmaError::OK;
  ASSERT_TRUE(queue.ProcessRequests().ok());
@@ -201,10 +220,11 @@ TEST_F(CreateRequestQueueTest, TestTransientOom) {
 TEST_F(CreateRequestQueueTest, TestTransientOomThenOom) {
  bool is_spilling_possible = true;
  CreateRequestQueue queue(
-      /*max_retries=*/2,
-      /*evict_if_full=*/false,
+      /*evict_if_full=*/true,
+      /*oom_grace_period_s=*/oom_grace_period_s_,
      /*spill_object_callback=*/[&]() { return is_spilling_possible; },
-      /*on_global_gc=*/[&]() { num_global_gc_++; });
+      /*on_global_gc=*/[&]() { num_global_gc_++; },
+      /*get_time=*/[&]() { return current_time_ns_; });

  auto return_status = PlasmaError::OutOfMemory;
  auto oom_request = [&](bool evict_if_full, PlasmaObject *result) {
@@ -222,8 +242,10 @@ TEST_F(CreateRequestQueueTest, TestTransientOomThenOom) {
  auto req_id1 = queue.AddRequest(ObjectID::Nil(), client, oom_request);
  auto req_id2 = queue.AddRequest(ObjectID::Nil(), client, blocked_request);

-  // Transient OOM should not use up any retries.
+  // Transient OOM should not use up any until grace period is done.
  for (int i = 0; i < 3; i++) {
+    // Advance 0.1 seconds. OOM grace period is 1 second.
+    current_time_ns_ += 1e8;
    ASSERT_TRUE(queue.ProcessRequests().IsTransientObjectStoreFull());
    ASSERT_REQUEST_UNFINISHED(queue, req_id1);
    ASSERT_REQUEST_UNFINISHED(queue, req_id2);
@@ -238,8 +260,9 @@ TEST_F(CreateRequestQueueTest, TestTransientOomThenOom) {
  ASSERT_REQUEST_UNFINISHED(queue, req_id2);
  ASSERT_EQ(num_global_gc_, 5);

-  // Retries used up. The first request should reply with OOM and the second
+  // Grace period is done. The first request should reply with OOM and the second
  // request should also be served.
+  current_time_ns_ += oom_grace_period_s_ * 2e9;
  ASSERT_TRUE(queue.ProcessRequests().ok());
  ASSERT_REQUEST_FINISHED(queue, req_id1, PlasmaError::OutOfMemory);
  ASSERT_REQUEST_FINISHED(queue, req_id2, PlasmaError::OK);
@@ -261,13 +284,16 @@ TEST_F(CreateRequestQueueTest, TestEvictIfFull) {
 }

 TEST(CreateRequestQueueParameterTest, TestNoEvictIfFull) {
+  int64_t current_time_ns = 0;
  CreateRequestQueue queue(
-      /*max_retries=*/2,
      /*evict_if_full=*/false,
+      /*oom_grace_period_s=*/1,
      /*spill_object_callback=*/[&]() { return false; },
-      /*on_global_gc=*/[&]() {});
+      /*on_global_gc=*/[&]() {},
+      /*get_time=*/[&]() { return current_time_ns; });

  bool first_try = true;
+
  auto oom_request = [&](bool evict_if_full, PlasmaObject *result) {
    if (first_try) {
      RAY_CHECK(!evict_if_full);
@@ -281,6 +307,7 @@ TEST(CreateRequestQueueParameterTest, TestNoEvictIfFull) {
  auto client = std::make_shared<MockClient>();
  static_cast<void>(queue.AddRequest(ObjectID::Nil(), client, oom_request));
  ASSERT_TRUE(queue.ProcessRequests().IsObjectStoreFull());
+  current_time_ns += 1e8;
  ASSERT_TRUE(queue.ProcessRequests().IsObjectStoreFull());
 }