Warn the user when a nondeterministic task is detected. (#339)

* WARN instead of FATAL for object hash mismatches, push error to driver * Document the callback signature for object_table_add/remove * Error table * Wait for all errors in python test * Fix doc * Fix state test
2026-06-27 22:38:16 +08:00 · 2017-03-07 00:32:15 -08:00
parent 0b8d279ef2
commit da06b4db82
15 changed files with 303 additions and 78 deletions
@@ -65,14 +65,22 @@ class TestGlobalStateStore(unittest.TestCase):
    # Check that Redis returns an error when RAY.OBJECT_TABLE_ADD adds an object
    # ID that is already present with a different hash.
    self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash1", "manager_id1")
+    response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP", "object_id1")
+    self.assertEqual(set(response), {b"manager_id1"})
    with self.assertRaises(redis.ResponseError):
-      self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id1")
+      self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id2")
+    # Check that the second manager was added, even though the hash was
+    # mismatched.
+    response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP", "object_id1")
+    self.assertEqual(set(response), {b"manager_id1", b"manager_id2"})
    # Check that it is fine if we add the same object ID multiple times with the
-    # same hash.
-    self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash1", "manager_id1")
-    self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash1", "manager_id1")
-    self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash1", "manager_id2")
-    self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 2, "hash1", "manager_id2")
+    # most recent hash.
+    self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id1")
+    self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id1")
+    self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id2")
+    self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 2, "hash2", "manager_id2")
+    response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP", "object_id1")
+    self.assertEqual(set(response), {b"manager_id1", b"manager_id2"})

  def testObjectTableAddAndLookup(self):
    # Try calling RAY.OBJECT_TABLE_LOOKUP with an object ID that has not been
@@ -721,50 +721,6 @@ class TestPlasmaManager(unittest.TestCase):
      assert_get_object_equal(self, self.client1, self.client2, object_id2,
                              memory_buffer=memory_buffer2, metadata=metadata2)

-  def test_illegal_put(self):
-    """
-    Test doing a put at the same object ID, but with different object data. The
-    first put should succeed. The second put should cause the plasma manager to
-    exit with a fatal error.
-    """
-    if USE_VALGRIND:
-      # Don't run this test when we are using valgrind because when processes
-      # die without freeing up their state, valgrind complains.
-      return
-    # Create and seal the first object.
-    length = 1000
-    object_id = random_object_id()
-    memory_buffer1 = self.client1.create(object_id, length)
-    for i in range(length):
-      memory_buffer1[i] = chr(i % 256)
-    self.client1.seal(object_id)
-    # Create and seal the second object. It has all the same data as the first
-    # object, with one bit flipped.
-    memory_buffer2 = self.client2.create(object_id, length)
-    for i in range(length):
-      j = i
-      if j == 0:
-        j += 1
-      memory_buffer2[i] = chr(j % 256)
-    self.client2.seal(object_id)
-    # Make sure that one of the plasma managers exited (the second one to call
-    # RAY.OBJECT_TABLE_ADD should have exited). In the vast majority of cases,
-    # this should be p5. However, on Travis, it is frequently p4.
-    time_left = 100
-    while time_left > 0:
-      if self.p5.poll() != None:
-        self.processes_to_kill.remove(self.p5)
-        break
-      if self.p4.poll() != None:
-        self.processes_to_kill.remove(self.p4)
-        break
-      time_left -= 0.1
-      time.sleep(0.1)
-
-    print("Time waiting for plasma manager to fail = {:.2}".format(100 - time_left))
-    # Check that exactly one of the plasma managers has died.
-    self.assertEqual([self.p5.poll(), self.p4.poll()].count(None), 1)
-
  def test_illegal_functionality(self):
    # Create an object id string.
    object_id = random_object_id()
@@ -49,6 +49,10 @@ NIL_ACTOR_ID = 20 * b"\xff"
 # fetch the object again.
 GET_TIMEOUT_MILLISECONDS = 1000

+# This must be kept in sync with the `error_types` array in
+# common/state/error_table.h.
+OBJECT_HASH_MISMATCH_ERROR_TYPE = b"object_hash_mismatch"
+
 def random_string():
  return np.random.bytes(20)

@@ -677,6 +681,12 @@ def error_info(worker=global_worker):
  for error_key in error_keys:
    if error_applies_to_driver(error_key, worker=worker):
      error_contents = worker.redis_client.hgetall(error_key)
+      # If the error is an object hash mismatch, look up the function name for
+      # the nondeterministic task.
+      if error_contents[b"type"] == OBJECT_HASH_MISMATCH_ERROR_TYPE:
+        function_id = error_contents[b"data"]
+        function_name = worker.redis_client.hget("RemoteFunction:{}".format(function_id), "name")
+        error_contents[b"data"] = function_name
      errors.append(error_contents)

  return errors