mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 22:38:16 +08:00
Warn the user when a nondeterministic task is detected. (#339)
* WARN instead of FATAL for object hash mismatches, push error to driver * Document the callback signature for object_table_add/remove * Error table * Wait for all errors in python test * Fix doc * Fix state test
This commit is contained in:
committed by
Robert Nishihara
parent
0b8d279ef2
commit
da06b4db82
@@ -65,14 +65,22 @@ class TestGlobalStateStore(unittest.TestCase):
|
||||
# Check that Redis returns an error when RAY.OBJECT_TABLE_ADD adds an object
|
||||
# ID that is already present with a different hash.
|
||||
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash1", "manager_id1")
|
||||
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP", "object_id1")
|
||||
self.assertEqual(set(response), {b"manager_id1"})
|
||||
with self.assertRaises(redis.ResponseError):
|
||||
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id1")
|
||||
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id2")
|
||||
# Check that the second manager was added, even though the hash was
|
||||
# mismatched.
|
||||
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP", "object_id1")
|
||||
self.assertEqual(set(response), {b"manager_id1", b"manager_id2"})
|
||||
# Check that it is fine if we add the same object ID multiple times with the
|
||||
# same hash.
|
||||
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash1", "manager_id1")
|
||||
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash1", "manager_id1")
|
||||
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash1", "manager_id2")
|
||||
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 2, "hash1", "manager_id2")
|
||||
# most recent hash.
|
||||
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id1")
|
||||
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id1")
|
||||
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id2")
|
||||
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 2, "hash2", "manager_id2")
|
||||
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP", "object_id1")
|
||||
self.assertEqual(set(response), {b"manager_id1", b"manager_id2"})
|
||||
|
||||
def testObjectTableAddAndLookup(self):
|
||||
# Try calling RAY.OBJECT_TABLE_LOOKUP with an object ID that has not been
|
||||
|
||||
@@ -721,50 +721,6 @@ class TestPlasmaManager(unittest.TestCase):
|
||||
assert_get_object_equal(self, self.client1, self.client2, object_id2,
|
||||
memory_buffer=memory_buffer2, metadata=metadata2)
|
||||
|
||||
def test_illegal_put(self):
|
||||
"""
|
||||
Test doing a put at the same object ID, but with different object data. The
|
||||
first put should succeed. The second put should cause the plasma manager to
|
||||
exit with a fatal error.
|
||||
"""
|
||||
if USE_VALGRIND:
|
||||
# Don't run this test when we are using valgrind because when processes
|
||||
# die without freeing up their state, valgrind complains.
|
||||
return
|
||||
# Create and seal the first object.
|
||||
length = 1000
|
||||
object_id = random_object_id()
|
||||
memory_buffer1 = self.client1.create(object_id, length)
|
||||
for i in range(length):
|
||||
memory_buffer1[i] = chr(i % 256)
|
||||
self.client1.seal(object_id)
|
||||
# Create and seal the second object. It has all the same data as the first
|
||||
# object, with one bit flipped.
|
||||
memory_buffer2 = self.client2.create(object_id, length)
|
||||
for i in range(length):
|
||||
j = i
|
||||
if j == 0:
|
||||
j += 1
|
||||
memory_buffer2[i] = chr(j % 256)
|
||||
self.client2.seal(object_id)
|
||||
# Make sure that one of the plasma managers exited (the second one to call
|
||||
# RAY.OBJECT_TABLE_ADD should have exited). In the vast majority of cases,
|
||||
# this should be p5. However, on Travis, it is frequently p4.
|
||||
time_left = 100
|
||||
while time_left > 0:
|
||||
if self.p5.poll() != None:
|
||||
self.processes_to_kill.remove(self.p5)
|
||||
break
|
||||
if self.p4.poll() != None:
|
||||
self.processes_to_kill.remove(self.p4)
|
||||
break
|
||||
time_left -= 0.1
|
||||
time.sleep(0.1)
|
||||
|
||||
print("Time waiting for plasma manager to fail = {:.2}".format(100 - time_left))
|
||||
# Check that exactly one of the plasma managers has died.
|
||||
self.assertEqual([self.p5.poll(), self.p4.poll()].count(None), 1)
|
||||
|
||||
def test_illegal_functionality(self):
|
||||
# Create an object id string.
|
||||
object_id = random_object_id()
|
||||
|
||||
@@ -49,6 +49,10 @@ NIL_ACTOR_ID = 20 * b"\xff"
|
||||
# fetch the object again.
|
||||
GET_TIMEOUT_MILLISECONDS = 1000
|
||||
|
||||
# This must be kept in sync with the `error_types` array in
|
||||
# common/state/error_table.h.
|
||||
OBJECT_HASH_MISMATCH_ERROR_TYPE = b"object_hash_mismatch"
|
||||
|
||||
def random_string():
|
||||
return np.random.bytes(20)
|
||||
|
||||
@@ -677,6 +681,12 @@ def error_info(worker=global_worker):
|
||||
for error_key in error_keys:
|
||||
if error_applies_to_driver(error_key, worker=worker):
|
||||
error_contents = worker.redis_client.hgetall(error_key)
|
||||
# If the error is an object hash mismatch, look up the function name for
|
||||
# the nondeterministic task.
|
||||
if error_contents[b"type"] == OBJECT_HASH_MISMATCH_ERROR_TYPE:
|
||||
function_id = error_contents[b"data"]
|
||||
function_name = worker.redis_client.hget("RemoteFunction:{}".format(function_id), "name")
|
||||
error_contents[b"data"] = function_name
|
||||
errors.append(error_contents)
|
||||
|
||||
return errors
|
||||
|
||||
Reference in New Issue
Block a user