Warn the user when a nondeterministic task is detected. (#339)

* WARN instead of FATAL for object hash mismatches, push error to driver

* Document the callback signature for object_table_add/remove

* Error table

* Wait for all errors in python test

* Fix doc

* Fix state test
This commit is contained in:
Stephanie Wang
2017-03-07 00:32:15 -08:00
committed by Robert Nishihara
parent 0b8d279ef2
commit da06b4db82
15 changed files with 303 additions and 78 deletions
+14 -6
View File
@@ -65,14 +65,22 @@ class TestGlobalStateStore(unittest.TestCase):
# Check that Redis returns an error when RAY.OBJECT_TABLE_ADD adds an object
# ID that is already present with a different hash.
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash1", "manager_id1")
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP", "object_id1")
self.assertEqual(set(response), {b"manager_id1"})
with self.assertRaises(redis.ResponseError):
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id1")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id2")
# Check that the second manager was added, even though the hash was
# mismatched.
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP", "object_id1")
self.assertEqual(set(response), {b"manager_id1", b"manager_id2"})
# Check that it is fine if we add the same object ID multiple times with the
# same hash.
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash1", "manager_id1")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash1", "manager_id1")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash1", "manager_id2")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 2, "hash1", "manager_id2")
# most recent hash.
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id1")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id1")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 1, "hash2", "manager_id2")
self.redis.execute_command("RAY.OBJECT_TABLE_ADD", "object_id1", 2, "hash2", "manager_id2")
response = self.redis.execute_command("RAY.OBJECT_TABLE_LOOKUP", "object_id1")
self.assertEqual(set(response), {b"manager_id1", b"manager_id2"})
def testObjectTableAddAndLookup(self):
# Try calling RAY.OBJECT_TABLE_LOOKUP with an object ID that has not been
-44
View File
@@ -721,50 +721,6 @@ class TestPlasmaManager(unittest.TestCase):
assert_get_object_equal(self, self.client1, self.client2, object_id2,
memory_buffer=memory_buffer2, metadata=metadata2)
def test_illegal_put(self):
"""
Test doing a put at the same object ID, but with different object data. The
first put should succeed. The second put should cause the plasma manager to
exit with a fatal error.
"""
if USE_VALGRIND:
# Don't run this test when we are using valgrind because when processes
# die without freeing up their state, valgrind complains.
return
# Create and seal the first object.
length = 1000
object_id = random_object_id()
memory_buffer1 = self.client1.create(object_id, length)
for i in range(length):
memory_buffer1[i] = chr(i % 256)
self.client1.seal(object_id)
# Create and seal the second object. It has all the same data as the first
# object, with one bit flipped.
memory_buffer2 = self.client2.create(object_id, length)
for i in range(length):
j = i
if j == 0:
j += 1
memory_buffer2[i] = chr(j % 256)
self.client2.seal(object_id)
# Make sure that one of the plasma managers exited (the second one to call
# RAY.OBJECT_TABLE_ADD should have exited). In the vast majority of cases,
# this should be p5. However, on Travis, it is frequently p4.
time_left = 100
while time_left > 0:
if self.p5.poll() != None:
self.processes_to_kill.remove(self.p5)
break
if self.p4.poll() != None:
self.processes_to_kill.remove(self.p4)
break
time_left -= 0.1
time.sleep(0.1)
print("Time waiting for plasma manager to fail = {:.2}".format(100 - time_left))
# Check that exactly one of the plasma managers has died.
self.assertEqual([self.p5.poll(), self.p4.poll()].count(None), 1)
def test_illegal_functionality(self):
# Create an object id string.
object_id = random_object_id()
+10
View File
@@ -49,6 +49,10 @@ NIL_ACTOR_ID = 20 * b"\xff"
# fetch the object again.
GET_TIMEOUT_MILLISECONDS = 1000
# This must be kept in sync with the `error_types` array in
# common/state/error_table.h.
OBJECT_HASH_MISMATCH_ERROR_TYPE = b"object_hash_mismatch"
def random_string():
return np.random.bytes(20)
@@ -677,6 +681,12 @@ def error_info(worker=global_worker):
for error_key in error_keys:
if error_applies_to_driver(error_key, worker=worker):
error_contents = worker.redis_client.hgetall(error_key)
# If the error is an object hash mismatch, look up the function name for
# the nondeterministic task.
if error_contents[b"type"] == OBJECT_HASH_MISMATCH_ERROR_TYPE:
function_id = error_contents[b"data"]
function_name = worker.redis_client.hget("RemoteFunction:{}".format(function_id), "name")
error_contents[b"data"] = function_name
errors.append(error_contents)
return errors