mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 06:15:23 +08:00
87bb7a8f67
* Revert "Make tests more informative (#372)" This reverts commitfd353250c8. * fix bugs, in particular deactivate worker service on driver and remove condition variables * changes to minimize the changes in this PR * switch from faulty mutex synchronization to using atomics * Increase the default size of the message queues, to accommodate exporting large numbers of remote functions. This is a temporary fix, but not a long term solution. * Reorganize the scheduler export code to queue up exports. This does not solve the underlying problem yet, but sets up a solution. * Start a separate thread on driver to print error messages by constantly querying the scheduler. This is a temporary solution because the solution based on starting a worker service for the driver which the scheduler can push error messages to is buggy. * Fix segfault in taskcapsule destructor. * Move tests for catching errors into a separate test file. * Revert "roll back grpc (#368)" This reverts commitc01ef95d04.
146 lines
5.2 KiB
Python
146 lines
5.2 KiB
Python
import unittest
|
|
import ray
|
|
import time
|
|
|
|
import test_functions
|
|
|
|
class FailureTest(unittest.TestCase):
|
|
|
|
def testNoArgs(self):
|
|
reload(test_functions)
|
|
ray.init(start_ray_local=True, num_workers=1, driver_mode=ray.SILENT_MODE)
|
|
|
|
test_functions.no_op_fail.remote()
|
|
time.sleep(0.2)
|
|
task_info = ray.task_info()
|
|
self.assertEqual(len(task_info["failed_tasks"]), 1)
|
|
self.assertEqual(len(task_info["running_tasks"]), 0)
|
|
self.assertTrue("The @remote decorator for function test_functions.no_op_fail has 0 return values, but test_functions.no_op_fail returned more than 0 values." in task_info["failed_tasks"][0].get("error_message"))
|
|
|
|
ray.worker.cleanup()
|
|
|
|
def testTypeChecking(self):
|
|
reload(test_functions)
|
|
ray.init(start_ray_local=True, num_workers=1, driver_mode=ray.SILENT_MODE)
|
|
|
|
# Make sure that these functions throw exceptions because there return
|
|
# values do not type check.
|
|
test_functions.test_return1.remote()
|
|
test_functions.test_return2.remote()
|
|
time.sleep(0.2)
|
|
task_info = ray.task_info()
|
|
self.assertEqual(len(task_info["failed_tasks"]), 2)
|
|
self.assertEqual(len(task_info["running_tasks"]), 0)
|
|
|
|
ray.worker.cleanup()
|
|
|
|
class TaskStatusTest(unittest.TestCase):
|
|
def testFailedTask(self):
|
|
reload(test_functions)
|
|
ray.init(start_ray_local=True, num_workers=3, driver_mode=ray.SILENT_MODE)
|
|
|
|
test_functions.test_alias_f.remote()
|
|
test_functions.throw_exception_fct1.remote()
|
|
test_functions.throw_exception_fct1.remote()
|
|
for _ in range(100): # Retry if we need to wait longer.
|
|
if len(ray.task_info()["failed_tasks"]) >= 2:
|
|
break
|
|
time.sleep(0.1)
|
|
result = ray.task_info()
|
|
self.assertEqual(len(result["failed_tasks"]), 2)
|
|
task_ids = set()
|
|
for task in result["failed_tasks"]:
|
|
self.assertTrue(task.has_key("worker_address"))
|
|
self.assertTrue(task.has_key("operationid"))
|
|
self.assertTrue("Test function 1 intentionally failed." in task.get("error_message"))
|
|
self.assertTrue(task["operationid"] not in task_ids)
|
|
task_ids.add(task["operationid"])
|
|
|
|
x = test_functions.throw_exception_fct2.remote()
|
|
try:
|
|
ray.get(x)
|
|
except Exception as e:
|
|
self.assertTrue("Test function 2 intentionally failed."in str(e))
|
|
else:
|
|
self.assertTrue(False) # ray.get should throw an exception
|
|
|
|
x, y, z = test_functions.throw_exception_fct3.remote(1.0)
|
|
for ref in [x, y, z]:
|
|
try:
|
|
ray.get(ref)
|
|
except Exception as e:
|
|
self.assertTrue("Test function 3 intentionally failed."in str(e))
|
|
else:
|
|
self.assertTrue(False) # ray.get should throw an exception
|
|
|
|
ray.worker.cleanup()
|
|
|
|
def testFailImportingRemoteFunction(self):
|
|
ray.init(start_ray_local=True, num_workers=2, driver_mode=ray.SILENT_MODE)
|
|
|
|
# This example is somewhat contrived. It should be successfully pickled, and
|
|
# then it should throw an exception when it is unpickled. This may depend a
|
|
# bit on the specifics of our pickler.
|
|
def reducer(*args):
|
|
raise Exception("There is a problem here.")
|
|
class Foo(object):
|
|
def __init__(self):
|
|
self.__name__ = "Foo_object"
|
|
self.func_doc = ""
|
|
self.__globals__ = {}
|
|
def __reduce__(self):
|
|
return reducer, ()
|
|
def __call__(self):
|
|
return
|
|
ray.remote([], [])(Foo())
|
|
for _ in range(100): # Retry if we need to wait longer.
|
|
if len(ray.task_info()["failed_remote_function_imports"]) >= 1:
|
|
break
|
|
time.sleep(0.1)
|
|
self.assertTrue("There is a problem here." in ray.task_info()["failed_remote_function_imports"][0]["error_message"])
|
|
|
|
ray.worker.cleanup()
|
|
|
|
def testFailImportingReusableVariable(self):
|
|
ray.init(start_ray_local=True, num_workers=2, driver_mode=ray.SILENT_MODE)
|
|
|
|
# This will throw an exception when the reusable variable is imported on the
|
|
# workers.
|
|
def initializer():
|
|
if ray.worker.global_worker.mode == ray.WORKER_MODE:
|
|
raise Exception("The initializer failed.")
|
|
return 0
|
|
ray.reusables.foo = ray.Reusable(initializer)
|
|
for _ in range(100): # Retry if we need to wait longer.
|
|
if len(ray.task_info()["failed_reusable_variable_imports"]) >= 1:
|
|
break
|
|
time.sleep(0.1)
|
|
# Check that the error message is in the task info.
|
|
self.assertTrue("The initializer failed." in ray.task_info()["failed_reusable_variable_imports"][0]["error_message"])
|
|
|
|
ray.worker.cleanup()
|
|
|
|
def testFailReinitializingVariable(self):
|
|
ray.init(start_ray_local=True, num_workers=2, driver_mode=ray.SILENT_MODE)
|
|
|
|
def initializer():
|
|
return 0
|
|
def reinitializer(foo):
|
|
raise Exception("The reinitializer failed.")
|
|
ray.reusables.foo = ray.Reusable(initializer, reinitializer)
|
|
@ray.remote([], [])
|
|
def use_foo():
|
|
ray.reusables.foo
|
|
use_foo.remote()
|
|
for _ in range(100): # Retry if we need to wait longer.
|
|
if len(ray.task_info()["failed_reinitialize_reusable_variables"]) >= 1:
|
|
break
|
|
time.sleep(0.1)
|
|
# Check that the error message is in the task info.
|
|
self.assertTrue("The reinitializer failed." in ray.task_info()["failed_reinitialize_reusable_variables"][0]["error_message"])
|
|
|
|
ray.worker.cleanup()
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main(verbosity=2)
|