From eda6ebb87ddef8f34e7ea6ef7fbf2ab8dc0c9ed2 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 31 Aug 2018 11:24:15 -0700 Subject: [PATCH] Convert some unittests to pytest. (#2779) * Convert multi_node_test.py to pytest. * Convert array_test.py to pytest. * Convert failure_test.py to pytest. * Convert microbenchmarks to pytest. * Convert component_failures_test.py to pytest and some minor quotes changes. * Convert tensorflow_test.py to pytest. * Convert actor_test.py to pytest. * Fix. * Fix --- python/ray/global_scheduler/test/test.py | 6 +- test/actor_test.py | 3631 +++++++++++----------- test/array_test.py | 435 ++- test/component_failures_test.py | 600 ++-- test/credis_test.py | 2 +- test/failure_test.py | 767 +++-- test/microbenchmarks.py | 216 +- test/multi_node_test.py | 390 +-- test/runtest.py | 4 +- test/tensorflow_test.py | 339 +- 10 files changed, 3185 insertions(+), 3205 deletions(-) diff --git a/python/ray/global_scheduler/test/test.py b/python/ray/global_scheduler/test/test.py index d8ef98264..37aad62ee 100644 --- a/python/ray/global_scheduler/test/test.py +++ b/python/ray/global_scheduler/test/test.py @@ -189,7 +189,7 @@ class TestGlobalScheduler(unittest.TestCase): assert (db_client_id is not None) @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), + os.environ.get("RAY_USE_NEW_GCS", False), "New GCS API doesn't have a Python API yet.") def test_integration_single_task(self): # There should be three db clients, the global scheduler, the local @@ -307,13 +307,13 @@ class TestGlobalScheduler(unittest.TestCase): self.assertEqual(num_tasks_done + num_tasks_waiting, num_tasks) @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), + os.environ.get("RAY_USE_NEW_GCS", False), "New GCS API doesn't have a Python API yet.") def test_integration_many_tasks_handler_sync(self): self.integration_many_tasks_helper(timesync=True) @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), + os.environ.get("RAY_USE_NEW_GCS", False), "New GCS API doesn't have a Python API yet.") def test_integration_many_tasks(self): # More realistic case: should handle out of order object and task diff --git a/test/actor_test.py b/test/actor_test.py index 49b271e0f..70ea474f4 100644 --- a/test/actor_test.py +++ b/test/actor_test.py @@ -9,1988 +9,1989 @@ import os import pytest import sys import time -import unittest import ray import ray.ray_constants as ray_constants import ray.test.test_utils -class ActorAPI(unittest.TestCase): - def tearDown(self): - ray.shutdown() +@pytest.fixture +def ray_start_regular(): + # Start the Ray processes. + ray.init(num_cpus=1) + yield None + # The code after the yield will run as teardown code. + ray.shutdown() - def testKeywordArgs(self): - ray.init(num_workers=0) - @ray.remote - class Actor(object): - def __init__(self, arg0, arg1=1, arg2="a"): - self.arg0 = arg0 - self.arg1 = arg1 - self.arg2 = arg2 +@pytest.fixture +def shutdown_only(): + yield None + # The code after the yield will run as teardown code. + ray.shutdown() - def get_values(self, arg0, arg1=2, arg2="b"): - return self.arg0 + arg0, self.arg1 + arg1, self.arg2 + arg2 - actor = Actor.remote(0) - assert ray.get(actor.get_values.remote(1)) == (1, 3, "ab") +def test_keyword_args(ray_start_regular): + @ray.remote + class Actor(object): + def __init__(self, arg0, arg1=1, arg2="a"): + self.arg0 = arg0 + self.arg1 = arg1 + self.arg2 = arg2 - actor = Actor.remote(1, 2) - assert ray.get(actor.get_values.remote(2, 3)) == (3, 5, "ab") + def get_values(self, arg0, arg1=2, arg2="b"): + return self.arg0 + arg0, self.arg1 + arg1, self.arg2 + arg2 - actor = Actor.remote(1, 2, "c") - assert ray.get(actor.get_values.remote(2, 3, "d")) == (3, 5, "cd") + actor = Actor.remote(0) + assert ray.get(actor.get_values.remote(1)) == (1, 3, "ab") - actor = Actor.remote(1, arg2="c") - assert ray.get(actor.get_values.remote(0, arg2="d")) == (1, 3, "cd") - assert ray.get(actor.get_values.remote(0, arg2="d", arg1=0)) == (1, 1, - "cd") + actor = Actor.remote(1, 2) + assert ray.get(actor.get_values.remote(2, 3)) == (3, 5, "ab") - actor = Actor.remote(1, arg2="c", arg1=2) - assert ray.get(actor.get_values.remote(0, arg2="d")) == (1, 4, "cd") - assert ray.get(actor.get_values.remote(0, arg2="d", arg1=0)) == (1, 2, - "cd") - assert ray.get(actor.get_values.remote(arg2="d", arg1=0, - arg0=2)) == (3, 2, "cd") + actor = Actor.remote(1, 2, "c") + assert ray.get(actor.get_values.remote(2, 3, "d")) == (3, 5, "cd") - # Make sure we get an exception if the constructor is called - # incorrectly. - with pytest.raises(Exception): - actor = Actor.remote() + actor = Actor.remote(1, arg2="c") + assert ray.get(actor.get_values.remote(0, arg2="d")) == (1, 3, "cd") + assert ray.get(actor.get_values.remote(0, arg2="d", arg1=0)) == (1, 1, + "cd") - with pytest.raises(Exception): - actor = Actor.remote(0, 1, 2, arg3=3) - - with pytest.raises(Exception): - actor = Actor.remote(0, arg0=1) - - # Make sure we get an exception if the method is called incorrectly. - actor = Actor.remote(1) - with pytest.raises(Exception): - ray.get(actor.get_values.remote()) - - def testVariableNumberOfArgs(self): - ray.init(num_workers=0) - - @ray.remote - class Actor(object): - def __init__(self, arg0, arg1=1, *args): - self.arg0 = arg0 - self.arg1 = arg1 - self.args = args - - def get_values(self, arg0, arg1=2, *args): - return self.arg0 + arg0, self.arg1 + arg1, self.args, args - - actor = Actor.remote(0) - assert ray.get(actor.get_values.remote(1)) == (1, 3, (), ()) - - actor = Actor.remote(1, 2) - assert ray.get(actor.get_values.remote(2, 3)) == (3, 5, (), ()) - - actor = Actor.remote(1, 2, "c") - assert ray.get(actor.get_values.remote(2, 3, "d")) == (3, 5, ("c", ), - ("d", )) - - actor = Actor.remote(1, 2, "a", "b", "c", "d") - assert ray.get(actor.get_values.remote( - 2, 3, 1, 2, 3, 4)) == (3, 5, ("a", "b", "c", "d"), (1, 2, 3, 4)) - - @ray.remote - class Actor(object): - def __init__(self, *args): - self.args = args - - def get_values(self, *args): - return self.args, args - - a = Actor.remote() - assert ray.get(a.get_values.remote()) == ((), ()) - a = Actor.remote(1) - assert ray.get(a.get_values.remote(2)) == ((1, ), (2, )) - a = Actor.remote(1, 2) - assert ray.get(a.get_values.remote(3, 4)) == ((1, 2), (3, 4)) - - def testNoArgs(self): - ray.init(num_workers=0) - - @ray.remote - class Actor(object): - def __init__(self): - pass - - def get_values(self): - pass + actor = Actor.remote(1, arg2="c", arg1=2) + assert ray.get(actor.get_values.remote(0, arg2="d")) == (1, 4, "cd") + assert ray.get(actor.get_values.remote(0, arg2="d", arg1=0)) == (1, 2, + "cd") + assert ray.get(actor.get_values.remote(arg2="d", arg1=0, arg0=2)) == (3, 2, + "cd") + # Make sure we get an exception if the constructor is called + # incorrectly. + with pytest.raises(Exception): actor = Actor.remote() - assert ray.get(actor.get_values.remote()) is None - def testNoConstructor(self): - # If no __init__ method is provided, that should not be a problem. - ray.init(num_workers=0) + with pytest.raises(Exception): + actor = Actor.remote(0, 1, 2, arg3=3) - @ray.remote - class Actor(object): - def get_values(self): - pass + with pytest.raises(Exception): + actor = Actor.remote(0, arg0=1) - actor = Actor.remote() - assert ray.get(actor.get_values.remote()) is None + # Make sure we get an exception if the method is called incorrectly. + actor = Actor.remote(1) + with pytest.raises(Exception): + ray.get(actor.get_values.remote()) - def testCustomClasses(self): - ray.init(num_workers=0) - class Foo(object): - def __init__(self, x): - self.x = x +def test_variable_number_of_args(ray_start_regular): + @ray.remote + class Actor(object): + def __init__(self, arg0, arg1=1, *args): + self.arg0 = arg0 + self.arg1 = arg1 + self.args = args - @ray.remote - class Actor(object): - def __init__(self, f2): - self.f1 = Foo(1) - self.f2 = f2 + def get_values(self, arg0, arg1=2, *args): + return self.arg0 + arg0, self.arg1 + arg1, self.args, args - def get_values1(self): - return self.f1, self.f2 + actor = Actor.remote(0) + assert ray.get(actor.get_values.remote(1)) == (1, 3, (), ()) - def get_values2(self, f3): - return self.f1, self.f2, f3 + actor = Actor.remote(1, 2) + assert ray.get(actor.get_values.remote(2, 3)) == (3, 5, (), ()) - actor = Actor.remote(Foo(2)) - results1 = ray.get(actor.get_values1.remote()) - assert results1[0].x == 1 - assert results1[1].x == 2 - results2 = ray.get(actor.get_values2.remote(Foo(3))) - assert results2[0].x == 1 - assert results2[1].x == 2 - assert results2[2].x == 3 + actor = Actor.remote(1, 2, "c") + assert ray.get(actor.get_values.remote(2, 3, "d")) == (3, 5, ("c", ), + ("d", )) - def testCachingActors(self): - # Test defining actors before ray.init() has been called. + actor = Actor.remote(1, 2, "a", "b", "c", "d") + assert ray.get(actor.get_values.remote( + 2, 3, 1, 2, 3, 4)) == (3, 5, ("a", "b", "c", "d"), (1, 2, 3, 4)) - @ray.remote - class Foo(object): - def __init__(self): - pass + @ray.remote + class Actor(object): + def __init__(self, *args): + self.args = args - def get_val(self): - return 3 + def get_values(self, *args): + return self.args, args - # Check that we can't actually create actors before ray.init() has been - # called. - with pytest.raises(Exception): - f = Foo.remote() + a = Actor.remote() + assert ray.get(a.get_values.remote()) == ((), ()) + a = Actor.remote(1) + assert ray.get(a.get_values.remote(2)) == ((1, ), (2, )) + a = Actor.remote(1, 2) + assert ray.get(a.get_values.remote(3, 4)) == ((1, 2), (3, 4)) - ray.init(num_workers=0) +def test_no_args(ray_start_regular): + @ray.remote + class Actor(object): + def __init__(self): + pass + + def get_values(self): + pass + + actor = Actor.remote() + assert ray.get(actor.get_values.remote()) is None + + +def test_no_constructor(ray_start_regular): + # If no __init__ method is provided, that should not be a problem. + @ray.remote + class Actor(object): + def get_values(self): + pass + + actor = Actor.remote() + assert ray.get(actor.get_values.remote()) is None + + +def test_custom_classes(ray_start_regular): + class Foo(object): + def __init__(self, x): + self.x = x + + @ray.remote + class Actor(object): + def __init__(self, f2): + self.f1 = Foo(1) + self.f2 = f2 + + def get_values1(self): + return self.f1, self.f2 + + def get_values2(self, f3): + return self.f1, self.f2, f3 + + actor = Actor.remote(Foo(2)) + results1 = ray.get(actor.get_values1.remote()) + assert results1[0].x == 1 + assert results1[1].x == 2 + results2 = ray.get(actor.get_values2.remote(Foo(3))) + assert results2[0].x == 1 + assert results2[1].x == 2 + assert results2[2].x == 3 + + +def test_caching_actors(shutdown_only): + # Test defining actors before ray.init() has been called. + + @ray.remote + class Foo(object): + def __init__(self): + pass + + def get_val(self): + return 3 + + # Check that we can't actually create actors before ray.init() has been + # called. + with pytest.raises(Exception): f = Foo.remote() - assert ray.get(f.get_val.remote()) == 3 + ray.init(num_cpus=1) - def testDecoratorArgs(self): - ray.init(num_workers=0) + f = Foo.remote() - # This is an invalid way of using the actor decorator. - with pytest.raises(Exception): + assert ray.get(f.get_val.remote()) == 3 - @ray.remote() - class Actor(object): - def __init__(self): - pass - # This is an invalid way of using the actor decorator. - with pytest.raises(Exception): +def test_decorator_args(ray_start_regular): + # This is an invalid way of using the actor decorator. + with pytest.raises(Exception): - @ray.remote(invalid_kwarg=0) # noqa: F811 - class Actor(object): - def __init__(self): - pass - - # This is an invalid way of using the actor decorator. - with pytest.raises(Exception): - - @ray.remote(num_cpus=0, invalid_kwarg=0) # noqa: F811 - class Actor(object): - def __init__(self): - pass - - # This is a valid way of using the decorator. - @ray.remote(num_cpus=1) # noqa: F811 + @ray.remote() class Actor(object): def __init__(self): pass - # This is a valid way of using the decorator. - @ray.remote(num_gpus=1) # noqa: F811 + # This is an invalid way of using the actor decorator. + with pytest.raises(Exception): + + @ray.remote(invalid_kwarg=0) # noqa: F811 class Actor(object): def __init__(self): pass - # This is a valid way of using the decorator. - @ray.remote(num_cpus=1, num_gpus=1) # noqa: F811 + # This is an invalid way of using the actor decorator. + with pytest.raises(Exception): + + @ray.remote(num_cpus=0, invalid_kwarg=0) # noqa: F811 class Actor(object): def __init__(self): pass - def testRandomIDGeneration(self): - ray.init(num_workers=0) + # This is a valid way of using the decorator. + @ray.remote(num_cpus=1) # noqa: F811 + class Actor(object): + def __init__(self): + pass - @ray.remote - class Foo(object): - def __init__(self): - pass + # This is a valid way of using the decorator. + @ray.remote(num_gpus=1) # noqa: F811 + class Actor(object): + def __init__(self): + pass - # Make sure that seeding numpy does not interfere with the generation - # of actor IDs. - np.random.seed(1234) - random.seed(1234) - f1 = Foo.remote() - np.random.seed(1234) - random.seed(1234) - f2 = Foo.remote() - - assert f1._ray_actor_id.id() != f2._ray_actor_id.id() - - def testActorClassName(self): - ray.init(num_workers=0) - - @ray.remote - class Foo(object): - def __init__(self): - pass - - Foo.remote() - - r = ray.worker.global_worker.redis_client - actor_keys = r.keys("ActorClass*") - assert len(actor_keys) == 1 - actor_class_info = r.hgetall(actor_keys[0]) - assert actor_class_info[b"class_name"] == b"Foo" - assert actor_class_info[b"module"] == b"actor_test" - - def testMultipleReturnValues(self): - ray.init(num_workers=0) - - @ray.remote - class Foo(object): - def method0(self): - return 1 - - @ray.method(num_return_vals=1) - def method1(self): - return 1 - - @ray.method(num_return_vals=2) - def method2(self): - return 1, 2 - - @ray.method(num_return_vals=3) - def method3(self): - return 1, 2, 3 - - f = Foo.remote() - - id0 = f.method0.remote() - assert ray.get(id0) == 1 - - id1 = f.method1.remote() - assert ray.get(id1) == 1 - - id2a, id2b = f.method2.remote() - assert ray.get([id2a, id2b]) == [1, 2] - - id3a, id3b, id3c = f.method3.remote() - assert ray.get([id3a, id3b, id3c]) == [1, 2, 3] + # This is a valid way of using the decorator. + @ray.remote(num_cpus=1, num_gpus=1) # noqa: F811 + class Actor(object): + def __init__(self): + pass -class ActorMethods(unittest.TestCase): - def tearDown(self): - ray.shutdown() +def test_random_id_generation(ray_start_regular): + @ray.remote + class Foo(object): + def __init__(self): + pass - def testDefineActor(self): - ray.init() + # Make sure that seeding numpy does not interfere with the generation + # of actor IDs. + np.random.seed(1234) + random.seed(1234) + f1 = Foo.remote() + np.random.seed(1234) + random.seed(1234) + f2 = Foo.remote() - @ray.remote - class Test(object): - def __init__(self, x): - self.x = x + assert f1._ray_actor_id.id() != f2._ray_actor_id.id() - def f(self, y): - return self.x + y - t = Test.remote(2) - assert ray.get(t.f.remote(1)) == 3 +def test_actor_class_name(ray_start_regular): + @ray.remote + class Foo(object): + def __init__(self): + pass - # Make sure that calling an actor method directly raises an exception. - with pytest.raises(Exception): - t.f(1) + Foo.remote() - def testActorDeletion(self): - ray.init(num_workers=0) + r = ray.worker.global_worker.redis_client + actor_keys = r.keys("ActorClass*") + assert len(actor_keys) == 1 + actor_class_info = r.hgetall(actor_keys[0]) + assert actor_class_info[b"class_name"] == b"Foo" + assert actor_class_info[b"module"] == b"actor_test" - # Make sure that when an actor handles goes out of scope, the actor - # destructor is called. - @ray.remote - class Actor(object): - def getpid(self): - return os.getpid() +def test_multiple_return_values(ray_start_regular): + @ray.remote + class Foo(object): + def method0(self): + return 1 + @ray.method(num_return_vals=1) + def method1(self): + return 1 + + @ray.method(num_return_vals=2) + def method2(self): + return 1, 2 + + @ray.method(num_return_vals=3) + def method3(self): + return 1, 2, 3 + + f = Foo.remote() + + id0 = f.method0.remote() + assert ray.get(id0) == 1 + + id1 = f.method1.remote() + assert ray.get(id1) == 1 + + id2a, id2b = f.method2.remote() + assert ray.get([id2a, id2b]) == [1, 2] + + id3a, id3b, id3c = f.method3.remote() + assert ray.get([id3a, id3b, id3c]) == [1, 2, 3] + + +def test_define_actor(ray_start_regular): + @ray.remote + class Test(object): + def __init__(self, x): + self.x = x + + def f(self, y): + return self.x + y + + t = Test.remote(2) + assert ray.get(t.f.remote(1)) == 3 + + # Make sure that calling an actor method directly raises an exception. + with pytest.raises(Exception): + t.f(1) + + +def test_actor_deletion(ray_start_regular): + # Make sure that when an actor handles goes out of scope, the actor + # destructor is called. + + @ray.remote + class Actor(object): + def getpid(self): + return os.getpid() + + a = Actor.remote() + pid = ray.get(a.getpid.remote()) + a = None + ray.test.test_utils.wait_for_pid_to_exit(pid) + + actors = [Actor.remote() for _ in range(10)] + pids = ray.get([a.getpid.remote() for a in actors]) + a = None + actors = None + [ray.test.test_utils.wait_for_pid_to_exit(pid) for pid in pids] + + @ray.remote + class Actor(object): + def method(self): + return 1 + + # Make sure that if we create an actor and call a method on it + # immediately, the actor doesn't get killed before the method is + # called. + assert ray.get(Actor.remote().method.remote()) == 1 + + +def test_actor_deletion_with_gpus(shutdown_only): + ray.init(num_cpus=1, num_gpus=1) + + # When an actor that uses a GPU exits, make sure that the GPU resources + # are released. + + @ray.remote(num_gpus=1) + class Actor(object): + def getpid(self): + return os.getpid() + + for _ in range(5): + # If we can successfully create an actor, that means that enough + # GPU resources are available. a = Actor.remote() - pid = ray.get(a.getpid.remote()) - a = None - ray.test.test_utils.wait_for_pid_to_exit(pid) - - actors = [Actor.remote() for _ in range(10)] - pids = ray.get([a.getpid.remote() for a in actors]) - a = None - actors = None - [ray.test.test_utils.wait_for_pid_to_exit(pid) for pid in pids] - - @ray.remote - class Actor(object): - def method(self): - return 1 - - # Make sure that if we create an actor and call a method on it - # immediately, the actor doesn't get killed before the method is - # called. - assert ray.get(Actor.remote().method.remote()) == 1 - - def testActorDeletionWithGPUs(self): - ray.init(num_workers=0, num_gpus=1) - - # When an actor that uses a GPU exits, make sure that the GPU resources - # are released. - - @ray.remote(num_gpus=1) - class Actor(object): - def getpid(self): - return os.getpid() - - for _ in range(5): - # If we can successfully create an actor, that means that enough - # GPU resources are available. - a = Actor.remote() - ray.get(a.getpid.remote()) - - def testActorState(self): - ray.init() - - @ray.remote - class Counter(object): - def __init__(self): - self.value = 0 - - def increase(self): - self.value += 1 - - def value(self): - return self.value - - c1 = Counter.remote() - c1.increase.remote() - assert ray.get(c1.value.remote()) == 1 - - c2 = Counter.remote() - c2.increase.remote() - c2.increase.remote() - assert ray.get(c2.value.remote()) == 2 - - def testActorClassMethods(self): - ray.init() - - class Foo(object): - x = 2 - - @classmethod - def as_remote(cls): - return ray.remote(cls) - - @classmethod - def f(cls): - return cls.x - - @classmethod - def g(cls, y): - return cls.x + y - - def echo(self, value): - return value - - a = Foo.as_remote().remote() - assert ray.get(a.echo.remote(2)) == 2 - assert ray.get(a.f.remote()) == 2 - assert ray.get(a.g.remote(2)) == 4 - - def testMultipleActors(self): - # Create a bunch of actors and call a bunch of methods on all of them. - ray.init(num_workers=0) - - @ray.remote - class Counter(object): - def __init__(self, value): - self.value = value - - def increase(self): - self.value += 1 - return self.value - - def reset(self): - self.value = 0 - - num_actors = 20 - num_increases = 50 - # Create multiple actors. - actors = [Counter.remote(i) for i in range(num_actors)] - results = [] - # Call each actor's method a bunch of times. - for i in range(num_actors): - results += [ - actors[i].increase.remote() for _ in range(num_increases) - ] - result_values = ray.get(results) - for i in range(num_actors): - v = result_values[(num_increases * i):(num_increases * (i + 1))] - assert v == list(range(i + 1, num_increases + i + 1)) - - # Reset the actor values. - [actor.reset.remote() for actor in actors] - - # Interweave the method calls on the different actors. - results = [] - for j in range(num_increases): - results += [actor.increase.remote() for actor in actors] - result_values = ray.get(results) - for j in range(num_increases): - v = result_values[(num_actors * j):(num_actors * (j + 1))] - assert v == num_actors * [j + 1] + ray.get(a.getpid.remote()) -class ActorNesting(unittest.TestCase): - def tearDown(self): - ray.shutdown() +def test_actor_state(ray_start_regular): + @ray.remote + class Counter(object): + def __init__(self): + self.value = 0 - def testRemoteFunctionWithinActor(self): - # Make sure we can use remote funtions within actors. - ray.init(num_cpus=10) + def increase(self): + self.value += 1 - # Create some values to close over. - val1 = 1 - val2 = 2 + def value(self): + return self.value - @ray.remote - def f(x): - return val1 + x + c1 = Counter.remote() + c1.increase.remote() + assert ray.get(c1.value.remote()) == 1 - @ray.remote - def g(x): - return ray.get(f.remote(x)) + c2 = Counter.remote() + c2.increase.remote() + c2.increase.remote() + assert ray.get(c2.value.remote()) == 2 - @ray.remote - class Actor(object): - def __init__(self, x): - self.x = x - self.y = val2 - self.object_ids = [f.remote(i) for i in range(5)] - self.values2 = ray.get([f.remote(i) for i in range(5)]) - def get_values(self): - return self.x, self.y, self.object_ids, self.values2 +def test_actor_class_methods(ray_start_regular): + class Foo(object): + x = 2 - def f(self): - return [f.remote(i) for i in range(5)] + @classmethod + def as_remote(cls): + return ray.remote(cls) - def g(self): - return ray.get([g.remote(i) for i in range(5)]) + @classmethod + def f(cls): + return cls.x - def h(self, object_ids): - return ray.get(object_ids) + @classmethod + def g(cls, y): + return cls.x + y - actor = Actor.remote(1) - values = ray.get(actor.get_values.remote()) - assert values[0] == 1 - assert values[1] == val2 - assert ray.get(values[2]) == list(range(1, 6)) - assert values[3] == list(range(1, 6)) + def echo(self, value): + return value - assert ray.get(ray.get(actor.f.remote())) == list(range(1, 6)) - assert ray.get(actor.g.remote()) == list(range(1, 6)) - assert ray.get(actor.h.remote( - [f.remote(i) for i in range(5)])) == list(range(1, 6)) + a = Foo.as_remote().remote() + assert ray.get(a.echo.remote(2)) == 2 + assert ray.get(a.f.remote()) == 2 + assert ray.get(a.g.remote(2)) == 4 - def testDefineActorWithinActor(self): - # Make sure we can use remote funtions within actors. - ray.init(num_cpus=10) - @ray.remote - class Actor1(object): - def __init__(self, x): - self.x = x +def test_multiple_actors(ray_start_regular): + @ray.remote + class Counter(object): + def __init__(self, value): + self.value = value - def new_actor(self, z): - @ray.remote - class Actor2(object): - def __init__(self, x): - self.x = x + def increase(self): + self.value += 1 + return self.value - def get_value(self): - return self.x + def reset(self): + self.value = 0 - self.actor2 = Actor2.remote(z) + num_actors = 20 + num_increases = 50 + # Create multiple actors. + actors = [Counter.remote(i) for i in range(num_actors)] + results = [] + # Call each actor's method a bunch of times. + for i in range(num_actors): + results += [actors[i].increase.remote() for _ in range(num_increases)] + result_values = ray.get(results) + for i in range(num_actors): + v = result_values[(num_increases * i):(num_increases * (i + 1))] + assert v == list(range(i + 1, num_increases + i + 1)) - def get_values(self, z): - self.new_actor(z) - return self.x, ray.get(self.actor2.get_value.remote()) + # Reset the actor values. + [actor.reset.remote() for actor in actors] - actor1 = Actor1.remote(3) - assert ray.get(actor1.get_values.remote(5)) == (3, 5) + # Interweave the method calls on the different actors. + results = [] + for j in range(num_increases): + results += [actor.increase.remote() for actor in actors] + result_values = ray.get(results) + for j in range(num_increases): + v = result_values[(num_actors * j):(num_actors * (j + 1))] + assert v == num_actors * [j + 1] - def testUseActorWithinActor(self): - # Make sure we can use actors within actors. - ray.init(num_cpus=10) - @ray.remote - class Actor1(object): - def __init__(self, x): - self.x = x +@pytest.fixture +def ray_start_bigger(): + # Start the Ray processes. + ray.init(num_cpus=10) + yield None + # The code after the yield will run as teardown code. + ray.shutdown() - def get_val(self): - return self.x - @ray.remote - class Actor2(object): - def __init__(self, x, y): - self.x = x - self.actor1 = Actor1.remote(y) +def test_remote_function_within_actor(ray_start_bigger): + # Make sure we can use remote funtions within actors. - def get_values(self, z): - return self.x, ray.get(self.actor1.get_val.remote()) + # Create some values to close over. + val1 = 1 + val2 = 2 - actor2 = Actor2.remote(3, 4) - assert ray.get(actor2.get_values.remote(5)) == (3, 4) + @ray.remote + def f(x): + return val1 + x - def testDefineActorWithinRemoteFunction(self): - # Make sure we can define and actors within remote funtions. - ray.init(num_cpus=10) + @ray.remote + def g(x): + return ray.get(f.remote(x)) - @ray.remote - def f(x, n): + @ray.remote + class Actor(object): + def __init__(self, x): + self.x = x + self.y = val2 + self.object_ids = [f.remote(i) for i in range(5)] + self.values2 = ray.get([f.remote(i) for i in range(5)]) + + def get_values(self): + return self.x, self.y, self.object_ids, self.values2 + + def f(self): + return [f.remote(i) for i in range(5)] + + def g(self): + return ray.get([g.remote(i) for i in range(5)]) + + def h(self, object_ids): + return ray.get(object_ids) + + actor = Actor.remote(1) + values = ray.get(actor.get_values.remote()) + assert values[0] == 1 + assert values[1] == val2 + assert ray.get(values[2]) == list(range(1, 6)) + assert values[3] == list(range(1, 6)) + + assert ray.get(ray.get(actor.f.remote())) == list(range(1, 6)) + assert ray.get(actor.g.remote()) == list(range(1, 6)) + assert ray.get(actor.h.remote([f.remote(i) for i in range(5)])) == list( + range(1, 6)) + + +def test_define_actor_within_actor(ray_start_bigger): + # Make sure we can use remote funtions within actors. + + @ray.remote + class Actor1(object): + def __init__(self, x): + self.x = x + + def new_actor(self, z): @ray.remote - class Actor1(object): + class Actor2(object): def __init__(self, x): self.x = x def get_value(self): return self.x - actor = Actor1.remote(x) - return ray.get([actor.get_value.remote() for _ in range(n)]) + self.actor2 = Actor2.remote(z) - assert ray.get(f.remote(3, 1)) == [3] - assert ray.get([f.remote(i, 20) - for i in range(10)]) == [20 * [i] for i in range(10)] + def get_values(self, z): + self.new_actor(z) + return self.x, ray.get(self.actor2.get_value.remote()) - def testUseActorWithinRemoteFunction(self): - # Make sure we can create and use actors within remote funtions. - ray.init(num_cpus=10) + actor1 = Actor1.remote(3) + assert ray.get(actor1.get_values.remote(5)) == (3, 5) + +def test_use_actor_within_actor(ray_start_bigger): + # Make sure we can use actors within actors. + + @ray.remote + class Actor1(object): + def __init__(self, x): + self.x = x + + def get_val(self): + return self.x + + @ray.remote + class Actor2(object): + def __init__(self, x, y): + self.x = x + self.actor1 = Actor1.remote(y) + + def get_values(self, z): + return self.x, ray.get(self.actor1.get_val.remote()) + + actor2 = Actor2.remote(3, 4) + assert ray.get(actor2.get_values.remote(5)) == (3, 4) + + +def test_define_actor_within_remote_function(ray_start_bigger): + # Make sure we can define and actors within remote funtions. + + @ray.remote + def f(x, n): @ray.remote class Actor1(object): def __init__(self, x): self.x = x - def get_values(self): - return self.x - - @ray.remote - def f(x): - actor = Actor1.remote(x) - return ray.get(actor.get_values.remote()) - - assert ray.get(f.remote(3)) == 3 - - def testActorImportCounter(self): - # This is mostly a test of the export counters to make sure that when - # an actor is imported, all of the necessary remote functions have been - # imported. - ray.init(num_cpus=10) - - # Export a bunch of remote functions. - num_remote_functions = 50 - for i in range(num_remote_functions): - - @ray.remote - def f(): - return i - - @ray.remote - def g(): - @ray.remote - class Actor(object): - def __init__(self): - # This should use the last version of f. - self.x = ray.get(f.remote()) - - def get_val(self): - return self.x - - actor = Actor.remote() - return ray.get(actor.get_val.remote()) - - assert ray.get(g.remote()) == num_remote_functions - 1 - - -class ActorInheritance(unittest.TestCase): - def tearDown(self): - ray.shutdown() - - def testInheritActorFromClass(self): - # Make sure we can define an actor by inheriting from a regular class. - # Note that actors cannot inherit from other actors. - ray.init() - - class Foo(object): - def __init__(self, x): - self.x = x - - def f(self): - return self.x - - def g(self, y): - return self.x + y - - @ray.remote - class Actor(Foo): - def __init__(self, x): - Foo.__init__(self, x) - def get_value(self): - return self.f() + return self.x - actor = Actor.remote(1) - assert ray.get(actor.get_value.remote()) == 1 - assert ray.get(actor.g.remote(5)) == 6 + actor = Actor1.remote(x) + return ray.get([actor.get_value.remote() for _ in range(n)]) + + assert ray.get(f.remote(3, 1)) == [3] + assert ray.get( + [f.remote(i, 20) for i in range(10)]) == [20 * [i] for i in range(10)] -class ActorSchedulingProperties(unittest.TestCase): - def tearDown(self): - ray.shutdown() +def test_use_actor_within_remote_function(ray_start_bigger): + # Make sure we can create and use actors within remote funtions. - def testRemoteFunctionsNotScheduledOnActors(self): - # Make sure that regular remote functions are not scheduled on actors. - ray.init(num_workers=0) + @ray.remote + class Actor1(object): + def __init__(self, x): + self.x = x - @ray.remote - class Actor(object): - def __init__(self): - pass + def get_values(self): + return self.x - def get_id(self): - return ray.worker.global_worker.worker_id + @ray.remote + def f(x): + actor = Actor1.remote(x) + return ray.get(actor.get_values.remote()) - a = Actor.remote() - actor_id = ray.get(a.get_id.remote()) + assert ray.get(f.remote(3)) == 3 + + +def test_actor_import_counter(ray_start_bigger): + # This is mostly a test of the export counters to make sure that when + # an actor is imported, all of the necessary remote functions have been + # imported. + + # Export a bunch of remote functions. + num_remote_functions = 50 + for i in range(num_remote_functions): @ray.remote def f(): + return i + + @ray.remote + def g(): + @ray.remote + class Actor(object): + def __init__(self): + # This should use the last version of f. + self.x = ray.get(f.remote()) + + def get_val(self): + return self.x + + actor = Actor.remote() + return ray.get(actor.get_val.remote()) + + assert ray.get(g.remote()) == num_remote_functions - 1 + + +def test_inherit_actor_from_class(ray_start_regular): + # Make sure we can define an actor by inheriting from a regular class. + # Note that actors cannot inherit from other actors. + + class Foo(object): + def __init__(self, x): + self.x = x + + def f(self): + return self.x + + def g(self, y): + return self.x + y + + @ray.remote + class Actor(Foo): + def __init__(self, x): + Foo.__init__(self, x) + + def get_value(self): + return self.f() + + actor = Actor.remote(1) + assert ray.get(actor.get_value.remote()) == 1 + assert ray.get(actor.g.remote(5)) == 6 + + +def test_remote_functions_not_scheduled_on_actors(ray_start_regular): + # Make sure that regular remote functions are not scheduled on actors. + + @ray.remote + class Actor(object): + def __init__(self): + pass + + def get_id(self): return ray.worker.global_worker.worker_id - resulting_ids = ray.get([f.remote() for _ in range(100)]) - assert actor_id not in resulting_ids + a = Actor.remote() + actor_id = ray.get(a.get_id.remote()) + + @ray.remote + def f(): + return ray.worker.global_worker.worker_id + + resulting_ids = ray.get([f.remote() for _ in range(100)]) + assert actor_id not in resulting_ids -class ActorsOnMultipleNodes(unittest.TestCase): - def tearDown(self): - ray.shutdown() +def test_actors_on_nodes_with_no_cpus(ray_start_regular): + @ray.remote + class Foo(object): + def method(self): + pass - def testActorsOnNodesWithNoCPUs(self): - ray.init(num_cpus=0) - - @ray.remote - class Foo(object): - def method(self): - pass - - f = Foo.remote() - ready_ids, _ = ray.wait([f.method.remote()], timeout=100) - assert ready_ids == [] - - def testActorLoadBalancing(self): - num_local_schedulers = 3 - ray.worker._init( - start_ray_local=True, - num_workers=0, - num_local_schedulers=num_local_schedulers) - - @ray.remote - class Actor1(object): - def __init__(self): - pass - - def get_location(self): - return ray.worker.global_worker.plasma_client.store_socket_name - - # Create a bunch of actors. - num_actors = 30 - num_attempts = 20 - minimum_count = 5 - - # Make sure that actors are spread between the local schedulers. - attempts = 0 - while attempts < num_attempts: - actors = [Actor1.remote() for _ in range(num_actors)] - locations = ray.get( - [actor.get_location.remote() for actor in actors]) - names = set(locations) - counts = [locations.count(name) for name in names] - print("Counts are {}.".format(counts)) - if (len(names) == num_local_schedulers - and all(count >= minimum_count for count in counts)): - break - attempts += 1 - assert attempts < num_attempts - - # Make sure we can get the results of a bunch of tasks. - results = [] - for _ in range(1000): - index = np.random.randint(num_actors) - results.append(actors[index].get_location.remote()) - ray.get(results) + f = Foo.remote() + ready_ids, _ = ray.wait([f.method.remote()], timeout=100) + assert ready_ids == [] -class ActorsWithGPUs(unittest.TestCase): - def tearDown(self): - ray.shutdown() +def test_actor_load_balancing(shutdown_only): + num_local_schedulers = 3 + ray.worker._init( + start_ray_local=True, + num_workers=0, + num_local_schedulers=num_local_schedulers) - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), "Crashing with new GCS API.") - def testActorGPUs(self): - num_local_schedulers = 3 - num_gpus_per_scheduler = 4 - ray.worker._init( - start_ray_local=True, - num_workers=0, - num_local_schedulers=num_local_schedulers, - num_cpus=(num_local_schedulers * [10 * num_gpus_per_scheduler]), - num_gpus=(num_local_schedulers * [num_gpus_per_scheduler])) + @ray.remote + class Actor1(object): + def __init__(self): + pass - @ray.remote(num_gpus=1) - class Actor1(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() + def get_location(self): + return ray.worker.global_worker.plasma_client.store_socket_name - def get_location_and_ids(self): - assert ray.get_gpu_ids() == self.gpu_ids - return ( - ray.worker.global_worker.plasma_client.store_socket_name, + # Create a bunch of actors. + num_actors = 30 + num_attempts = 20 + minimum_count = 5 + + # Make sure that actors are spread between the local schedulers. + attempts = 0 + while attempts < num_attempts: + actors = [Actor1.remote() for _ in range(num_actors)] + locations = ray.get([actor.get_location.remote() for actor in actors]) + names = set(locations) + counts = [locations.count(name) for name in names] + print("Counts are {}.".format(counts)) + if (len(names) == num_local_schedulers + and all(count >= minimum_count for count in counts)): + break + attempts += 1 + assert attempts < num_attempts + + # Make sure we can get the results of a bunch of tasks. + results = [] + for _ in range(1000): + index = np.random.randint(num_actors) + results.append(actors[index].get_location.remote()) + ray.get(results) + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Failing with new GCS API on Linux.") +def test_actor_gpus(shutdown_only): + num_local_schedulers = 3 + num_gpus_per_scheduler = 4 + ray.worker._init( + start_ray_local=True, + num_workers=0, + num_local_schedulers=num_local_schedulers, + num_cpus=(num_local_schedulers * [10 * num_gpus_per_scheduler]), + num_gpus=(num_local_schedulers * [num_gpus_per_scheduler])) + + @ray.remote(num_gpus=1) + class Actor1(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + + def get_location_and_ids(self): + assert ray.get_gpu_ids() == self.gpu_ids + return (ray.worker.global_worker.plasma_client.store_socket_name, tuple(self.gpu_ids)) - # Create one actor per GPU. - actors = [ - Actor1.remote() - for _ in range(num_local_schedulers * num_gpus_per_scheduler) + # Create one actor per GPU. + actors = [ + Actor1.remote() + for _ in range(num_local_schedulers * num_gpus_per_scheduler) + ] + # Make sure that no two actors are assigned to the same GPU. + locations_and_ids = ray.get( + [actor.get_location_and_ids.remote() for actor in actors]) + node_names = {location for location, gpu_id in locations_and_ids} + assert len(node_names) == num_local_schedulers + location_actor_combinations = [] + for node_name in node_names: + for gpu_id in range(num_gpus_per_scheduler): + location_actor_combinations.append((node_name, (gpu_id, ))) + assert set(locations_and_ids) == set(location_actor_combinations) + + # Creating a new actor should fail because all of the GPUs are being + # used. + a = Actor1.remote() + ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10) + assert ready_ids == [] + + +def test_actor_multiple_gpus(shutdown_only): + num_local_schedulers = 3 + num_gpus_per_scheduler = 5 + ray.worker._init( + start_ray_local=True, + num_workers=0, + num_local_schedulers=num_local_schedulers, + num_cpus=(num_local_schedulers * [10 * num_gpus_per_scheduler]), + num_gpus=(num_local_schedulers * [num_gpus_per_scheduler])) + + @ray.remote(num_gpus=2) + class Actor1(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + + def get_location_and_ids(self): + assert ray.get_gpu_ids() == self.gpu_ids + return (ray.worker.global_worker.plasma_client.store_socket_name, + tuple(self.gpu_ids)) + + # Create some actors. + actors1 = [Actor1.remote() for _ in range(num_local_schedulers * 2)] + # Make sure that no two actors are assigned to the same GPU. + locations_and_ids = ray.get( + [actor.get_location_and_ids.remote() for actor in actors1]) + node_names = {location for location, gpu_id in locations_and_ids} + assert len(node_names) == num_local_schedulers + + # Keep track of which GPU IDs are being used for each location. + gpus_in_use = {node_name: [] for node_name in node_names} + for location, gpu_ids in locations_and_ids: + gpus_in_use[location].extend(gpu_ids) + for node_name in node_names: + assert len(set(gpus_in_use[node_name])) == 4 + + # Creating a new actor should fail because all of the GPUs are being + # used. + a = Actor1.remote() + ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10) + assert ready_ids == [] + + # We should be able to create more actors that use only a single GPU. + @ray.remote(num_gpus=1) + class Actor2(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + + def get_location_and_ids(self): + return (ray.worker.global_worker.plasma_client.store_socket_name, + tuple(self.gpu_ids)) + + # Create some actors. + actors2 = [Actor2.remote() for _ in range(num_local_schedulers)] + # Make sure that no two actors are assigned to the same GPU. + locations_and_ids = ray.get( + [actor.get_location_and_ids.remote() for actor in actors2]) + names = {location for location, gpu_id in locations_and_ids} + assert node_names == names + for location, gpu_ids in locations_and_ids: + gpus_in_use[location].extend(gpu_ids) + for node_name in node_names: + assert len(gpus_in_use[node_name]) == 5 + assert set(gpus_in_use[node_name]) == set(range(5)) + + # Creating a new actor should fail because all of the GPUs are being + # used. + a = Actor2.remote() + ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10) + assert ready_ids == [] + + +def test_actor_different_numbers_of_gpus(shutdown_only): + # Test that we can create actors on two nodes that have different + # numbers of GPUs. + ray.worker._init( + start_ray_local=True, + num_workers=0, + num_local_schedulers=3, + num_cpus=[10, 10, 10], + num_gpus=[0, 5, 10]) + + @ray.remote(num_gpus=1) + class Actor1(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + + def get_location_and_ids(self): + return (ray.worker.global_worker.plasma_client.store_socket_name, + tuple(self.gpu_ids)) + + # Create some actors. + actors = [Actor1.remote() for _ in range(0 + 5 + 10)] + # Make sure that no two actors are assigned to the same GPU. + locations_and_ids = ray.get( + [actor.get_location_and_ids.remote() for actor in actors]) + node_names = {location for location, gpu_id in locations_and_ids} + assert len(node_names) == 2 + for node_name in node_names: + node_gpu_ids = [ + gpu_id for location, gpu_id in locations_and_ids + if location == node_name ] - # Make sure that no two actors are assigned to the same GPU. - locations_and_ids = ray.get( - [actor.get_location_and_ids.remote() for actor in actors]) - node_names = {location for location, gpu_id in locations_and_ids} - assert len(node_names) == num_local_schedulers - location_actor_combinations = [] - for node_name in node_names: - for gpu_id in range(num_gpus_per_scheduler): - location_actor_combinations.append((node_name, (gpu_id, ))) - assert set(locations_and_ids) == set(location_actor_combinations) + assert len(node_gpu_ids) in [5, 10] + assert set(node_gpu_ids) == {(i, ) for i in range(len(node_gpu_ids))} - # Creating a new actor should fail because all of the GPUs are being - # used. - a = Actor1.remote() - ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10) - assert ready_ids == [] + # Creating a new actor should fail because all of the GPUs are being + # used. + a = Actor1.remote() + ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10) + assert ready_ids == [] - def testActorMultipleGPUs(self): - num_local_schedulers = 3 - num_gpus_per_scheduler = 5 - ray.worker._init( - start_ray_local=True, - num_workers=0, - num_local_schedulers=num_local_schedulers, - num_cpus=(num_local_schedulers * [10 * num_gpus_per_scheduler]), - num_gpus=(num_local_schedulers * [num_gpus_per_scheduler])) - @ray.remote(num_gpus=2) - class Actor1(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() - - def get_location_and_ids(self): - assert ray.get_gpu_ids() == self.gpu_ids - return ( - ray.worker.global_worker.plasma_client.store_socket_name, - tuple(self.gpu_ids)) - - # Create some actors. - actors1 = [Actor1.remote() for _ in range(num_local_schedulers * 2)] - # Make sure that no two actors are assigned to the same GPU. - locations_and_ids = ray.get( - [actor.get_location_and_ids.remote() for actor in actors1]) - node_names = {location for location, gpu_id in locations_and_ids} - assert len(node_names) == num_local_schedulers - - # Keep track of which GPU IDs are being used for each location. - gpus_in_use = {node_name: [] for node_name in node_names} - for location, gpu_ids in locations_and_ids: - gpus_in_use[location].extend(gpu_ids) - for node_name in node_names: - assert len(set(gpus_in_use[node_name])) == 4 - - # Creating a new actor should fail because all of the GPUs are being - # used. - a = Actor1.remote() - ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10) - assert ready_ids == [] - - # We should be able to create more actors that use only a single GPU. - @ray.remote(num_gpus=1) - class Actor2(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() - - def get_location_and_ids(self): - return ( - ray.worker.global_worker.plasma_client.store_socket_name, - tuple(self.gpu_ids)) - - # Create some actors. - actors2 = [Actor2.remote() for _ in range(num_local_schedulers)] - # Make sure that no two actors are assigned to the same GPU. - locations_and_ids = ray.get( - [actor.get_location_and_ids.remote() for actor in actors2]) - names = {location for location, gpu_id in locations_and_ids} - assert node_names == names - for location, gpu_ids in locations_and_ids: - gpus_in_use[location].extend(gpu_ids) - for node_name in node_names: - assert len(gpus_in_use[node_name]) == 5 - assert set(gpus_in_use[node_name]) == set(range(5)) - - # Creating a new actor should fail because all of the GPUs are being - # used. - a = Actor2.remote() - ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10) - assert ready_ids == [] - - def testActorDifferentNumbersOfGPUs(self): - # Test that we can create actors on two nodes that have different - # numbers of GPUs. - ray.worker._init( - start_ray_local=True, - num_workers=0, - num_local_schedulers=3, - num_cpus=[10, 10, 10], - num_gpus=[0, 5, 10]) - - @ray.remote(num_gpus=1) - class Actor1(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() - - def get_location_and_ids(self): - return ( - ray.worker.global_worker.plasma_client.store_socket_name, - tuple(self.gpu_ids)) - - # Create some actors. - actors = [Actor1.remote() for _ in range(0 + 5 + 10)] - # Make sure that no two actors are assigned to the same GPU. - locations_and_ids = ray.get( - [actor.get_location_and_ids.remote() for actor in actors]) - node_names = {location for location, gpu_id in locations_and_ids} - assert len(node_names) == 2 - for node_name in node_names: - node_gpu_ids = [ - gpu_id for location, gpu_id in locations_and_ids - if location == node_name - ] - assert len(node_gpu_ids) in [5, 10] - assert set(node_gpu_ids) == {(i, ) - for i in range(len(node_gpu_ids))} - - # Creating a new actor should fail because all of the GPUs are being - # used. - a = Actor1.remote() - ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10) - assert ready_ids == [] - - def testActorMultipleGPUsFromMultipleTasks(self): - num_local_schedulers = 10 - num_gpus_per_scheduler = 10 - ray.worker._init( - start_ray_local=True, - num_workers=0, - num_local_schedulers=num_local_schedulers, - redirect_output=True, - num_cpus=(num_local_schedulers * [10 * num_gpus_per_scheduler]), - num_gpus=(num_local_schedulers * [num_gpus_per_scheduler])) - - @ray.remote - def create_actors(n): - @ray.remote(num_gpus=1) - class Actor(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() - - def get_location_and_ids(self): - return ((ray.worker.global_worker.plasma_client. - store_socket_name), tuple(self.gpu_ids)) - - # Create n actors. - for _ in range(n): - Actor.remote() - - ray.get([ - create_actors.remote(num_gpus_per_scheduler) - for _ in range(num_local_schedulers) - ]) +def test_actor_multiple_gpus_from_multiple_tasks(shutdown_only): + num_local_schedulers = 10 + num_gpus_per_scheduler = 10 + ray.worker._init( + start_ray_local=True, + num_workers=0, + num_local_schedulers=num_local_schedulers, + redirect_output=True, + num_cpus=(num_local_schedulers * [10 * num_gpus_per_scheduler]), + num_gpus=(num_local_schedulers * [num_gpus_per_scheduler])) + @ray.remote + def create_actors(n): @ray.remote(num_gpus=1) class Actor(object): def __init__(self): self.gpu_ids = ray.get_gpu_ids() def get_location_and_ids(self): - return ( - ray.worker.global_worker.plasma_client.store_socket_name, + return (( + ray.worker.global_worker.plasma_client.store_socket_name), + tuple(self.gpu_ids)) + + # Create n actors. + for _ in range(n): + Actor.remote() + + ray.get([ + create_actors.remote(num_gpus_per_scheduler) + for _ in range(num_local_schedulers) + ]) + + @ray.remote(num_gpus=1) + class Actor(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + + def get_location_and_ids(self): + return (ray.worker.global_worker.plasma_client.store_socket_name, tuple(self.gpu_ids)) - # All the GPUs should be used up now. + # All the GPUs should be used up now. + a = Actor.remote() + ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10) + assert ready_ids == [] + + +@pytest.mark.skipif( + sys.version_info < (3, 0), reason="This test requires Python 3.") +def test_actors_and_tasks_with_gpus(shutdown_only): + num_local_schedulers = 3 + num_gpus_per_scheduler = 6 + ray.worker._init( + start_ray_local=True, + num_workers=0, + num_local_schedulers=num_local_schedulers, + num_cpus=num_gpus_per_scheduler, + num_gpus=(num_local_schedulers * [num_gpus_per_scheduler])) + + def check_intervals_non_overlapping(list_of_intervals): + for i in range(len(list_of_intervals)): + for j in range(i): + first_interval = list_of_intervals[i] + second_interval = list_of_intervals[j] + # Check that list_of_intervals[i] and list_of_intervals[j] + # don't overlap. + assert first_interval[0] < first_interval[1] + assert second_interval[0] < second_interval[1] + intervals_nonoverlapping = ( + first_interval[1] <= second_interval[0] + or second_interval[1] <= first_interval[0]) + assert intervals_nonoverlapping, ( + "Intervals {} and {} are overlapping.".format( + first_interval, second_interval)) + + @ray.remote(num_gpus=1) + def f1(): + t1 = time.monotonic() + time.sleep(0.1) + t2 = time.monotonic() + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 1 + assert gpu_ids[0] in range(num_gpus_per_scheduler) + return (ray.worker.global_worker.plasma_client.store_socket_name, + tuple(gpu_ids), [t1, t2]) + + @ray.remote(num_gpus=2) + def f2(): + t1 = time.monotonic() + time.sleep(0.1) + t2 = time.monotonic() + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 2 + assert gpu_ids[0] in range(num_gpus_per_scheduler) + assert gpu_ids[1] in range(num_gpus_per_scheduler) + return (ray.worker.global_worker.plasma_client.store_socket_name, + tuple(gpu_ids), [t1, t2]) + + @ray.remote(num_gpus=1) + class Actor1(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + assert len(self.gpu_ids) == 1 + assert self.gpu_ids[0] in range(num_gpus_per_scheduler) + + def get_location_and_ids(self): + assert ray.get_gpu_ids() == self.gpu_ids + return (ray.worker.global_worker.plasma_client.store_socket_name, + tuple(self.gpu_ids)) + + def locations_to_intervals_for_many_tasks(): + # Launch a bunch of GPU tasks. + locations_ids_and_intervals = ray.get([ + f1.remote() + for _ in range(5 * num_local_schedulers * num_gpus_per_scheduler) + ] + [ + f2.remote() + for _ in range(5 * num_local_schedulers * num_gpus_per_scheduler) + ] + [ + f1.remote() + for _ in range(5 * num_local_schedulers * num_gpus_per_scheduler) + ]) + + locations_to_intervals = collections.defaultdict(lambda: []) + for location, gpu_ids, interval in locations_ids_and_intervals: + for gpu_id in gpu_ids: + locations_to_intervals[(location, gpu_id)].append(interval) + return locations_to_intervals + + # Run a bunch of GPU tasks. + locations_to_intervals = locations_to_intervals_for_many_tasks() + # Make sure that all GPUs were used. + assert (len(locations_to_intervals) == num_local_schedulers * + num_gpus_per_scheduler) + # For each GPU, verify that the set of tasks that used this specific + # GPU did not overlap in time. + for locations in locations_to_intervals: + check_intervals_non_overlapping(locations_to_intervals[locations]) + + # Create an actor that uses a GPU. + a = Actor1.remote() + actor_location = ray.get(a.get_location_and_ids.remote()) + actor_location = (actor_location[0], actor_location[1][0]) + # This check makes sure that actor_location is formatted the same way + # that the keys of locations_to_intervals are formatted. + assert actor_location in locations_to_intervals + + # Run a bunch of GPU tasks. + locations_to_intervals = locations_to_intervals_for_many_tasks() + # Make sure that all but one of the GPUs were used. + assert (len(locations_to_intervals) == + num_local_schedulers * num_gpus_per_scheduler - 1) + # For each GPU, verify that the set of tasks that used this specific + # GPU did not overlap in time. + for locations in locations_to_intervals: + check_intervals_non_overlapping(locations_to_intervals[locations]) + # Make sure that the actor's GPU was not used. + assert actor_location not in locations_to_intervals + + # Create several more actors that use GPUs. + actors = [Actor1.remote() for _ in range(3)] + actor_locations = ray.get( + [actor.get_location_and_ids.remote() for actor in actors]) + + # Run a bunch of GPU tasks. + locations_to_intervals = locations_to_intervals_for_many_tasks() + # Make sure that all but 11 of the GPUs were used. + assert (len(locations_to_intervals) == + num_local_schedulers * num_gpus_per_scheduler - 1 - 3) + # For each GPU, verify that the set of tasks that used this specific + # GPU did not overlap in time. + for locations in locations_to_intervals: + check_intervals_non_overlapping(locations_to_intervals[locations]) + # Make sure that the GPUs were not used. + assert actor_location not in locations_to_intervals + for location in actor_locations: + assert location not in locations_to_intervals + + # Create more actors to fill up all the GPUs. + more_actors = [ + Actor1.remote() + for _ in range(num_local_schedulers * num_gpus_per_scheduler - 1 - 3) + ] + # Wait for the actors to finish being created. + ray.get([actor.get_location_and_ids.remote() for actor in more_actors]) + + # Now if we run some GPU tasks, they should not be scheduled. + results = [f1.remote() for _ in range(30)] + ready_ids, remaining_ids = ray.wait(results, timeout=1000) + assert len(ready_ids) == 0 + + +def test_actors_and_tasks_with_gpus_version_two(shutdown_only): + # Create tasks and actors that both use GPUs and make sure that they + # are given different GPUs + ray.init(num_cpus=10, num_gpus=10) + + @ray.remote(num_gpus=1) + def f(): + time.sleep(4) + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 1 + return gpu_ids[0] + + @ray.remote(num_gpus=1) + class Actor(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + assert len(self.gpu_ids) == 1 + + def get_gpu_id(self): + assert ray.get_gpu_ids() == self.gpu_ids + return self.gpu_ids[0] + + results = [] + actors = [] + for _ in range(5): + results.append(f.remote()) a = Actor.remote() - ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10) - assert ready_ids == [] + results.append(a.get_gpu_id.remote()) + # Prevent the actor handle from going out of scope so that its GPU + # resources don't get released. + actors.append(a) - @unittest.skipIf(sys.version_info < (3, 0), "This test requires Python 3.") - def testActorsAndTasksWithGPUs(self): - num_local_schedulers = 3 - num_gpus_per_scheduler = 6 - ray.worker._init( - start_ray_local=True, - num_workers=0, - num_local_schedulers=num_local_schedulers, - num_cpus=num_gpus_per_scheduler, - num_gpus=(num_local_schedulers * [num_gpus_per_scheduler])) + gpu_ids = ray.get(results) + assert set(gpu_ids) == set(range(10)) - def check_intervals_non_overlapping(list_of_intervals): - for i in range(len(list_of_intervals)): - for j in range(i): - first_interval = list_of_intervals[i] - second_interval = list_of_intervals[j] - # Check that list_of_intervals[i] and list_of_intervals[j] - # don't overlap. - assert first_interval[0] < first_interval[1] - assert second_interval[0] < second_interval[1] - intervals_nonoverlapping = ( - first_interval[1] <= second_interval[0] - or second_interval[1] <= first_interval[0]) - assert intervals_nonoverlapping, ( - "Intervals {} and {} are overlapping.".format( - first_interval, second_interval)) - @ray.remote(num_gpus=1) - def f1(): - t1 = time.monotonic() +@pytest.mark.skipif( + sys.version_info < (3, 0), reason="This test requires Python 3.") +def test_actors_and_task_resource_bookkeeping(ray_start_regular): + @ray.remote + class Foo(object): + def __init__(self): + start = time.monotonic() time.sleep(0.1) - t2 = time.monotonic() - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 1 - assert gpu_ids[0] in range(num_gpus_per_scheduler) - return (ray.worker.global_worker.plasma_client.store_socket_name, - tuple(gpu_ids), [t1, t2]) - - @ray.remote(num_gpus=2) - def f2(): - t1 = time.monotonic() - time.sleep(0.1) - t2 = time.monotonic() - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 2 - assert gpu_ids[0] in range(num_gpus_per_scheduler) - assert gpu_ids[1] in range(num_gpus_per_scheduler) - return (ray.worker.global_worker.plasma_client.store_socket_name, - tuple(gpu_ids), [t1, t2]) - - @ray.remote(num_gpus=1) - class Actor1(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() - assert len(self.gpu_ids) == 1 - assert self.gpu_ids[0] in range(num_gpus_per_scheduler) - - def get_location_and_ids(self): - assert ray.get_gpu_ids() == self.gpu_ids - return ( - ray.worker.global_worker.plasma_client.store_socket_name, - tuple(self.gpu_ids)) - - def locations_to_intervals_for_many_tasks(): - # Launch a bunch of GPU tasks. - locations_ids_and_intervals = ray.get([ - f1.remote() for _ in range(5 * num_local_schedulers * - num_gpus_per_scheduler) - ] + [ - f2.remote() for _ in range(5 * num_local_schedulers * - num_gpus_per_scheduler) - ] + [ - f1.remote() for _ in range(5 * num_local_schedulers * - num_gpus_per_scheduler) - ]) - - locations_to_intervals = collections.defaultdict(lambda: []) - for location, gpu_ids, interval in locations_ids_and_intervals: - for gpu_id in gpu_ids: - locations_to_intervals[(location, gpu_id)].append(interval) - return locations_to_intervals - - # Run a bunch of GPU tasks. - locations_to_intervals = locations_to_intervals_for_many_tasks() - # Make sure that all GPUs were used. - assert (len(locations_to_intervals) == num_local_schedulers * - num_gpus_per_scheduler) - # For each GPU, verify that the set of tasks that used this specific - # GPU did not overlap in time. - for locations in locations_to_intervals: - check_intervals_non_overlapping(locations_to_intervals[locations]) - - # Create an actor that uses a GPU. - a = Actor1.remote() - actor_location = ray.get(a.get_location_and_ids.remote()) - actor_location = (actor_location[0], actor_location[1][0]) - # This check makes sure that actor_location is formatted the same way - # that the keys of locations_to_intervals are formatted. - assert actor_location in locations_to_intervals - - # Run a bunch of GPU tasks. - locations_to_intervals = locations_to_intervals_for_many_tasks() - # Make sure that all but one of the GPUs were used. - assert (len(locations_to_intervals) == - num_local_schedulers * num_gpus_per_scheduler - 1) - # For each GPU, verify that the set of tasks that used this specific - # GPU did not overlap in time. - for locations in locations_to_intervals: - check_intervals_non_overlapping(locations_to_intervals[locations]) - # Make sure that the actor's GPU was not used. - assert actor_location not in locations_to_intervals - - # Create several more actors that use GPUs. - actors = [Actor1.remote() for _ in range(3)] - actor_locations = ray.get( - [actor.get_location_and_ids.remote() for actor in actors]) - - # Run a bunch of GPU tasks. - locations_to_intervals = locations_to_intervals_for_many_tasks() - # Make sure that all but 11 of the GPUs were used. - assert (len(locations_to_intervals) == - num_local_schedulers * num_gpus_per_scheduler - 1 - 3) - # For each GPU, verify that the set of tasks that used this specific - # GPU did not overlap in time. - for locations in locations_to_intervals: - check_intervals_non_overlapping(locations_to_intervals[locations]) - # Make sure that the GPUs were not used. - assert actor_location not in locations_to_intervals - for location in actor_locations: - assert location not in locations_to_intervals - - # Create more actors to fill up all the GPUs. - more_actors = [ - Actor1.remote() - for _ in range(num_local_schedulers * num_gpus_per_scheduler - 1 - - 3) - ] - # Wait for the actors to finish being created. - ray.get([actor.get_location_and_ids.remote() for actor in more_actors]) - - # Now if we run some GPU tasks, they should not be scheduled. - results = [f1.remote() for _ in range(30)] - ready_ids, remaining_ids = ray.wait(results, timeout=1000) - assert len(ready_ids) == 0 - - def testActorsAndTasksWithGPUsVersionTwo(self): - # Create tasks and actors that both use GPUs and make sure that they - # are given different GPUs - ray.init(num_cpus=10, num_gpus=10) - - @ray.remote(num_gpus=1) - def f(): - time.sleep(4) - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 1 - return gpu_ids[0] - - @ray.remote(num_gpus=1) - class Actor(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() - assert len(self.gpu_ids) == 1 - - def get_gpu_id(self): - assert ray.get_gpu_ids() == self.gpu_ids - return self.gpu_ids[0] - - results = [] - actors = [] - for _ in range(5): - results.append(f.remote()) - a = Actor.remote() - results.append(a.get_gpu_id.remote()) - # Prevent the actor handle from going out of scope so that its GPU - # resources don't get released. - actors.append(a) - - gpu_ids = ray.get(results) - assert set(gpu_ids) == set(range(10)) - - @unittest.skipIf(sys.version_info < (3, 0), "This test requires Python 3.") - def testActorsAndTaskResourceBookkeeping(self): - ray.init(num_cpus=1) - - @ray.remote - class Foo(object): - def __init__(self): - start = time.monotonic() - time.sleep(0.1) - end = time.monotonic() - self.interval = (start, end) - - def get_interval(self): - return self.interval - - def sleep(self): - start = time.monotonic() - time.sleep(0.01) - end = time.monotonic() - return start, end - - # First make sure that we do not have more actor methods running at a - # time than we have CPUs. - actors = [Foo.remote() for _ in range(4)] - interval_ids = [] - interval_ids += [actor.get_interval.remote() for actor in actors] - for _ in range(4): - interval_ids += [actor.sleep.remote() for actor in actors] - - # Make sure that the intervals don't overlap. - intervals = ray.get(interval_ids) - intervals.sort(key=lambda x: x[0]) - for interval1, interval2 in zip(intervals[:-1], intervals[1:]): - assert interval1[0] < interval1[1] - assert interval1[1] < interval2[0] - assert interval2[0] < interval2[1] - - def testBlockingActorTask(self): - ray.init(num_cpus=1, num_gpus=1) - - @ray.remote(num_gpus=1) - def f(): - return 1 - - @ray.remote - class Foo(object): - def __init__(self): - pass - - def blocking_method(self): - ray.get(f.remote()) - - # Make sure we can execute a blocking actor method even if there is - # only one CPU. - actor = Foo.remote() - ray.get(actor.blocking_method.remote()) - - @ray.remote(num_cpus=1) - class CPUFoo(object): - def __init__(self): - pass - - def blocking_method(self): - ray.get(f.remote()) - - # Make sure that lifetime CPU resources are not released when actors - # block. - actor = CPUFoo.remote() - x_id = actor.blocking_method.remote() - ready_ids, remaining_ids = ray.wait([x_id], timeout=1000) - assert ready_ids == [] - assert remaining_ids == [x_id] - - @ray.remote(num_gpus=1) - class GPUFoo(object): - def __init__(self): - pass - - def blocking_method(self): - ray.get(f.remote()) - - # Make sure that GPU resources are not released when actors block. - actor = GPUFoo.remote() - x_id = actor.blocking_method.remote() - ready_ids, remaining_ids = ray.wait([x_id], timeout=1000) - assert ready_ids == [] - assert remaining_ids == [x_id] - - -@unittest.skipIf( - os.environ.get("RAY_USE_XRAY") != "1", "This test only works with xray.") -class ActorExceptionFailures(unittest.TestCase): - def tearDown(self): - ray.shutdown() - - def testExceptionRaisedWhenActorNodeDies(self): - ray.worker._init( - start_ray_local=True, num_local_schedulers=2, num_cpus=1) - - @ray.remote - class Counter(object): - def __init__(self): - self.x = 0 - - def local_plasma(self): - return ray.worker.global_worker.plasma_client.store_socket_name - - def inc(self): - self.x += 1 - return self.x - - local_plasma = ray.worker.global_worker.plasma_client.store_socket_name - - # Create an actor that is not on the local scheduler. - actor = Counter.remote() - while ray.get(actor.local_plasma.remote()) == local_plasma: - actor = Counter.remote() - - # Kill the second plasma store to get rid of the cached objects and - # trigger the corresponding local scheduler to exit. - process = ray.services.all_processes[ - ray.services.PROCESS_TYPE_PLASMA_STORE][1] - process.kill() - - # Submit some new actor tasks. - x_ids = [actor.inc.remote() for _ in range(100)] - - # Make sure that getting the result raises an exception. - for _ in range(10): - for x_id in x_ids: - with pytest.raises(ray.worker.RayGetError): - # There is some small chance that ray.get will actually - # succeed (if the object is transferred before the raylet - # dies). - ray.get(x_id) - - # Make sure the process has exited. - process.wait() - - -@unittest.skipIf( - os.environ.get("RAY_USE_XRAY") == "1", - "This test does not work with xray yet.") -class ActorReconstruction(unittest.TestCase): - def tearDown(self): - ray.shutdown() - - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.") - def testLocalSchedulerDying(self): - ray.worker._init( - start_ray_local=True, - num_local_schedulers=2, - num_workers=0, - redirect_output=True) - - @ray.remote - class Counter(object): - def __init__(self): - self.x = 0 - - def local_plasma(self): - return ray.worker.global_worker.plasma_client.store_socket_name - - def inc(self): - self.x += 1 - return self.x - - local_plasma = ray.worker.global_worker.plasma_client.store_socket_name - - # Create an actor that is not on the local scheduler. - actor = Counter.remote() - while ray.get(actor.local_plasma.remote()) == local_plasma: - actor = Counter.remote() - - ids = [actor.inc.remote() for _ in range(100)] - - # Wait for the last task to finish running. - ray.get(ids[-1]) - - # Kill the second plasma store to get rid of the cached objects and - # trigger the corresponding local scheduler to exit. - process = ray.services.all_processes[ - ray.services.PROCESS_TYPE_PLASMA_STORE][1] - process.kill() - process.wait() - - # Get all of the results - results = ray.get(ids) - - assert results == list(range(1, 1 + len(results))) - - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.") - def testManyLocalSchedulersDying(self): - # This test can be made more stressful by increasing the numbers below. - # The total number of actors created will be - # num_actors_at_a_time * num_local_schedulers. - num_local_schedulers = 5 - num_actors_at_a_time = 3 - num_function_calls_at_a_time = 10 - - ray.worker._init( - start_ray_local=True, - num_local_schedulers=num_local_schedulers, - num_workers=0, - redirect_output=True) - - @ray.remote - class SlowCounter(object): - def __init__(self): - self.x = 0 - - def inc(self, duration): - time.sleep(duration) - self.x += 1 - return self.x - - # Create some initial actors. - actors = [SlowCounter.remote() for _ in range(num_actors_at_a_time)] - - # Wait for the actors to start up. - time.sleep(1) - - # This is a mapping from actor handles to object IDs returned by - # methods on that actor. - result_ids = collections.defaultdict(lambda: []) - - # In a loop we are going to create some actors, run some methods, kill - # a local scheduler, and run some more methods. - for i in range(num_local_schedulers - 1): - # Create some actors. - actors.extend( - [SlowCounter.remote() for _ in range(num_actors_at_a_time)]) - # Run some methods. - for j in range(len(actors)): - actor = actors[j] - for _ in range(num_function_calls_at_a_time): - result_ids[actor].append(actor.inc.remote(j**2 * 0.000001)) - # Kill a plasma store to get rid of the cached objects and trigger - # exit of the corresponding local scheduler. Don't kill the first - # local scheduler since that is the one that the driver is - # connected to. - process = ray.services.all_processes[ - ray.services.PROCESS_TYPE_PLASMA_STORE][i + 1] - process.kill() - process.wait() - - # Run some more methods. - for j in range(len(actors)): - actor = actors[j] - for _ in range(num_function_calls_at_a_time): - result_ids[actor].append(actor.inc.remote(j**2 * 0.000001)) - - # Get the results and check that they have the correct values. - for _, result_id_list in result_ids.items(): - results = list(range(1, len(result_id_list) + 1)) - assert ray.get(result_id_list) == results - - def setup_counter_actor(self, - test_checkpoint=False, - save_exception=False, - resume_exception=False): - ray.worker._init( - start_ray_local=True, - num_local_schedulers=2, - num_workers=0, - redirect_output=True) - - # Only set the checkpoint interval if we're testing with checkpointing. - checkpoint_interval = -1 - if test_checkpoint: - checkpoint_interval = 5 - - @ray.remote(checkpoint_interval=checkpoint_interval) - class Counter(object): - _resume_exception = resume_exception - - def __init__(self, save_exception): - self.x = 0 - self.num_inc_calls = 0 - self.save_exception = save_exception - self.restored = False - - def local_plasma(self): - return ray.worker.global_worker.plasma_client.store_socket_name - - def inc(self, *xs): - self.x += 1 - self.num_inc_calls += 1 - return self.x - - def get_num_inc_calls(self): - return self.num_inc_calls - - def test_restore(self): - # This method will only return True if __ray_restore__ has been - # called. - return self.restored - - def __ray_save__(self): - if self.save_exception: - raise Exception("Exception raised in checkpoint save") - return self.x, self.save_exception - - def __ray_restore__(self, checkpoint): - if self._resume_exception: - raise Exception("Exception raised in checkpoint resume") - self.x, self.save_exception = checkpoint - self.num_inc_calls = 0 - self.restored = True - - local_plasma = ray.worker.global_worker.plasma_client.store_socket_name - - # Create an actor that is not on the local scheduler. - actor = Counter.remote(save_exception) - while ray.get(actor.local_plasma.remote()) == local_plasma: - actor = Counter.remote(save_exception) - - args = [ray.put(0) for _ in range(100)] - ids = [actor.inc.remote(*args[i:]) for i in range(100)] - - return actor, ids - - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.") - def testCheckpointing(self): - actor, ids = self.setup_counter_actor(test_checkpoint=True) - # Wait for the last task to finish running. - ray.get(ids[-1]) - - # Kill the corresponding plasma store to get rid of the cached objects. - process = ray.services.all_processes[ - ray.services.PROCESS_TYPE_PLASMA_STORE][1] - process.kill() - process.wait() - - # Check that the actor restored from a checkpoint. - assert ray.get(actor.test_restore.remote()) - # Check that we can submit another call on the actor and get the - # correct counter result. - x = ray.get(actor.inc.remote()) - assert x == 101 - # Check that the number of inc calls since actor initialization is less - # than the counter value, since the actor initialized from a - # checkpoint. - num_inc_calls = ray.get(actor.get_num_inc_calls.remote()) - assert num_inc_calls < x - - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.") - def testRemoteCheckpoint(self): - actor, ids = self.setup_counter_actor(test_checkpoint=True) - - # Do a remote checkpoint call and wait for it to finish. - ray.get(actor.__ray_checkpoint__.remote()) - - # Kill the corresponding plasma store to get rid of the cached objects. - process = ray.services.all_processes[ - ray.services.PROCESS_TYPE_PLASMA_STORE][1] - process.kill() - process.wait() - - # Check that the actor restored from a checkpoint. - assert ray.get(actor.test_restore.remote()) - # Check that the number of inc calls since actor initialization is - # exactly zero, since there could not have been another inc call since - # the remote checkpoint. - num_inc_calls = ray.get(actor.get_num_inc_calls.remote()) - assert num_inc_calls == 0 - # Check that we can submit another call on the actor and get the - # correct counter result. - x = ray.get(actor.inc.remote()) - assert x == 101 - - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.") - def testLostCheckpoint(self): - actor, ids = self.setup_counter_actor(test_checkpoint=True) - # Wait for the first fraction of tasks to finish running. - ray.get(ids[len(ids) // 10]) - - # Kill the corresponding plasma store to get rid of the cached objects. - process = ray.services.all_processes[ - ray.services.PROCESS_TYPE_PLASMA_STORE][1] - process.kill() - process.wait() - - # Check that the actor restored from a checkpoint. - assert ray.get(actor.test_restore.remote()) - # Check that we can submit another call on the actor and get the - # correct counter result. - x = ray.get(actor.inc.remote()) - assert x == 101 - # Check that the number of inc calls since actor initialization is less - # than the counter value, since the actor initialized from a - # checkpoint. - num_inc_calls = ray.get(actor.get_num_inc_calls.remote()) - assert num_inc_calls < x - assert 5 < num_inc_calls - - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.") - def testCheckpointException(self): - actor, ids = self.setup_counter_actor( - test_checkpoint=True, save_exception=True) - # Wait for the last task to finish running. - ray.get(ids[-1]) - - # Kill the corresponding plasma store to get rid of the cached objects. - process = ray.services.all_processes[ - ray.services.PROCESS_TYPE_PLASMA_STORE][1] - process.kill() - process.wait() - - # Check that we can submit another call on the actor and get the - # correct counter result. - x = ray.get(actor.inc.remote()) - assert x == 101 - # Check that the number of inc calls since actor initialization is - # equal to the counter value, since the actor did not initialize from a - # checkpoint. - num_inc_calls = ray.get(actor.get_num_inc_calls.remote()) - assert num_inc_calls == x - # Check that errors were raised when trying to save the checkpoint. - errors = ray.error_info() - assert 0 < len(errors) - for error in errors: - assert error["type"] == ray_constants.CHECKPOINT_PUSH_ERROR - - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.") - def testCheckpointResumeException(self): - actor, ids = self.setup_counter_actor( - test_checkpoint=True, resume_exception=True) - # Wait for the last task to finish running. - ray.get(ids[-1]) - - # Kill the corresponding plasma store to get rid of the cached objects. - process = ray.services.all_processes[ - ray.services.PROCESS_TYPE_PLASMA_STORE][1] - process.kill() - process.wait() - - # Check that we can submit another call on the actor and get the - # correct counter result. - x = ray.get(actor.inc.remote()) - assert x == 101 - # Check that the number of inc calls since actor initialization is - # equal to the counter value, since the actor did not initialize from a - # checkpoint. - num_inc_calls = ray.get(actor.get_num_inc_calls.remote()) - assert num_inc_calls == x - # Check that an error was raised when trying to resume from the - # checkpoint. - errors = ray.error_info() - assert len(errors) == 1 - for error in errors: - assert error["type"] == ray_constants.CHECKPOINT_PUSH_ERROR - - @unittest.skip("Fork/join consistency not yet implemented.") - def testDistributedHandle(self): - counter, ids = self.setup_counter_actor(test_checkpoint=False) - - @ray.remote - def fork_many_incs(counter, num_incs): - x = None - for _ in range(num_incs): - x = counter.inc.remote() - # Only call ray.get() on the last task submitted. - return ray.get(x) - - # Fork num_iters times. - count = ray.get(ids[-1]) - num_incs = 100 - num_iters = 10 - forks = [ - fork_many_incs.remote(counter, num_incs) for _ in range(num_iters) - ] - ray.wait(forks, num_returns=len(forks)) - count += num_incs * num_iters - - # Kill the second plasma store to get rid of the cached objects and - # trigger the corresponding local scheduler to exit. - process = ray.services.all_processes[ - ray.services.PROCESS_TYPE_PLASMA_STORE][1] - process.kill() - process.wait() - - # Check that the actor did not restore from a checkpoint. - assert not ray.get(counter.test_restore.remote()) - # Check that we can submit another call on the actor and get the - # correct counter result. - x = ray.get(counter.inc.remote()) - assert x == count + 1 - - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.") - def testRemoteCheckpointDistributedHandle(self): - counter, ids = self.setup_counter_actor(test_checkpoint=True) - - @ray.remote - def fork_many_incs(counter, num_incs): - x = None - for _ in range(num_incs): - x = counter.inc.remote() - # Only call ray.get() on the last task submitted. - return ray.get(x) - - # Fork num_iters times. - count = ray.get(ids[-1]) - num_incs = 100 - num_iters = 10 - forks = [ - fork_many_incs.remote(counter, num_incs) for _ in range(num_iters) - ] - ray.wait(forks, num_returns=len(forks)) - ray.wait([counter.__ray_checkpoint__.remote()]) - count += num_incs * num_iters - - # Kill the second plasma store to get rid of the cached objects and - # trigger the corresponding local scheduler to exit. - process = ray.services.all_processes[ - ray.services.PROCESS_TYPE_PLASMA_STORE][1] - process.kill() - process.wait() - - # Check that the actor restored from a checkpoint. - assert ray.get(counter.test_restore.remote()) - # Check that the number of inc calls since actor initialization is - # exactly zero, since there could not have been another inc call since - # the remote checkpoint. - num_inc_calls = ray.get(counter.get_num_inc_calls.remote()) - assert num_inc_calls == 0 - # Check that we can submit another call on the actor and get the - # correct counter result. - x = ray.get(counter.inc.remote()) - assert x == count + 1 - - @unittest.skip("Fork/join consistency not yet implemented.") - def testCheckpointDistributedHandle(self): - counter, ids = self.setup_counter_actor(test_checkpoint=True) - - @ray.remote - def fork_many_incs(counter, num_incs): - x = None - for _ in range(num_incs): - x = counter.inc.remote() - # Only call ray.get() on the last task submitted. - return ray.get(x) - - # Fork num_iters times. - count = ray.get(ids[-1]) - num_incs = 100 - num_iters = 10 - forks = [ - fork_many_incs.remote(counter, num_incs) for _ in range(num_iters) - ] - ray.wait(forks, num_returns=len(forks)) - count += num_incs * num_iters - - # Kill the second plasma store to get rid of the cached objects and - # trigger the corresponding local scheduler to exit. - process = ray.services.all_processes[ - ray.services.PROCESS_TYPE_PLASMA_STORE][1] - process.kill() - process.wait() - - # Check that the actor restored from a checkpoint. - assert ray.get(counter.test_restore.remote()) - # Check that we can submit another call on the actor and get the - # correct counter result. - x = ray.get(counter.inc.remote()) - assert x == count + 1 - - def _testNondeterministicReconstruction( - self, num_forks, num_items_per_fork, num_forks_to_wait): - ray.worker._init( - start_ray_local=True, - num_local_schedulers=2, - num_workers=0, - redirect_output=True) - - # Make a shared queue. - @ray.remote - class Queue(object): - def __init__(self): - self.queue = [] - - def local_plasma(self): - return ray.worker.global_worker.plasma_client.store_socket_name - - def push(self, item): - self.queue.append(item) - - def read(self): - return self.queue - - # Schedule the shared queue onto the remote local scheduler. - local_plasma = ray.worker.global_worker.plasma_client.store_socket_name - actor = Queue.remote() - while ray.get(actor.local_plasma.remote()) == local_plasma: - actor = Queue.remote() - - # A task that takes in the shared queue and a list of items to enqueue, - # one by one. - @ray.remote - def enqueue(queue, items): - done = None - for item in items: - done = queue.push.remote(item) - # TODO(swang): Return the object ID returned by the last method - # called on the shared queue, so that the caller of enqueue can - # wait for all of the queue methods to complete. This can be - # removed once join consistency is implemented. - return [done] - - # Call the enqueue task num_forks times, each with num_items_per_fork - # unique objects to push onto the shared queue. - enqueue_tasks = [] - for fork in range(num_forks): - enqueue_tasks.append( - enqueue.remote(actor, - [(fork, i) for i in range(num_items_per_fork)])) - # Wait for the forks to complete their tasks. - enqueue_tasks = ray.get(enqueue_tasks) - enqueue_tasks = [fork_ids[0] for fork_ids in enqueue_tasks] - ray.wait(enqueue_tasks, num_returns=num_forks_to_wait) - - # Read the queue to get the initial order of execution. - queue = ray.get(actor.read.remote()) - - # Kill the second plasma store to get rid of the cached objects and - # trigger the corresponding local scheduler to exit. - process = ray.services.all_processes[ - ray.services.PROCESS_TYPE_PLASMA_STORE][1] - process.kill() - process.wait() - - # Read the queue again and check for deterministic reconstruction. - ray.get(enqueue_tasks) - reconstructed_queue = ray.get(actor.read.remote()) - # Make sure the final queue has all items from all forks. - assert len(reconstructed_queue) == num_forks * num_items_per_fork - # Make sure that the prefix of the final queue matches the queue from - # the initial execution. - assert queue == reconstructed_queue[:len(queue)] - - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), - "Currently doesn't work with the new GCS.") - def testNondeterministicReconstruction(self): - self._testNondeterministicReconstruction(10, 100, 10) - - @unittest.skip("Nondeterministic reconstruction currently not supported " - "when there are concurrent forks that didn't finish " - "initial execution.") - def testNondeterministicReconstructionConcurrentForks(self): - self._testNondeterministicReconstruction(10, 100, 1) - - -class DistributedActorHandles(unittest.TestCase): - def tearDown(self): - ray.shutdown() - - def setup_queue_actor(self): - ray.init() - - @ray.remote - class Queue(object): - def __init__(self): - self.queue = [] - - def enqueue(self, key, item): - self.queue.append((key, item)) - - def read(self): - return self.queue - - return Queue.remote() - - @unittest.skipIf( - os.environ.get("RAY_USE_XRAY") == "1", - "This test does not work with xray yet.") - def testFork(self): - queue = self.setup_queue_actor() - - @ray.remote - def fork(queue, key, item): - return ray.get(queue.enqueue.remote(key, item)) - - # Fork num_iters times. - num_iters = 100 - ray.get([fork.remote(queue, i, 0) for i in range(num_iters)]) - items = ray.get(queue.read.remote()) - for i in range(num_iters): - filtered_items = [item[1] for item in items if item[0] == i] - assert filtered_items == list(range(1)) - - @unittest.skipIf( - os.environ.get("RAY_USE_XRAY") == "1", - "This test does not work with xray yet.") - def testForkConsistency(self): - queue = self.setup_queue_actor() - - @ray.remote - def fork(queue, key, num_items): - x = None - for item in range(num_items): - x = queue.enqueue.remote(key, item) - return ray.get(x) - - # Fork num_iters times. - num_forks = 10 - num_items_per_fork = 100 - ray.get([ - fork.remote(queue, i, num_items_per_fork) for i in range(num_forks) - ]) - items = ray.get(queue.read.remote()) - for i in range(num_forks): - filtered_items = [item[1] for item in items if item[0] == i] - assert filtered_items == list(range(num_items_per_fork)) - - @unittest.skip("Garbage collection for distributed actor handles not " - "implemented.") - def testGarbageCollection(self): - queue = self.setup_queue_actor() - - @ray.remote - def fork(queue): - for i in range(10): - x = queue.enqueue.remote(0, i) - time.sleep(0.1) - return ray.get(x) - - x = fork.remote(queue) - ray.get(queue.read.remote()) - del queue - - print(ray.get(x)) - - def testCallingPutOnActorHandle(self): - ray.worker.init(num_workers=1) - - @ray.remote - class Counter(object): - def __init__(self): - self.x = 0 - - def inc(self): - self.x += 1 - return self.x - - @ray.remote - def f(): - return Counter.remote() - - @ray.remote - def g(): - return [Counter.remote()] - - # Currently, calling ray.put on an actor handle is allowed, but is - # there a good use case? - counter = Counter.remote() - counter_id = ray.put(counter) - new_counter = ray.get(counter_id) - assert ray.get(new_counter.inc.remote()) == 1 - assert ray.get(counter.inc.remote()) == 2 - assert ray.get(new_counter.inc.remote()) == 3 - - with pytest.raises(Exception): + end = time.monotonic() + self.interval = (start, end) + + def get_interval(self): + return self.interval + + def sleep(self): + start = time.monotonic() + time.sleep(0.01) + end = time.monotonic() + return start, end + + # First make sure that we do not have more actor methods running at a + # time than we have CPUs. + actors = [Foo.remote() for _ in range(4)] + interval_ids = [] + interval_ids += [actor.get_interval.remote() for actor in actors] + for _ in range(4): + interval_ids += [actor.sleep.remote() for actor in actors] + + # Make sure that the intervals don't overlap. + intervals = ray.get(interval_ids) + intervals.sort(key=lambda x: x[0]) + for interval1, interval2 in zip(intervals[:-1], intervals[1:]): + assert interval1[0] < interval1[1] + assert interval1[1] < interval2[0] + assert interval2[0] < interval2[1] + + +def test_blocking_actor_task(shutdown_only): + ray.init(num_cpus=1, num_gpus=1) + + @ray.remote(num_gpus=1) + def f(): + return 1 + + @ray.remote + class Foo(object): + def __init__(self): + pass + + def blocking_method(self): ray.get(f.remote()) - # The below test works, but do we want to disallow this usage? - ray.get(g.remote()) + # Make sure we can execute a blocking actor method even if there is + # only one CPU. + actor = Foo.remote() + ray.get(actor.blocking_method.remote()) - def testPicklingActorHandle(self): - ray.worker.init(num_workers=1) + @ray.remote(num_cpus=1) + class CPUFoo(object): + def __init__(self): + pass - @ray.remote - class Foo(object): - def method(self): - pass + def blocking_method(self): + ray.get(f.remote()) - f = Foo.remote() - new_f = ray.worker.pickle.loads(ray.worker.pickle.dumps(f)) - # Verify that we can call a method on the unpickled handle. TODO(rkn): - # we should also test this from a different driver. - ray.get(new_f.method.remote()) + # Make sure that lifetime CPU resources are not released when actors + # block. + actor = CPUFoo.remote() + x_id = actor.blocking_method.remote() + ready_ids, remaining_ids = ray.wait([x_id], timeout=1000) + assert ready_ids == [] + assert remaining_ids == [x_id] - def testRegisterAndGetNamedActors(self): - # TODO(heyucongtom): We should test this from another driver. - ray.worker.init(num_workers=1) + @ray.remote(num_gpus=1) + class GPUFoo(object): + def __init__(self): + pass - @ray.remote - class Foo(object): - def __init__(self): - self.x = 0 + def blocking_method(self): + ray.get(f.remote()) - def method(self): - self.x += 1 - return self.x + # Make sure that GPU resources are not released when actors block. + actor = GPUFoo.remote() + x_id = actor.blocking_method.remote() + ready_ids, remaining_ids = ray.wait([x_id], timeout=1000) + assert ready_ids == [] + assert remaining_ids == [x_id] - f1 = Foo.remote() - # Test saving f. - ray.experimental.register_actor("f1", f1) - # Test getting f. - f2 = ray.experimental.get_actor("f1") - assert f1._actor_id == f2._actor_id - # Test same name register shall raise error. - with pytest.raises(ValueError): - ray.experimental.register_actor("f1", f2) +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") != "1", + reason="This test only works with xray.") +def test_exception_raised_when_actor_node_dies(shutdown_only): + ray.worker._init(start_ray_local=True, num_local_schedulers=2, num_cpus=1) - # Test register with wrong object type. - with pytest.raises(TypeError): - ray.experimental.register_actor("f3", 1) + @ray.remote + class Counter(object): + def __init__(self): + self.x = 0 - # Test getting a nonexistent actor. - with pytest.raises(ValueError): - ray.experimental.get_actor("nonexistent") + def local_plasma(self): + return ray.worker.global_worker.plasma_client.store_socket_name - # Test method - assert ray.get(f1.method.remote()) == 1 - assert ray.get(f2.method.remote()) == 2 - assert ray.get(f1.method.remote()) == 3 - assert ray.get(f2.method.remote()) == 4 + def inc(self): + self.x += 1 + return self.x + + local_plasma = ray.worker.global_worker.plasma_client.store_socket_name + + # Create an actor that is not on the local scheduler. + actor = Counter.remote() + while ray.get(actor.local_plasma.remote()) == local_plasma: + actor = Counter.remote() + + # Kill the second plasma store to get rid of the cached objects and + # trigger the corresponding local scheduler to exit. + process = ray.services.all_processes[ + ray.services.PROCESS_TYPE_PLASMA_STORE][1] + process.kill() + + # Submit some new actor tasks. + x_ids = [actor.inc.remote() for _ in range(100)] + + # Make sure that getting the result raises an exception. + for _ in range(10): + for x_id in x_ids: + with pytest.raises(ray.worker.RayGetError): + # There is some small chance that ray.get will actually + # succeed (if the object is transferred before the raylet + # dies). + ray.get(x_id) + + # Make sure the process has exited. + process.wait() + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not work with xray yet.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_local_scheduler_dying(shutdown_only): + ray.worker._init( + start_ray_local=True, + num_local_schedulers=2, + num_workers=0, + redirect_output=True) + + @ray.remote + class Counter(object): + def __init__(self): + self.x = 0 + + def local_plasma(self): + return ray.worker.global_worker.plasma_client.store_socket_name + + def inc(self): + self.x += 1 + return self.x + + local_plasma = ray.worker.global_worker.plasma_client.store_socket_name + + # Create an actor that is not on the local scheduler. + actor = Counter.remote() + while ray.get(actor.local_plasma.remote()) == local_plasma: + actor = Counter.remote() + + ids = [actor.inc.remote() for _ in range(100)] + + # Wait for the last task to finish running. + ray.get(ids[-1]) + + # Kill the second plasma store to get rid of the cached objects and + # trigger the corresponding local scheduler to exit. + process = ray.services.all_processes[ + ray.services.PROCESS_TYPE_PLASMA_STORE][1] + process.kill() + process.wait() + + # Get all of the results + results = ray.get(ids) + + assert results == list(range(1, 1 + len(results))) + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not work with xray yet.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_many_local_schedulers_dying(shutdown_only): + # This test can be made more stressful by increasing the numbers below. + # The total number of actors created will be + # num_actors_at_a_time * num_local_schedulers. + num_local_schedulers = 5 + num_actors_at_a_time = 3 + num_function_calls_at_a_time = 10 + + ray.worker._init( + start_ray_local=True, + num_local_schedulers=num_local_schedulers, + num_cpus=3, + redirect_output=True) + + @ray.remote + class SlowCounter(object): + def __init__(self): + self.x = 0 + + def inc(self, duration): + time.sleep(duration) + self.x += 1 + return self.x + + # Create some initial actors. + actors = [SlowCounter.remote() for _ in range(num_actors_at_a_time)] + + # Wait for the actors to start up. + time.sleep(1) + + # This is a mapping from actor handles to object IDs returned by + # methods on that actor. + result_ids = collections.defaultdict(lambda: []) + + # In a loop we are going to create some actors, run some methods, kill + # a local scheduler, and run some more methods. + for i in range(num_local_schedulers - 1): + # Create some actors. + actors.extend( + [SlowCounter.remote() for _ in range(num_actors_at_a_time)]) + # Run some methods. + for j in range(len(actors)): + actor = actors[j] + for _ in range(num_function_calls_at_a_time): + result_ids[actor].append(actor.inc.remote(j**2 * 0.000001)) + # Kill a plasma store to get rid of the cached objects and trigger + # exit of the corresponding local scheduler. Don't kill the first + # local scheduler since that is the one that the driver is + # connected to. + process = ray.services.all_processes[ + ray.services.PROCESS_TYPE_PLASMA_STORE][i + 1] + process.kill() + process.wait() + + # Run some more methods. + for j in range(len(actors)): + actor = actors[j] + for _ in range(num_function_calls_at_a_time): + result_ids[actor].append(actor.inc.remote(j**2 * 0.000001)) + + # Get the results and check that they have the correct values. + for _, result_id_list in result_ids.items(): + results = list(range(1, len(result_id_list) + 1)) + assert ray.get(result_id_list) == results + + +def setup_counter_actor(test_checkpoint=False, + save_exception=False, + resume_exception=False): + ray.worker._init( + start_ray_local=True, + num_local_schedulers=2, + num_workers=0, + redirect_output=True) + + # Only set the checkpoint interval if we're testing with checkpointing. + checkpoint_interval = -1 + if test_checkpoint: + checkpoint_interval = 5 + + @ray.remote(checkpoint_interval=checkpoint_interval) + class Counter(object): + _resume_exception = resume_exception + + def __init__(self, save_exception): + self.x = 0 + self.num_inc_calls = 0 + self.save_exception = save_exception + self.restored = False + + def local_plasma(self): + return ray.worker.global_worker.plasma_client.store_socket_name + + def inc(self, *xs): + self.x += 1 + self.num_inc_calls += 1 + return self.x + + def get_num_inc_calls(self): + return self.num_inc_calls + + def test_restore(self): + # This method will only return True if __ray_restore__ has been + # called. + return self.restored + + def __ray_save__(self): + if self.save_exception: + raise Exception("Exception raised in checkpoint save") + return self.x, self.save_exception + + def __ray_restore__(self, checkpoint): + if self._resume_exception: + raise Exception("Exception raised in checkpoint resume") + self.x, self.save_exception = checkpoint + self.num_inc_calls = 0 + self.restored = True + + local_plasma = ray.worker.global_worker.plasma_client.store_socket_name + + # Create an actor that is not on the local scheduler. + actor = Counter.remote(save_exception) + while ray.get(actor.local_plasma.remote()) == local_plasma: + actor = Counter.remote(save_exception) + + args = [ray.put(0) for _ in range(100)] + ids = [actor.inc.remote(*args[i:]) for i in range(100)] + + return actor, ids + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not work with xray yet.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_checkpointing(shutdown_only): + actor, ids = setup_counter_actor(test_checkpoint=True) + # Wait for the last task to finish running. + ray.get(ids[-1]) + + # Kill the corresponding plasma store to get rid of the cached objects. + process = ray.services.all_processes[ + ray.services.PROCESS_TYPE_PLASMA_STORE][1] + process.kill() + process.wait() + + # Check that the actor restored from a checkpoint. + assert ray.get(actor.test_restore.remote()) + # Check that we can submit another call on the actor and get the + # correct counter result. + x = ray.get(actor.inc.remote()) + assert x == 101 + # Check that the number of inc calls since actor initialization is less + # than the counter value, since the actor initialized from a + # checkpoint. + num_inc_calls = ray.get(actor.get_num_inc_calls.remote()) + assert num_inc_calls < x + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not work with xray yet.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_remote_checkpoint(shutdown_only): + actor, ids = setup_counter_actor(test_checkpoint=True) + + # Do a remote checkpoint call and wait for it to finish. + ray.get(actor.__ray_checkpoint__.remote()) + + # Kill the corresponding plasma store to get rid of the cached objects. + process = ray.services.all_processes[ + ray.services.PROCESS_TYPE_PLASMA_STORE][1] + process.kill() + process.wait() + + # Check that the actor restored from a checkpoint. + assert ray.get(actor.test_restore.remote()) + # Check that the number of inc calls since actor initialization is + # exactly zero, since there could not have been another inc call since + # the remote checkpoint. + num_inc_calls = ray.get(actor.get_num_inc_calls.remote()) + assert num_inc_calls == 0 + # Check that we can submit another call on the actor and get the + # correct counter result. + x = ray.get(actor.inc.remote()) + assert x == 101 + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not work with xray yet.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_lost_checkpoint(shutdown_only): + actor, ids = setup_counter_actor(test_checkpoint=True) + # Wait for the first fraction of tasks to finish running. + ray.get(ids[len(ids) // 10]) + + # Kill the corresponding plasma store to get rid of the cached objects. + process = ray.services.all_processes[ + ray.services.PROCESS_TYPE_PLASMA_STORE][1] + process.kill() + process.wait() + + # Check that the actor restored from a checkpoint. + assert ray.get(actor.test_restore.remote()) + # Check that we can submit another call on the actor and get the + # correct counter result. + x = ray.get(actor.inc.remote()) + assert x == 101 + # Check that the number of inc calls since actor initialization is less + # than the counter value, since the actor initialized from a + # checkpoint. + num_inc_calls = ray.get(actor.get_num_inc_calls.remote()) + assert num_inc_calls < x + assert 5 < num_inc_calls + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not work with xray yet.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_checkpoint_exception(shutdown_only): + actor, ids = setup_counter_actor(test_checkpoint=True, save_exception=True) + # Wait for the last task to finish running. + ray.get(ids[-1]) + + # Kill the corresponding plasma store to get rid of the cached objects. + process = ray.services.all_processes[ + ray.services.PROCESS_TYPE_PLASMA_STORE][1] + process.kill() + process.wait() + + # Check that we can submit another call on the actor and get the + # correct counter result. + x = ray.get(actor.inc.remote()) + assert x == 101 + # Check that the number of inc calls since actor initialization is + # equal to the counter value, since the actor did not initialize from a + # checkpoint. + num_inc_calls = ray.get(actor.get_num_inc_calls.remote()) + assert num_inc_calls == x + # Check that errors were raised when trying to save the checkpoint. + errors = ray.error_info() + assert 0 < len(errors) + for error in errors: + assert error["type"] == ray_constants.CHECKPOINT_PUSH_ERROR + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not work with xray yet.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_checkpoint_resume_exception(shutdown_only): + actor, ids = setup_counter_actor( + test_checkpoint=True, resume_exception=True) + # Wait for the last task to finish running. + ray.get(ids[-1]) + + # Kill the corresponding plasma store to get rid of the cached objects. + process = ray.services.all_processes[ + ray.services.PROCESS_TYPE_PLASMA_STORE][1] + process.kill() + process.wait() + + # Check that we can submit another call on the actor and get the + # correct counter result. + x = ray.get(actor.inc.remote()) + assert x == 101 + # Check that the number of inc calls since actor initialization is + # equal to the counter value, since the actor did not initialize from a + # checkpoint. + num_inc_calls = ray.get(actor.get_num_inc_calls.remote()) + assert num_inc_calls == x + # Check that an error was raised when trying to resume from the + # checkpoint. + errors = ray.error_info() + assert len(errors) == 1 + for error in errors: + assert error["type"] == ray_constants.CHECKPOINT_PUSH_ERROR + + +@pytest.mark.skip("Fork/join consistency not yet implemented.") +def test_distributed_handle(self): + counter, ids = setup_counter_actor(test_checkpoint=False) + + @ray.remote + def fork_many_incs(counter, num_incs): + x = None + for _ in range(num_incs): + x = counter.inc.remote() + # Only call ray.get() on the last task submitted. + return ray.get(x) + + # Fork num_iters times. + count = ray.get(ids[-1]) + num_incs = 100 + num_iters = 10 + forks = [ + fork_many_incs.remote(counter, num_incs) for _ in range(num_iters) + ] + ray.wait(forks, num_returns=len(forks)) + count += num_incs * num_iters + + # Kill the second plasma store to get rid of the cached objects and + # trigger the corresponding local scheduler to exit. + process = ray.services.all_processes[ + ray.services.PROCESS_TYPE_PLASMA_STORE][1] + process.kill() + process.wait() + + # Check that the actor did not restore from a checkpoint. + assert not ray.get(counter.test_restore.remote()) + # Check that we can submit another call on the actor and get the + # correct counter result. + x = ray.get(counter.inc.remote()) + assert x == count + 1 + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not work with xray yet.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_remote_checkpoint_distributed_handle(shutdown_only): + counter, ids = setup_counter_actor(test_checkpoint=True) + + @ray.remote + def fork_many_incs(counter, num_incs): + x = None + for _ in range(num_incs): + x = counter.inc.remote() + # Only call ray.get() on the last task submitted. + return ray.get(x) + + # Fork num_iters times. + count = ray.get(ids[-1]) + num_incs = 100 + num_iters = 10 + forks = [ + fork_many_incs.remote(counter, num_incs) for _ in range(num_iters) + ] + ray.wait(forks, num_returns=len(forks)) + ray.wait([counter.__ray_checkpoint__.remote()]) + count += num_incs * num_iters + + # Kill the second plasma store to get rid of the cached objects and + # trigger the corresponding local scheduler to exit. + process = ray.services.all_processes[ + ray.services.PROCESS_TYPE_PLASMA_STORE][1] + process.kill() + process.wait() + + # Check that the actor restored from a checkpoint. + assert ray.get(counter.test_restore.remote()) + # Check that the number of inc calls since actor initialization is + # exactly zero, since there could not have been another inc call since + # the remote checkpoint. + num_inc_calls = ray.get(counter.get_num_inc_calls.remote()) + assert num_inc_calls == 0 + # Check that we can submit another call on the actor and get the + # correct counter result. + x = ray.get(counter.inc.remote()) + assert x == count + 1 + + +@pytest.mark.skip("Fork/join consistency not yet implemented.") +def test_checkpoint_distributed_handle(shutdown_only): + counter, ids = setup_counter_actor(test_checkpoint=True) + + @ray.remote + def fork_many_incs(counter, num_incs): + x = None + for _ in range(num_incs): + x = counter.inc.remote() + # Only call ray.get() on the last task submitted. + return ray.get(x) + + # Fork num_iters times. + count = ray.get(ids[-1]) + num_incs = 100 + num_iters = 10 + forks = [ + fork_many_incs.remote(counter, num_incs) for _ in range(num_iters) + ] + ray.wait(forks, num_returns=len(forks)) + count += num_incs * num_iters + + # Kill the second plasma store to get rid of the cached objects and + # trigger the corresponding local scheduler to exit. + process = ray.services.all_processes[ + ray.services.PROCESS_TYPE_PLASMA_STORE][1] + process.kill() + process.wait() + + # Check that the actor restored from a checkpoint. + assert ray.get(counter.test_restore.remote()) + # Check that we can submit another call on the actor and get the + # correct counter result. + x = ray.get(counter.inc.remote()) + assert x == count + 1 + + +def _test_nondeterministic_reconstruction(num_forks, num_items_per_fork, + num_forks_to_wait): + ray.worker._init( + start_ray_local=True, + num_local_schedulers=2, + num_workers=0, + redirect_output=True) + + # Make a shared queue. + @ray.remote + class Queue(object): + def __init__(self): + self.queue = [] + + def local_plasma(self): + return ray.worker.global_worker.plasma_client.store_socket_name + + def push(self, item): + self.queue.append(item) + + def read(self): + return self.queue + + # Schedule the shared queue onto the remote local scheduler. + local_plasma = ray.worker.global_worker.plasma_client.store_socket_name + actor = Queue.remote() + while ray.get(actor.local_plasma.remote()) == local_plasma: + actor = Queue.remote() + + # A task that takes in the shared queue and a list of items to enqueue, + # one by one. + @ray.remote + def enqueue(queue, items): + done = None + for item in items: + done = queue.push.remote(item) + # TODO(swang): Return the object ID returned by the last method + # called on the shared queue, so that the caller of enqueue can + # wait for all of the queue methods to complete. This can be + # removed once join consistency is implemented. + return [done] + + # Call the enqueue task num_forks times, each with num_items_per_fork + # unique objects to push onto the shared queue. + enqueue_tasks = [] + for fork in range(num_forks): + enqueue_tasks.append( + enqueue.remote(actor, + [(fork, i) for i in range(num_items_per_fork)])) + # Wait for the forks to complete their tasks. + enqueue_tasks = ray.get(enqueue_tasks) + enqueue_tasks = [fork_ids[0] for fork_ids in enqueue_tasks] + ray.wait(enqueue_tasks, num_returns=num_forks_to_wait) + + # Read the queue to get the initial order of execution. + queue = ray.get(actor.read.remote()) + + # Kill the second plasma store to get rid of the cached objects and + # trigger the corresponding local scheduler to exit. + process = ray.services.all_processes[ + ray.services.PROCESS_TYPE_PLASMA_STORE][1] + process.kill() + process.wait() + + # Read the queue again and check for deterministic reconstruction. + ray.get(enqueue_tasks) + reconstructed_queue = ray.get(actor.read.remote()) + # Make sure the final queue has all items from all forks. + assert len(reconstructed_queue) == num_forks * num_items_per_fork + # Make sure that the prefix of the final queue matches the queue from + # the initial execution. + assert queue == reconstructed_queue[:len(queue)] + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not work with xray yet.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Currently doesn't work with the new GCS.") +def test_nondeterministic_reconstruction(shutdown_only): + _test_nondeterministic_reconstruction(10, 100, 10) + + +@pytest.mark.skip("Nondeterministic reconstruction currently not supported " + "when there are concurrent forks that didn't finish " + "initial execution.") +def test_nondeterministic_reconstruction_concurrent_forks(shutdown_only): + _test_nondeterministic_reconstruction(10, 100, 1) @pytest.fixture -def ray_stop(): - # The initialization code depends on the test that is run. - yield None +def setup_queue_actor(): + ray.init(num_cpus=1) + + @ray.remote + class Queue(object): + def __init__(self): + self.queue = [] + + def enqueue(self, key, item): + self.queue.append((key, item)) + + def read(self): + return self.queue + + yield Queue.remote() + # The code after the yield will run as teardown code. ray.shutdown() -@unittest.skipIf(sys.version_info < (3, 0), - "This test is currently failing on Python 2.7.") -def testLifetimeAndTransientResources(ray_stop): - ray.init(num_cpus=1) +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not work with xray yet.") +def test_fork(setup_queue_actor): + queue = setup_queue_actor + @ray.remote + def fork(queue, key, item): + return ray.get(queue.enqueue.remote(key, item)) + + # Fork num_iters times. + num_iters = 100 + ray.get([fork.remote(queue, i, 0) for i in range(num_iters)]) + items = ray.get(queue.read.remote()) + for i in range(num_iters): + filtered_items = [item[1] for item in items if item[0] == i] + assert filtered_items == list(range(1)) + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not work with xray yet.") +def test_fork_consistency(setup_queue_actor): + queue = setup_queue_actor + + @ray.remote + def fork(queue, key, num_items): + x = None + for item in range(num_items): + x = queue.enqueue.remote(key, item) + return ray.get(x) + + # Fork num_iters times. + num_forks = 10 + num_items_per_fork = 100 + ray.get( + [fork.remote(queue, i, num_items_per_fork) for i in range(num_forks)]) + items = ray.get(queue.read.remote()) + for i in range(num_forks): + filtered_items = [item[1] for item in items if item[0] == i] + assert filtered_items == list(range(num_items_per_fork)) + + +@pytest.mark.skip("Garbage collection for distributed actor handles not " + "implemented.") +def test_garbage_collection(setup_queue_actor): + queue = setup_queue_actor + + @ray.remote + def fork(queue): + for i in range(10): + x = queue.enqueue.remote(0, i) + time.sleep(0.1) + return ray.get(x) + + x = fork.remote(queue) + ray.get(queue.read.remote()) + del queue + + print(ray.get(x)) + + +def test_calling_put_on_actor_handle(ray_start_regular): + @ray.remote + class Counter(object): + def __init__(self): + self.x = 0 + + def inc(self): + self.x += 1 + return self.x + + @ray.remote + def f(): + return Counter.remote() + + @ray.remote + def g(): + return [Counter.remote()] + + # Currently, calling ray.put on an actor handle is allowed, but is + # there a good use case? + counter = Counter.remote() + counter_id = ray.put(counter) + new_counter = ray.get(counter_id) + assert ray.get(new_counter.inc.remote()) == 1 + assert ray.get(counter.inc.remote()) == 2 + assert ray.get(new_counter.inc.remote()) == 3 + + with pytest.raises(Exception): + ray.get(f.remote()) + + # The below test works, but do we want to disallow this usage? + ray.get(g.remote()) + + +def test_pickling_actor_handle(ray_start_regular): + @ray.remote + class Foo(object): + def method(self): + pass + + f = Foo.remote() + new_f = ray.worker.pickle.loads(ray.worker.pickle.dumps(f)) + # Verify that we can call a method on the unpickled handle. TODO(rkn): + # we should also test this from a different driver. + ray.get(new_f.method.remote()) + + +def test_register_and_get_named_actors(ray_start_regular): + # TODO(heyucongtom): We should test this from another driver. + + @ray.remote + class Foo(object): + def __init__(self): + self.x = 0 + + def method(self): + self.x += 1 + return self.x + + f1 = Foo.remote() + # Test saving f. + ray.experimental.register_actor("f1", f1) + # Test getting f. + f2 = ray.experimental.get_actor("f1") + assert f1._actor_id == f2._actor_id + + # Test same name register shall raise error. + with pytest.raises(ValueError): + ray.experimental.register_actor("f1", f2) + + # Test register with wrong object type. + with pytest.raises(TypeError): + ray.experimental.register_actor("f3", 1) + + # Test getting a nonexistent actor. + with pytest.raises(ValueError): + ray.experimental.get_actor("nonexistent") + + # Test method + assert ray.get(f1.method.remote()) == 1 + assert ray.get(f2.method.remote()) == 2 + assert ray.get(f1.method.remote()) == 3 + assert ray.get(f2.method.remote()) == 4 + + +@pytest.mark.skipif( + sys.version_info < (3, 0), + reason="This test is currently failing on Python 2.7.") +def test_lifetime_and_transient_resources(ray_start_regular): # This actor acquires resources only when running methods. @ray.remote class Actor1(object): @@ -2013,7 +2014,7 @@ def testLifetimeAndTransientResources(ray_stop): assert len(ready_ids) == 1 -def testCustomLabelPlacement(ray_stop): +def test_custom_label_placement(shutdown_only): ray.worker._init( start_ray_local=True, num_local_schedulers=2, @@ -2047,7 +2048,7 @@ def testCustomLabelPlacement(ray_stop): assert location != local_plasma -def testCreatingMoreActorsThanResources(ray_stop): +def test_creating_more_actors_than_resources(shutdown_only): ray.init( num_workers=0, num_cpus=10, @@ -2097,7 +2098,3 @@ def testCreatingMoreActorsThanResources(ray_stop): ray.wait([object_id]) ray.get(results) - - -if __name__ == "__main__": - unittest.main(verbosity=2) diff --git a/test/array_test.py b/test/array_test.py index 5b71cea9e..e5838f89d 100644 --- a/test/array_test.py +++ b/test/array_test.py @@ -4,8 +4,8 @@ from __future__ import print_function import numpy as np from numpy.testing import assert_equal, assert_almost_equal +import pytest import sys -import unittest import ray import ray.experimental.array.remote as ra @@ -15,229 +15,224 @@ if sys.version_info >= (3, 0): from importlib import reload -class RemoteArrayTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() - - def testMethods(self): - for module in [ - ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg - ]: - reload(module) - ray.init() - - # test eye - object_id = ra.eye.remote(3) - val = ray.get(object_id) - assert_almost_equal(val, np.eye(3)) - - # test zeros - object_id = ra.zeros.remote([3, 4, 5]) - val = ray.get(object_id) - assert_equal(val, np.zeros([3, 4, 5])) - - # test qr - pass by value - a_val = np.random.normal(size=[10, 11]) - q_id, r_id = ra.linalg.qr.remote(a_val) - q_val = ray.get(q_id) - r_val = ray.get(r_id) - assert_almost_equal(np.dot(q_val, r_val), a_val) - - # test qr - pass by objectid - a = ra.random.normal.remote([10, 13]) - q_id, r_id = ra.linalg.qr.remote(a) - a_val = ray.get(a) - q_val = ray.get(q_id) - r_val = ray.get(r_id) - assert_almost_equal(np.dot(q_val, r_val), a_val) +@pytest.fixture +def ray_start_regular(): + for module in [ + ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg + ]: + reload(module) + # Start the Ray processes. + ray.init(num_cpus=2) + yield None + # The code after the yield will run as teardown code. + ray.shutdown() -class DistributedArrayTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() +def test_remote_array_methods(ray_start_regular): + # test eye + object_id = ra.eye.remote(3) + val = ray.get(object_id) + assert_almost_equal(val, np.eye(3)) - def testAssemble(self): - for module in [ - ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg - ]: - reload(module) - ray.init() + # test zeros + object_id = ra.zeros.remote([3, 4, 5]) + val = ray.get(object_id) + assert_equal(val, np.zeros([3, 4, 5])) - a = ra.ones.remote([da.BLOCK_SIZE, da.BLOCK_SIZE]) - b = ra.zeros.remote([da.BLOCK_SIZE, da.BLOCK_SIZE]) - x = da.DistArray([2 * da.BLOCK_SIZE, da.BLOCK_SIZE], - np.array([[a], [b]])) - assert_equal( - x.assemble(), - np.vstack([ - np.ones([da.BLOCK_SIZE, da.BLOCK_SIZE]), - np.zeros([da.BLOCK_SIZE, da.BLOCK_SIZE]) - ])) + # test qr - pass by value + a_val = np.random.normal(size=[10, 11]) + q_id, r_id = ra.linalg.qr.remote(a_val) + q_val = ray.get(q_id) + r_val = ray.get(r_id) + assert_almost_equal(np.dot(q_val, r_val), a_val) - def testMethods(self): - for module in [ - ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg - ]: - reload(module) - ray.worker._init( - start_ray_local=True, num_local_schedulers=2, num_cpus=[10, 10]) - - x = da.zeros.remote([9, 25, 51], "float") - assert_equal(ray.get(da.assemble.remote(x)), np.zeros([9, 25, 51])) - - x = da.ones.remote([11, 25, 49], dtype_name="float") - assert_equal(ray.get(da.assemble.remote(x)), np.ones([11, 25, 49])) - - x = da.random.normal.remote([11, 25, 49]) - y = da.copy.remote(x) - assert_equal( - ray.get(da.assemble.remote(x)), ray.get(da.assemble.remote(y))) - - x = da.eye.remote(25, dtype_name="float") - assert_equal(ray.get(da.assemble.remote(x)), np.eye(25)) - - x = da.random.normal.remote([25, 49]) - y = da.triu.remote(x) - assert_equal( - ray.get(da.assemble.remote(y)), - np.triu(ray.get(da.assemble.remote(x)))) - - x = da.random.normal.remote([25, 49]) - y = da.tril.remote(x) - assert_equal( - ray.get(da.assemble.remote(y)), - np.tril(ray.get(da.assemble.remote(x)))) - - x = da.random.normal.remote([25, 49]) - y = da.random.normal.remote([49, 18]) - z = da.dot.remote(x, y) - w = da.assemble.remote(z) - u = da.assemble.remote(x) - v = da.assemble.remote(y) - assert_almost_equal(ray.get(w), np.dot(ray.get(u), ray.get(v))) - assert_almost_equal(ray.get(w), np.dot(ray.get(u), ray.get(v))) - - # test add - x = da.random.normal.remote([23, 42]) - y = da.random.normal.remote([23, 42]) - z = da.add.remote(x, y) - assert_almost_equal( - ray.get(da.assemble.remote(z)), - ray.get(da.assemble.remote(x)) + ray.get(da.assemble.remote(y))) - - # test subtract - x = da.random.normal.remote([33, 40]) - y = da.random.normal.remote([33, 40]) - z = da.subtract.remote(x, y) - assert_almost_equal( - ray.get(da.assemble.remote(z)), - ray.get(da.assemble.remote(x)) - ray.get(da.assemble.remote(y))) - - # test transpose - x = da.random.normal.remote([234, 432]) - y = da.transpose.remote(x) - assert_equal( - ray.get(da.assemble.remote(x)).T, ray.get(da.assemble.remote(y))) - - # test numpy_to_dist - x = da.random.normal.remote([23, 45]) - y = da.assemble.remote(x) - z = da.numpy_to_dist.remote(y) - w = da.assemble.remote(z) - assert_equal( - ray.get(da.assemble.remote(x)), ray.get(da.assemble.remote(z))) - assert_equal(ray.get(y), ray.get(w)) - - # test da.tsqr - for shape in [[123, da.BLOCK_SIZE], [7, da.BLOCK_SIZE], - [da.BLOCK_SIZE, da.BLOCK_SIZE], [da.BLOCK_SIZE, 7], - [10 * da.BLOCK_SIZE, da.BLOCK_SIZE]]: - x = da.random.normal.remote(shape) - K = min(shape) - q, r = da.linalg.tsqr.remote(x) - x_val = ray.get(da.assemble.remote(x)) - q_val = ray.get(da.assemble.remote(q)) - r_val = ray.get(r) - assert r_val.shape == (K, shape[1]) - assert_equal(r_val, np.triu(r_val)) - assert_almost_equal(x_val, np.dot(q_val, r_val)) - assert_almost_equal(np.dot(q_val.T, q_val), np.eye(K)) - - # test da.linalg.modified_lu - def test_modified_lu(d1, d2): - print("testing dist_modified_lu with d1 = " + str(d1) + ", d2 = " + - str(d2)) - assert d1 >= d2 - m = ra.random.normal.remote([d1, d2]) - q, r = ra.linalg.qr.remote(m) - l, u, s = da.linalg.modified_lu.remote(da.numpy_to_dist.remote(q)) - q_val = ray.get(q) - ray.get(r) - l_val = ray.get(da.assemble.remote(l)) - u_val = ray.get(u) - s_val = ray.get(s) - s_mat = np.zeros((d1, d2)) - for i in range(len(s_val)): - s_mat[i, i] = s_val[i] - # Check that q - s = l * u. - assert_almost_equal(q_val - s_mat, np.dot(l_val, u_val)) - # Check that u is upper triangular. - assert_equal(np.triu(u_val), u_val) - # Check that l is lower triangular. - assert_equal(np.tril(l_val), l_val) - - for d1, d2 in [(100, 100), (99, 98), (7, 5), (7, 7), (20, 7), (20, - 10)]: - test_modified_lu(d1, d2) - - # test dist_tsqr_hr - def test_dist_tsqr_hr(d1, d2): - print("testing dist_tsqr_hr with d1 = " + str(d1) + ", d2 = " + - str(d2)) - a = da.random.normal.remote([d1, d2]) - y, t, y_top, r = da.linalg.tsqr_hr.remote(a) - a_val = ray.get(da.assemble.remote(a)) - y_val = ray.get(da.assemble.remote(y)) - t_val = ray.get(t) - y_top_val = ray.get(y_top) - r_val = ray.get(r) - tall_eye = np.zeros((d1, min(d1, d2))) - np.fill_diagonal(tall_eye, 1) - q = tall_eye - np.dot(y_val, np.dot(t_val, y_top_val.T)) - # Check that q.T * q = I. - assert_almost_equal(np.dot(q.T, q), np.eye(min(d1, d2))) - # Check that a = (I - y * t * y_top.T) * r. - assert_almost_equal(np.dot(q, r_val), a_val) - - for d1, d2 in [(123, da.BLOCK_SIZE), (7, da.BLOCK_SIZE), - (da.BLOCK_SIZE, da.BLOCK_SIZE), (da.BLOCK_SIZE, 7), - (10 * da.BLOCK_SIZE, da.BLOCK_SIZE)]: - test_dist_tsqr_hr(d1, d2) - - def test_dist_qr(d1, d2): - print("testing qr with d1 = {}, and d2 = {}.".format(d1, d2)) - a = da.random.normal.remote([d1, d2]) - K = min(d1, d2) - q, r = da.linalg.qr.remote(a) - a_val = ray.get(da.assemble.remote(a)) - q_val = ray.get(da.assemble.remote(q)) - r_val = ray.get(da.assemble.remote(r)) - assert q_val.shape == (d1, K) - assert r_val.shape == (K, d2) - assert_almost_equal(np.dot(q_val.T, q_val), np.eye(K)) - assert_equal(r_val, np.triu(r_val)) - assert_almost_equal(a_val, np.dot(q_val, r_val)) - - for d1, d2 in [(123, da.BLOCK_SIZE), (7, da.BLOCK_SIZE), - (da.BLOCK_SIZE, da.BLOCK_SIZE), (da.BLOCK_SIZE, 7), - (13, 21), (34, 35), (8, 7)]: - test_dist_qr(d1, d2) - test_dist_qr(d2, d1) - for _ in range(20): - d1 = np.random.randint(1, 35) - d2 = np.random.randint(1, 35) - test_dist_qr(d1, d2) + # test qr - pass by objectid + a = ra.random.normal.remote([10, 13]) + q_id, r_id = ra.linalg.qr.remote(a) + a_val = ray.get(a) + q_val = ray.get(q_id) + r_val = ray.get(r_id) + assert_almost_equal(np.dot(q_val, r_val), a_val) -if __name__ == "__main__": - unittest.main(verbosity=2) +def test_distributed_array_assemble(ray_start_regular): + a = ra.ones.remote([da.BLOCK_SIZE, da.BLOCK_SIZE]) + b = ra.zeros.remote([da.BLOCK_SIZE, da.BLOCK_SIZE]) + x = da.DistArray([2 * da.BLOCK_SIZE, da.BLOCK_SIZE], np.array([[a], [b]])) + assert_equal( + x.assemble(), + np.vstack([ + np.ones([da.BLOCK_SIZE, da.BLOCK_SIZE]), + np.zeros([da.BLOCK_SIZE, da.BLOCK_SIZE]) + ])) + + +@pytest.fixture +def ray_start_two_nodes(): + for module in [ + ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg + ]: + reload(module) + # Start the Ray processes. + ray.worker._init( + start_ray_local=True, num_local_schedulers=2, num_cpus=[10, 10]) + yield None + # The code after the yield will run as teardown code. + ray.shutdown() + + +def test_distributed_array_methods(ray_start_two_nodes): + x = da.zeros.remote([9, 25, 51], "float") + assert_equal(ray.get(da.assemble.remote(x)), np.zeros([9, 25, 51])) + + x = da.ones.remote([11, 25, 49], dtype_name="float") + assert_equal(ray.get(da.assemble.remote(x)), np.ones([11, 25, 49])) + + x = da.random.normal.remote([11, 25, 49]) + y = da.copy.remote(x) + assert_equal( + ray.get(da.assemble.remote(x)), ray.get(da.assemble.remote(y))) + + x = da.eye.remote(25, dtype_name="float") + assert_equal(ray.get(da.assemble.remote(x)), np.eye(25)) + + x = da.random.normal.remote([25, 49]) + y = da.triu.remote(x) + assert_equal( + ray.get(da.assemble.remote(y)), np.triu( + ray.get(da.assemble.remote(x)))) + + x = da.random.normal.remote([25, 49]) + y = da.tril.remote(x) + assert_equal( + ray.get(da.assemble.remote(y)), np.tril( + ray.get(da.assemble.remote(x)))) + + x = da.random.normal.remote([25, 49]) + y = da.random.normal.remote([49, 18]) + z = da.dot.remote(x, y) + w = da.assemble.remote(z) + u = da.assemble.remote(x) + v = da.assemble.remote(y) + assert_almost_equal(ray.get(w), np.dot(ray.get(u), ray.get(v))) + assert_almost_equal(ray.get(w), np.dot(ray.get(u), ray.get(v))) + + # test add + x = da.random.normal.remote([23, 42]) + y = da.random.normal.remote([23, 42]) + z = da.add.remote(x, y) + assert_almost_equal( + ray.get(da.assemble.remote(z)), + ray.get(da.assemble.remote(x)) + ray.get(da.assemble.remote(y))) + + # test subtract + x = da.random.normal.remote([33, 40]) + y = da.random.normal.remote([33, 40]) + z = da.subtract.remote(x, y) + assert_almost_equal( + ray.get(da.assemble.remote(z)), + ray.get(da.assemble.remote(x)) - ray.get(da.assemble.remote(y))) + + # test transpose + x = da.random.normal.remote([234, 432]) + y = da.transpose.remote(x) + assert_equal( + ray.get(da.assemble.remote(x)).T, ray.get(da.assemble.remote(y))) + + # test numpy_to_dist + x = da.random.normal.remote([23, 45]) + y = da.assemble.remote(x) + z = da.numpy_to_dist.remote(y) + w = da.assemble.remote(z) + assert_equal( + ray.get(da.assemble.remote(x)), ray.get(da.assemble.remote(z))) + assert_equal(ray.get(y), ray.get(w)) + + # test da.tsqr + for shape in [[123, da.BLOCK_SIZE], [7, da.BLOCK_SIZE], + [da.BLOCK_SIZE, da.BLOCK_SIZE], [da.BLOCK_SIZE, 7], + [10 * da.BLOCK_SIZE, da.BLOCK_SIZE]]: + x = da.random.normal.remote(shape) + K = min(shape) + q, r = da.linalg.tsqr.remote(x) + x_val = ray.get(da.assemble.remote(x)) + q_val = ray.get(da.assemble.remote(q)) + r_val = ray.get(r) + assert r_val.shape == (K, shape[1]) + assert_equal(r_val, np.triu(r_val)) + assert_almost_equal(x_val, np.dot(q_val, r_val)) + assert_almost_equal(np.dot(q_val.T, q_val), np.eye(K)) + + # test da.linalg.modified_lu + def test_modified_lu(d1, d2): + print("testing dist_modified_lu with d1 = " + str(d1) + ", d2 = " + + str(d2)) + assert d1 >= d2 + m = ra.random.normal.remote([d1, d2]) + q, r = ra.linalg.qr.remote(m) + l, u, s = da.linalg.modified_lu.remote(da.numpy_to_dist.remote(q)) + q_val = ray.get(q) + ray.get(r) + l_val = ray.get(da.assemble.remote(l)) + u_val = ray.get(u) + s_val = ray.get(s) + s_mat = np.zeros((d1, d2)) + for i in range(len(s_val)): + s_mat[i, i] = s_val[i] + # Check that q - s = l * u. + assert_almost_equal(q_val - s_mat, np.dot(l_val, u_val)) + # Check that u is upper triangular. + assert_equal(np.triu(u_val), u_val) + # Check that l is lower triangular. + assert_equal(np.tril(l_val), l_val) + + for d1, d2 in [(100, 100), (99, 98), (7, 5), (7, 7), (20, 7), (20, 10)]: + test_modified_lu(d1, d2) + + # test dist_tsqr_hr + def test_dist_tsqr_hr(d1, d2): + print("testing dist_tsqr_hr with d1 = " + str(d1) + ", d2 = " + + str(d2)) + a = da.random.normal.remote([d1, d2]) + y, t, y_top, r = da.linalg.tsqr_hr.remote(a) + a_val = ray.get(da.assemble.remote(a)) + y_val = ray.get(da.assemble.remote(y)) + t_val = ray.get(t) + y_top_val = ray.get(y_top) + r_val = ray.get(r) + tall_eye = np.zeros((d1, min(d1, d2))) + np.fill_diagonal(tall_eye, 1) + q = tall_eye - np.dot(y_val, np.dot(t_val, y_top_val.T)) + # Check that q.T * q = I. + assert_almost_equal(np.dot(q.T, q), np.eye(min(d1, d2))) + # Check that a = (I - y * t * y_top.T) * r. + assert_almost_equal(np.dot(q, r_val), a_val) + + for d1, d2 in [(123, da.BLOCK_SIZE), (7, da.BLOCK_SIZE), (da.BLOCK_SIZE, + da.BLOCK_SIZE), + (da.BLOCK_SIZE, 7), (10 * da.BLOCK_SIZE, da.BLOCK_SIZE)]: + test_dist_tsqr_hr(d1, d2) + + def test_dist_qr(d1, d2): + print("testing qr with d1 = {}, and d2 = {}.".format(d1, d2)) + a = da.random.normal.remote([d1, d2]) + K = min(d1, d2) + q, r = da.linalg.qr.remote(a) + a_val = ray.get(da.assemble.remote(a)) + q_val = ray.get(da.assemble.remote(q)) + r_val = ray.get(da.assemble.remote(r)) + assert q_val.shape == (d1, K) + assert r_val.shape == (K, d2) + assert_almost_equal(np.dot(q_val.T, q_val), np.eye(K)) + assert_equal(r_val, np.triu(r_val)) + assert_almost_equal(a_val, np.dot(q_val, r_val)) + + for d1, d2 in [(123, da.BLOCK_SIZE), (7, da.BLOCK_SIZE), (da.BLOCK_SIZE, + da.BLOCK_SIZE), + (da.BLOCK_SIZE, 7), (13, 21), (34, 35), (8, 7)]: + test_dist_qr(d1, d2) + test_dist_qr(d2, d1) + for _ in range(20): + d1 = np.random.randint(1, 35) + d2 = np.random.randint(1, 35) + test_dist_qr(d1, d2) diff --git a/test/component_failures_test.py b/test/component_failures_test.py index 04eb6ae68..60553dc94 100644 --- a/test/component_failures_test.py +++ b/test/component_failures_test.py @@ -2,351 +2,363 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import pytest import os import ray import time -import unittest import pyarrow as pa -class ComponentFailureTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() +@pytest.fixture +def ray_start_workers_separate(): + # Start the Ray processes. + ray.worker._init( + num_cpus=1, + start_workers_from_local_scheduler=False, + start_ray_local=True, + redirect_output=True) + yield None + # The code after the yield will run as teardown code. + ray.shutdown() - # This test checks that when a worker dies in the middle of a get, the - # plasma store and manager will not die. - @unittest.skipIf( - os.environ.get('RAY_USE_XRAY', False), - "Workers are all started by Raylet, so cannot be killed from Python.") - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), - "Not working with new GCS API.") - def testDyingWorkerGet(self): - obj_id = 20 * b"a" - @ray.remote - def f(): - ray.worker.global_worker.plasma_client.get(obj_id) +# This test checks that when a worker dies in the middle of a get, the +# plasma store and manager will not die. +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY", False), + reason="This test does not work with xray yet.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Not working with new GCS API.") +def test_dying_worker_get(ray_start_workers_separate): + obj_id = 20 * b"a" - ray.worker._init( - num_workers=1, - start_workers_from_local_scheduler=False, - start_ray_local=True, - redirect_output=True) + @ray.remote + def f(): + ray.worker.global_worker.plasma_client.get(ray.ObjectID(obj_id)) - # Have the worker wait in a get call. - f.remote() + # Have the worker wait in a get call. + f.remote() - # Kill the worker. - time.sleep(1) - (ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER][0] - .terminate()) + # Kill the worker. + time.sleep(1) + (ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER][0] + .terminate()) + time.sleep(0.1) + + # Seal the object so the store attempts to notify the worker that the + # get has been fulfilled. + ray.worker.global_worker.plasma_client.create( + pa.plasma.ObjectID(obj_id), 100) + ray.worker.global_worker.plasma_client.seal(pa.plasma.ObjectID(obj_id)) + time.sleep(0.1) + + # Make sure that nothing has died. + assert ray.services.all_processes_alive( + exclude=[ray.services.PROCESS_TYPE_WORKER]) + + +# This test checks that when a worker dies in the middle of a wait, the +# plasma store and manager will not die. +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY", False), + reason="This test does not work with xray yet.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Not working with new GCS API.") +def test_dying_worker_wait(ray_start_workers_separate): + obj_id = 20 * b"a" + + @ray.remote + def f(): + ray.worker.global_worker.plasma_client.wait([ray.ObjectID(obj_id)]) + + # Have the worker wait in a get call. + f.remote() + + # Kill the worker. + time.sleep(1) + (ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER][0] + .terminate()) + time.sleep(0.1) + + # Seal the object so the store attempts to notify the worker that the + # get has been fulfilled. + ray.worker.global_worker.plasma_client.create( + pa.plasma.ObjectID(obj_id), 100) + ray.worker.global_worker.plasma_client.seal(pa.plasma.ObjectID(obj_id)) + time.sleep(0.1) + + # Make sure that nothing has died. + assert ray.services.all_processes_alive( + exclude=[ray.services.PROCESS_TYPE_WORKER]) + + +@pytest.fixture(params=[(1, 4), (4, 4)]) +def ray_start_workers_separate_multinode(request): + num_local_schedulers = request.param[0] + num_initial_workers = request.param[1] + # Start the Ray processes. + ray.worker._init( + num_workers=(num_initial_workers * num_local_schedulers), + num_local_schedulers=num_local_schedulers, + start_workers_from_local_scheduler=False, + start_ray_local=True, + num_cpus=[num_initial_workers] * num_local_schedulers, + redirect_output=True) + yield num_local_schedulers, num_initial_workers + # The code after the yield will run as teardown code. + ray.shutdown() + + +def test_worker_failed(ray_start_workers_separate_multinode): + num_local_schedulers, num_initial_workers = ( + ray_start_workers_separate_multinode) + + @ray.remote + def f(x): + time.sleep(0.5) + return x + + # Submit more tasks than there are workers so that all workers and + # cores are utilized. + object_ids = [ + f.remote(i) for i in range(num_initial_workers * num_local_schedulers) + ] + object_ids += [f.remote(object_id) for object_id in object_ids] + # Allow the tasks some time to begin executing. + time.sleep(0.1) + # Kill the workers as the tasks execute. + for worker in ( + ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER]): + worker.terminate() time.sleep(0.1) + # Make sure that we can still get the objects after the executing tasks + # died. + ray.get(object_ids) - # Seal the object so the store attempts to notify the worker that the - # get has been fulfilled. - ray.worker.global_worker.plasma_client.create( - pa.plasma.ObjectID(obj_id), 100) - ray.worker.global_worker.plasma_client.seal(pa.plasma.ObjectID(obj_id)) - time.sleep(0.1) - # Make sure that nothing has died. - assert ray.services.all_processes_alive( - exclude=[ray.services.PROCESS_TYPE_WORKER]) +def _test_component_failed(component_type): + """Kill a component on all worker nodes and check workload succeeds.""" + # Raylet is able to pass a harder failure test than legacy ray. + use_raylet = os.environ.get("RAY_USE_XRAY") == "1" - # This test checks that when a worker dies in the middle of a wait, the - # plasma store and manager will not die. - @unittest.skipIf( - os.environ.get('RAY_USE_XRAY', False), - "Workers are all started by Raylet, so cannot be killed from Python.") - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), - "Not working with new GCS API.") - def testDyingWorkerWait(self): - obj_id = 20 * b"a" + # Start with 4 workers and 4 cores. + num_local_schedulers = 4 + num_workers_per_scheduler = 8 + ray.worker._init( + num_workers=num_workers_per_scheduler, + num_local_schedulers=num_local_schedulers, + start_ray_local=True, + num_cpus=[num_workers_per_scheduler] * num_local_schedulers, + redirect_output=True) - @ray.remote - def f(): - ray.worker.global_worker.plasma_client.wait([obj_id]) - - ray.worker._init( - num_workers=1, - start_workers_from_local_scheduler=False, - start_ray_local=True, - redirect_output=True) - - # Have the worker wait in a get call. - f.remote() - - # Kill the worker. - time.sleep(1) - (ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER][0] - .terminate()) - time.sleep(0.1) - - # Seal the object so the store attempts to notify the worker that the - # get has been fulfilled. - ray.worker.global_worker.plasma_client.create( - pa.plasma.ObjectID(obj_id), 100) - ray.worker.global_worker.plasma_client.seal(pa.plasma.ObjectID(obj_id)) - time.sleep(0.1) - - # Make sure that nothing has died. - assert ray.services.all_processes_alive( - exclude=[ray.services.PROCESS_TYPE_WORKER]) - - def _testWorkerFailed(self, num_local_schedulers): + if use_raylet: + # Submit many tasks with many dependencies. @ray.remote def f(x): - time.sleep(0.5) return x - num_initial_workers = 4 - ray.worker._init( - num_workers=(num_initial_workers * num_local_schedulers), - num_local_schedulers=num_local_schedulers, - start_workers_from_local_scheduler=False, - start_ray_local=True, - num_cpus=[num_initial_workers] * num_local_schedulers, - redirect_output=True) + @ray.remote + def g(*xs): + return 1 + + # Kill the component on all nodes except the head node as the tasks + # execute. Do this in a loop while submitting tasks between each + # component failure. + # NOTE(swang): Legacy ray hangs on this test if the plasma manager + # is killed. + time.sleep(0.1) + components = ray.services.all_processes[component_type] + for process in components[1:]: + # Submit a round of tasks with many dependencies. + x = 1 + for _ in range(1000): + x = f.remote(x) + + xs = [g.remote(1)] + for _ in range(100): + xs.append(g.remote(*xs)) + xs.append(g.remote(1)) + + # Kill a component on one of the nodes. + process.terminate() + time.sleep(1) + process.kill() + process.wait() + assert not process.poll() is None + + # Make sure that we can still get the objects after the + # executing tasks died. + ray.get(x) + ray.get(xs) + else: + + @ray.remote + def f(x, j): + time.sleep(0.2) + return x + # Submit more tasks than there are workers so that all workers and # cores are utilized. object_ids = [ - f.remote(i) - for i in range(num_initial_workers * num_local_schedulers) + f.remote(i, 0) + for i in range(num_workers_per_scheduler * num_local_schedulers) ] - object_ids += [f.remote(object_id) for object_id in object_ids] - # Allow the tasks some time to begin executing. + object_ids += [f.remote(object_id, 1) for object_id in object_ids] + object_ids += [f.remote(object_id, 2) for object_id in object_ids] + + # Kill the component on all nodes except the head node as the tasks + # execute. time.sleep(0.1) - # Kill the workers as the tasks execute. - for worker in ( - ray.services.all_processes[ray.services.PROCESS_TYPE_WORKER]): - worker.terminate() - time.sleep(0.1) - # Make sure that we can still get the objects after the executing tasks - # died. - ray.get(object_ids) + components = ray.services.all_processes[component_type] + for process in components[1:]: + process.terminate() + time.sleep(1) - def testWorkerFailed(self): - self._testWorkerFailed(1) + for process in components[1:]: + process.kill() + process.wait() + assert not process.poll() is None - def testWorkerFailedMultinode(self): - self._testWorkerFailed(4) + # Make sure that we can still get the objects after the executing + # tasks died. + results = ray.get(object_ids) + expected_results = 4 * list( + range(num_workers_per_scheduler * num_local_schedulers)) + assert results == expected_results - def _testComponentFailed(self, component_type): - """Kill a component on all worker nodes and check workload succeeds.""" - # Raylet is able to pass a harder failure test than legacy ray. - use_raylet = os.environ.get("RAY_USE_XRAY") == "1" - # Start with 4 workers and 4 cores. - num_local_schedulers = 4 - num_workers_per_scheduler = 8 - ray.worker._init( - num_workers=num_workers_per_scheduler, - num_local_schedulers=num_local_schedulers, - start_ray_local=True, - num_cpus=[num_workers_per_scheduler] * num_local_schedulers, - redirect_output=True) - - if use_raylet: - # Submit many tasks with many dependencies. - @ray.remote - def f(x): - return x - - @ray.remote - def g(*xs): - return 1 - - # Kill the component on all nodes except the head node as the tasks - # execute. Do this in a loop while submitting tasks between each - # component failure. - # NOTE(swang): Legacy ray hangs on this test if the plasma manager - # is killed. - time.sleep(0.1) - components = ray.services.all_processes[component_type] - for process in components[1:]: - # Submit a round of tasks with many dependencies. - x = 1 - for _ in range(1000): - x = f.remote(x) - - xs = [g.remote(1)] - for _ in range(100): - xs.append(g.remote(*xs)) - xs.append(g.remote(1)) - - # Kill a component on one of the nodes. - process.terminate() - time.sleep(1) - process.kill() - process.wait() - assert not process.poll() is None - - # Make sure that we can still get the objects after the - # executing tasks died. - ray.get(x) - ray.get(xs) +def check_components_alive(component_type, check_component_alive): + """Check that a given component type is alive on all worker nodes. + """ + components = ray.services.all_processes[component_type][1:] + for component in components: + if check_component_alive: + assert component.poll() is None else: + print("waiting for " + component_type + " with PID " + + str(component.pid) + "to terminate") + component.wait() + print("done waiting for " + component_type + " with PID " + + str(component.pid) + "to terminate") + assert not component.poll() is None - @ray.remote - def f(x, j): - time.sleep(0.2) - return x - # Submit more tasks than there are workers so that all workers and - # cores are utilized. - object_ids = [ - f.remote(i, 0) for i in range(num_workers_per_scheduler * - num_local_schedulers) - ] - object_ids += [f.remote(object_id, 1) for object_id in object_ids] - object_ids += [f.remote(object_id, 2) for object_id in object_ids] +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") != "1", + reason="This test only makes sense with xray.") +def test_raylet_failed(): + # Kill all local schedulers on worker nodes. + _test_component_failed(ray.services.PROCESS_TYPE_RAYLET) - # Kill the component on all nodes except the head node as the tasks - # execute. - time.sleep(0.1) - components = ray.services.all_processes[component_type] - for process in components[1:]: - process.terminate() - time.sleep(1) + # The plasma stores and plasma managers should still be alive on the + # worker nodes. + check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, True) - for process in components[1:]: - process.kill() - process.wait() - assert not process.poll() is None + ray.shutdown() - # Make sure that we can still get the objects after the executing - # tasks died. - results = ray.get(object_ids) - expected_results = 4 * list( - range(num_workers_per_scheduler * num_local_schedulers)) - assert results == expected_results - def check_components_alive(self, component_type, check_component_alive): - """Check that a given component type is alive on all worker nodes. - """ - components = ray.services.all_processes[component_type][1:] - for component in components: - if check_component_alive: - assert component.poll() is None - else: - print("waiting for " + component_type + " with PID " + - str(component.pid) + "to terminate") - component.wait() - print("done waiting for " + component_type + " with PID " + - str(component.pid) + "to terminate") - assert not component.poll() is None +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not make sense with xray.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_local_scheduler_failed(): + # Kill all local schedulers on worker nodes. + _test_component_failed(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER) - @unittest.skipIf(not os.environ.get('RAY_USE_XRAY', False), - "Only tests Raylet failure.") - def testRayletFailed(self): - # Kill all local schedulers on worker nodes. - self._testComponentFailed(ray.services.PROCESS_TYPE_RAYLET) + # The plasma stores and plasma managers should still be alive on the + # worker nodes. + check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, True) + check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER, True) + check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER, False) - # The plasma stores and plasma managers should still be alive on the - # worker nodes. - self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, - True) + ray.shutdown() - @unittest.skipIf( - os.environ.get('RAY_USE_XRAY', False), - "Raylet codepath does not have this component") - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.") - def testLocalSchedulerFailed(self): - # Kill all local schedulers on worker nodes. - self._testComponentFailed(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER) - # The plasma stores and plasma managers should still be alive on the - # worker nodes. - self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, - True) - self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER, - True) - self.check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER, - False) +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not make sense with xray.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_plasma_manager_failed(): + # Kill all plasma managers on worker nodes. + _test_component_failed(ray.services.PROCESS_TYPE_PLASMA_MANAGER) - @unittest.skipIf( - os.environ.get('RAY_USE_XRAY', False), - "Raylet codepath does not have this component") - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.") - def testPlasmaManagerFailed(self): - # Kill all plasma managers on worker nodes. - self._testComponentFailed(ray.services.PROCESS_TYPE_PLASMA_MANAGER) + # The plasma stores should still be alive (but unreachable) on the + # worker nodes. + check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, True) + check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER, False) + check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER, False) - # The plasma stores should still be alive (but unreachable) on the - # worker nodes. - self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, - True) - self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER, - False) - self.check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER, - False) + ray.shutdown() - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), "Hanging with new GCS API.") - def testPlasmaStoreFailed(self): - # Kill all plasma stores on worker nodes. - self._testComponentFailed(ray.services.PROCESS_TYPE_PLASMA_STORE) - # No processes should be left alive on the worker nodes. - self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, - False) - self.check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER, - False) - self.check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER, - False) - self.check_components_alive(ray.services.PROCESS_TYPE_RAYLET, False) +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_plasma_store_failed(): + # Kill all plasma stores on worker nodes. + _test_component_failed(ray.services.PROCESS_TYPE_PLASMA_STORE) - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), - "Not working with new GCS API.") - def testDriverLivesSequential(self): - ray.worker.init(redirect_output=True) - all_processes = ray.services.all_processes - processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] + - all_processes[ray.services.PROCESS_TYPE_PLASMA_MANAGER] + - all_processes[ray.services.PROCESS_TYPE_LOCAL_SCHEDULER] + - all_processes[ray.services.PROCESS_TYPE_GLOBAL_SCHEDULER] - + all_processes[ray.services.PROCESS_TYPE_RAYLET]) + # No processes should be left alive on the worker nodes. + check_components_alive(ray.services.PROCESS_TYPE_PLASMA_STORE, False) + check_components_alive(ray.services.PROCESS_TYPE_PLASMA_MANAGER, False) + check_components_alive(ray.services.PROCESS_TYPE_LOCAL_SCHEDULER, False) + check_components_alive(ray.services.PROCESS_TYPE_RAYLET, False) - # Kill all the components sequentially. - for process in processes: - process.terminate() - time.sleep(0.1) - process.kill() - process.wait() + ray.shutdown() - # If the driver can reach the tearDown method, then it is still alive. - @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), - "Not working with new GCS API.") - def testDriverLivesParallel(self): - ray.worker.init(redirect_output=True) - all_processes = ray.services.all_processes - processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] + - all_processes[ray.services.PROCESS_TYPE_PLASMA_MANAGER] + - all_processes[ray.services.PROCESS_TYPE_LOCAL_SCHEDULER] + - all_processes[ray.services.PROCESS_TYPE_GLOBAL_SCHEDULER] - + all_processes[ray.services.PROCESS_TYPE_RAYLET]) - - # Kill all the components in parallel. - for process in processes: - process.terminate() +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_driver_lives_sequential(): + ray.worker.init() + all_processes = ray.services.all_processes + processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] + + all_processes[ray.services.PROCESS_TYPE_PLASMA_MANAGER] + + all_processes[ray.services.PROCESS_TYPE_LOCAL_SCHEDULER] + + all_processes[ray.services.PROCESS_TYPE_GLOBAL_SCHEDULER] + + all_processes[ray.services.PROCESS_TYPE_RAYLET]) + # Kill all the components sequentially. + for process in processes: + process.terminate() time.sleep(0.1) - for process in processes: - process.kill() + process.kill() + process.wait() - for process in processes: - process.wait() - - # If the driver can reach the tearDown method, then it is still alive. + ray.shutdown() + # If the driver can reach the tearDown method, then it is still alive. -if __name__ == "__main__": - unittest.main(verbosity=2) +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_driver_lives_parallel(): + ray.worker.init() + all_processes = ray.services.all_processes + processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] + + all_processes[ray.services.PROCESS_TYPE_PLASMA_MANAGER] + + all_processes[ray.services.PROCESS_TYPE_LOCAL_SCHEDULER] + + all_processes[ray.services.PROCESS_TYPE_GLOBAL_SCHEDULER] + + all_processes[ray.services.PROCESS_TYPE_RAYLET]) + + # Kill all the components in parallel. + for process in processes: + process.terminate() + + time.sleep(0.1) + for process in processes: + process.kill() + + for process in processes: + process.wait() + + # If the driver can reach the tearDown method, then it is still alive. + ray.shutdown() diff --git a/test/credis_test.py b/test/credis_test.py index 9796f0d56..316da135e 100644 --- a/test/credis_test.py +++ b/test/credis_test.py @@ -13,7 +13,7 @@ def parse_client(addr_port_str): return redis.StrictRedis(host=redis_address, port=redis_port) -@unittest.skipIf(not os.environ.get('RAY_USE_NEW_GCS', False), +@unittest.skipIf(not os.environ.get("RAY_USE_NEW_GCS", False), "Tests functionality of the new GCS.") class CredisTest(unittest.TestCase): def setUp(self): diff --git a/test/failure_test.py b/test/failure_test.py index f8604016f..e438771d6 100644 --- a/test/failure_test.py +++ b/test/failure_test.py @@ -8,7 +8,6 @@ import ray import sys import tempfile import time -import unittest import ray.ray_constants as ray_constants import pytest @@ -27,481 +26,457 @@ def wait_for_errors(error_type, num_errors, timeout=10): raise Exception("Timing out of wait.") -class TaskStatusTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() +@pytest.fixture +def ray_start_regular(): + # Start the Ray processes. + ray.init(num_cpus=2) + yield None + # The code after the yield will run as teardown code. + ray.shutdown() - def testFailedTask(self): - @ray.remote - def throw_exception_fct1(): - raise Exception("Test function 1 intentionally failed.") - @ray.remote - def throw_exception_fct2(): - raise Exception("Test function 2 intentionally failed.") +def test_failed_task(ray_start_regular): + @ray.remote + def throw_exception_fct1(): + raise Exception("Test function 1 intentionally failed.") - @ray.remote(num_return_vals=3) - def throw_exception_fct3(x): - raise Exception("Test function 3 intentionally failed.") + @ray.remote + def throw_exception_fct2(): + raise Exception("Test function 2 intentionally failed.") - ray.init(num_workers=3) + @ray.remote(num_return_vals=3) + def throw_exception_fct3(x): + raise Exception("Test function 3 intentionally failed.") - throw_exception_fct1.remote() - throw_exception_fct1.remote() - wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) - assert len(relevant_errors(ray_constants.TASK_PUSH_ERROR)) == 2 - for task in relevant_errors(ray_constants.TASK_PUSH_ERROR): - msg = task.get("message") - assert "Test function 1 intentionally failed." in msg + throw_exception_fct1.remote() + throw_exception_fct1.remote() + wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) + assert len(relevant_errors(ray_constants.TASK_PUSH_ERROR)) == 2 + for task in relevant_errors(ray_constants.TASK_PUSH_ERROR): + msg = task.get("message") + assert "Test function 1 intentionally failed." in msg - x = throw_exception_fct2.remote() + x = throw_exception_fct2.remote() + try: + ray.get(x) + except Exception as e: + assert "Test function 2 intentionally failed." in str(e) + else: + # ray.get should throw an exception. + assert False + + x, y, z = throw_exception_fct3.remote(1.0) + for ref in [x, y, z]: try: - ray.get(x) + ray.get(ref) except Exception as e: - assert "Test function 2 intentionally failed." in str(e) + assert "Test function 3 intentionally failed." in str(e) else: # ray.get should throw an exception. assert False - x, y, z = throw_exception_fct3.remote(1.0) - for ref in [x, y, z]: - try: - ray.get(ref) - except Exception as e: - assert "Test function 3 intentionally failed." in str(e) - else: - # ray.get should throw an exception. - assert False + @ray.remote + def f(): + raise Exception("This function failed.") - @ray.remote - def f(): - raise Exception("This function failed.") + try: + ray.get(f.remote()) + except Exception as e: + assert "This function failed." in str(e) + else: + # ray.get should throw an exception. + assert False - try: - ray.get(f.remote()) - except Exception as e: - assert "This function failed." in str(e) - else: - # ray.get should throw an exception. - assert False - def testFailImportingRemoteFunction(self): - ray.init(num_workers=2) - - # Create the contents of a temporary Python file. - temporary_python_file = """ +def test_fail_importing_remote_function(ray_start_regular): + # Create the contents of a temporary Python file. + temporary_python_file = """ def temporary_helper_function(): return 1 """ - f = tempfile.NamedTemporaryFile(suffix=".py") - f.write(temporary_python_file.encode("ascii")) - f.flush() - directory = os.path.dirname(f.name) - # Get the module name and strip ".py" from the end. - module_name = os.path.basename(f.name)[:-3] - sys.path.append(directory) - module = __import__(module_name) + f = tempfile.NamedTemporaryFile(suffix=".py") + f.write(temporary_python_file.encode("ascii")) + f.flush() + directory = os.path.dirname(f.name) + # Get the module name and strip ".py" from the end. + module_name = os.path.basename(f.name)[:-3] + sys.path.append(directory) + module = __import__(module_name) - # Define a function that closes over this temporary module. This should - # fail when it is unpickled. - @ray.remote - def g(): - return module.temporary_python_file() + # Define a function that closes over this temporary module. This should + # fail when it is unpickled. + @ray.remote + def g(): + return module.temporary_python_file() - wait_for_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR, 2) - assert "No module named" in ray.error_info()[0]["message"] - assert "No module named" in ray.error_info()[1]["message"] + wait_for_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR, 2) + assert "No module named" in ray.error_info()[0]["message"] + assert "No module named" in ray.error_info()[1]["message"] - # Check that if we try to call the function it throws an exception and - # does not hang. - for _ in range(10): - with pytest.raises(Exception): - ray.get(g.remote()) + # Check that if we try to call the function it throws an exception and + # does not hang. + for _ in range(10): + with pytest.raises(Exception): + ray.get(g.remote()) - f.close() + f.close() - # Clean up the junk we added to sys.path. - sys.path.pop(-1) + # Clean up the junk we added to sys.path. + sys.path.pop(-1) - def testFailedFunctionToRun(self): - ray.init(num_workers=2) - def f(worker): - if ray.worker.global_worker.mode == ray.WORKER_MODE: - raise Exception("Function to run failed.") +def test_failed_function_to_run(ray_start_regular): + def f(worker): + if ray.worker.global_worker.mode == ray.WORKER_MODE: + raise Exception("Function to run failed.") - ray.worker.global_worker.run_function_on_all_workers(f) - wait_for_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR, 2) - # Check that the error message is in the task info. - error_info = ray.error_info() - assert len(error_info) == 2 - assert "Function to run failed." in error_info[0]["message"] - assert "Function to run failed." in error_info[1]["message"] + ray.worker.global_worker.run_function_on_all_workers(f) + wait_for_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR, 2) + # Check that the error message is in the task info. + error_info = ray.error_info() + assert len(error_info) == 2 + assert "Function to run failed." in error_info[0]["message"] + assert "Function to run failed." in error_info[1]["message"] - def testFailImportingActor(self): - ray.init(num_workers=2) - # Create the contents of a temporary Python file. - temporary_python_file = """ +def test_fail_importing_actor(ray_start_regular): + # Create the contents of a temporary Python file. + temporary_python_file = """ def temporary_helper_function(): return 1 """ - f = tempfile.NamedTemporaryFile(suffix=".py") - f.write(temporary_python_file.encode("ascii")) - f.flush() - directory = os.path.dirname(f.name) - # Get the module name and strip ".py" from the end. - module_name = os.path.basename(f.name)[:-3] - sys.path.append(directory) - module = __import__(module_name) + f = tempfile.NamedTemporaryFile(suffix=".py") + f.write(temporary_python_file.encode("ascii")) + f.flush() + directory = os.path.dirname(f.name) + # Get the module name and strip ".py" from the end. + module_name = os.path.basename(f.name)[:-3] + sys.path.append(directory) + module = __import__(module_name) - # Define an actor that closes over this temporary module. This should - # fail when it is unpickled. - @ray.remote - class Foo(object): - def __init__(self): - self.x = module.temporary_python_file() + # Define an actor that closes over this temporary module. This should + # fail when it is unpickled. + @ray.remote + class Foo(object): + def __init__(self): + self.x = module.temporary_python_file() - def get_val(self): - return 1 + def get_val(self): + return 1 - # There should be no errors yet. - assert len(ray.error_info()) == 0 + # There should be no errors yet. + assert len(ray.error_info()) == 0 - # Create an actor. - foo = Foo.remote() + # Create an actor. + foo = Foo.remote() - # Wait for the error to arrive. - wait_for_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR, 1) - assert "No module named" in ray.error_info()[0]["message"] + # Wait for the error to arrive. + wait_for_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR, 1) + assert "No module named" in ray.error_info()[0]["message"] - # Wait for the error from when the __init__ tries to run. - wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1) - assert ("failed to be imported, and so cannot execute this method" in - ray.error_info()[1]["message"]) + # Wait for the error from when the __init__ tries to run. + wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1) + assert ("failed to be imported, and so cannot execute this method" in + ray.error_info()[1]["message"]) - # Check that if we try to get the function it throws an exception and - # does not hang. - with pytest.raises(Exception): - ray.get(foo.get_val.remote()) + # Check that if we try to get the function it throws an exception and + # does not hang. + with pytest.raises(Exception): + ray.get(foo.get_val.remote()) - # Wait for the error from when the call to get_val. - wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) - assert ("failed to be imported, and so cannot execute this method" in - ray.error_info()[2]["message"]) + # Wait for the error from when the call to get_val. + wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) + assert ("failed to be imported, and so cannot execute this method" in + ray.error_info()[2]["message"]) - f.close() + f.close() - # Clean up the junk we added to sys.path. - sys.path.pop(-1) + # Clean up the junk we added to sys.path. + sys.path.pop(-1) -class ActorTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() +def test_failed_actor_init(ray_start_regular): + error_message1 = "actor constructor failed" + error_message2 = "actor method failed" - def testFailedActorInit(self): - ray.init(num_workers=0) + @ray.remote + class FailedActor(object): + def __init__(self): + raise Exception(error_message1) - error_message1 = "actor constructor failed" - error_message2 = "actor method failed" + def get_val(self): + return 1 - @ray.remote - class FailedActor(object): - def __init__(self): - raise Exception(error_message1) + def fail_method(self): + raise Exception(error_message2) - def get_val(self): - return 1 + a = FailedActor.remote() - def fail_method(self): - raise Exception(error_message2) + # Make sure that we get errors from a failed constructor. + wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1) + assert len(ray.error_info()) == 1 + assert error_message1 in ray.error_info()[0]["message"] - a = FailedActor.remote() - - # Make sure that we get errors from a failed constructor. - wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1) - assert len(ray.error_info()) == 1 - assert error_message1 in ray.error_info()[0]["message"] - - # Make sure that we get errors from a failed method. - a.fail_method.remote() - wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) - assert len(ray.error_info()) == 2 - assert error_message2 in ray.error_info()[1]["message"] - - def testIncorrectMethodCalls(self): - ray.init(num_workers=0) - - @ray.remote - class Actor(object): - def __init__(self, missing_variable_name): - pass - - def get_val(self, x): - pass - - # Make sure that we get errors if we call the constructor incorrectly. - - # Create an actor with too few arguments. - with pytest.raises(Exception): - a = Actor.remote() - - # Create an actor with too many arguments. - with pytest.raises(Exception): - a = Actor.remote(1, 2) - - # Create an actor the correct number of arguments. - a = Actor.remote(1) - - # Call a method with too few arguments. - with pytest.raises(Exception): - a.get_val.remote() - - # Call a method with too many arguments. - with pytest.raises(Exception): - a.get_val.remote(1, 2) - # Call a method that doesn't exist. - with pytest.raises(AttributeError): - a.nonexistent_method() - with pytest.raises(AttributeError): - a.nonexistent_method.remote() + # Make sure that we get errors from a failed method. + a.fail_method.remote() + wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) + assert len(ray.error_info()) == 2 + assert error_message2 in ray.error_info()[1]["message"] -class WorkerDeath(unittest.TestCase): - def tearDown(self): - ray.shutdown() - - def testWorkerRaisingException(self): - ray.init(num_workers=1) - - @ray.remote - def f(): - ray.worker.global_worker._get_next_task_from_local_scheduler = None - - # Running this task should cause the worker to raise an exception after - # the task has successfully completed. - f.remote() - - wait_for_errors(ray_constants.WORKER_CRASH_PUSH_ERROR, 1) - wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1) - assert len(ray.error_info()) == 2 - - def testWorkerDying(self): - ray.init(num_workers=0) - - # Define a remote function that will kill the worker that runs it. - @ray.remote - def f(): - eval("exit()") - - f.remote() - - wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1) - - error_info = ray.error_info() - assert len(error_info) == 1 - assert "died or was killed while executing" in error_info[0]["message"] - - def testActorWorkerDying(self): - ray.init(num_workers=0) - - @ray.remote - class Actor(object): - def kill(self): - eval("exit()") - - @ray.remote - def consume(x): +def test_incorrect_method_calls(ray_start_regular): + @ray.remote + class Actor(object): + def __init__(self, missing_variable_name): pass + def get_val(self, x): + pass + + # Make sure that we get errors if we call the constructor incorrectly. + + # Create an actor with too few arguments. + with pytest.raises(Exception): a = Actor.remote() - [obj], _ = ray.wait([a.kill.remote()], timeout=5000) + + # Create an actor with too many arguments. + with pytest.raises(Exception): + a = Actor.remote(1, 2) + + # Create an actor the correct number of arguments. + a = Actor.remote(1) + + # Call a method with too few arguments. + with pytest.raises(Exception): + a.get_val.remote() + + # Call a method with too many arguments. + with pytest.raises(Exception): + a.get_val.remote(1, 2) + # Call a method that doesn't exist. + with pytest.raises(AttributeError): + a.nonexistent_method() + with pytest.raises(AttributeError): + a.nonexistent_method.remote() + + +def test_worker_raising_exception(ray_start_regular): + @ray.remote + def f(): + ray.worker.global_worker._get_next_task_from_local_scheduler = None + + # Running this task should cause the worker to raise an exception after + # the task has successfully completed. + f.remote() + + wait_for_errors(ray_constants.WORKER_CRASH_PUSH_ERROR, 1) + wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1) + assert len(ray.error_info()) == 2 + + +def test_worker_dying(ray_start_regular): + # Define a remote function that will kill the worker that runs it. + @ray.remote + def f(): + eval("exit()") + + f.remote() + + wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1) + + error_info = ray.error_info() + assert len(error_info) == 1 + assert "died or was killed while executing" in error_info[0]["message"] + + +def test_actor_worker_dying(ray_start_regular): + @ray.remote + class Actor(object): + def kill(self): + eval("exit()") + + @ray.remote + def consume(x): + pass + + a = Actor.remote() + [obj], _ = ray.wait([a.kill.remote()], timeout=5000) + with pytest.raises(Exception): + ray.get(obj) + with pytest.raises(Exception): + ray.get(consume.remote(obj)) + wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1) + + +def test_actor_worker_dying_future_tasks(ray_start_regular): + @ray.remote + class Actor(object): + def getpid(self): + return os.getpid() + + def sleep(self): + time.sleep(1) + + a = Actor.remote() + pid = ray.get(a.getpid.remote()) + tasks1 = [a.sleep.remote() for _ in range(10)] + os.kill(pid, 9) + time.sleep(0.1) + tasks2 = [a.sleep.remote() for _ in range(10)] + for obj in tasks1 + tasks2: with pytest.raises(Exception): ray.get(obj) - with pytest.raises(Exception): - ray.get(consume.remote(obj)) - wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1) - def testActorWorkerDyingFutureTasks(self): - ray.init(num_workers=0) - - @ray.remote - class Actor(object): - def getpid(self): - return os.getpid() - - def sleep(self): - time.sleep(1) - - a = Actor.remote() - pid = ray.get(a.getpid.remote()) - tasks1 = [a.sleep.remote() for _ in range(10)] - os.kill(pid, 9) - time.sleep(0.1) - tasks2 = [a.sleep.remote() for _ in range(10)] - for obj in tasks1 + tasks2: - with pytest.raises(Exception): - ray.get(obj) - - wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1) - - def testActorWorkerDyingNothingInProgress(self): - ray.init(num_workers=0) - - @ray.remote - class Actor(object): - def getpid(self): - return os.getpid() - - a = Actor.remote() - pid = ray.get(a.getpid.remote()) - os.kill(pid, 9) - time.sleep(0.1) - task2 = a.getpid.remote() - with pytest.raises(Exception): - ray.get(task2) + wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1) -@unittest.skipIf( +def test_actor_worker_dying_nothing_in_progress(ray_start_regular): + @ray.remote + class Actor(object): + def getpid(self): + return os.getpid() + + a = Actor.remote() + pid = ray.get(a.getpid.remote()) + os.kill(pid, 9) + time.sleep(0.1) + task2 = a.getpid.remote() + with pytest.raises(Exception): + ray.get(task2) + + +@pytest.fixture +def ray_start_object_store_memory(): + # Start the Ray processes. + store_size = 10**6 + ray.init(num_cpus=1, object_store_memory=store_size) + yield None + # The code after the yield will run as teardown code. + ray.shutdown() + + +@pytest.mark.skipif( os.environ.get("RAY_USE_XRAY") == "1", - "This test does not work with xray yet.") -class PutErrorTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() + reason="This test does not work with xray yet.") +def test_put_error1(ray_start_object_store_memory): + num_objects = 3 + object_size = 4 * 10**5 - def testPutError1(self): - store_size = 10**6 - ray.worker._init(start_ray_local=True, object_store_memory=store_size) + # Define a task with a single dependency, a numpy array, that returns + # another array. + @ray.remote + def single_dependency(i, arg): + arg = np.copy(arg) + arg[0] = i + return arg - num_objects = 3 - object_size = 4 * 10**5 + @ray.remote + def put_arg_task(): + # Launch num_objects instances of the remote task, each dependent + # on the one before it. The result of the first task should get + # evicted. + args = [] + arg = single_dependency.remote(0, np.zeros( + object_size, dtype=np.uint8)) + for i in range(num_objects): + arg = single_dependency.remote(i, arg) + args.append(arg) - # Define a task with a single dependency, a numpy array, that returns - # another array. - @ray.remote - def single_dependency(i, arg): - arg = np.copy(arg) - arg[0] = i - return arg + # Get the last value to force all tasks to finish. + value = ray.get(args[-1]) + assert value[0] == i - @ray.remote - def put_arg_task(): - # Launch num_objects instances of the remote task, each dependent - # on the one before it. The result of the first task should get - # evicted. - args = [] - arg = single_dependency.remote( - 0, np.zeros(object_size, dtype=np.uint8)) - for i in range(num_objects): - arg = single_dependency.remote(i, arg) - args.append(arg) + # Get the first value (which should have been evicted) to force + # reconstruction. Currently, since we're not able to reconstruct + # `ray.put` objects that were evicted and whose originating tasks + # are still running, this for-loop should hang and push an error to + # the driver. + ray.get(args[0]) - # Get the last value to force all tasks to finish. - value = ray.get(args[-1]) - assert value[0] == i + put_arg_task.remote() - # Get the first value (which should have been evicted) to force - # reconstruction. Currently, since we're not able to reconstruct - # `ray.put` objects that were evicted and whose originating tasks - # are still running, this for-loop should hang and push an error to - # the driver. - ray.get(args[0]) - - put_arg_task.remote() - - # Make sure we receive the correct error message. - wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1) - - def testPutError2(self): - # This is the same as the previous test, but it calls ray.put directly. - store_size = 10**6 - ray.worker._init(start_ray_local=True, object_store_memory=store_size) - - num_objects = 3 - object_size = 4 * 10**5 - - # Define a task with a single dependency, a numpy array, that returns - # another array. - @ray.remote - def single_dependency(i, arg): - arg = np.copy(arg) - arg[0] = i - return arg - - @ray.remote - def put_task(): - # Launch num_objects instances of the remote task, each dependent - # on the one before it. The result of the first task should get - # evicted. - args = [] - arg = ray.put(np.zeros(object_size, dtype=np.uint8)) - for i in range(num_objects): - arg = single_dependency.remote(i, arg) - args.append(arg) - - # Get the last value to force all tasks to finish. - value = ray.get(args[-1]) - assert value[0] == i - - # Get the first value (which should have been evicted) to force - # reconstruction. Currently, since we're not able to reconstruct - # `ray.put` objects that were evicted and whose originating tasks - # are still running, this for-loop should hang and push an error to - # the driver. - ray.get(args[0]) - - put_task.remote() - - # Make sure we receive the correct error message. - wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1) + # Make sure we receive the correct error message. + wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1) -class ConfigurationTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() +@pytest.mark.skipif( + os.environ.get("RAY_USE_XRAY") == "1", + reason="This test does not work with xray yet.") +def test_put_error2(ray_start_object_store_memory): + # This is the same as the previous test, but it calls ray.put directly. + num_objects = 3 + object_size = 4 * 10**5 - def testVersionMismatch(self): - ray_version = ray.__version__ - ray.__version__ = "fake ray version" + # Define a task with a single dependency, a numpy array, that returns + # another array. + @ray.remote + def single_dependency(i, arg): + arg = np.copy(arg) + arg[0] = i + return arg - ray.init(num_workers=1) + @ray.remote + def put_task(): + # Launch num_objects instances of the remote task, each dependent + # on the one before it. The result of the first task should get + # evicted. + args = [] + arg = ray.put(np.zeros(object_size, dtype=np.uint8)) + for i in range(num_objects): + arg = single_dependency.remote(i, arg) + args.append(arg) - wait_for_errors(ray_constants.VERSION_MISMATCH_PUSH_ERROR, 1) + # Get the last value to force all tasks to finish. + value = ray.get(args[-1]) + assert value[0] == i - ray.__version__ = ray_version + # Get the first value (which should have been evicted) to force + # reconstruction. Currently, since we're not able to reconstruct + # `ray.put` objects that were evicted and whose originating tasks + # are still running, this for-loop should hang and push an error to + # the driver. + ray.get(args[0]) + + put_task.remote() + + # Make sure we receive the correct error message. + wait_for_errors(ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR, 1) -class WarningTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() +def test_version_mismatch(): + ray_version = ray.__version__ + ray.__version__ = "fake ray version" - def testExportLargeObjects(self): - import ray.ray_constants as ray_constants + ray.init(num_cpus=1) - ray.init(num_workers=1) + wait_for_errors(ray_constants.VERSION_MISMATCH_PUSH_ERROR, 1) - large_object = np.zeros(2 * ray_constants.PICKLE_OBJECT_WARNING_SIZE) + # Reset the version. + ray.__version__ = ray_version - @ray.remote - def f(): + ray.shutdown() + + +def test_export_large_objects(ray_start_regular): + import ray.ray_constants as ray_constants + + large_object = np.zeros(2 * ray_constants.PICKLE_OBJECT_WARNING_SIZE) + + @ray.remote + def f(): + large_object + + # Make sure that a warning is generated. + wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 1) + + @ray.remote + class Foo(object): + def __init__(self): large_object - # Make sure that a warning is generated. - wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 1) + Foo.remote() - @ray.remote - class Foo(object): - def __init__(self): - large_object - - Foo.remote() - - # Make sure that a warning is generated. - wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 2) - - -if __name__ == "__main__": - unittest.main(verbosity=2) + # Make sure that a warning is generated. + wait_for_errors(ray_constants.PICKLING_LARGE_OBJECT_PUSH_ERROR, 2) diff --git a/test/microbenchmarks.py b/test/microbenchmarks.py index 5cfc5ce40..6bbbc48bb 100644 --- a/test/microbenchmarks.py +++ b/test/microbenchmarks.py @@ -2,120 +2,118 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import pytest import os -import unittest import ray import time import numpy as np -class MicroBenchmarkTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() - - def testTiming(self): - @ray.remote - def empty_function(): - pass - - @ray.remote - def trivial_function(): - return 1 - - ray.init(num_workers=3) - - # Measure the time required to submit a remote task to the scheduler. - elapsed_times = [] - for _ in range(1000): - start_time = time.time() - empty_function.remote() - end_time = time.time() - elapsed_times.append(end_time - start_time) - elapsed_times = np.sort(elapsed_times) - average_elapsed_time = sum(elapsed_times) / 1000 - print("Time required to submit an empty function call:") - print(" Average: {}".format(average_elapsed_time)) - print(" 90th percentile: {}".format(elapsed_times[900])) - print(" 99th percentile: {}".format(elapsed_times[990])) - print(" worst: {}".format(elapsed_times[999])) - # average_elapsed_time should be about 0.00038. - - # Measure the time required to submit a remote task to the scheduler - # (where the remote task returns one value). - elapsed_times = [] - for _ in range(1000): - start_time = time.time() - trivial_function.remote() - end_time = time.time() - elapsed_times.append(end_time - start_time) - elapsed_times = np.sort(elapsed_times) - average_elapsed_time = sum(elapsed_times) / 1000 - print("Time required to submit a trivial function call:") - print(" Average: {}".format(average_elapsed_time)) - print(" 90th percentile: {}".format(elapsed_times[900])) - print(" 99th percentile: {}".format(elapsed_times[990])) - print(" worst: {}".format(elapsed_times[999])) - # average_elapsed_time should be about 0.001. - - # Measure the time required to submit a remote task to the scheduler - # and get the result. - elapsed_times = [] - for _ in range(1000): - start_time = time.time() - x = trivial_function.remote() - ray.get(x) - end_time = time.time() - elapsed_times.append(end_time - start_time) - elapsed_times = np.sort(elapsed_times) - average_elapsed_time = sum(elapsed_times) / 1000 - print("Time required to submit a trivial function call and get the " - "result:") - print(" Average: {}".format(average_elapsed_time)) - print(" 90th percentile: {}".format(elapsed_times[900])) - print(" 99th percentile: {}".format(elapsed_times[990])) - print(" worst: {}".format(elapsed_times[999])) - # average_elapsed_time should be about 0.0013. - - # Measure the time required to do do a put. - elapsed_times = [] - for _ in range(1000): - start_time = time.time() - ray.put(1) - end_time = time.time() - elapsed_times.append(end_time - start_time) - elapsed_times = np.sort(elapsed_times) - average_elapsed_time = sum(elapsed_times) / 1000 - print("Time required to put an int:") - print(" Average: {}".format(average_elapsed_time)) - print(" 90th percentile: {}".format(elapsed_times[900])) - print(" 99th percentile: {}".format(elapsed_times[990])) - print(" worst: {}".format(elapsed_times[999])) - # average_elapsed_time should be about 0.00087. - - def testCache(self): - ray.init(num_workers=1) - - A = np.random.rand(1, 1000000) - v = np.random.rand(1000000) - A_id = ray.put(A) - v_id = ray.put(v) - a = time.time() - for i in range(100): - A.dot(v) - b = time.time() - a - c = time.time() - for i in range(100): - ray.get(A_id).dot(ray.get(v_id)) - d = time.time() - c - - if d > 1.5 * b: - if os.getenv("TRAVIS") is None: - raise Exception("The caching test was too slow. " - "d = {}, b = {}".format(d, b)) - else: - print("WARNING: The caching test was too slow. " - "d = {}, b = {}".format(d, b)) +@pytest.fixture +def ray_start_regular(): + # Start the Ray processes. + ray.init(num_cpus=3) + yield None + # The code after the yield will run as teardown code. + ray.shutdown() -if __name__ == "__main__": - unittest.main(verbosity=2) +def test_timing(ray_start_regular): + @ray.remote + def empty_function(): + pass + + @ray.remote + def trivial_function(): + return 1 + + # Measure the time required to submit a remote task to the scheduler. + elapsed_times = [] + for _ in range(1000): + start_time = time.time() + empty_function.remote() + end_time = time.time() + elapsed_times.append(end_time - start_time) + elapsed_times = np.sort(elapsed_times) + average_elapsed_time = sum(elapsed_times) / 1000 + print("Time required to submit an empty function call:") + print(" Average: {}".format(average_elapsed_time)) + print(" 90th percentile: {}".format(elapsed_times[900])) + print(" 99th percentile: {}".format(elapsed_times[990])) + print(" worst: {}".format(elapsed_times[999])) + # average_elapsed_time should be about 0.00038. + + # Measure the time required to submit a remote task to the scheduler + # (where the remote task returns one value). + elapsed_times = [] + for _ in range(1000): + start_time = time.time() + trivial_function.remote() + end_time = time.time() + elapsed_times.append(end_time - start_time) + elapsed_times = np.sort(elapsed_times) + average_elapsed_time = sum(elapsed_times) / 1000 + print("Time required to submit a trivial function call:") + print(" Average: {}".format(average_elapsed_time)) + print(" 90th percentile: {}".format(elapsed_times[900])) + print(" 99th percentile: {}".format(elapsed_times[990])) + print(" worst: {}".format(elapsed_times[999])) + # average_elapsed_time should be about 0.001. + + # Measure the time required to submit a remote task to the scheduler + # and get the result. + elapsed_times = [] + for _ in range(1000): + start_time = time.time() + x = trivial_function.remote() + ray.get(x) + end_time = time.time() + elapsed_times.append(end_time - start_time) + elapsed_times = np.sort(elapsed_times) + average_elapsed_time = sum(elapsed_times) / 1000 + print("Time required to submit a trivial function call and get the " + "result:") + print(" Average: {}".format(average_elapsed_time)) + print(" 90th percentile: {}".format(elapsed_times[900])) + print(" 99th percentile: {}".format(elapsed_times[990])) + print(" worst: {}".format(elapsed_times[999])) + # average_elapsed_time should be about 0.0013. + + # Measure the time required to do do a put. + elapsed_times = [] + for _ in range(1000): + start_time = time.time() + ray.put(1) + end_time = time.time() + elapsed_times.append(end_time - start_time) + elapsed_times = np.sort(elapsed_times) + average_elapsed_time = sum(elapsed_times) / 1000 + print("Time required to put an int:") + print(" Average: {}".format(average_elapsed_time)) + print(" 90th percentile: {}".format(elapsed_times[900])) + print(" 99th percentile: {}".format(elapsed_times[990])) + print(" worst: {}".format(elapsed_times[999])) + # average_elapsed_time should be about 0.00087. + + +def test_cache(ray_start_regular): + A = np.random.rand(1, 1000000) + v = np.random.rand(1000000) + A_id = ray.put(A) + v_id = ray.put(v) + a = time.time() + for i in range(100): + A.dot(v) + b = time.time() - a + c = time.time() + for i in range(100): + ray.get(A_id).dot(ray.get(v_id)) + d = time.time() - c + + if d > 1.5 * b: + if os.getenv("TRAVIS") is None: + raise Exception("The caching test was too slow. " + "d = {}, b = {}".format(d, b)) + else: + print("WARNING: The caching test was too slow. " + "d = {}, b = {}".format(d, b)) diff --git a/test/multi_node_test.py b/test/multi_node_test.py index abd42bfeb..a30bd2919 100644 --- a/test/multi_node_test.py +++ b/test/multi_node_test.py @@ -1,15 +1,16 @@ -from __future__ import absolute_import, division, print_function +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function import os +import pytest import subprocess import sys import tempfile import time -import unittest import ray from ray.test.test_utils import run_and_get_output -import pytest def run_string_as_driver(driver_script): @@ -30,52 +31,56 @@ def run_string_as_driver(driver_script): return out -class MultiNodeTest(unittest.TestCase): - def setUp(self): - out = run_and_get_output(["ray", "start", "--head"]) - # Get the redis address from the output. - redis_substring_prefix = "redis_address=\"" - redis_address_location = ( - out.find(redis_substring_prefix) + len(redis_substring_prefix)) - redis_address = out[redis_address_location:] - self.redis_address = redis_address.split("\"")[0] +@pytest.fixture +def ray_start_head(): + out = run_and_get_output(["ray", "start", "--head"]) + # Get the redis address from the output. + redis_substring_prefix = "redis_address=\"" + redis_address_location = ( + out.find(redis_substring_prefix) + len(redis_substring_prefix)) + redis_address = out[redis_address_location:] + redis_address = redis_address.split("\"")[0] - def tearDown(self): - ray.shutdown() - # Kill the Ray cluster. - subprocess.Popen(["ray", "stop"]).wait() + yield redis_address - def testErrorIsolation(self): - # Connect a driver to the Ray cluster. - ray.init(redis_address=self.redis_address) + # Disconnect from the Ray cluster. + ray.shutdown() + # Kill the Ray cluster. + subprocess.Popen(["ray", "stop"]).wait() - # There shouldn't be any errors yet. - assert len(ray.error_info()) == 0 - error_string1 = "error_string1" - error_string2 = "error_string2" +def test_error_isolation(ray_start_head): + redis_address = ray_start_head + # Connect a driver to the Ray cluster. + ray.init(redis_address=redis_address) - @ray.remote - def f(): - raise Exception(error_string1) + # There shouldn't be any errors yet. + assert len(ray.error_info()) == 0 - # Run a remote function that throws an error. - with pytest.raises(Exception): - ray.get(f.remote()) + error_string1 = "error_string1" + error_string2 = "error_string2" - # Wait for the error to appear in Redis. - while len(ray.error_info()) != 1: - time.sleep(0.1) - print("Waiting for error to appear.") + @ray.remote + def f(): + raise Exception(error_string1) - # Make sure we got the error. - assert len(ray.error_info()) == 1 - assert error_string1 in ray.error_info()[0]["message"] + # Run a remote function that throws an error. + with pytest.raises(Exception): + ray.get(f.remote()) - # Start another driver and make sure that it does not receive this - # error. Make the other driver throw an error, and make sure it - # receives that error. - driver_script = """ + # Wait for the error to appear in Redis. + while len(ray.error_info()) != 1: + time.sleep(0.1) + print("Waiting for error to appear.") + + # Make sure we got the error. + assert len(ray.error_info()) == 1 + assert error_string1 in ray.error_info()[0]["message"] + + # Start another driver and make sure that it does not receive this + # error. Make the other driver throw an error, and make sure it + # receives that error. + driver_script = """ import ray import time @@ -101,25 +106,28 @@ assert len(ray.error_info()) == 1 assert "{}" in ray.error_info()[0]["message"] print("success") -""".format(self.redis_address, error_string2, error_string2) +""".format(redis_address, error_string2, error_string2) - out = run_string_as_driver(driver_script) - # Make sure the other driver succeeded. - assert "success" in out + out = run_string_as_driver(driver_script) + # Make sure the other driver succeeded. + assert "success" in out - # Make sure that the other error message doesn't show up for this - # driver. - assert len(ray.error_info()) == 1 - assert error_string1 in ray.error_info()[0]["message"] + # Make sure that the other error message doesn't show up for this + # driver. + assert len(ray.error_info()) == 1 + assert error_string1 in ray.error_info()[0]["message"] - def testRemoteFunctionIsolation(self): - # This test will run multiple remote functions with the same names in - # two different drivers. Connect a driver to the Ray cluster. - ray.init(redis_address=self.redis_address) - # Start another driver and make sure that it can define and call its - # own commands with the same names. - driver_script = """ +def test_remote_function_isolation(ray_start_head): + # This test will run multiple remote functions with the same names in + # two different drivers. Connect a driver to the Ray cluster. + redis_address = ray_start_head + + ray.init(redis_address=redis_address) + + # Start another driver and make sure that it can define and call its + # own commands with the same names. + driver_script = """ import ray import time ray.init(redis_address="{}") @@ -133,32 +141,35 @@ for _ in range(10000): result = ray.get([f.remote(), g.remote(0, 0)]) assert result == [3, 4] print("success") -""".format(self.redis_address) +""".format(redis_address) - out = run_string_as_driver(driver_script) + out = run_string_as_driver(driver_script) - @ray.remote - def f(): - return 1 + @ray.remote + def f(): + return 1 - @ray.remote - def g(x): - return 2 + @ray.remote + def g(x): + return 2 - for _ in range(10000): - result = ray.get([f.remote(), g.remote(0)]) - assert result == [1, 2] + for _ in range(10000): + result = ray.get([f.remote(), g.remote(0)]) + assert result == [1, 2] - # Make sure the other driver succeeded. - assert "success" in out + # Make sure the other driver succeeded. + assert "success" in out - def testDriverExitingQuickly(self): - # This test will create some drivers that submit some tasks and then - # exit without waiting for the tasks to complete. - ray.init(redis_address=self.redis_address) - # Define a driver that creates an actor and exits. - driver_script1 = """ +def test_driver_exiting_quickly(ray_start_head): + # This test will create some drivers that submit some tasks and then + # exit without waiting for the tasks to complete. + redis_address = ray_start_head + + ray.init(redis_address=redis_address) + + # Define a driver that creates an actor and exits. + driver_script1 = """ import ray ray.init(redis_address="{}") @ray.remote @@ -167,10 +178,10 @@ class Foo(object): pass Foo.remote() print("success") -""".format(self.redis_address) +""".format(redis_address) - # Define a driver that creates some tasks and exits. - driver_script2 = """ + # Define a driver that creates some tasks and exits. + driver_script2 = """ import ray ray.init(redis_address="{}") @ray.remote @@ -178,137 +189,142 @@ def f(): return 1 f.remote() print("success") -""".format(self.redis_address) +""".format(redis_address) - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - out = run_string_as_driver(driver_script1) - # Make sure the first driver ran to completion. - assert "success" in out - out = run_string_as_driver(driver_script2) - # Make sure the first driver ran to completion. - assert "success" in out - assert ray.services.all_processes_alive() + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + out = run_string_as_driver(driver_script1) + # Make sure the first driver ran to completion. + assert "success" in out + out = run_string_as_driver(driver_script2) + # Make sure the first driver ran to completion. + assert "success" in out + assert ray.services.all_processes_alive() -class StartRayScriptTest(unittest.TestCase): - def testCallingStartRayHead(self): - # Test that we can call start-ray.sh with various command line - # parameters. TODO(rkn): This test only tests the --head code path. We - # should also test the non-head node code path. +def test_calling_start_ray_head(): + # Test that we can call start-ray.sh with various command line + # parameters. TODO(rkn): This test only tests the --head code path. We + # should also test the non-head node code path. - # Test starting Ray with no arguments. - run_and_get_output(["ray", "start", "--head"]) - subprocess.Popen(["ray", "stop"]).wait() + # Test starting Ray with no arguments. + run_and_get_output(["ray", "start", "--head"]) + subprocess.Popen(["ray", "stop"]).wait() - # Test starting Ray with a number of workers specified. - run_and_get_output(["ray", "start", "--head", "--num-workers", "20"]) - subprocess.Popen(["ray", "stop"]).wait() + # Test starting Ray with a number of workers specified. + run_and_get_output(["ray", "start", "--head", "--num-workers", "20"]) + subprocess.Popen(["ray", "stop"]).wait() - # Test starting Ray with a redis port specified. - run_and_get_output(["ray", "start", "--head", "--redis-port", "6379"]) - subprocess.Popen(["ray", "stop"]).wait() + # Test starting Ray with a redis port specified. + run_and_get_output(["ray", "start", "--head", "--redis-port", "6379"]) + subprocess.Popen(["ray", "stop"]).wait() - # Test starting Ray with a node IP address specified. - run_and_get_output( - ["ray", "start", "--head", "--node-ip-address", "127.0.0.1"]) - subprocess.Popen(["ray", "stop"]).wait() + # Test starting Ray with a node IP address specified. + run_and_get_output( + ["ray", "start", "--head", "--node-ip-address", "127.0.0.1"]) + subprocess.Popen(["ray", "stop"]).wait() - # Test starting Ray with an object manager port specified. - run_and_get_output( - ["ray", "start", "--head", "--object-manager-port", "12345"]) - subprocess.Popen(["ray", "stop"]).wait() + # Test starting Ray with an object manager port specified. + run_and_get_output( + ["ray", "start", "--head", "--object-manager-port", "12345"]) + subprocess.Popen(["ray", "stop"]).wait() - # Test starting Ray with the number of CPUs specified. - run_and_get_output(["ray", "start", "--head", "--num-cpus", "2"]) - subprocess.Popen(["ray", "stop"]).wait() + # Test starting Ray with the number of CPUs specified. + run_and_get_output(["ray", "start", "--head", "--num-cpus", "2"]) + subprocess.Popen(["ray", "stop"]).wait() - # Test starting Ray with the number of GPUs specified. - run_and_get_output(["ray", "start", "--head", "--num-gpus", "100"]) - subprocess.Popen(["ray", "stop"]).wait() + # Test starting Ray with the number of GPUs specified. + run_and_get_output(["ray", "start", "--head", "--num-gpus", "100"]) + subprocess.Popen(["ray", "stop"]).wait() - # Test starting Ray with the max redis clients specified. - run_and_get_output( - ["ray", "start", "--head", "--redis-max-clients", "100"]) - subprocess.Popen(["ray", "stop"]).wait() + # Test starting Ray with the max redis clients specified. + run_and_get_output( + ["ray", "start", "--head", "--redis-max-clients", "100"]) + subprocess.Popen(["ray", "stop"]).wait() - if "RAY_USE_NEW_GCS" not in os.environ: - # Test starting Ray with redis shard ports specified. - run_and_get_output([ - "ray", "start", "--head", "--redis-shard-ports", - "6380,6381,6382" - ]) - subprocess.Popen(["ray", "stop"]).wait() - - # Test starting Ray with all arguments specified. - run_and_get_output([ - "ray", "start", "--head", "--num-workers", "2", "--redis-port", - "6379", "--redis-shard-ports", "6380,6381,6382", - "--object-manager-port", "12345", "--num-cpus", "2", - "--num-gpus", "0", "--redis-max-clients", "100", "--resources", - "{\"Custom\": 1}" - ]) - subprocess.Popen(["ray", "stop"]).wait() - - # Test starting Ray with invalid arguments. - with pytest.raises(Exception): - run_and_get_output([ - "ray", "start", "--head", "--redis-address", "127.0.0.1:6379" - ]) - subprocess.Popen(["ray", "stop"]).wait() - - def testUsingHostnames(self): - # Start the Ray processes on this machine. + if "RAY_USE_NEW_GCS" not in os.environ: + # Test starting Ray with redis shard ports specified. run_and_get_output([ - "ray", "start", "--head", "--node-ip-address=localhost", - "--redis-port=6379" + "ray", "start", "--head", "--redis-shard-ports", "6380,6381,6382" ]) - - ray.init(node_ip_address="localhost", redis_address="localhost:6379") - - @ray.remote - def f(): - return 1 - - assert ray.get(f.remote()) == 1 - - # Kill the Ray cluster. subprocess.Popen(["ray", "stop"]).wait() + # Test starting Ray with all arguments specified. + run_and_get_output([ + "ray", "start", "--head", "--num-workers", "2", "--redis-port", + "6379", "--redis-shard-ports", "6380,6381,6382", + "--object-manager-port", "12345", "--num-cpus", "2", "--num-gpus", + "0", "--redis-max-clients", "100", "--resources", "{\"Custom\": 1}" + ]) + subprocess.Popen(["ray", "stop"]).wait() -class MiscellaneousTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() + # Test starting Ray with invalid arguments. + with pytest.raises(Exception): + run_and_get_output( + ["ray", "start", "--head", "--redis-address", "127.0.0.1:6379"]) + subprocess.Popen(["ray", "stop"]).wait() - def testConnectingInLocalCase(self): - address_info = ray.init(num_cpus=0) - # Define a driver that just connects to Redis. - driver_script = """ +@pytest.fixture +def ray_start_head_local(): + # Start the Ray processes on this machine. + run_and_get_output([ + "ray", "start", "--head", "--node-ip-address=localhost", + "--redis-port=6379" + ]) + + yield None + + # Disconnect from the Ray cluster. + ray.shutdown() + # Kill the Ray cluster. + subprocess.Popen(["ray", "stop"]).wait() + + +def test_using_hostnames(ray_start_head_local): + ray.init(node_ip_address="localhost", redis_address="localhost:6379") + + @ray.remote + def f(): + return 1 + + assert ray.get(f.remote()) == 1 + + +@pytest.fixture +def ray_start_regular(): + # Start the Ray processes. + address_info = ray.init(num_cpus=1) + yield address_info + # The code after the yield will run as teardown code. + ray.shutdown() + + +def test_connecting_in_local_case(ray_start_regular): + address_info = ray_start_regular + + # Define a driver that just connects to Redis. + driver_script = """ import ray ray.init(redis_address="{}") print("success") """.format(address_info["redis_address"]) - out = run_string_as_driver(driver_script) - # Make sure the other driver succeeded. - assert "success" in out + out = run_string_as_driver(driver_script) + # Make sure the other driver succeeded. + assert "success" in out -class RunDriverForMultipleTimesTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() - - def testRunDriverForTwice(self): - # We used to have issue 2165 and 2288: - # https://github.com/ray-project/ray/issues/2165 - # https://github.com/ray-project/ray/issues/2288 - # both complain that driver will hang when run for the second time. - # This test is used to verify the fix for above issue, it will run the - # same driver for twice and verify whether both of them succeed. - address_info = ray.init() - driver_script = """ +def test_run_driver_twice(ray_start_regular): + # We used to have issue 2165 and 2288: + # https://github.com/ray-project/ray/issues/2165 + # https://github.com/ray-project/ray/issues/2288 + # both complain that driver will hang when run for the second time. + # This test is used to verify the fix for above issue, it will run the + # same driver for twice and verify whether both of them succeed. + address_info = ray_start_regular + driver_script = """ import ray import ray.tune as tune import os @@ -338,10 +354,6 @@ tune.run_experiments({{ print("success") """.format(address_info["redis_address"]) - for i in range(2): - out = run_string_as_driver(driver_script) - assert "success" in out - - -if __name__ == "__main__": - unittest.main(verbosity=2) + for i in range(2): + out = run_string_as_driver(driver_script) + assert "success" in out diff --git a/test/runtest.py b/test/runtest.py index e2f299d06..b814bd3a8 100644 --- a/test/runtest.py +++ b/test/runtest.py @@ -1287,7 +1287,7 @@ class APITest(unittest.TestCase): @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), + os.environ.get("RAY_USE_NEW_GCS", False), "For now, RAY_USE_NEW_GCS supports 1 shard, and credis " "supports 1-node chain for that shard only.") class APITestSharded(APITest): @@ -2187,7 +2187,7 @@ def wait_for_num_objects(num_objects, timeout=10): @unittest.skipIf( - os.environ.get('RAY_USE_NEW_GCS', False), + os.environ.get("RAY_USE_NEW_GCS", False), "New GCS API doesn't have a Python API yet.") class GlobalStateAPI(unittest.TestCase): def tearDown(self): diff --git a/test/tensorflow_test.py b/test/tensorflow_test.py index 7fc215d43..f37d81b79 100644 --- a/test/tensorflow_test.py +++ b/test/tensorflow_test.py @@ -3,8 +3,8 @@ from __future__ import division from __future__ import print_function from numpy.testing import assert_almost_equal +import pytest import tensorflow as tf -import unittest import ray @@ -93,178 +93,169 @@ class TrainActor(object): return self.values[1].get_weights() -class TensorFlowTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() - - def testTensorFlowVariables(self): - ray.init(num_workers=2) - - sess = tf.Session() - loss, init, _, _ = make_linear_network() - sess.run(init) - - variables = ray.experimental.TensorFlowVariables(loss, sess) - weights = variables.get_weights() - - for (name, val) in weights.items(): - weights[name] += 1.0 - - variables.set_weights(weights) - assert weights == variables.get_weights() - - loss2, init2, _, _ = make_linear_network("w", "b") - sess.run(init2) - - variables2 = ray.experimental.TensorFlowVariables(loss2, sess) - weights2 = variables2.get_weights() - - for (name, val) in weights2.items(): - weights2[name] += 2.0 - - variables2.set_weights(weights2) - assert weights2 == variables2.get_weights() - flat_weights = variables2.get_flat() + 2.0 - variables2.set_flat(flat_weights) - assert_almost_equal(flat_weights, variables2.get_flat()) - - variables3 = ray.experimental.TensorFlowVariables([loss2]) - assert variables3.sess is None - sess = tf.Session() - variables3.set_session(sess) - assert variables3.sess == sess - - # Test that the variable names for the two different nets are not - # modified by TensorFlow to be unique (i.e., they should already - # be unique because of the variable prefix). - def testVariableNameCollision(self): - ray.init(num_workers=2) - - net1 = NetActor() - net2 = NetActor() - - # This is checking that the variable names of the two nets are the - # same, i.e., that the names in the weight dictionaries are the same. - net1.values[0].set_weights(net2.values[0].get_weights()) - - # Test that TensorFlowVariables can take in addition variables through - # input_variables arg and with no loss. - def testAdditionalVariablesNoLoss(self): - ray.init(num_workers=1) - - net = LossActor(use_loss=False) - assert len(net.values[0].variables.items()) == 1 - assert len(net.values[0].placeholders.items()) == 1 - - net.values[0].set_weights(net.values[0].get_weights()) - - # Test that TensorFlowVariables can take in addition variables through - # input_variables arg and with a loss. - def testAdditionalVariablesWithLoss(self): - ray.init(num_workers=1) - - net = LossActor() - assert len(net.values[0].variables.items()) == 3 - assert len(net.values[0].placeholders.items()) == 3 - - net.values[0].set_weights(net.values[0].get_weights()) - - # Test that different networks on the same worker are independent and - # we can get/set their weights without any interaction. - def testNetworksIndependent(self): - # Note we use only one worker to ensure that all of the remote - # functions run on the same worker. - ray.init(num_workers=1) - net1 = NetActor() - net2 = NetActor() - - # Make sure the two networks have different weights. TODO(rkn): Note - # that equality comparisons of numpy arrays normally does not work. - # This only works because at the moment they have size 1. - weights1 = net1.get_weights() - weights2 = net2.get_weights() - assert weights1 != weights2 - - # Set the weights and get the weights, and make sure they are - # unchanged. - new_weights1 = net1.set_and_get_weights(weights1) - new_weights2 = net2.set_and_get_weights(weights2) - assert weights1 == new_weights1 - assert weights2 == new_weights2 - - # Swap the weights. - new_weights1 = net2.set_and_get_weights(weights1) - new_weights2 = net1.set_and_get_weights(weights2) - assert weights1 == new_weights1 - assert weights2 == new_weights2 - - # This test creates an additional network on the driver so that the - # tensorflow variables on the driver and the worker differ. - def testNetworkDriverWorkerIndependent(self): - ray.init(num_workers=1) - - # Create a network on the driver locally. - sess1 = tf.Session() - loss1, init1, _, _ = make_linear_network() - ray.experimental.TensorFlowVariables(loss1, sess1) - sess1.run(init1) - - net2 = ray.remote(NetActor).remote() - weights2 = ray.get(net2.get_weights.remote()) - - new_weights2 = ray.get( - net2.set_and_get_weights.remote(net2.get_weights.remote())) - assert weights2 == new_weights2 - - def testVariablesControlDependencies(self): - ray.init(num_workers=1) - - # Creates a network and appends a momentum optimizer. - sess = tf.Session() - loss, init, _, _ = make_linear_network() - minimizer = tf.train.MomentumOptimizer(0.9, 0.9).minimize(loss) - net_vars = ray.experimental.TensorFlowVariables(minimizer, sess) - sess.run(init) - - # Tests if all variables are properly retrieved, 2 variables and 2 - # momentum variables. - assert len(net_vars.variables.items()) == 4 - - def testRemoteTrainingStep(self): - ray.init(num_workers=1) - - net = ray.remote(TrainActor).remote() - ray.get(net.training_step.remote(net.get_weights.remote())) - - def testRemoteTrainingLoss(self): - ray.init(num_workers=2) - - net = ray.remote(TrainActor).remote() - net_values = TrainActor().values - loss, variables, _, sess, grads, train, placeholders = net_values - - before_acc = sess.run( - loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100]))) - - for _ in range(3): - gradients_list = ray.get([ - net.training_step.remote(variables.get_weights()) - for _ in range(2) - ]) - mean_grads = [ - sum(gradients[i] - for gradients in gradients_list) / len(gradients_list) - for i in range(len(gradients_list[0])) - ] - feed_dict = { - grad[0]: mean_grad - for (grad, mean_grad) in zip(grads, mean_grads) - } - sess.run(train, feed_dict=feed_dict) - after_acc = sess.run( - loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100]))) - assert before_acc < after_acc +@pytest.fixture +def ray_start_regular(): + # Start the Ray processes. + ray.init(num_cpus=2) + yield None + # The code after the yield will run as teardown code. + ray.shutdown() -if __name__ == "__main__": - unittest.main(verbosity=2) +def test_tensorflow_variables(ray_start_regular): + sess = tf.Session() + loss, init, _, _ = make_linear_network() + sess.run(init) + + variables = ray.experimental.TensorFlowVariables(loss, sess) + weights = variables.get_weights() + + for (name, val) in weights.items(): + weights[name] += 1.0 + + variables.set_weights(weights) + assert weights == variables.get_weights() + + loss2, init2, _, _ = make_linear_network("w", "b") + sess.run(init2) + + variables2 = ray.experimental.TensorFlowVariables(loss2, sess) + weights2 = variables2.get_weights() + + for (name, val) in weights2.items(): + weights2[name] += 2.0 + + variables2.set_weights(weights2) + assert weights2 == variables2.get_weights() + flat_weights = variables2.get_flat() + 2.0 + variables2.set_flat(flat_weights) + assert_almost_equal(flat_weights, variables2.get_flat()) + + variables3 = ray.experimental.TensorFlowVariables([loss2]) + assert variables3.sess is None + sess = tf.Session() + variables3.set_session(sess) + assert variables3.sess == sess + + +# Test that the variable names for the two different nets are not +# modified by TensorFlow to be unique (i.e., they should already +# be unique because of the variable prefix). +def test_variable_name_collision(ray_start_regular): + net1 = NetActor() + net2 = NetActor() + + # This is checking that the variable names of the two nets are the + # same, i.e., that the names in the weight dictionaries are the same. + net1.values[0].set_weights(net2.values[0].get_weights()) + + +# Test that TensorFlowVariables can take in addition variables through +# input_variables arg and with no loss. +def test_additional_variables_no_loss(ray_start_regular): + net = LossActor(use_loss=False) + assert len(net.values[0].variables.items()) == 1 + assert len(net.values[0].placeholders.items()) == 1 + + net.values[0].set_weights(net.values[0].get_weights()) + + +# Test that TensorFlowVariables can take in addition variables through +# input_variables arg and with a loss. +def test_additional_variables_with_loss(ray_start_regular): + net = LossActor() + assert len(net.values[0].variables.items()) == 3 + assert len(net.values[0].placeholders.items()) == 3 + + net.values[0].set_weights(net.values[0].get_weights()) + + +# Test that different networks on the same worker are independent and +# we can get/set their weights without any interaction. +def test_networks_independent(ray_start_regular): + # Note we use only one worker to ensure that all of the remote + # functions run on the same worker. + net1 = NetActor() + net2 = NetActor() + + # Make sure the two networks have different weights. TODO(rkn): Note + # that equality comparisons of numpy arrays normally does not work. + # This only works because at the moment they have size 1. + weights1 = net1.get_weights() + weights2 = net2.get_weights() + assert weights1 != weights2 + + # Set the weights and get the weights, and make sure they are + # unchanged. + new_weights1 = net1.set_and_get_weights(weights1) + new_weights2 = net2.set_and_get_weights(weights2) + assert weights1 == new_weights1 + assert weights2 == new_weights2 + + # Swap the weights. + new_weights1 = net2.set_and_get_weights(weights1) + new_weights2 = net1.set_and_get_weights(weights2) + assert weights1 == new_weights1 + assert weights2 == new_weights2 + + +# This test creates an additional network on the driver so that the +# tensorflow variables on the driver and the worker differ. +def test_network_driver_worker_independent(ray_start_regular): + # Create a network on the driver locally. + sess1 = tf.Session() + loss1, init1, _, _ = make_linear_network() + ray.experimental.TensorFlowVariables(loss1, sess1) + sess1.run(init1) + + net2 = ray.remote(NetActor).remote() + weights2 = ray.get(net2.get_weights.remote()) + + new_weights2 = ray.get( + net2.set_and_get_weights.remote(net2.get_weights.remote())) + assert weights2 == new_weights2 + + +def test_variables_control_dependencies(ray_start_regular): + # Creates a network and appends a momentum optimizer. + sess = tf.Session() + loss, init, _, _ = make_linear_network() + minimizer = tf.train.MomentumOptimizer(0.9, 0.9).minimize(loss) + net_vars = ray.experimental.TensorFlowVariables(minimizer, sess) + sess.run(init) + + # Tests if all variables are properly retrieved, 2 variables and 2 + # momentum variables. + assert len(net_vars.variables.items()) == 4 + + +def test_remote_training_step(ray_start_regular): + net = ray.remote(TrainActor).remote() + ray.get(net.training_step.remote(net.get_weights.remote())) + + +def test_remote_training_loss(ray_start_regular): + net = ray.remote(TrainActor).remote() + net_values = TrainActor().values + loss, variables, _, sess, grads, train, placeholders = net_values + + before_acc = sess.run( + loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100]))) + + for _ in range(3): + gradients_list = ray.get([ + net.training_step.remote(variables.get_weights()) for _ in range(2) + ]) + mean_grads = [ + sum(gradients[i] + for gradients in gradients_list) / len(gradients_list) + for i in range(len(gradients_list[0])) + ] + feed_dict = { + grad[0]: mean_grad + for (grad, mean_grad) in zip(grads, mean_grads) + } + sess.run(train, feed_dict=feed_dict) + after_acc = sess.run( + loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100]))) + assert before_acc < after_acc