Move more unit tests to bazel (#6250)

* move more unit tests to bazel * move to avoid conflict * fix lint * fix deps * seprate * fix failing tests * show tests * ignore mismatch * try combining bazel runs * build lint * remove tests from install * fix test utils * better config * split up * exclusive * fix verbosity * fix tests class * cleanup * remove flaky * fix metrics test * Update .travis.yml * no retry flaky * split up actor * split basic test * split up trial runner test * split stress * fix basic test * fix tests * switch to pytest runner for main * make microbench not fail * move load code to py3 * test is no longer package * bazel to end
2026-06-28 15:06:28 +08:00 · 2019-11-24 11:43:34 -08:00
parent aa8d5d2f6c
commit 53641f1f74
70 changed files with 6599 additions and 5766 deletions
@@ -4,7 +4,7 @@ import time
 import numpy as np
 import ray

-from ray.tests.cluster_utils import Cluster
+from ray.cluster_utils import Cluster


 def main():
@@ -2,6 +2,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import json
 import fnmatch
 import os
 import subprocess
@@ -170,3 +171,11 @@ def recursive_fnmatch(dirpath, pattern):
        for filename in fnmatch.filter(filenames, pattern):
            matches.append(os.path.join(root, filename))
    return matches
+
+
+def generate_internal_config_map(**kwargs):
+    internal_config = json.dumps(kwargs)
+    ray_kwargs = {
+        "_internal_config": internal_config,
+    }
+    return ray_kwargs
@@ -0,0 +1,297 @@
+py_test(
+    name = "test_actor",
+    size = "large",
+    srcs = ["test_actor.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_actor_resources",
+    size = "large",
+    srcs = ["test_actor_resources.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_actor_failures",
+    size = "large",
+    srcs = ["test_actor_failures.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_basic",
+    size = "large",
+    srcs = ["test_basic.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_advanced",
+    size = "large",
+    srcs = ["test_advanced.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_component_failures",
+    size = "large",
+    srcs = ["test_component_failures.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_multinode_failures",
+    size = "large",
+    srcs = ["test_multinode_failures.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_stress",
+    size = "large",
+    srcs = ["test_stress.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_stress_sharded",
+    size = "large",
+    srcs = ["test_stress_sharded.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_stress_failure",
+    size = "large",
+    srcs = ["test_stress_failure.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_array",
+    size = "medium",
+    srcs = ["test_array.py"],
+    deps = ["//:ray_lib"],
+    flaky = 1,
+)
+
+py_test(
+    name = "test_autoscaler",
+    size = "small",
+    srcs = ["test_autoscaler.py"],
+    deps = ["//:ray_lib"],
+    flaky = 1,
+)
+
+py_test(
+    name = "test_autoscaler_yaml",
+    size = "small",
+    srcs = ["test_autoscaler_yaml.py"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_cython",
+    size = "small",
+    srcs = ["test_cython.py"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_debug_tools",
+    size = "small",
+    srcs = ["test_debug_tools.py"],
+    deps = ["//:ray_lib"],
+    flaky = 1,
+)
+
+py_test(
+    name = "test_dynres",
+    size = "medium",
+    srcs = ["test_dynres.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_failure",
+    size = "medium",
+    srcs = ["test_failure.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_garbage_collection",
+    size = "medium",
+    srcs = ["test_garbage_collection.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_global_state",
+    size = "medium",
+    srcs = ["test_global_state.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_logical_graph",
+    size = "medium",
+    srcs = ["test_logical_graph.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_memory_limits",
+    size = "medium",
+    srcs = ["test_memory_limits.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_memory_scheduling",
+    size = "medium",
+    srcs = ["test_memory_scheduling.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_metrics",
+    size = "small",
+    srcs = ["test_metrics.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_microbenchmarks",
+    size = "medium",
+    srcs = ["test_microbenchmarks.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_mini",
+    size = "small",
+    srcs = ["test_mini.py"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_monitors",
+    size = "medium",
+    srcs = ["test_monitors.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_multi_node_2",
+    size = "medium",
+    srcs = ["test_multi_node_2.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_multi_node",
+    size = "medium",
+    srcs = ["test_multi_node.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_node_manager",
+    size = "small",
+    srcs = ["test_node_manager.py"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_object_manager",
+    size = "medium",
+    srcs = ["test_object_manager.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_projects",
+    size = "small",
+    srcs = ["test_projects.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_queue",
+    size = "small",
+    srcs = ["test_queue.py"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_ray_init",
+    size = "medium",
+    srcs = ["test_ray_init.py"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_signal",
+    size = "medium",
+    srcs = ["test_signal.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_tempfile",
+    size = "small",
+    srcs = ["test_tempfile.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_tensorflow",
+    size = "medium",
+    srcs = ["test_tensorflow.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_unreconstructable_errors",
+    size = "medium",
+    srcs = ["test_unreconstructable_errors.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
+
+py_test(
+    name = "test_webui",
+    size = "medium",
+    srcs = ["test_webui.py"],
+    tags = ["exclusive"],
+    deps = ["//:ray_lib"],
+)
@@ -8,7 +8,7 @@ import pytest
 import subprocess

 import ray
-from ray.tests.cluster_utils import Cluster
+from ray.cluster_utils import Cluster


@pytest.fixture
@@ -18,14 +18,6 @@ def shutdown_only():
    ray.shutdown()


-def generate_internal_config_map(**kwargs):
-    internal_config = json.dumps(kwargs)
-    ray_kwargs = {
-        "_internal_config": internal_config,
-    }
-    return ray_kwargs
-
-
 def get_default_fixure_internal_config():
    internal_config = json.dumps({
        "initial_reconstruction_timeout_milliseconds": 200,
@@ -177,7 +169,7 @@ def two_node_cluster():
        "initial_reconstruction_timeout_milliseconds": 200,
        "num_heartbeats_timeout": 10,
    })
-    cluster = ray.tests.cluster_utils.Cluster(
+    cluster = ray.cluster_utils.Cluster(
        head_node_args={"_internal_config": internal_config})
    for _ in range(2):
        remote_node = cluster.add_node(
@@ -8,8 +8,8 @@ import threading
 import pytest

 import ray
-import ray.tests.cluster_utils
-import ray.tests.utils
+import ray.cluster_utils
+import ray.test_utils


@pytest.mark.parametrize(
@@ -0,0 +1,732 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+import numpy as np
+import os
+import pytest
+try:
+    import pytest_timeout
+except ImportError:
+    pytest_timeout = None
+import signal
+import sys
+import time
+
+import ray
+import ray.ray_constants as ray_constants
+import ray.test_utils
+import ray.cluster_utils
+from ray.test_utils import (relevant_errors, wait_for_condition,
+                            wait_for_errors, wait_for_pid_to_exit,
+                            generate_internal_config_map)
+
+
+@pytest.fixture
+def ray_checkpointable_actor_cls(request):
+    checkpoint_dir = "/tmp/ray_temp_checkpoint_dir/"
+    if not os.path.isdir(checkpoint_dir):
+        os.mkdir(checkpoint_dir)
+
+    class CheckpointableActor(ray.actor.Checkpointable):
+        def __init__(self):
+            self.value = 0
+            self.resumed_from_checkpoint = False
+            self.checkpoint_dir = checkpoint_dir
+
+        def node_id(self):
+            return ray.worker.global_worker.node.unique_id
+
+        def increase(self):
+            self.value += 1
+            return self.value
+
+        def get(self):
+            return self.value
+
+        def was_resumed_from_checkpoint(self):
+            return self.resumed_from_checkpoint
+
+        def get_pid(self):
+            return os.getpid()
+
+        def should_checkpoint(self, checkpoint_context):
+            # Checkpoint the actor when value is increased to 3.
+            should_checkpoint = self.value == 3
+            return should_checkpoint
+
+        def save_checkpoint(self, actor_id, checkpoint_id):
+            actor_id, checkpoint_id = actor_id.hex(), checkpoint_id.hex()
+            # Save checkpoint into a file.
+            with open(self.checkpoint_dir + actor_id, "a+") as f:
+                print(checkpoint_id, self.value, file=f)
+
+        def load_checkpoint(self, actor_id, available_checkpoints):
+            actor_id = actor_id.hex()
+            filename = self.checkpoint_dir + actor_id
+            # Load checkpoint from the file.
+            if not os.path.isfile(filename):
+                return None
+
+            available_checkpoint_ids = [
+                c.checkpoint_id for c in available_checkpoints
+            ]
+            with open(filename, "r") as f:
+                for line in f:
+                    checkpoint_id, value = line.strip().split(" ")
+                    checkpoint_id = ray.ActorCheckpointID(
+                        ray.utils.hex_to_binary(checkpoint_id))
+                    if checkpoint_id in available_checkpoint_ids:
+                        self.value = int(value)
+                        self.resumed_from_checkpoint = True
+                        return checkpoint_id
+                return None
+
+        def checkpoint_expired(self, actor_id, checkpoint_id):
+            pass
+
+    return CheckpointableActor
+
+
+@pytest.mark.parametrize(
+    "ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True)
+def test_actor_eviction(ray_start_object_store_memory):
+    object_store_memory = ray_start_object_store_memory
+
+    @ray.remote
+    class Actor(object):
+        def __init__(self):
+            pass
+
+        def create_object(self, size):
+            return np.random.rand(size)
+
+    a = Actor.remote()
+    # Submit enough methods on the actor so that they exceed the size of the
+    # object store.
+    objects = []
+    num_objects = 20
+    for _ in range(num_objects):
+        obj = a.create_object.remote(object_store_memory // num_objects)
+        objects.append(obj)
+        # Get each object once to make sure each object gets created.
+        ray.get(obj)
+
+    # Get each object again. At this point, the earlier objects should have
+    # been evicted.
+    num_evicted, num_success = 0, 0
+    for obj in objects:
+        try:
+            val = ray.get(obj)
+            assert isinstance(val, np.ndarray), val
+            num_success += 1
+        except ray.exceptions.UnreconstructableError:
+            num_evicted += 1
+    # Some objects should have been evicted, and some should still be in the
+    # object store.
+    assert num_evicted > 0
+    assert num_success > 0
+
+
+def test_actor_reconstruction(ray_start_regular):
+    """Test actor reconstruction when actor process is killed."""
+
+    @ray.remote(max_reconstructions=1)
+    class ReconstructableActor(object):
+        """An actor that will be reconstructed at most once."""
+
+        def __init__(self):
+            self.value = 0
+
+        def increase(self, delay=0):
+            time.sleep(delay)
+            self.value += 1
+            return self.value
+
+        def get_pid(self):
+            return os.getpid()
+
+    actor = ReconstructableActor.remote()
+    pid = ray.get(actor.get_pid.remote())
+    # Call increase 3 times
+    for _ in range(3):
+        ray.get(actor.increase.remote())
+    # Call increase again with some delay.
+    result = actor.increase.remote(delay=0.5)
+    # Sleep some time to wait for the above task to start execution.
+    time.sleep(0.2)
+    # Kill actor process, while the above task is still being executed.
+    os.kill(pid, signal.SIGKILL)
+    # Check that the above task didn't fail and the actor is reconstructed.
+    assert ray.get(result) == 4
+    # Check that we can still call the actor.
+    assert ray.get(actor.increase.remote()) == 5
+    # kill actor process one more time.
+    pid = ray.get(actor.get_pid.remote())
+    os.kill(pid, signal.SIGKILL)
+    # The actor has exceeded max reconstructions, and this task should fail.
+    with pytest.raises(ray.exceptions.RayActorError):
+        ray.get(actor.increase.remote())
+
+    # Create another actor.
+    actor = ReconstructableActor.remote()
+    # Intentionlly exit the actor
+    actor.__ray_terminate__.remote()
+    # Check that the actor won't be reconstructed.
+    with pytest.raises(ray.exceptions.RayActorError):
+        ray.get(actor.increase.remote())
+
+
+def test_actor_reconstruction_without_task(ray_start_regular):
+    """Test a dead actor can be reconstructed without sending task to it."""
+
+    @ray.remote(max_reconstructions=1)
+    class ReconstructableActor(object):
+        def __init__(self, obj_ids):
+            for obj_id in obj_ids:
+                # Every time the actor gets constructed,
+                # put a new object in plasma store.
+                global_worker = ray.worker.global_worker
+                if not global_worker.core_worker.object_exists(obj_id):
+                    global_worker.put_object(1, obj_id)
+                    break
+
+        def get_pid(self):
+            return os.getpid()
+
+    obj_ids = [ray.ObjectID.from_random() for _ in range(2)]
+    actor = ReconstructableActor.remote(obj_ids)
+    # Kill the actor.
+    pid = ray.get(actor.get_pid.remote())
+    os.kill(pid, signal.SIGKILL)
+    # Wait until the actor is reconstructed.
+    assert wait_for_condition(
+        lambda: ray.worker.global_worker.core_worker.object_exists(obj_ids[1]),
+        timeout_ms=5000)
+
+
+def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
+    """Test actor reconstruction when node dies unexpectedly."""
+    cluster = ray_start_cluster_head
+    max_reconstructions = 3
+    # Add a few nodes to the cluster.
+    # Use custom resource to make sure the actor is only created on worker
+    # nodes, not on the head node.
+    for _ in range(max_reconstructions + 2):
+        cluster.add_node(
+            resources={"a": 1},
+            _internal_config=json.dumps({
+                "initial_reconstruction_timeout_milliseconds": 200,
+                "num_heartbeats_timeout": 10,
+            }),
+        )
+
+    def kill_node(node_id):
+        node_to_remove = None
+        for node in cluster.worker_nodes:
+            if node_id == node.unique_id:
+                node_to_remove = node
+        cluster.remove_node(node_to_remove)
+
+    @ray.remote(max_reconstructions=max_reconstructions, resources={"a": 1})
+    class MyActor(object):
+        def __init__(self):
+            self.value = 0
+
+        def increase(self):
+            self.value += 1
+            return self.value
+
+        def get_object_store_socket(self):
+            return ray.worker.global_worker.node.unique_id
+
+    actor = MyActor.remote()
+    # Call increase 3 times.
+    for _ in range(3):
+        ray.get(actor.increase.remote())
+
+    for i in range(max_reconstructions):
+        object_store_socket = ray.get(actor.get_object_store_socket.remote())
+        # Kill actor's node and the actor should be reconstructed
+        # on a different node.
+        kill_node(object_store_socket)
+        # Call increase again.
+        # Check that the actor is reconstructed and value is correct.
+        assert ray.get(actor.increase.remote()) == 4 + i
+        # Check that the actor is now on a different node.
+        assert object_store_socket != ray.get(
+            actor.get_object_store_socket.remote())
+
+    # kill the node again.
+    object_store_socket = ray.get(actor.get_object_store_socket.remote())
+    kill_node(object_store_socket)
+    # The actor has exceeded max reconstructions, and this task should fail.
+    with pytest.raises(ray.exceptions.RayActorError):
+        ray.get(actor.increase.remote())
+
+
+# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for
+# this test. Because if this value is too small, suprious task reconstruction
+# may happen and cause the test fauilure. If the value is too large, this test
+# could be very slow. We can remove this once we support dynamic timeout.
+@pytest.mark.parametrize(
+    "ray_start_cluster_head", [
+        generate_internal_config_map(
+            initial_reconstruction_timeout_milliseconds=1000)
+    ],
+    indirect=True)
+def test_multiple_actor_reconstruction(ray_start_cluster_head):
+    cluster = ray_start_cluster_head
+    # This test can be made more stressful by increasing the numbers below.
+    # The total number of actors created will be
+    # num_actors_at_a_time * num_nodes.
+    num_nodes = 5
+    num_actors_at_a_time = 3
+    num_function_calls_at_a_time = 10
+
+    worker_nodes = [
+        cluster.add_node(
+            num_cpus=3,
+            _internal_config=json.dumps({
+                "initial_reconstruction_timeout_milliseconds": 200,
+                "num_heartbeats_timeout": 10,
+            })) for _ in range(num_nodes)
+    ]
+
+    @ray.remote(max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION)
+    class SlowCounter(object):
+        def __init__(self):
+            self.x = 0
+
+        def inc(self, duration):
+            time.sleep(duration)
+            self.x += 1
+            return self.x
+
+    # Create some initial actors.
+    actors = [SlowCounter.remote() for _ in range(num_actors_at_a_time)]
+
+    # Wait for the actors to start up.
+    time.sleep(1)
+
+    # This is a mapping from actor handles to object IDs returned by
+    # methods on that actor.
+    result_ids = collections.defaultdict(lambda: [])
+
+    # In a loop we are going to create some actors, run some methods, kill
+    # a raylet, and run some more methods.
+    for node in worker_nodes:
+        # Create some actors.
+        actors.extend(
+            [SlowCounter.remote() for _ in range(num_actors_at_a_time)])
+        # Run some methods.
+        for j in range(len(actors)):
+            actor = actors[j]
+            for _ in range(num_function_calls_at_a_time):
+                result_ids[actor].append(actor.inc.remote(j**2 * 0.000001))
+        # Kill a node.
+        cluster.remove_node(node)
+
+        # Run some more methods.
+        for j in range(len(actors)):
+            actor = actors[j]
+            for _ in range(num_function_calls_at_a_time):
+                result_ids[actor].append(actor.inc.remote(j**2 * 0.000001))
+
+    # Get the results and check that they have the correct values.
+    for _, result_id_list in result_ids.items():
+        results = list(range(1, len(result_id_list) + 1))
+        assert ray.get(result_id_list) == results
+
+
+def kill_actor(actor):
+    """A helper function that kills an actor process."""
+    pid = ray.get(actor.get_pid.remote())
+    os.kill(pid, signal.SIGKILL)
+    wait_for_pid_to_exit(pid)
+
+
+def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
+    """Test actor checkpointing and restoring from a checkpoint."""
+    actor = ray.remote(
+        max_reconstructions=2)(ray_checkpointable_actor_cls).remote()
+    # Call increase 3 times, triggering a checkpoint.
+    expected = 0
+    for _ in range(3):
+        ray.get(actor.increase.remote())
+        expected += 1
+    # Assert that the actor wasn't resumed from a checkpoint.
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
+    # Kill actor process.
+    kill_actor(actor)
+    # Assert that the actor was resumed from a checkpoint and its value is
+    # still correct.
+    assert ray.get(actor.get.remote()) == expected
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
+
+    # Submit some more tasks. These should get replayed since they happen after
+    # the checkpoint.
+    for _ in range(3):
+        ray.get(actor.increase.remote())
+        expected += 1
+    # Kill actor again and check that reconstruction still works after the
+    # actor resuming from a checkpoint.
+    kill_actor(actor)
+    assert ray.get(actor.get.remote()) == expected
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
+
+
+def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
+    """Test checkpointing of a remote actor through method invocation."""
+
+    # Define a class that exposes a method to save checkpoints.
+    class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
+        def __init__(self):
+            super(RemoteCheckpointableActor, self).__init__()
+            self._should_checkpoint = False
+
+        def checkpoint(self):
+            self._should_checkpoint = True
+
+        def should_checkpoint(self, checkpoint_context):
+            should_checkpoint = self._should_checkpoint
+            self._should_checkpoint = False
+            return should_checkpoint
+
+    cls = ray.remote(max_reconstructions=2)(RemoteCheckpointableActor)
+    actor = cls.remote()
+    # Call increase 3 times.
+    expected = 0
+    for _ in range(3):
+        ray.get(actor.increase.remote())
+        expected += 1
+    # Call a checkpoint task.
+    actor.checkpoint.remote()
+    # Assert that the actor wasn't resumed from a checkpoint.
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
+    # Kill actor process.
+    kill_actor(actor)
+    # Assert that the actor was resumed from a checkpoint and its value is
+    # still correct.
+    assert ray.get(actor.get.remote()) == expected
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
+
+    # Submit some more tasks. These should get replayed since they happen after
+    # the checkpoint.
+    for _ in range(3):
+        ray.get(actor.increase.remote())
+        expected += 1
+    # Kill actor again and check that reconstruction still works after the
+    # actor resuming from a checkpoint.
+    kill_actor(actor)
+    assert ray.get(actor.get.remote()) == expected
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
+
+
+def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
+                                       ray_checkpointable_actor_cls):
+    """Test actor checkpointing on a remote node."""
+    # Place the actor on the remote node.
+    cluster = ray_start_cluster_2_nodes
+    remote_node = list(cluster.worker_nodes)
+    actor_cls = ray.remote(max_reconstructions=1)(ray_checkpointable_actor_cls)
+    actor = actor_cls.remote()
+    while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id):
+        actor = actor_cls.remote()
+
+    # Call increase several times.
+    expected = 0
+    for _ in range(6):
+        ray.get(actor.increase.remote())
+        expected += 1
+    # Assert that the actor wasn't resumed from a checkpoint.
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
+    # Kill actor process.
+    cluster.remove_node(remote_node[0])
+    # Assert that the actor was resumed from a checkpoint and its value is
+    # still correct.
+    assert ray.get(actor.get.remote()) == expected
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
+
+
+def test_checkpointing_save_exception(ray_start_regular,
+                                      ray_checkpointable_actor_cls):
+    """Test actor can still be recovered if checkpoints fail to complete."""
+
+    @ray.remote(max_reconstructions=2)
+    class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
+        def save_checkpoint(self, actor_id, checkpoint_context):
+            raise Exception("Intentional error saving checkpoint.")
+
+    actor = RemoteCheckpointableActor.remote()
+    # Call increase 3 times, triggering a checkpoint that will fail.
+    expected = 0
+    for _ in range(3):
+        ray.get(actor.increase.remote())
+        expected += 1
+    # Assert that the actor wasn't resumed from a checkpoint.
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
+    # Kill actor process.
+    kill_actor(actor)
+    # Assert that the actor still wasn't resumed from a checkpoint and its
+    # value is still correct.
+    assert ray.get(actor.get.remote()) == expected
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
+
+    # Submit some more tasks. These should get replayed since they happen after
+    # the checkpoint.
+    for _ in range(3):
+        ray.get(actor.increase.remote())
+        expected += 1
+    # Kill actor again, and check that reconstruction still works and the actor
+    # wasn't resumed from a checkpoint.
+    kill_actor(actor)
+    assert ray.get(actor.get.remote()) == expected
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
+
+    # Check that the checkpoint error was pushed to the driver.
+    wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1)
+
+
+def test_checkpointing_load_exception(ray_start_regular,
+                                      ray_checkpointable_actor_cls):
+    """Test actor can still be recovered if checkpoints fail to load."""
+
+    @ray.remote(max_reconstructions=2)
+    class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
+        def load_checkpoint(self, actor_id, checkpoints):
+            raise Exception("Intentional error loading checkpoint.")
+
+    actor = RemoteCheckpointableActor.remote()
+    # Call increase 3 times, triggering a checkpoint that will succeed.
+    expected = 0
+    for _ in range(3):
+        ray.get(actor.increase.remote())
+        expected += 1
+    # Assert that the actor wasn't resumed from a checkpoint because loading
+    # it failed.
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
+    # Kill actor process.
+    kill_actor(actor)
+    # Assert that the actor still wasn't resumed from a checkpoint and its
+    # value is still correct.
+    assert ray.get(actor.get.remote()) == expected
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
+
+    # Submit some more tasks. These should get replayed since they happen after
+    # the checkpoint.
+    for _ in range(3):
+        ray.get(actor.increase.remote())
+        expected += 1
+    # Kill actor again, and check that reconstruction still works and the actor
+    # wasn't resumed from a checkpoint.
+    kill_actor(actor)
+    assert ray.get(actor.get.remote()) == expected
+    assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
+
+    # Check that the checkpoint error was pushed to the driver.
+    wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1)
+
+
+@pytest.mark.parametrize(
+    "ray_start_regular",
+    # This overwrite currently isn't effective,
+    # see https://github.com/ray-project/ray/issues/3926.
+    [generate_internal_config_map(num_actor_checkpoints_to_keep=20)],
+    indirect=True,
+)
+def test_deleting_actor_checkpoint(ray_start_regular):
+    """Test deleting old actor checkpoints."""
+
+    @ray.remote
+    class CheckpointableActor(ray.actor.Checkpointable):
+        def __init__(self):
+            self.checkpoint_ids = []
+
+        def get_checkpoint_ids(self):
+            return self.checkpoint_ids
+
+        def should_checkpoint(self, checkpoint_context):
+            # Save checkpoints after every task
+            return True
+
+        def save_checkpoint(self, actor_id, checkpoint_id):
+            self.checkpoint_ids.append(checkpoint_id)
+            pass
+
+        def load_checkpoint(self, actor_id, available_checkpoints):
+            pass
+
+        def checkpoint_expired(self, actor_id, checkpoint_id):
+            assert checkpoint_id == self.checkpoint_ids[0]
+            del self.checkpoint_ids[0]
+
+    actor = CheckpointableActor.remote()
+    for i in range(19):
+        assert len(ray.get(actor.get_checkpoint_ids.remote())) == i + 1
+    for _ in range(20):
+        assert len(ray.get(actor.get_checkpoint_ids.remote())) == 20
+
+
+def test_bad_checkpointable_actor_class():
+    """Test error raised if an actor class doesn't implement all abstract
+    methods in the Checkpointable interface."""
+
+    with pytest.raises(TypeError):
+
+        @ray.remote
+        class BadCheckpointableActor(ray.actor.Checkpointable):
+            def should_checkpoint(self, checkpoint_context):
+                return True
+
+
+def test_init_exception_in_checkpointable_actor(ray_start_regular,
+                                                ray_checkpointable_actor_cls):
+    # This test is similar to test_failure.py::test_failed_actor_init.
+    # This test is used to guarantee that checkpointable actor does not
+    # break the same logic.
+    error_message1 = "actor constructor failed"
+    error_message2 = "actor method failed"
+
+    @ray.remote
+    class CheckpointableFailedActor(ray_checkpointable_actor_cls):
+        def __init__(self):
+            raise Exception(error_message1)
+
+        def fail_method(self):
+            raise Exception(error_message2)
+
+        def should_checkpoint(self, checkpoint_context):
+            return True
+
+    a = CheckpointableFailedActor.remote()
+
+    # Make sure that we get errors from a failed constructor.
+    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
+    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
+    assert len(errors) == 1
+    assert error_message1 in errors[0]["message"]
+
+    # Make sure that we get errors from a failed method.
+    a.fail_method.remote()
+    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
+    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
+    assert len(errors) == 2
+    assert error_message1 in errors[1]["message"]
+
+
+def test_decorated_method(ray_start_regular):
+    def method_invocation_decorator(f):
+        def new_f_invocation(args, kwargs):
+            # Split one argument into two. Return th kwargs without passing
+            # them into the actor.
+            return f([args[0], args[0]], {}), kwargs
+
+        return new_f_invocation
+
+    def method_execution_decorator(f):
+        def new_f_execution(self, b, c):
+            # Turn two arguments into one.
+            return f(self, b + c)
+
+        new_f_execution.__ray_invocation_decorator__ = (
+            method_invocation_decorator)
+        return new_f_execution
+
+    @ray.remote
+    class Actor(object):
+        @method_execution_decorator
+        def decorated_method(self, x):
+            return x + 1
+
+    a = Actor.remote()
+
+    object_id, extra = a.decorated_method.remote(3, kwarg=3)
+    assert isinstance(object_id, ray.ObjectID)
+    assert extra == {"kwarg": 3}
+    assert ray.get(object_id) == 7  # 2 * 3 + 1
+
+
+@pytest.mark.skipif(
+    pytest_timeout is None,
+    reason="Timeout package not installed; skipping test that may hang.")
+@pytest.mark.timeout(20)
+@pytest.mark.parametrize(
+    "ray_start_cluster", [{
+        "num_cpus": 1,
+        "num_nodes": 2,
+    }], indirect=True)
+def test_ray_wait_dead_actor(ray_start_cluster):
+    """Tests that methods completed by dead actors are returned as ready"""
+    cluster = ray_start_cluster
+
+    @ray.remote(num_cpus=1)
+    class Actor(object):
+        def __init__(self):
+            pass
+
+        def node_id(self):
+            return ray.worker.global_worker.node.unique_id
+
+        def ping(self):
+            time.sleep(1)
+
+    # Create some actors and wait for them to initialize.
+    num_nodes = len(cluster.list_all_nodes())
+    actors = [Actor.remote() for _ in range(num_nodes)]
+    ray.get([actor.ping.remote() for actor in actors])
+
+    # Ping the actors and make sure the tasks complete.
+    ping_ids = [actor.ping.remote() for actor in actors]
+    ray.get(ping_ids)
+    # Evict the result from the node that we're about to kill.
+    remote_node = cluster.list_all_nodes()[-1]
+    remote_ping_id = None
+    for i, actor in enumerate(actors):
+        if ray.get(actor.node_id.remote()) == remote_node.unique_id:
+            remote_ping_id = ping_ids[i]
+    ray.internal.free([remote_ping_id], local_only=True)
+    cluster.remove_node(remote_node)
+
+    # Repeatedly call ray.wait until the exception for the dead actor is
+    # received.
+    unready = ping_ids[:]
+    while unready:
+        _, unready = ray.wait(unready, timeout=0)
+        time.sleep(1)
+
+    with pytest.raises(ray.exceptions.RayActorError):
+        ray.get(ping_ids)
+
+    # Evict the result from the dead node.
+    ray.internal.free([remote_ping_id], local_only=True)
+    # Create an actor on the local node that will call ray.wait in a loop.
+    head_node_resource = "HEAD_NODE"
+    ray.experimental.set_resource(head_node_resource, 1)
+
+    @ray.remote(num_cpus=0, resources={head_node_resource: 1})
+    class ParentActor(object):
+        def __init__(self, ping_ids):
+            self.unready = ping_ids
+
+        def wait(self):
+            _, self.unready = ray.wait(self.unready, timeout=0)
+            return len(self.unready) == 0
+
+        def ping(self):
+            return
+
+    # Repeatedly call ray.wait through the local actor until the exception for
+    # the dead actor is received.
+    parent_actor = ParentActor.remote(ping_ids)
+    ray.get(parent_actor.ping.remote())
+    failure_detected = False
+    while not failure_detected:
+        failure_detected = ray.get(parent_actor.wait.remote())
+
+
+if __name__ == "__main__":
+    import pytest
+    sys.exit(pytest.main(["-v", __file__]))
@@ -0,0 +1,743 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+import os
+import pytest
+try:
+    import pytest_timeout
+except ImportError:
+    pytest_timeout = None
+import sys
+import time
+
+import ray
+import ray.test_utils
+import ray.cluster_utils
+
+
+def test_actor_deletion_with_gpus(shutdown_only):
+    ray.init(
+        num_cpus=1, num_gpus=1, object_store_memory=int(150 * 1024 * 1024))
+
+    # When an actor that uses a GPU exits, make sure that the GPU resources
+    # are released.
+
+    @ray.remote(num_gpus=1)
+    class Actor(object):
+        def getpid(self):
+            return os.getpid()
+
+    for _ in range(5):
+        # If we can successfully create an actor, that means that enough
+        # GPU resources are available.
+        a = Actor.remote()
+        ray.get(a.getpid.remote())
+
+
+def test_actor_state(ray_start_regular):
+    @ray.remote
+    class Counter(object):
+        def __init__(self):
+            self.value = 0
+
+        def increase(self):
+            self.value += 1
+
+        def value(self):
+            return self.value
+
+    c1 = Counter.remote()
+    c1.increase.remote()
+    assert ray.get(c1.value.remote()) == 1
+
+    c2 = Counter.remote()
+    c2.increase.remote()
+    c2.increase.remote()
+    assert ray.get(c2.value.remote()) == 2
+
+
+def test_actor_class_methods(ray_start_regular):
+    class Foo(object):
+        x = 2
+
+        @classmethod
+        def as_remote(cls):
+            return ray.remote(cls)
+
+        @classmethod
+        def f(cls):
+            return cls.x
+
+        @classmethod
+        def g(cls, y):
+            return cls.x + y
+
+        def echo(self, value):
+            return value
+
+    a = Foo.as_remote().remote()
+    assert ray.get(a.echo.remote(2)) == 2
+    assert ray.get(a.f.remote()) == 2
+    assert ray.get(a.g.remote(2)) == 4
+
+
+def test_resource_assignment(shutdown_only):
+    """Test to make sure that we assign resource to actors at instantiation."""
+    # This test will create 16 actors. Declaring this many CPUs initially will
+    # speed up the test because the workers will be started ahead of time.
+    ray.init(
+        num_cpus=16,
+        num_gpus=1,
+        resources={"Custom": 1},
+        object_store_memory=int(150 * 1024 * 1024))
+
+    class Actor(object):
+        def __init__(self):
+            self.resources = ray.get_resource_ids()
+
+        def get_actor_resources(self):
+            return self.resources
+
+        def get_actor_method_resources(self):
+            return ray.get_resource_ids()
+
+    decorator_resource_args = [{}, {
+        "num_cpus": 0.1
+    }, {
+        "num_gpus": 0.1
+    }, {
+        "resources": {
+            "Custom": 0.1
+        }
+    }]
+    instantiation_resource_args = [{}, {
+        "num_cpus": 0.2
+    }, {
+        "num_gpus": 0.2
+    }, {
+        "resources": {
+            "Custom": 0.2
+        }
+    }]
+    for decorator_args in decorator_resource_args:
+        for instantiation_args in instantiation_resource_args:
+            if len(decorator_args) == 0:
+                actor_class = ray.remote(Actor)
+            else:
+                actor_class = ray.remote(**decorator_args)(Actor)
+            actor = actor_class._remote(**instantiation_args)
+            actor_resources = ray.get(actor.get_actor_resources.remote())
+            actor_method_resources = ray.get(
+                actor.get_actor_method_resources.remote())
+            if len(decorator_args) == 0 and len(instantiation_args) == 0:
+                assert len(actor_resources) == 0, (
+                    "Actor should not be assigned resources.")
+                assert list(actor_method_resources.keys()) == [
+                    "CPU"
+                ], ("Actor method should only have CPUs")
+                assert actor_method_resources["CPU"][0][1] == 1, (
+                    "Actor method should default to one cpu.")
+            else:
+                if ("num_cpus" not in decorator_args
+                        and "num_cpus" not in instantiation_args):
+                    assert actor_resources["CPU"][0][1] == 1, (
+                        "Actor should default to one cpu.")
+                correct_resources = {}
+                defined_resources = decorator_args.copy()
+                defined_resources.update(instantiation_args)
+                for resource, value in defined_resources.items():
+                    if resource == "num_cpus":
+                        correct_resources["CPU"] = value
+                    elif resource == "num_gpus":
+                        correct_resources["GPU"] = value
+                    elif resource == "resources":
+                        for custom_resource, amount in value.items():
+                            correct_resources[custom_resource] = amount
+                for resource, amount in correct_resources.items():
+                    assert (actor_resources[resource][0][0] ==
+                            actor_method_resources[resource][0][0]), (
+                                "Should have assigned same {} for both actor ",
+                                "and actor method.".format(resource))
+                    assert (actor_resources[resource][0][
+                        1] == actor_method_resources[resource][0][1]), (
+                            "Should have assigned same amount of {} for both ",
+                            "actor and actor method.".format(resource))
+                    assert actor_resources[resource][0][1] == amount, (
+                        "Actor should have {amount} {resource} but has ",
+                        "{amount} {resource}".format(
+                            amount=amount, resource=resource))
+
+
+@pytest.mark.skipif(
+    os.environ.get("RAY_USE_NEW_GCS") == "on",
+    reason="Failing with new GCS API on Linux.")
+def test_actor_gpus(ray_start_cluster):
+    cluster = ray_start_cluster
+    num_nodes = 3
+    num_gpus_per_raylet = 4
+    for i in range(num_nodes):
+        cluster.add_node(
+            num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet)
+    ray.init(address=cluster.address)
+
+    @ray.remote(num_gpus=1)
+    class Actor1(object):
+        def __init__(self):
+            self.gpu_ids = ray.get_gpu_ids()
+
+        def get_location_and_ids(self):
+            assert ray.get_gpu_ids() == self.gpu_ids
+            return (ray.worker.global_worker.node.unique_id,
+                    tuple(self.gpu_ids))
+
+    # Create one actor per GPU.
+    actors = [Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet)]
+    # Make sure that no two actors are assigned to the same GPU.
+    locations_and_ids = ray.get(
+        [actor.get_location_and_ids.remote() for actor in actors])
+    node_names = {location for location, gpu_id in locations_and_ids}
+    assert len(node_names) == num_nodes
+    location_actor_combinations = []
+    for node_name in node_names:
+        for gpu_id in range(num_gpus_per_raylet):
+            location_actor_combinations.append((node_name, (gpu_id, )))
+    assert set(locations_and_ids) == set(location_actor_combinations)
+
+    # Creating a new actor should fail because all of the GPUs are being
+    # used.
+    a = Actor1.remote()
+    ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
+    assert ready_ids == []
+
+
+def test_actor_multiple_gpus(ray_start_cluster):
+    cluster = ray_start_cluster
+    num_nodes = 3
+    num_gpus_per_raylet = 5
+    for i in range(num_nodes):
+        cluster.add_node(
+            num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet)
+    ray.init(address=cluster.address)
+
+    @ray.remote(num_gpus=2)
+    class Actor1(object):
+        def __init__(self):
+            self.gpu_ids = ray.get_gpu_ids()
+
+        def get_location_and_ids(self):
+            assert ray.get_gpu_ids() == self.gpu_ids
+            return (ray.worker.global_worker.node.unique_id,
+                    tuple(self.gpu_ids))
+
+    # Create some actors.
+    actors1 = [Actor1.remote() for _ in range(num_nodes * 2)]
+    # Make sure that no two actors are assigned to the same GPU.
+    locations_and_ids = ray.get(
+        [actor.get_location_and_ids.remote() for actor in actors1])
+    node_names = {location for location, gpu_id in locations_and_ids}
+    assert len(node_names) == num_nodes
+
+    # Keep track of which GPU IDs are being used for each location.
+    gpus_in_use = {node_name: [] for node_name in node_names}
+    for location, gpu_ids in locations_and_ids:
+        gpus_in_use[location].extend(gpu_ids)
+    for node_name in node_names:
+        assert len(set(gpus_in_use[node_name])) == 4
+
+    # Creating a new actor should fail because all of the GPUs are being
+    # used.
+    a = Actor1.remote()
+    ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
+    assert ready_ids == []
+
+    # We should be able to create more actors that use only a single GPU.
+    @ray.remote(num_gpus=1)
+    class Actor2(object):
+        def __init__(self):
+            self.gpu_ids = ray.get_gpu_ids()
+
+        def get_location_and_ids(self):
+            return (ray.worker.global_worker.node.unique_id,
+                    tuple(self.gpu_ids))
+
+    # Create some actors.
+    actors2 = [Actor2.remote() for _ in range(num_nodes)]
+    # Make sure that no two actors are assigned to the same GPU.
+    locations_and_ids = ray.get(
+        [actor.get_location_and_ids.remote() for actor in actors2])
+    names = {location for location, gpu_id in locations_and_ids}
+    assert node_names == names
+    for location, gpu_ids in locations_and_ids:
+        gpus_in_use[location].extend(gpu_ids)
+    for node_name in node_names:
+        assert len(gpus_in_use[node_name]) == 5
+        assert set(gpus_in_use[node_name]) == set(range(5))
+
+    # Creating a new actor should fail because all of the GPUs are being
+    # used.
+    a = Actor2.remote()
+    ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
+    assert ready_ids == []
+
+
+def test_actor_different_numbers_of_gpus(ray_start_cluster):
+    # Test that we can create actors on two nodes that have different
+    # numbers of GPUs.
+    cluster = ray_start_cluster
+    cluster.add_node(num_cpus=10, num_gpus=0)
+    cluster.add_node(num_cpus=10, num_gpus=5)
+    cluster.add_node(num_cpus=10, num_gpus=10)
+    ray.init(address=cluster.address)
+
+    @ray.remote(num_gpus=1)
+    class Actor1(object):
+        def __init__(self):
+            self.gpu_ids = ray.get_gpu_ids()
+
+        def get_location_and_ids(self):
+            return (ray.worker.global_worker.node.unique_id,
+                    tuple(self.gpu_ids))
+
+    # Create some actors.
+    actors = [Actor1.remote() for _ in range(0 + 5 + 10)]
+    # Make sure that no two actors are assigned to the same GPU.
+    locations_and_ids = ray.get(
+        [actor.get_location_and_ids.remote() for actor in actors])
+    node_names = {location for location, gpu_id in locations_and_ids}
+    assert len(node_names) == 2
+    for node_name in node_names:
+        node_gpu_ids = [
+            gpu_id for location, gpu_id in locations_and_ids
+            if location == node_name
+        ]
+        assert len(node_gpu_ids) in [5, 10]
+        assert set(node_gpu_ids) == {(i, ) for i in range(len(node_gpu_ids))}
+
+    # Creating a new actor should fail because all of the GPUs are being
+    # used.
+    a = Actor1.remote()
+    ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
+    assert ready_ids == []
+
+
+def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster):
+    cluster = ray_start_cluster
+    num_nodes = 5
+    num_gpus_per_raylet = 5
+    for i in range(num_nodes):
+        cluster.add_node(
+            num_cpus=10 * num_gpus_per_raylet,
+            num_gpus=num_gpus_per_raylet,
+            _internal_config=json.dumps({
+                "num_heartbeats_timeout": 1000
+            }))
+    ray.init(address=cluster.address)
+
+    @ray.remote
+    def create_actors(i, n):
+        @ray.remote(num_gpus=1)
+        class Actor(object):
+            def __init__(self, i, j):
+                self.gpu_ids = ray.get_gpu_ids()
+
+            def get_location_and_ids(self):
+                return ((ray.worker.global_worker.node.unique_id),
+                        tuple(self.gpu_ids))
+
+            def sleep(self):
+                time.sleep(100)
+
+        # Create n actors.
+        actors = []
+        for j in range(n):
+            actors.append(Actor.remote(i, j))
+
+        locations = ray.get(
+            [actor.get_location_and_ids.remote() for actor in actors])
+
+        # Put each actor to sleep for a long time to prevent them from getting
+        # terminated.
+        for actor in actors:
+            actor.sleep.remote()
+
+        return locations
+
+    all_locations = ray.get([
+        create_actors.remote(i, num_gpus_per_raylet) for i in range(num_nodes)
+    ])
+
+    # Make sure that no two actors are assigned to the same GPU.
+    node_names = {
+        location
+        for locations in all_locations for location, gpu_id in locations
+    }
+    assert len(node_names) == num_nodes
+
+    # Keep track of which GPU IDs are being used for each location.
+    gpus_in_use = {node_name: [] for node_name in node_names}
+    for locations in all_locations:
+        for location, gpu_ids in locations:
+            gpus_in_use[location].extend(gpu_ids)
+    for node_name in node_names:
+        assert len(set(gpus_in_use[node_name])) == num_gpus_per_raylet
+
+    @ray.remote(num_gpus=1)
+    class Actor(object):
+        def __init__(self):
+            self.gpu_ids = ray.get_gpu_ids()
+
+        def get_location_and_ids(self):
+            return (ray.worker.global_worker.node.unique_id,
+                    tuple(self.gpu_ids))
+
+    # All the GPUs should be used up now.
+    a = Actor.remote()
+    ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
+    assert ready_ids == []
+
+
+@pytest.mark.skipif(
+    sys.version_info < (3, 0), reason="This test requires Python 3.")
+def test_actors_and_tasks_with_gpus(ray_start_cluster):
+    cluster = ray_start_cluster
+    num_nodes = 3
+    num_gpus_per_raylet = 2
+    for i in range(num_nodes):
+        cluster.add_node(
+            num_cpus=num_gpus_per_raylet, num_gpus=num_gpus_per_raylet)
+    ray.init(address=cluster.address)
+
+    def check_intervals_non_overlapping(list_of_intervals):
+        for i in range(len(list_of_intervals)):
+            for j in range(i):
+                first_interval = list_of_intervals[i]
+                second_interval = list_of_intervals[j]
+                # Check that list_of_intervals[i] and list_of_intervals[j]
+                # don't overlap.
+                assert first_interval[0] < first_interval[1]
+                assert second_interval[0] < second_interval[1]
+                intervals_nonoverlapping = (
+                    first_interval[1] <= second_interval[0]
+                    or second_interval[1] <= first_interval[0])
+                assert intervals_nonoverlapping, (
+                    "Intervals {} and {} are overlapping.".format(
+                        first_interval, second_interval))
+
+    @ray.remote(num_gpus=1)
+    def f1():
+        t1 = time.monotonic()
+        time.sleep(0.1)
+        t2 = time.monotonic()
+        gpu_ids = ray.get_gpu_ids()
+        assert len(gpu_ids) == 1
+        assert gpu_ids[0] in range(num_gpus_per_raylet)
+        return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids),
+                [t1, t2])
+
+    @ray.remote(num_gpus=2)
+    def f2():
+        t1 = time.monotonic()
+        time.sleep(0.1)
+        t2 = time.monotonic()
+        gpu_ids = ray.get_gpu_ids()
+        assert len(gpu_ids) == 2
+        assert gpu_ids[0] in range(num_gpus_per_raylet)
+        assert gpu_ids[1] in range(num_gpus_per_raylet)
+        return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids),
+                [t1, t2])
+
+    @ray.remote(num_gpus=1)
+    class Actor1(object):
+        def __init__(self):
+            self.gpu_ids = ray.get_gpu_ids()
+            assert len(self.gpu_ids) == 1
+            assert self.gpu_ids[0] in range(num_gpus_per_raylet)
+
+        def get_location_and_ids(self):
+            assert ray.get_gpu_ids() == self.gpu_ids
+            return (ray.worker.global_worker.node.unique_id,
+                    tuple(self.gpu_ids))
+
+    def locations_to_intervals_for_many_tasks():
+        # Launch a bunch of GPU tasks.
+        locations_ids_and_intervals = ray.get(
+            [f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] +
+            [f2.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] +
+            [f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)])
+
+        locations_to_intervals = collections.defaultdict(lambda: [])
+        for location, gpu_ids, interval in locations_ids_and_intervals:
+            for gpu_id in gpu_ids:
+                locations_to_intervals[(location, gpu_id)].append(interval)
+        return locations_to_intervals
+
+    # Run a bunch of GPU tasks.
+    locations_to_intervals = locations_to_intervals_for_many_tasks()
+    # For each GPU, verify that the set of tasks that used this specific
+    # GPU did not overlap in time.
+    for locations in locations_to_intervals:
+        check_intervals_non_overlapping(locations_to_intervals[locations])
+
+    # Create an actor that uses a GPU.
+    a = Actor1.remote()
+    actor_location = ray.get(a.get_location_and_ids.remote())
+    actor_location = (actor_location[0], actor_location[1][0])
+    # This check makes sure that actor_location is formatted the same way
+    # that the keys of locations_to_intervals are formatted.
+    assert actor_location in locations_to_intervals
+
+    # Run a bunch of GPU tasks.
+    locations_to_intervals = locations_to_intervals_for_many_tasks()
+    # For each GPU, verify that the set of tasks that used this specific
+    # GPU did not overlap in time.
+    for locations in locations_to_intervals:
+        check_intervals_non_overlapping(locations_to_intervals[locations])
+    # Make sure that the actor's GPU was not used.
+    assert actor_location not in locations_to_intervals
+
+    # Create more actors to fill up all the GPUs.
+    more_actors = [
+        Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet - 1)
+    ]
+    # Wait for the actors to finish being created.
+    ray.get([actor.get_location_and_ids.remote() for actor in more_actors])
+
+    # Now if we run some GPU tasks, they should not be scheduled.
+    results = [f1.remote() for _ in range(30)]
+    ready_ids, remaining_ids = ray.wait(results, timeout=1.0)
+    assert len(ready_ids) == 0
+
+
+def test_actors_and_tasks_with_gpus_version_two(shutdown_only):
+    # Create tasks and actors that both use GPUs and make sure that they
+    # are given different GPUs
+    num_gpus = 4
+
+    ray.init(
+        num_cpus=(num_gpus + 1),
+        num_gpus=num_gpus,
+        object_store_memory=int(150 * 1024 * 1024))
+
+    # The point of this actor is to record which GPU IDs have been seen. We
+    # can't just return them from the tasks, because the tasks don't return
+    # for a long time in order to make sure the GPU is not released
+    # prematurely.
+    @ray.remote
+    class RecordGPUs(object):
+        def __init__(self):
+            self.gpu_ids_seen = []
+            self.num_calls = 0
+
+        def add_ids(self, gpu_ids):
+            self.gpu_ids_seen += gpu_ids
+            self.num_calls += 1
+
+        def get_gpu_ids_and_calls(self):
+            return self.gpu_ids_seen, self.num_calls
+
+    @ray.remote(num_gpus=1)
+    def f(record_gpu_actor):
+        gpu_ids = ray.get_gpu_ids()
+        assert len(gpu_ids) == 1
+        record_gpu_actor.add_ids.remote(gpu_ids)
+        # Sleep for a long time so that the GPU never gets released. This task
+        # will be killed by ray.shutdown() before it actually finishes.
+        time.sleep(1000)
+
+    @ray.remote(num_gpus=1)
+    class Actor(object):
+        def __init__(self, record_gpu_actor):
+            self.gpu_ids = ray.get_gpu_ids()
+            assert len(self.gpu_ids) == 1
+            record_gpu_actor.add_ids.remote(self.gpu_ids)
+
+        def check_gpu_ids(self):
+            assert ray.get_gpu_ids() == self.gpu_ids
+
+    record_gpu_actor = RecordGPUs.remote()
+
+    actors = []
+    actor_results = []
+    for _ in range(num_gpus // 2):
+        f.remote(record_gpu_actor)
+        a = Actor.remote(record_gpu_actor)
+        actor_results.append(a.check_gpu_ids.remote())
+        # Prevent the actor handle from going out of scope so that its GPU
+        # resources don't get released.
+        actors.append(a)
+
+    # Make sure that the actor method calls succeeded.
+    ray.get(actor_results)
+
+    start_time = time.time()
+    while time.time() - start_time < 30:
+        seen_gpu_ids, num_calls = ray.get(
+            record_gpu_actor.get_gpu_ids_and_calls.remote())
+        if num_calls == num_gpus:
+            break
+    assert set(seen_gpu_ids) == set(range(num_gpus))
+
+
+def test_blocking_actor_task(shutdown_only):
+    ray.init(
+        num_cpus=1, num_gpus=1, object_store_memory=int(150 * 1024 * 1024))
+
+    @ray.remote(num_gpus=1)
+    def f():
+        return 1
+
+    @ray.remote
+    class Foo(object):
+        def __init__(self):
+            pass
+
+        def blocking_method(self):
+            ray.get(f.remote())
+
+    # Make sure we can execute a blocking actor method even if there is
+    # only one CPU.
+    actor = Foo.remote()
+    ray.get(actor.blocking_method.remote())
+
+    @ray.remote(num_cpus=1)
+    class CPUFoo(object):
+        def __init__(self):
+            pass
+
+        def blocking_method(self):
+            ray.get(f.remote())
+
+    # Make sure that lifetime CPU resources are not released when actors
+    # block.
+    actor = CPUFoo.remote()
+    x_id = actor.blocking_method.remote()
+    ready_ids, remaining_ids = ray.wait([x_id], timeout=1.0)
+    assert ready_ids == []
+    assert remaining_ids == [x_id]
+
+    @ray.remote(num_gpus=1)
+    class GPUFoo(object):
+        def __init__(self):
+            pass
+
+        def blocking_method(self):
+            ray.get(f.remote())
+
+    # Make sure that GPU resources are not released when actors block.
+    actor = GPUFoo.remote()
+    x_id = actor.blocking_method.remote()
+    ready_ids, remaining_ids = ray.wait([x_id], timeout=1.0)
+    assert ready_ids == []
+    assert remaining_ids == [x_id]
+
+
+@pytest.mark.skipif(
+    sys.version_info < (3, 0),
+    reason="This test is currently failing on Python 2.7.")
+def test_lifetime_and_transient_resources(ray_start_regular):
+    # This actor acquires resources only when running methods.
+    @ray.remote
+    class Actor1(object):
+        def method(self):
+            pass
+
+    # This actor acquires resources for its lifetime.
+    @ray.remote(num_cpus=1)
+    class Actor2(object):
+        def method(self):
+            pass
+
+    actor1s = [Actor1.remote() for _ in range(10)]
+    ray.get([a.method.remote() for a in actor1s])
+
+    actor2s = [Actor2.remote() for _ in range(2)]
+    results = [a.method.remote() for a in actor2s]
+    ready_ids, remaining_ids = ray.wait(
+        results, num_returns=len(results), timeout=5.0)
+    assert len(ready_ids) == 1
+
+
+def test_custom_label_placement(ray_start_cluster):
+    cluster = ray_start_cluster
+    cluster.add_node(num_cpus=2, resources={"CustomResource1": 2})
+    cluster.add_node(num_cpus=2, resources={"CustomResource2": 2})
+    ray.init(address=cluster.address)
+
+    @ray.remote(resources={"CustomResource1": 1})
+    class ResourceActor1(object):
+        def get_location(self):
+            return ray.worker.global_worker.node.unique_id
+
+    @ray.remote(resources={"CustomResource2": 1})
+    class ResourceActor2(object):
+        def get_location(self):
+            return ray.worker.global_worker.node.unique_id
+
+    node_id = ray.worker.global_worker.node.unique_id
+
+    # Create some actors.
+    actors1 = [ResourceActor1.remote() for _ in range(2)]
+    actors2 = [ResourceActor2.remote() for _ in range(2)]
+    locations1 = ray.get([a.get_location.remote() for a in actors1])
+    locations2 = ray.get([a.get_location.remote() for a in actors2])
+    for location in locations1:
+        assert location == node_id
+    for location in locations2:
+        assert location != node_id
+
+
+def test_creating_more_actors_than_resources(shutdown_only):
+    ray.init(num_cpus=10, num_gpus=2, resources={"CustomResource1": 1})
+
+    @ray.remote(num_gpus=1)
+    class ResourceActor1(object):
+        def method(self):
+            return ray.get_gpu_ids()[0]
+
+    @ray.remote(resources={"CustomResource1": 1})
+    class ResourceActor2(object):
+        def method(self):
+            pass
+
+    # Make sure the first two actors get created and the third one does
+    # not.
+    actor1 = ResourceActor1.remote()
+    result1 = actor1.method.remote()
+    ray.wait([result1])
+    actor2 = ResourceActor1.remote()
+    result2 = actor2.method.remote()
+    ray.wait([result2])
+    actor3 = ResourceActor1.remote()
+    result3 = actor3.method.remote()
+    ready_ids, _ = ray.wait([result3], timeout=0.2)
+    assert len(ready_ids) == 0
+
+    # By deleting actor1, we free up resources to create actor3.
+    del actor1
+
+    results = ray.get([result1, result2, result3])
+    assert results[0] == results[2]
+    assert set(results) == {0, 1}
+
+    # Make sure that when one actor goes out of scope a new actor is
+    # created because some resources have been freed up.
+    results = []
+    for _ in range(3):
+        actor = ResourceActor2.remote()
+        object_id = actor.method.remote()
+        results.append(object_id)
+        # Wait for the task to execute. We do this because otherwise it may
+        # be possible for the __ray_terminate__ task to execute before the
+        # method.
+        ray.wait([object_id])
+
+    ray.get(results)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -10,7 +10,7 @@ import sys
 import ray
 import ray.experimental.array.remote as ra
 import ray.experimental.array.distributed as da
-import ray.tests.cluster_utils
+import ray.cluster_utils

 if sys.version_info >= (3, 0):
    from importlib import reload
@@ -216,3 +216,9 @@ def test_distributed_array_methods(ray_start_cluster_2_nodes, reload_modules):
        d1 = np.random.randint(1, 35)
        d2 = np.random.randint(1, 35)
        test_dist_qr(d1, d2)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -17,7 +17,7 @@ from ray.autoscaler.autoscaler import StandardAutoscaler, LoadMetrics, \
 from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_NODE_STATUS, \
    STATUS_UP_TO_DATE, STATUS_UPDATE_FAILED
 from ray.autoscaler.node_provider import NODE_PROVIDERS, NodeProvider
-from ray.tests.utils import RayTestTimeoutException
+from ray.test_utils import RayTestTimeoutException
 import pytest


@@ -1084,4 +1084,5 @@ class AutoscalingTest(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -7,7 +7,7 @@ import unittest
 import yaml

 from ray.autoscaler.autoscaler import fillout_defaults, validate_config
-from ray.tests.utils import recursive_fnmatch
+from ray.test_utils import recursive_fnmatch

 RAY_PATH = os.path.abspath(os.path.join(__file__, "../../"))
 CONFIG_PATHS = recursive_fnmatch(
@@ -31,4 +31,6 @@ class AutoscalingConfigTest(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -1,56 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import ray
-from ray.experimental.streaming.batched_queue import BatchedQueue
-
-
-@ray.remote
-class Reader(object):
-    def __init__(self, queue):
-        self.queue = queue
-        self.num_reads = 0
-        self.start = time.time()
-
-    def read(self, read_slowly):
-        expected_value = 0
-        for _ in range(1000):
-            x = self.queue.read_next()
-            assert x == expected_value, (x, expected_value)
-            expected_value += 1
-            self.num_reads += 1
-            if read_slowly:
-                time.sleep(0.001)
-
-
-def test_batched_queue(ray_start_regular):
-    # Batched queue parameters
-    max_queue_size = 10000  # Max number of batches in queue
-    max_batch_size = 1000  # Max number of elements per batch
-    batch_timeout = 0.001  # 1ms flush timeout
-    prefetch_depth = 10  # Number of batches to prefetch from plasma
-    background_flush = False  # Don't use daemon thread for flushing
-    # Two tests: one with a big queue and slow reader, and
-    # a second one with a small queue and a faster reader
-    for read_slowly in [True, False]:
-        # Construct the batched queue
-        queue = BatchedQueue(
-            max_size=max_queue_size,
-            max_batch_size=max_batch_size,
-            max_batch_time=batch_timeout,
-            prefetch_depth=prefetch_depth,
-            background_flush=background_flush)
-        # Create and start the reader
-        reader = Reader.remote(queue)
-        object_id = reader.read.remote(read_slowly=read_slowly)
-        value = 0
-        for _ in range(1000):
-            queue.put_next(value)
-            value += 1
-        queue._flush_writes()
-        ray.get(object_id)
-        # Test once more with a very small queue size and a faster reader
-        max_queue_size = 10
@@ -13,9 +13,9 @@ import pytest

 import ray
 import ray.ray_constants as ray_constants
-from ray.tests.cluster_utils import Cluster
-from ray.tests.utils import (run_string_as_driver_nonblocking,
-                             RayTestTimeoutException)
+from ray.cluster_utils import Cluster
+from ray.test_utils import (run_string_as_driver_nonblocking,
+                            RayTestTimeoutException)


 # This test checks that when a worker dies in the middle of a get, the plasma
@@ -441,3 +441,8 @@ def test_driver_lives_parallel(ray_start_regular):
        process_info.process.wait()

    # If the driver can reach the tearDown method, then it is still alive.
+
+
+if __name__ == "__main__":
+    import pytest
+    sys.exit(pytest.main(["-v", __file__]))
@@ -1,42 +0,0 @@
-from __future__ import absolute_import, division, print_function
-
-import os
-import unittest
-
-import redis
-
-import ray
-
-
-def parse_client(addr_port_str):
-    address, redis_port = addr_port_str.split(":")
-    return redis.StrictRedis(host=address, port=redis_port)
-
-
-@unittest.skipIf(not os.environ.get("RAY_USE_NEW_GCS", False),
-                 "Tests functionality of the new GCS.")
-class CredisTest(unittest.TestCase):
-    def setUp(self):
-        self.config = ray.init(num_cpus=0)
-
-    def tearDown(self):
-        ray.shutdown()
-
-    def test_credis_started(self):
-        assert "redis_address" in self.config
-        primary = parse_client(self.config["redis_address"])
-        assert primary.ping() is True
-        member = primary.lrange("RedisShards", 0, -1)[0]
-        shard = parse_client(member.decode())
-
-        # Check that primary has loaded credis's master module.
-        chain = primary.execute_command("MASTER.GET_CHAIN")
-        assert len(chain) == 1
-
-        # Check that the shard has loaded credis' member module.
-        assert chain[0] == member
-        assert shard.execute_command("MEMBER.SN") == -1
-
-
-if __name__ == "__main__":
-    unittest.main(verbosity=2)
@@ -54,4 +54,6 @@ class CythonTest(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -47,3 +47,12 @@ def test_raylet_gdb(ray_gdb_start):
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        assert pgrep_command.communicate()[0]
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    # Make subprocess happy in bazel.
+    os.environ["LC_ALL"] = "en_US.UTF-8"
+    os.environ["LANG"] = "en_US.UTF-8"
+    sys.exit(pytest.main(["-v", __file__]))
@@ -6,8 +6,8 @@ import logging
 import time

 import ray
-import ray.tests.cluster_utils
-import ray.tests.utils
+import ray.cluster_utils
+import ray.test_utils

 logger = logging.getLogger(__name__)

@@ -605,3 +605,9 @@ def test_release_cpus_when_actor_creation_task_blocking(shutdown_only):

    result = wait_until(assert_available_resources, 1000)
    assert result is True
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -15,8 +15,8 @@ import redis

 import ray
 import ray.ray_constants as ray_constants
-from ray.tests.cluster_utils import Cluster
-from ray.tests.utils import (
+from ray.cluster_utils import Cluster
+from ray.test_utils import (
    relevant_errors,
    wait_for_errors,
    RayTestTimeoutException,
@@ -903,3 +903,8 @@ def test_direct_call_serialized_id(ray_start_cluster):

    obj = small_object.remote()
    ray.get(get.remote([obj]))
+
+
+if __name__ == "__main__":
+    import pytest
+    sys.exit(pytest.main(["-v", __file__]))
@@ -10,8 +10,8 @@ import logging
 import pytest

 import ray
-import ray.tests.cluster_utils
-import ray.tests.utils
+import ray.cluster_utils
+import ray.test_utils

 logger = logging.getLogger(__name__)

@@ -78,3 +78,9 @@ def test_pending_task_dependency(shutdown_only):
        ray.put(np_array)

    ray.get(oid)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -78,3 +78,9 @@ def test_add_remove_cluster_resources(ray_start_cluster_head):
        nodes += [cluster.add_node(num_cpus=1)]
    cluster.wait_for_nodes()
    assert ray.cluster_resources()["CPU"] == 6
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -202,3 +202,9 @@ def test_channel_generation():
 def test_wordcount():
    """Tests a simple streaming wordcount."""
    pass
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -86,4 +86,6 @@ class TestMemoryLimits(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -152,4 +152,6 @@ class TestMemoryScheduling(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -10,7 +10,7 @@ import time
 import ray
 from ray.core.generated import node_manager_pb2
 from ray.core.generated import node_manager_pb2_grpc
-from ray.tests.utils import RayTestTimeoutException
+from ray.test_utils import RayTestTimeoutException


 def test_worker_stats(ray_start_regular):
@@ -53,5 +53,13 @@ def test_worker_stats(ray_start_regular):
            if p.info["pid"] in pids
        ]
        for process in processes:
-            assert "python" in process or "ray" in process
+            # TODO(ekl) why does travis/mi end up in the process list
+            assert ("python" in process or "ray" in process
+                    or "travis" in process)
        break
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -2,7 +2,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import os
 import pytest
 import time
 import numpy as np
@@ -104,9 +103,11 @@ def test_cache(ray_start_regular):
    d = time.time() - c

    if d > 1.5 * b:
-        if os.getenv("TRAVIS") is None:
-            raise Exception("The caching test was too slow. "
-                            "d = {}, b = {}".format(d, b))
-        else:
-            print("WARNING: The caching test was too slow. "
-                  "d = {}, b = {}".format(d, b))
+        print("WARNING: The caching test was too slow. "
+              "d = {}, b = {}".format(d, b))
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -107,3 +107,12 @@ def test_cleanup_on_driver_exit_single_redis_shard():
 def test_cleanup_on_driver_exit_many_redis_shards():
    _test_cleanup_on_driver_exit(num_redis_shards=5)
    _test_cleanup_on_driver_exit(num_redis_shards=31)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    # Make subprocess happy in bazel.
+    os.environ["LC_ALL"] = "en_US.UTF-8"
+    os.environ["LANG"] = "en_US.UTF-8"
+    sys.exit(pytest.main(["-v", __file__]))
@@ -8,7 +8,7 @@ import subprocess
 import time

 import ray
-from ray.tests.utils import (
+from ray.test_utils import (
    RayTestTimeoutException,
    run_string_as_driver,
    run_string_as_driver_nonblocking,
@@ -615,3 +615,12 @@ def test_use_pickle(call_ray_start):
        return (3, "world")

    assert ray.get(f.remote(x)) == (3, "world")
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    # Make subprocess happy in bazel.
+    os.environ["LC_ALL"] = "en_US.UTF-8"
+    os.environ["LANG"] = "en_US.UTF-8"
+    sys.exit(pytest.main(["-v", __file__]))
@@ -9,8 +9,8 @@ import time
 import ray
 import ray.ray_constants as ray_constants
 from ray.monitor import Monitor
-from ray.tests.cluster_utils import Cluster
-from ray.tests.conftest import generate_internal_config_map
+from ray.cluster_utils import Cluster
+from ray.test_utils import generate_internal_config_map

 logger = logging.getLogger(__name__)

@@ -221,3 +221,9 @@ def test_worker_plasma_store_failure(ray_start_cluster_head):
    worker.kill_plasma_store()
    worker.all_processes[ray_constants.PROCESS_TYPE_RAYLET][0].process.wait()
    assert not worker.any_processes_alive(), worker.live_processes()
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -0,0 +1,273 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import signal
+import sys
+import time
+
+import numpy as np
+import pytest
+
+import ray
+import ray.ray_constants as ray_constants
+from ray.cluster_utils import Cluster
+from ray.test_utils import RayTestTimeoutException
+
+
+@pytest.fixture(params=[(1, 4), (4, 4)])
+def ray_start_workers_separate_multinode(request):
+    num_nodes = request.param[0]
+    num_initial_workers = request.param[1]
+    # Start the Ray processes.
+    cluster = Cluster()
+    for _ in range(num_nodes):
+        cluster.add_node(num_cpus=num_initial_workers)
+    ray.init(address=cluster.address)
+
+    yield num_nodes, num_initial_workers
+    # The code after the yield will run as teardown code.
+    ray.shutdown()
+    cluster.shutdown()
+
+
+def test_worker_failed(ray_start_workers_separate_multinode):
+    num_nodes, num_initial_workers = (ray_start_workers_separate_multinode)
+
+    @ray.remote
+    def get_pids():
+        time.sleep(0.25)
+        return os.getpid()
+
+    start_time = time.time()
+    pids = set()
+    while len(pids) < num_nodes * num_initial_workers:
+        new_pids = ray.get([
+            get_pids.remote()
+            for _ in range(2 * num_nodes * num_initial_workers)
+        ])
+        for pid in new_pids:
+            pids.add(pid)
+        if time.time() - start_time > 60:
+            raise RayTestTimeoutException(
+                "Timed out while waiting to get worker PIDs.")
+
+    @ray.remote
+    def f(x):
+        time.sleep(0.5)
+        return x
+
+    # Submit more tasks than there are workers so that all workers and
+    # cores are utilized.
+    object_ids = [f.remote(i) for i in range(num_initial_workers * num_nodes)]
+    object_ids += [f.remote(object_id) for object_id in object_ids]
+    # Allow the tasks some time to begin executing.
+    time.sleep(0.1)
+    # Kill the workers as the tasks execute.
+    for pid in pids:
+        os.kill(pid, signal.SIGKILL)
+        time.sleep(0.1)
+    # Make sure that we either get the object or we get an appropriate
+    # exception.
+    for object_id in object_ids:
+        try:
+            ray.get(object_id)
+        except (ray.exceptions.RayTaskError, ray.exceptions.RayWorkerError):
+            pass
+
+
+def _test_component_failed(cluster, component_type):
+    """Kill a component on all worker nodes and check workload succeeds."""
+    # Submit many tasks with many dependencies.
+    @ray.remote
+    def f(x):
+        return x
+
+    @ray.remote
+    def g(*xs):
+        return 1
+
+    # Kill the component on all nodes except the head node as the tasks
+    # execute. Do this in a loop while submitting tasks between each
+    # component failure.
+    time.sleep(0.1)
+    worker_nodes = cluster.list_all_nodes()[1:]
+    assert len(worker_nodes) > 0
+    for node in worker_nodes:
+        process = node.all_processes[component_type][0].process
+        # Submit a round of tasks with many dependencies.
+        x = 1
+        for _ in range(1000):
+            x = f.remote(x)
+
+        xs = [g.remote(1)]
+        for _ in range(100):
+            xs.append(g.remote(*xs))
+            xs.append(g.remote(1))
+
+        # Kill a component on one of the nodes.
+        process.terminate()
+        time.sleep(1)
+        process.kill()
+        process.wait()
+        assert not process.poll() is None
+
+        # Make sure that we can still get the objects after the
+        # executing tasks died.
+        ray.get(x)
+        ray.get(xs)
+
+
+def check_components_alive(cluster, component_type, check_component_alive):
+    """Check that a given component type is alive on all worker nodes."""
+    worker_nodes = cluster.list_all_nodes()[1:]
+    assert len(worker_nodes) > 0
+    for node in worker_nodes:
+        process = node.all_processes[component_type][0].process
+        if check_component_alive:
+            assert process.poll() is None
+        else:
+            print("waiting for " + component_type + " with PID " +
+                  str(process.pid) + "to terminate")
+            process.wait()
+            print("done waiting for " + component_type + " with PID " +
+                  str(process.pid) + "to terminate")
+            assert not process.poll() is None
+
+
+@pytest.mark.parametrize(
+    "ray_start_cluster", [{
+        "num_cpus": 8,
+        "num_nodes": 4,
+        "_internal_config": json.dumps({
+            "num_heartbeats_timeout": 100
+        }),
+    }],
+    indirect=True)
+def test_raylet_failed(ray_start_cluster):
+    cluster = ray_start_cluster
+    # Kill all raylets on worker nodes.
+    _test_component_failed(cluster, ray_constants.PROCESS_TYPE_RAYLET)
+
+    # The plasma stores should still be alive on the worker nodes.
+    check_components_alive(cluster, ray_constants.PROCESS_TYPE_PLASMA_STORE,
+                           True)
+
+
+@pytest.mark.skipif(
+    os.environ.get("RAY_USE_NEW_GCS") == "on",
+    reason="Hanging with new GCS API.")
+@pytest.mark.parametrize(
+    "ray_start_cluster", [{
+        "num_cpus": 8,
+        "num_nodes": 2,
+        "_internal_config": json.dumps({
+            "num_heartbeats_timeout": 100
+        }),
+    }],
+    indirect=True)
+def test_plasma_store_failed(ray_start_cluster):
+    cluster = ray_start_cluster
+    # Kill all plasma stores on worker nodes.
+    _test_component_failed(cluster, ray_constants.PROCESS_TYPE_PLASMA_STORE)
+
+    # No processes should be left alive on the worker nodes.
+    check_components_alive(cluster, ray_constants.PROCESS_TYPE_PLASMA_STORE,
+                           False)
+    check_components_alive(cluster, ray_constants.PROCESS_TYPE_RAYLET, False)
+
+
+@pytest.mark.parametrize(
+    "ray_start_cluster", [{
+        "num_cpus": 4,
+        "num_nodes": 3,
+        "do_init": True
+    }],
+    indirect=True)
+def test_actor_creation_node_failure(ray_start_cluster):
+    # TODO(swang): Refactor test_raylet_failed, etc to reuse the below code.
+    cluster = ray_start_cluster
+
+    @ray.remote
+    class Child(object):
+        def __init__(self, death_probability):
+            self.death_probability = death_probability
+
+        def ping(self):
+            # Exit process with some probability.
+            exit_chance = np.random.rand()
+            if exit_chance < self.death_probability:
+                sys.exit(-1)
+
+    num_children = 50
+    # Children actors will die about half the time.
+    death_probability = 0.5
+
+    children = [Child.remote(death_probability) for _ in range(num_children)]
+    while len(cluster.list_all_nodes()) > 1:
+        for j in range(2):
+            # Submit some tasks on the actors. About half of the actors will
+            # fail.
+            children_out = [child.ping.remote() for child in children]
+            # Wait a while for all the tasks to complete. This should trigger
+            # reconstruction for any actor creation tasks that were forwarded
+            # to nodes that then failed.
+            ready, _ = ray.wait(
+                children_out, num_returns=len(children_out), timeout=5 * 60.0)
+            assert len(ready) == len(children_out)
+
+            # Replace any actors that died.
+            for i, out in enumerate(children_out):
+                try:
+                    ray.get(out)
+                except ray.exceptions.RayActorError:
+                    children[i] = Child.remote(death_probability)
+        # Remove a node. Any actor creation tasks that were forwarded to this
+        # node must be reconstructed.
+        cluster.remove_node(cluster.list_all_nodes()[-1])
+
+
+@pytest.mark.skipif(
+    os.environ.get("RAY_USE_NEW_GCS") == "on",
+    reason="Hanging with new GCS API.")
+def test_driver_lives_sequential(ray_start_regular):
+    ray.worker._global_node.kill_raylet()
+    ray.worker._global_node.kill_plasma_store()
+    ray.worker._global_node.kill_log_monitor()
+    ray.worker._global_node.kill_monitor()
+    ray.worker._global_node.kill_raylet_monitor()
+
+    # If the driver can reach the tearDown method, then it is still alive.
+
+
+@pytest.mark.skipif(
+    os.environ.get("RAY_USE_NEW_GCS") == "on",
+    reason="Hanging with new GCS API.")
+def test_driver_lives_parallel(ray_start_regular):
+    all_processes = ray.worker._global_node.all_processes
+    process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
+                     all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
+                     all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
+                     all_processes[ray_constants.PROCESS_TYPE_MONITOR] +
+                     all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR])
+    assert len(process_infos) == 5
+
+    # Kill all the components in parallel.
+    for process_info in process_infos:
+        process_info.process.terminate()
+
+    time.sleep(0.1)
+    for process_info in process_infos:
+        process_info.process.kill()
+
+    for process_info in process_infos:
+        process_info.process.wait()
+
+    # If the driver can reach the tearDown method, then it is still alive.
+
+
+if __name__ == "__main__":
+    import pytest
+    sys.exit(pytest.main(["-v", __file__]))
@@ -3,7 +3,7 @@ from __future__ import division
 from __future__ import print_function

 import ray
-from ray.tests.utils import run_string_as_driver
+from ray.test_utils import run_string_as_driver


 # This tests the queue transitions for infeasible tasks. This has been an issue
@@ -48,3 +48,9 @@ f.remote()
    ray.get([
        f._submit(args=[], kwargs={}, resources={str(i): 1}) for i in range(3)
    ])
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -11,7 +11,7 @@ import time
 import warnings

 import ray
-from ray.tests.cluster_utils import Cluster
+from ray.cluster_utils import Cluster

 # TODO(yuhguo): This test file requires a lot of CPU/memory, and
 # better be put in Jenkins. However, it fails frequently in Jenkins, but
@@ -325,3 +325,9 @@ def test_many_small_transfers(ray_start_cluster_with_resource):
    do_transfers()
    do_transfers()
    do_transfers()
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -237,3 +237,11 @@ def test_session_create_multiple():
        "session-tests/commands-test", session_start,
        ["first", "--a", "*", "--b", "*"])
    assert result.exit_code == 1
+
+
+if __name__ == "__main__":
+    import sys
+    # Make subprocess happy in bazel.
+    os.environ["LC_ALL"] = "en_US.UTF-8"
+    os.environ["LANG"] = "en_US.UTF-8"
+    sys.exit(pytest.main(["-v", __file__]))
@@ -119,3 +119,8 @@ def test_queue(ray_start_regular):
        assert q.get() == item
        size -= 1
        assert q.qsize() == size
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -7,7 +7,7 @@ import pytest
 import redis

 import ray
-from ray.tests.cluster_utils import Cluster
+from ray.cluster_utils import Cluster


@pytest.fixture
@@ -61,3 +61,9 @@ class TestRedisPassword(object):

        object_id = f.remote()
        ray.get(object_id)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -387,3 +387,9 @@ def test_small_receive_timeout(ray_start_regular):
    result_list = ray.experimental.signal.receive([a], timeout=small_timeout)

    assert len(result_list) == 1
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -2,39 +2,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import json
 import numpy as np
-import os
 import pytest
-import sys
 import time

 import ray
-from ray.tests.cluster_utils import Cluster
-from ray.tests.utils import flat_errors
-import ray.ray_constants as ray_constants
-
-
-@pytest.fixture(params=[1, 4])
-def ray_start_sharded(request):
-    num_redis_shards = request.param
-
-    if os.environ.get("RAY_USE_NEW_GCS") == "on":
-        num_redis_shards = 1
-        # For now, RAY_USE_NEW_GCS supports 1 shard, and credis supports
-        # 1-node chain for that shard only.
-
-    # Start the Ray processes.
-    ray.init(
-        object_store_memory=int(0.5 * 10**9),
-        num_cpus=10,
-        num_redis_shards=num_redis_shards,
-        redis_max_memory=10**7)
-
-    yield None
-
-    # The code after the yield will run as teardown code.
-    ray.shutdown()
+from ray.cluster_utils import Cluster


@pytest.fixture(params=[(1, 4), (4, 4)])
@@ -105,71 +78,6 @@ def test_dependencies(ray_start_combination):
    assert cluster.remaining_processes_alive()


-def test_submitting_many_tasks(ray_start_sharded):
-    @ray.remote
-    def f(x):
-        return 1
-
-    def g(n):
-        x = 1
-        for i in range(n):
-            x = f.remote(x)
-        return x
-
-    ray.get([g(1000) for _ in range(100)])
-    assert ray.services.remaining_processes_alive()
-
-
-def test_submitting_many_actors_to_one(ray_start_sharded):
-    @ray.remote
-    class Actor(object):
-        def __init__(self):
-            pass
-
-        def ping(self):
-            return
-
-    @ray.remote
-    class Worker(object):
-        def __init__(self, actor):
-            self.actor = actor
-
-        def ping(self):
-            return ray.get(self.actor.ping.remote())
-
-    a = Actor.remote()
-    workers = [Worker.remote(a) for _ in range(10)]
-    for _ in range(10):
-        out = ray.get([w.ping.remote() for w in workers])
-        assert out == [None for _ in workers]
-
-
-def test_getting_and_putting(ray_start_sharded):
-    for n in range(8):
-        x = np.zeros(10**n)
-
-        for _ in range(100):
-            ray.put(x)
-
-        x_id = ray.put(x)
-        for _ in range(1000):
-            ray.get(x_id)
-
-    assert ray.services.remaining_processes_alive()
-
-
-def test_getting_many_objects(ray_start_sharded):
-    @ray.remote
-    def f():
-        return 1
-
-    n = 10**4  # TODO(pcm): replace by 10 ** 5 once this is faster.
-    lst = ray.get([f.remote() for _ in range(n)])
-    assert lst == n * [1]
-
-    assert ray.services.remaining_processes_alive()
-
-
 def test_wait(ray_start_combination):
    num_nodes, num_workers_per_scheduler, cluster = ray_start_combination
    num_workers = num_nodes * num_workers_per_scheduler
@@ -197,360 +105,7 @@ def test_wait(ray_start_combination):
    assert cluster.remaining_processes_alive()


-@pytest.fixture(params=[1, 4])
-def ray_start_reconstruction(request):
-    num_nodes = request.param
-
-    plasma_store_memory = int(0.5 * 10**9)
-
-    cluster = Cluster(
-        initialize_head=True,
-        head_node_args={
-            "num_cpus": 1,
-            "object_store_memory": plasma_store_memory // num_nodes,
-            "redis_max_memory": 10**7,
-            "_internal_config": json.dumps({
-                "initial_reconstruction_timeout_milliseconds": 200
-            })
-        })
-    for i in range(num_nodes - 1):
-        cluster.add_node(
-            num_cpus=1,
-            object_store_memory=plasma_store_memory // num_nodes,
-            _internal_config=json.dumps({
-                "initial_reconstruction_timeout_milliseconds": 200
-            }))
-    ray.init(address=cluster.address)
-
-    yield plasma_store_memory, num_nodes, cluster
-
-    # Clean up the Ray cluster.
-    ray.shutdown()
-    cluster.shutdown()
-
-
-@pytest.mark.skipif(
-    os.environ.get("RAY_USE_NEW_GCS") == "on",
-    reason="Failing with new GCS API on Linux.")
-def test_simple(ray_start_reconstruction):
-    plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
-    # Define the size of one task's return argument so that the combined
-    # sum of all objects' sizes is at least twice the plasma stores'
-    # combined allotted memory.
-    num_objects = 100
-    size = int(plasma_store_memory * 1.5 / (num_objects * 8))
-
-    # Define a remote task with no dependencies, which returns a numpy
-    # array of the given size.
-    @ray.remote
-    def foo(i, size):
-        array = np.zeros(size)
-        array[0] = i
-        return array
-
-    # Launch num_objects instances of the remote task.
-    args = []
-    for i in range(num_objects):
-        args.append(foo.remote(i, size))
-
-    # Get each value to force each task to finish. After some number of
-    # gets, old values should be evicted.
-    for i in range(num_objects):
-        value = ray.get(args[i])
-        assert value[0] == i
-    # Get each value again to force reconstruction.
-    for i in range(num_objects):
-        value = ray.get(args[i])
-        assert value[0] == i
-    # Get values sequentially, in chunks.
-    num_chunks = 4 * num_nodes
-    chunk = num_objects // num_chunks
-    for i in range(num_chunks):
-        values = ray.get(args[i * chunk:(i + 1) * chunk])
-        del values
-
-    assert cluster.remaining_processes_alive()
-
-
-def sorted_random_indexes(total, output_num):
-    random_indexes = [np.random.randint(total) for _ in range(output_num)]
-    random_indexes.sort()
-    return random_indexes
-
-
-@pytest.mark.skipif(
-    os.environ.get("RAY_USE_NEW_GCS") == "on",
-    reason="Failing with new GCS API on Linux.")
-def test_recursive(ray_start_reconstruction):
-    plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
-    # Define the size of one task's return argument so that the combined
-    # sum of all objects' sizes is at least twice the plasma stores'
-    # combined allotted memory.
-    num_objects = 100
-    size = int(plasma_store_memory * 1.5 / (num_objects * 8))
-
-    # Define a root task with no dependencies, which returns a numpy array
-    # of the given size.
-    @ray.remote
-    def no_dependency_task(size):
-        array = np.zeros(size)
-        return array
-
-    # Define a task with a single dependency, which returns its one
-    # argument.
-    @ray.remote
-    def single_dependency(i, arg):
-        arg = np.copy(arg)
-        arg[0] = i
-        return arg
-
-    # Launch num_objects instances of the remote task, each dependent on
-    # the one before it.
-    arg = no_dependency_task.remote(size)
-    args = []
-    for i in range(num_objects):
-        arg = single_dependency.remote(i, arg)
-        args.append(arg)
-
-    # Get each value to force each task to finish. After some number of
-    # gets, old values should be evicted.
-    for i in range(num_objects):
-        value = ray.get(args[i])
-        assert value[0] == i
-    # Get each value again to force reconstruction.
-    for i in range(num_objects):
-        value = ray.get(args[i])
-        assert value[0] == i
-    # Get 10 values randomly.
-    random_indexes = sorted_random_indexes(num_objects, 10)
-    for i in random_indexes:
-        value = ray.get(args[i])
-        assert value[0] == i
-    # Get values sequentially, in chunks.
-    num_chunks = 4 * num_nodes
-    chunk = num_objects // num_chunks
-    for i in range(num_chunks):
-        values = ray.get(args[i * chunk:(i + 1) * chunk])
-        del values
-
-    assert cluster.remaining_processes_alive()
-
-
-@pytest.mark.skip(reason="This test often hangs or fails in CI.")
-@pytest.mark.skipif(
-    os.environ.get("RAY_USE_NEW_GCS") == "on",
-    reason="Failing with new GCS API on Linux.")
-def test_multiple_recursive(ray_start_reconstruction):
-    plasma_store_memory, _, cluster = ray_start_reconstruction
-    # Define the size of one task's return argument so that the combined
-    # sum of all objects' sizes is at least twice the plasma stores'
-    # combined allotted memory.
-    num_objects = 100
-    size = plasma_store_memory * 2 // (num_objects * 8)
-
-    # Define a root task with no dependencies, which returns a numpy array
-    # of the given size.
-    @ray.remote
-    def no_dependency_task(size):
-        array = np.zeros(size)
-        return array
-
-    # Define a task with multiple dependencies, which returns its first
-    # argument.
-    @ray.remote
-    def multiple_dependency(i, arg1, arg2, arg3):
-        arg1 = np.copy(arg1)
-        arg1[0] = i
-        return arg1
-
-    # Launch num_args instances of the root task. Then launch num_objects
-    # instances of the multi-dependency remote task, each dependent on the
-    # num_args tasks before it.
-    num_args = 3
-    args = []
-    for i in range(num_args):
-        arg = no_dependency_task.remote(size)
-        args.append(arg)
-    for i in range(num_objects):
-        args.append(multiple_dependency.remote(i, *args[i:i + num_args]))
-
-    # Get each value to force each task to finish. After some number of
-    # gets, old values should be evicted.
-    args = args[num_args:]
-    for i in range(num_objects):
-        value = ray.get(args[i])
-        assert value[0] == i
-    # Get each value again to force reconstruction.
-    for i in range(num_objects):
-        value = ray.get(args[i])
-        assert value[0] == i
-    # Get 10 values randomly.
-    random_indexes = sorted_random_indexes(num_objects, 10)
-    for i in random_indexes:
-        value = ray.get(args[i])
-        assert value[0] == i
-
-    assert cluster.remaining_processes_alive()
-
-
-def wait_for_errors(error_check):
-    # Wait for errors from all the nondeterministic tasks.
-    errors = []
-    time_left = 100
-    while time_left > 0:
-        errors = flat_errors()
-        if error_check(errors):
-            break
-        time_left -= 1
-        time.sleep(1)
-
-    # Make sure that enough errors came through.
-    assert error_check(errors)
-    return errors
-
-
-@pytest.mark.skip("This test does not work yet.")
-@pytest.mark.skipif(
-    os.environ.get("RAY_USE_NEW_GCS") == "on",
-    reason="Failing with new GCS API on Linux.")
-def test_nondeterministic_task(ray_start_reconstruction):
-    plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
-    # Define the size of one task's return argument so that the combined
-    # sum of all objects' sizes is at least twice the plasma stores'
-    # combined allotted memory.
-    num_objects = 1000
-    size = plasma_store_memory * 2 // (num_objects * 8)
-
-    # Define a nondeterministic remote task with no dependencies, which
-    # returns a random numpy array of the given size. This task should
-    # produce an error on the driver if it is ever reexecuted.
-    @ray.remote
-    def foo(i, size):
-        array = np.random.rand(size)
-        array[0] = i
-        return array
-
-    # Define a deterministic remote task with no dependencies, which
-    # returns a numpy array of zeros of the given size.
-    @ray.remote
-    def bar(i, size):
-        array = np.zeros(size)
-        array[0] = i
-        return array
-
-    # Launch num_objects instances, half deterministic and half
-    # nondeterministic.
-    args = []
-    for i in range(num_objects):
-        if i % 2 == 0:
-            args.append(foo.remote(i, size))
-        else:
-            args.append(bar.remote(i, size))
-
-    # Get each value to force each task to finish. After some number of
-    # gets, old values should be evicted.
-    for i in range(num_objects):
-        value = ray.get(args[i])
-        assert value[0] == i
-    # Get each value again to force reconstruction.
-    for i in range(num_objects):
-        value = ray.get(args[i])
-        assert value[0] == i
-
-    def error_check(errors):
-        if num_nodes == 1:
-            # In a single-node setting, each object is evicted and
-            # reconstructed exactly once, so exactly half the objects will
-            # produce an error during reconstruction.
-            min_errors = num_objects // 2
-        else:
-            # In a multinode setting, each object is evicted zero or one
-            # times, so some of the nondeterministic tasks may not be
-            # reexecuted.
-            min_errors = 1
-        return len(errors) >= min_errors
-
-    errors = wait_for_errors(error_check)
-    # Make sure all the errors have the correct type.
-    assert all(error["type"] == ray_constants.HASH_MISMATCH_PUSH_ERROR
-               for error in errors)
-
-    assert cluster.remaining_processes_alive()
-
-
-@pytest.mark.skipif(
-    os.environ.get("RAY_USE_NEW_GCS") == "on",
-    reason="Failing with new GCS API on Linux.")
-@pytest.mark.skipif(
-    sys.version_info < (3, 0), reason="This test requires Python 3.")
-@pytest.mark.parametrize(
-    "ray_start_object_store_memory", [10**9], indirect=True)
-def test_driver_put_errors(ray_start_object_store_memory):
-    plasma_store_memory = ray_start_object_store_memory
-    # Define the size of one task's return argument so that the combined
-    # sum of all objects' sizes is at least twice the plasma stores'
-    # combined allotted memory.
-    num_objects = 100
-    size = plasma_store_memory * 2 // (num_objects * 8)
-
-    # Define a task with a single dependency, a numpy array, that returns
-    # another array.
-    @ray.remote
-    def single_dependency(i, arg):
-        arg = np.copy(arg)
-        arg[0] = i
-        return arg
-
-    # Launch num_objects instances of the remote task, each dependent on
-    # the one before it. The first instance of the task takes a numpy array
-    # as an argument, which is put into the object store.
-    args = []
-    arg = single_dependency.remote(0, np.zeros(size))
-    for i in range(num_objects):
-        arg = single_dependency.remote(i, arg)
-        args.append(arg)
-    # Get each value to force each task to finish. After some number of
-    # gets, old values should be evicted.
-    for i in range(num_objects):
-        value = ray.get(args[i])
-        assert value[0] == i
-
-    # Get each value starting from the beginning to force reconstruction.
-    # Currently, since we're not able to reconstruct `ray.put` objects that
-    # were evicted and whose originating tasks are still running, this
-    # for-loop should hang on its first iteration and push an error to the
-    # driver.
-    ray.worker.global_worker.raylet_client.fetch_or_reconstruct([args[0]],
-                                                                False)
-
-    def error_check(errors):
-        return len(errors) > 1
-
-    errors = wait_for_errors(error_check)
-    assert all(error["type"] == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
-               or "ray.exceptions.UnreconstructableError" in error["message"]
-               for error in errors)
-
-
-# NOTE(swang): This test tries to launch 1000 workers and breaks.
-# TODO(rkn): This test needs to be updated to use pytest.
-# class WorkerPoolTests(unittest.TestCase):
-#
-#   def tearDown(self):
-#     ray.shutdown()
-#
-#   def testBlockingTasks(self):
-#     @ray.remote
-#     def f(i, j):
-#       return (i, j)
-#
-#     @ray.remote
-#     def g(i):
-#       # Each instance of g submits and blocks on the result of another remote
-#       # task.
-#       object_ids = [f.remote(i, j) for j in range(10)]
-#       return ray.get(object_ids)
-#
-#     ray.init(num_workers=1)
-#     ray.get([g.remote(i) for i in range(1000)])
-#     ray.shutdown()
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -0,0 +1,379 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import numpy as np
+import os
+import pytest
+import sys
+import time
+
+import ray
+from ray.cluster_utils import Cluster
+from ray.test_utils import flat_errors
+import ray.ray_constants as ray_constants
+
+
+@pytest.fixture(params=[1, 4])
+def ray_start_reconstruction(request):
+    num_nodes = request.param
+
+    plasma_store_memory = int(0.5 * 10**9)
+
+    cluster = Cluster(
+        initialize_head=True,
+        head_node_args={
+            "num_cpus": 1,
+            "object_store_memory": plasma_store_memory // num_nodes,
+            "redis_max_memory": 10**7,
+            "_internal_config": json.dumps({
+                "initial_reconstruction_timeout_milliseconds": 200
+            })
+        })
+    for i in range(num_nodes - 1):
+        cluster.add_node(
+            num_cpus=1,
+            object_store_memory=plasma_store_memory // num_nodes,
+            _internal_config=json.dumps({
+                "initial_reconstruction_timeout_milliseconds": 200
+            }))
+    ray.init(address=cluster.address)
+
+    yield plasma_store_memory, num_nodes, cluster
+
+    # Clean up the Ray cluster.
+    ray.shutdown()
+    cluster.shutdown()
+
+
+@pytest.mark.skipif(
+    os.environ.get("RAY_USE_NEW_GCS") == "on",
+    reason="Failing with new GCS API on Linux.")
+def test_simple(ray_start_reconstruction):
+    plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
+    # Define the size of one task's return argument so that the combined
+    # sum of all objects' sizes is at least twice the plasma stores'
+    # combined allotted memory.
+    num_objects = 100
+    size = int(plasma_store_memory * 1.5 / (num_objects * 8))
+
+    # Define a remote task with no dependencies, which returns a numpy
+    # array of the given size.
+    @ray.remote
+    def foo(i, size):
+        array = np.zeros(size)
+        array[0] = i
+        return array
+
+    # Launch num_objects instances of the remote task.
+    args = []
+    for i in range(num_objects):
+        args.append(foo.remote(i, size))
+
+    # Get each value to force each task to finish. After some number of
+    # gets, old values should be evicted.
+    for i in range(num_objects):
+        value = ray.get(args[i])
+        assert value[0] == i
+    # Get each value again to force reconstruction.
+    for i in range(num_objects):
+        value = ray.get(args[i])
+        assert value[0] == i
+    # Get values sequentially, in chunks.
+    num_chunks = 4 * num_nodes
+    chunk = num_objects // num_chunks
+    for i in range(num_chunks):
+        values = ray.get(args[i * chunk:(i + 1) * chunk])
+        del values
+
+    assert cluster.remaining_processes_alive()
+
+
+def sorted_random_indexes(total, output_num):
+    random_indexes = [np.random.randint(total) for _ in range(output_num)]
+    random_indexes.sort()
+    return random_indexes
+
+
+@pytest.mark.skipif(
+    os.environ.get("RAY_USE_NEW_GCS") == "on",
+    reason="Failing with new GCS API on Linux.")
+def test_recursive(ray_start_reconstruction):
+    plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
+    # Define the size of one task's return argument so that the combined
+    # sum of all objects' sizes is at least twice the plasma stores'
+    # combined allotted memory.
+    num_objects = 100
+    size = int(plasma_store_memory * 1.5 / (num_objects * 8))
+
+    # Define a root task with no dependencies, which returns a numpy array
+    # of the given size.
+    @ray.remote
+    def no_dependency_task(size):
+        array = np.zeros(size)
+        return array
+
+    # Define a task with a single dependency, which returns its one
+    # argument.
+    @ray.remote
+    def single_dependency(i, arg):
+        arg = np.copy(arg)
+        arg[0] = i
+        return arg
+
+    # Launch num_objects instances of the remote task, each dependent on
+    # the one before it.
+    arg = no_dependency_task.remote(size)
+    args = []
+    for i in range(num_objects):
+        arg = single_dependency.remote(i, arg)
+        args.append(arg)
+
+    # Get each value to force each task to finish. After some number of
+    # gets, old values should be evicted.
+    for i in range(num_objects):
+        value = ray.get(args[i])
+        assert value[0] == i
+    # Get each value again to force reconstruction.
+    for i in range(num_objects):
+        value = ray.get(args[i])
+        assert value[0] == i
+    # Get 10 values randomly.
+    random_indexes = sorted_random_indexes(num_objects, 10)
+    for i in random_indexes:
+        value = ray.get(args[i])
+        assert value[0] == i
+    # Get values sequentially, in chunks.
+    num_chunks = 4 * num_nodes
+    chunk = num_objects // num_chunks
+    for i in range(num_chunks):
+        values = ray.get(args[i * chunk:(i + 1) * chunk])
+        del values
+
+    assert cluster.remaining_processes_alive()
+
+
+@pytest.mark.skip(reason="This test often hangs or fails in CI.")
+@pytest.mark.skipif(
+    os.environ.get("RAY_USE_NEW_GCS") == "on",
+    reason="Failing with new GCS API on Linux.")
+def test_multiple_recursive(ray_start_reconstruction):
+    plasma_store_memory, _, cluster = ray_start_reconstruction
+    # Define the size of one task's return argument so that the combined
+    # sum of all objects' sizes is at least twice the plasma stores'
+    # combined allotted memory.
+    num_objects = 100
+    size = plasma_store_memory * 2 // (num_objects * 8)
+
+    # Define a root task with no dependencies, which returns a numpy array
+    # of the given size.
+    @ray.remote
+    def no_dependency_task(size):
+        array = np.zeros(size)
+        return array
+
+    # Define a task with multiple dependencies, which returns its first
+    # argument.
+    @ray.remote
+    def multiple_dependency(i, arg1, arg2, arg3):
+        arg1 = np.copy(arg1)
+        arg1[0] = i
+        return arg1
+
+    # Launch num_args instances of the root task. Then launch num_objects
+    # instances of the multi-dependency remote task, each dependent on the
+    # num_args tasks before it.
+    num_args = 3
+    args = []
+    for i in range(num_args):
+        arg = no_dependency_task.remote(size)
+        args.append(arg)
+    for i in range(num_objects):
+        args.append(multiple_dependency.remote(i, *args[i:i + num_args]))
+
+    # Get each value to force each task to finish. After some number of
+    # gets, old values should be evicted.
+    args = args[num_args:]
+    for i in range(num_objects):
+        value = ray.get(args[i])
+        assert value[0] == i
+    # Get each value again to force reconstruction.
+    for i in range(num_objects):
+        value = ray.get(args[i])
+        assert value[0] == i
+    # Get 10 values randomly.
+    random_indexes = sorted_random_indexes(num_objects, 10)
+    for i in random_indexes:
+        value = ray.get(args[i])
+        assert value[0] == i
+
+    assert cluster.remaining_processes_alive()
+
+
+def wait_for_errors(error_check):
+    # Wait for errors from all the nondeterministic tasks.
+    errors = []
+    time_left = 100
+    while time_left > 0:
+        errors = flat_errors()
+        if error_check(errors):
+            break
+        time_left -= 1
+        time.sleep(1)
+
+    # Make sure that enough errors came through.
+    assert error_check(errors)
+    return errors
+
+
+@pytest.mark.skip("This test does not work yet.")
+@pytest.mark.skipif(
+    os.environ.get("RAY_USE_NEW_GCS") == "on",
+    reason="Failing with new GCS API on Linux.")
+def test_nondeterministic_task(ray_start_reconstruction):
+    plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
+    # Define the size of one task's return argument so that the combined
+    # sum of all objects' sizes is at least twice the plasma stores'
+    # combined allotted memory.
+    num_objects = 1000
+    size = plasma_store_memory * 2 // (num_objects * 8)
+
+    # Define a nondeterministic remote task with no dependencies, which
+    # returns a random numpy array of the given size. This task should
+    # produce an error on the driver if it is ever reexecuted.
+    @ray.remote
+    def foo(i, size):
+        array = np.random.rand(size)
+        array[0] = i
+        return array
+
+    # Define a deterministic remote task with no dependencies, which
+    # returns a numpy array of zeros of the given size.
+    @ray.remote
+    def bar(i, size):
+        array = np.zeros(size)
+        array[0] = i
+        return array
+
+    # Launch num_objects instances, half deterministic and half
+    # nondeterministic.
+    args = []
+    for i in range(num_objects):
+        if i % 2 == 0:
+            args.append(foo.remote(i, size))
+        else:
+            args.append(bar.remote(i, size))
+
+    # Get each value to force each task to finish. After some number of
+    # gets, old values should be evicted.
+    for i in range(num_objects):
+        value = ray.get(args[i])
+        assert value[0] == i
+    # Get each value again to force reconstruction.
+    for i in range(num_objects):
+        value = ray.get(args[i])
+        assert value[0] == i
+
+    def error_check(errors):
+        if num_nodes == 1:
+            # In a single-node setting, each object is evicted and
+            # reconstructed exactly once, so exactly half the objects will
+            # produce an error during reconstruction.
+            min_errors = num_objects // 2
+        else:
+            # In a multinode setting, each object is evicted zero or one
+            # times, so some of the nondeterministic tasks may not be
+            # reexecuted.
+            min_errors = 1
+        return len(errors) >= min_errors
+
+    errors = wait_for_errors(error_check)
+    # Make sure all the errors have the correct type.
+    assert all(error["type"] == ray_constants.HASH_MISMATCH_PUSH_ERROR
+               for error in errors)
+
+    assert cluster.remaining_processes_alive()
+
+
+@pytest.mark.skipif(
+    os.environ.get("RAY_USE_NEW_GCS") == "on",
+    reason="Failing with new GCS API on Linux.")
+@pytest.mark.skipif(
+    sys.version_info < (3, 0), reason="This test requires Python 3.")
+@pytest.mark.parametrize(
+    "ray_start_object_store_memory", [10**9], indirect=True)
+def test_driver_put_errors(ray_start_object_store_memory):
+    plasma_store_memory = ray_start_object_store_memory
+    # Define the size of one task's return argument so that the combined
+    # sum of all objects' sizes is at least twice the plasma stores'
+    # combined allotted memory.
+    num_objects = 100
+    size = plasma_store_memory * 2 // (num_objects * 8)
+
+    # Define a task with a single dependency, a numpy array, that returns
+    # another array.
+    @ray.remote
+    def single_dependency(i, arg):
+        arg = np.copy(arg)
+        arg[0] = i
+        return arg
+
+    # Launch num_objects instances of the remote task, each dependent on
+    # the one before it. The first instance of the task takes a numpy array
+    # as an argument, which is put into the object store.
+    args = []
+    arg = single_dependency.remote(0, np.zeros(size))
+    for i in range(num_objects):
+        arg = single_dependency.remote(i, arg)
+        args.append(arg)
+    # Get each value to force each task to finish. After some number of
+    # gets, old values should be evicted.
+    for i in range(num_objects):
+        value = ray.get(args[i])
+        assert value[0] == i
+
+    # Get each value starting from the beginning to force reconstruction.
+    # Currently, since we're not able to reconstruct `ray.put` objects that
+    # were evicted and whose originating tasks are still running, this
+    # for-loop should hang on its first iteration and push an error to the
+    # driver.
+    ray.worker.global_worker.raylet_client.fetch_or_reconstruct([args[0]],
+                                                                False)
+
+    def error_check(errors):
+        return len(errors) > 1
+
+    errors = wait_for_errors(error_check)
+    assert all(error["type"] == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
+               or "ray.exceptions.UnreconstructableError" in error["message"]
+               for error in errors)
+
+
+# NOTE(swang): This test tries to launch 1000 workers and breaks.
+# TODO(rkn): This test needs to be updated to use pytest.
+# class WorkerPoolTests(unittest.TestCase):
+#
+#   def tearDown(self):
+#     ray.shutdown()
+#
+#   def testBlockingTasks(self):
+#     @ray.remote
+#     def f(i, j):
+#       return (i, j)
+#
+#     @ray.remote
+#     def g(i):
+#       # Each instance of g submits and blocks on the result of another remote
+#       # task.
+#       object_ids = [f.remote(i, j) for j in range(10)]
+#       return ray.get(object_ids)
+#
+#     ray.init(num_workers=1)
+#     ray.get([g.remote(i) for i in range(1000)])
+#     ray.shutdown()
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -0,0 +1,102 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import os
+import pytest
+
+import ray
+
+
+@pytest.fixture(params=[1, 4])
+def ray_start_sharded(request):
+    num_redis_shards = request.param
+
+    if os.environ.get("RAY_USE_NEW_GCS") == "on":
+        num_redis_shards = 1
+        # For now, RAY_USE_NEW_GCS supports 1 shard, and credis supports
+        # 1-node chain for that shard only.
+
+    # Start the Ray processes.
+    ray.init(
+        object_store_memory=int(0.5 * 10**9),
+        num_cpus=10,
+        num_redis_shards=num_redis_shards,
+        redis_max_memory=10**7)
+
+    yield None
+
+    # The code after the yield will run as teardown code.
+    ray.shutdown()
+
+
+def test_submitting_many_tasks(ray_start_sharded):
+    @ray.remote
+    def f(x):
+        return 1
+
+    def g(n):
+        x = 1
+        for i in range(n):
+            x = f.remote(x)
+        return x
+
+    ray.get([g(1000) for _ in range(100)])
+    assert ray.services.remaining_processes_alive()
+
+
+def test_submitting_many_actors_to_one(ray_start_sharded):
+    @ray.remote
+    class Actor(object):
+        def __init__(self):
+            pass
+
+        def ping(self):
+            return
+
+    @ray.remote
+    class Worker(object):
+        def __init__(self, actor):
+            self.actor = actor
+
+        def ping(self):
+            return ray.get(self.actor.ping.remote())
+
+    a = Actor.remote()
+    workers = [Worker.remote(a) for _ in range(10)]
+    for _ in range(10):
+        out = ray.get([w.ping.remote() for w in workers])
+        assert out == [None for _ in workers]
+
+
+def test_getting_and_putting(ray_start_sharded):
+    for n in range(8):
+        x = np.zeros(10**n)
+
+        for _ in range(100):
+            ray.put(x)
+
+        x_id = ray.put(x)
+        for _ in range(1000):
+            ray.get(x_id)
+
+    assert ray.services.remaining_processes_alive()
+
+
+def test_getting_many_objects(ray_start_sharded):
+    @ray.remote
+    def f():
+        return 1
+
+    n = 10**4  # TODO(pcm): replace by 10 ** 5 once this is faster.
+    lst = ray.get([f.remote() for _ in range(n)])
+    assert lst == n * [1]
+
+    assert ray.services.remaining_processes_alive()
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -7,7 +7,7 @@ import shutil
 import time
 import pytest
 import ray
-from ray.tests.cluster_utils import Cluster
+from ray.cluster_utils import Cluster


 def test_conn_cluster():
@@ -150,3 +150,11 @@ def test_session_dir_uniqueness():
        session_dirs.add(ray.worker._global_node.get_session_dir_path)
        ray.shutdown()
    assert len(session_dirs) == 3
+
+
+if __name__ == "__main__":
+    import sys
+    # Make subprocess happy in bazel.
+    os.environ["LC_ALL"] = "en_US.UTF-8"
+    os.environ["LANG"] = "en_US.UTF-8"
+    sys.exit(pytest.main(["-v", __file__]))
@@ -252,3 +252,9 @@ def test_remote_training_loss(ray_start_2_cpus):
    after_acc = sess.run(
        loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))
    assert before_acc < after_acc
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -44,4 +44,6 @@ class TestUnreconstructableErrors(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -33,3 +33,9 @@ def test_get_webui(shutdown_only):
    assert node_info["error"] is None
    assert node_info["result"] is not None
    assert isinstance(node_info["timestamp"], float)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -3,12 +3,14 @@ py_test(
    size = "medium",
    srcs = ["tests/test_actor_reuse.py"],
    tags = ["jenkins_only"],
+    deps = [":tune_lib"],
 )

 py_test(
    name = "test_automl_searcher",
    size = "small",
    srcs = ["tests/test_automl_searcher.py"],
+    deps = [":tune_lib"],
 )

 py_test(
@@ -77,19 +79,52 @@ py_test(
    deps = [":tune_lib"],
 )

+py_test(
+    name = "test_run_experiment",
+    size = "medium",
+    srcs = ["tests/test_run_experiment.py"],
+    deps = [":tune_lib"],
+    tags = ["exclusive"],
+)
+
 py_test(
    name = "test_trial_runner",
-    size = "large",
+    size = "medium",
    srcs = ["tests/test_trial_runner.py"],
    deps = [":tune_lib"],
    tags = ["exclusive"],
 )

+py_test(
+    name = "test_var",
+    size = "medium",
+    srcs = ["tests/test_var.py"],
+    deps = [":tune_lib"],
+    tags = ["exclusive"],
+)
+
+py_test(
+    name = "test_api",
+    size = "medium",
+    srcs = ["tests/test_api.py"],
+    deps = [":tune_lib"],
+    tags = ["exclusive"],
+)
+
+py_test(
+    name = "test_sync",
+    size = "medium",
+    srcs = ["tests/test_sync.py"],
+    deps = [":tune_lib"],
+    tags = ["exclusive"],
+)
+ 
 py_test(
    name = "test_trial_scheduler",
    size = "medium",
    srcs = ["tests/test_trial_scheduler.py"],
    deps = [":tune_lib"],
+    tags = ["exclusive"],
 )

 py_test(
@@ -113,11 +148,12 @@ py_test(
    size = "medium",
    srcs = ["tests/test_tune_server.py"],
    deps = [":tune_lib"],
+    tags = ["exclusive"],
 )

 # This is a dummy test dependency that causes the above tests to be
 # re-run if any of these files changes.
 py_library(
-    name="tune_lib",
+    name = "tune_lib",
    srcs = glob(["**/*.py"], exclude=["tests/*.py"]),
 )
@@ -96,4 +96,6 @@ class ActorReuseTest(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -0,0 +1,797 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import os
+import time
+import unittest
+
+import ray
+from ray.rllib import _register_all
+
+from ray import tune
+from ray.tune import Trainable, TuneError
+from ray.tune import register_env, register_trainable, run_experiments
+from ray.tune.schedulers import TrialScheduler, FIFOScheduler
+from ray.tune.trial import Trial
+from ray.tune.result import (TIMESTEPS_TOTAL, DONE, HOSTNAME, NODE_IP, PID,
+                             EPISODES_TOTAL, TRAINING_ITERATION,
+                             TIMESTEPS_THIS_ITER, TIME_THIS_ITER_S,
+                             TIME_TOTAL_S, TRIAL_ID, EXPERIMENT_TAG)
+from ray.tune.logger import Logger
+from ray.tune.util import pin_in_object_store, get_pinned_object, flatten_dict
+from ray.tune.experiment import Experiment
+from ray.tune.resources import Resources
+from ray.tune.suggest import grid_search
+from ray.tune.suggest.suggestion import _MockSuggestionAlgorithm
+
+
+class TrainableFunctionApiTest(unittest.TestCase):
+    def setUp(self):
+        ray.init(num_cpus=4, num_gpus=0, object_store_memory=150 * 1024 * 1024)
+
+    def tearDown(self):
+        ray.shutdown()
+        _register_all()  # re-register the evicted objects
+
+    def checkAndReturnConsistentLogs(self, results, sleep_per_iter=None):
+        """Checks logging is the same between APIs.
+
+        Ignore "DONE" for logging but checks that the
+        scheduler is notified properly with the last result.
+        """
+        class_results = copy.deepcopy(results)
+        function_results = copy.deepcopy(results)
+
+        class_output = []
+        function_output = []
+        scheduler_notif = []
+
+        class MockScheduler(FIFOScheduler):
+            def on_trial_complete(self, runner, trial, result):
+                scheduler_notif.append(result)
+
+        class ClassAPILogger(Logger):
+            def on_result(self, result):
+                class_output.append(result)
+
+        class FunctionAPILogger(Logger):
+            def on_result(self, result):
+                function_output.append(result)
+
+        class _WrappedTrainable(Trainable):
+            def _setup(self, config):
+                del config
+                self._result_iter = copy.deepcopy(class_results)
+
+            def _train(self):
+                if sleep_per_iter:
+                    time.sleep(sleep_per_iter)
+                res = self._result_iter.pop(0)  # This should not fail
+                if not self._result_iter:  # Mark "Done" for last result
+                    res[DONE] = True
+                return res
+
+        def _function_trainable(config, reporter):
+            for result in function_results:
+                if sleep_per_iter:
+                    time.sleep(sleep_per_iter)
+                reporter(**result)
+
+        class_trainable_name = "class_trainable"
+        register_trainable(class_trainable_name, _WrappedTrainable)
+
+        trials = run_experiments(
+            {
+                "function_api": {
+                    "run": _function_trainable,
+                    "loggers": [FunctionAPILogger],
+                },
+                "class_api": {
+                    "run": class_trainable_name,
+                    "loggers": [ClassAPILogger],
+                },
+            },
+            raise_on_failed_trial=False,
+            scheduler=MockScheduler())
+
+        # Ignore these fields
+        NO_COMPARE_FIELDS = {
+            HOSTNAME,
+            NODE_IP,
+            TRIAL_ID,
+            EXPERIMENT_TAG,
+            PID,
+            TIME_THIS_ITER_S,
+            TIME_TOTAL_S,
+            DONE,  # This is ignored because FunctionAPI has different handling
+            "timestamp",
+            "time_since_restore",
+            "experiment_id",
+            "date",
+        }
+
+        self.assertEqual(len(class_output), len(results))
+        self.assertEqual(len(function_output), len(results))
+
+        def as_comparable_result(result):
+            return {
+                k: v
+                for k, v in result.items() if k not in NO_COMPARE_FIELDS
+            }
+
+        function_comparable = [
+            as_comparable_result(result) for result in function_output
+        ]
+        class_comparable = [
+            as_comparable_result(result) for result in class_output
+        ]
+
+        self.assertEqual(function_comparable, class_comparable)
+
+        self.assertEqual(sum(t.get(DONE) for t in scheduler_notif), 2)
+        self.assertEqual(
+            as_comparable_result(scheduler_notif[0]),
+            as_comparable_result(scheduler_notif[1]))
+
+        # Make sure the last result is the same.
+        self.assertEqual(
+            as_comparable_result(trials[0].last_result),
+            as_comparable_result(trials[1].last_result))
+
+        return function_output, trials
+
+    def testPinObject(self):
+        X = pin_in_object_store("hello")
+
+        @ray.remote
+        def f():
+            return get_pinned_object(X)
+
+        self.assertEqual(ray.get(f.remote()), "hello")
+
+    def testFetchPinned(self):
+        X = pin_in_object_store("hello")
+
+        def train(config, reporter):
+            get_pinned_object(X)
+            reporter(timesteps_total=100, done=True)
+
+        register_trainable("f1", train)
+        [trial] = run_experiments({
+            "foo": {
+                "run": "f1",
+            }
+        })
+        self.assertEqual(trial.status, Trial.TERMINATED)
+        self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 100)
+
+    def testRegisterEnv(self):
+        register_env("foo", lambda: None)
+        self.assertRaises(TypeError, lambda: register_env("foo", 2))
+
+    def testRegisterEnvOverwrite(self):
+        def train(config, reporter):
+            reporter(timesteps_total=100, done=True)
+
+        def train2(config, reporter):
+            reporter(timesteps_total=200, done=True)
+
+        register_trainable("f1", train)
+        register_trainable("f1", train2)
+        [trial] = run_experiments({
+            "foo": {
+                "run": "f1",
+            }
+        })
+        self.assertEqual(trial.status, Trial.TERMINATED)
+        self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 200)
+
+    def testRegisterTrainable(self):
+        def train(config, reporter):
+            pass
+
+        class A(object):
+            pass
+
+        class B(Trainable):
+            pass
+
+        register_trainable("foo", train)
+        Experiment("test", train)
+        register_trainable("foo", B)
+        Experiment("test", B)
+        self.assertRaises(TypeError, lambda: register_trainable("foo", B()))
+        self.assertRaises(TuneError, lambda: Experiment("foo", B()))
+        self.assertRaises(TypeError, lambda: register_trainable("foo", A))
+        self.assertRaises(TypeError, lambda: Experiment("foo", A))
+
+    def testTrainableCallable(self):
+        def dummy_fn(config, reporter, steps):
+            reporter(timesteps_total=steps, done=True)
+
+        from functools import partial
+        steps = 500
+        register_trainable("test", partial(dummy_fn, steps=steps))
+        [trial] = run_experiments({
+            "foo": {
+                "run": "test",
+            }
+        })
+        self.assertEqual(trial.status, Trial.TERMINATED)
+        self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], steps)
+        [trial] = tune.run(partial(dummy_fn, steps=steps)).trials
+        self.assertEqual(trial.status, Trial.TERMINATED)
+        self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], steps)
+
+    def testBuiltInTrainableResources(self):
+        class B(Trainable):
+            @classmethod
+            def default_resource_request(cls, config):
+                return Resources(cpu=config["cpu"], gpu=config["gpu"])
+
+            def _train(self):
+                return {"timesteps_this_iter": 1, "done": True}
+
+        register_trainable("B", B)
+
+        def f(cpus, gpus, queue_trials):
+            return run_experiments(
+                {
+                    "foo": {
+                        "run": "B",
+                        "config": {
+                            "cpu": cpus,
+                            "gpu": gpus,
+                        },
+                    }
+                },
+                queue_trials=queue_trials)[0]
+
+        # Should all succeed
+        self.assertEqual(f(0, 0, False).status, Trial.TERMINATED)
+        self.assertEqual(f(1, 0, True).status, Trial.TERMINATED)
+        self.assertEqual(f(1, 0, True).status, Trial.TERMINATED)
+
+        # Too large resource request
+        self.assertRaises(TuneError, lambda: f(100, 100, False))
+        self.assertRaises(TuneError, lambda: f(0, 100, False))
+        self.assertRaises(TuneError, lambda: f(100, 0, False))
+
+        # TODO(ekl) how can we test this is queued (hangs)?
+        # f(100, 0, True)
+
+    def testRewriteEnv(self):
+        def train(config, reporter):
+            reporter(timesteps_total=1)
+
+        register_trainable("f1", train)
+
+        [trial] = run_experiments({
+            "foo": {
+                "run": "f1",
+                "env": "CartPole-v0",
+            }
+        })
+        self.assertEqual(trial.config["env"], "CartPole-v0")
+
+    def testConfigPurity(self):
+        def train(config, reporter):
+            assert config == {"a": "b"}, config
+            reporter(timesteps_total=1)
+
+        register_trainable("f1", train)
+        run_experiments({
+            "foo": {
+                "run": "f1",
+                "config": {
+                    "a": "b"
+                },
+            }
+        })
+
+    def testLogdir(self):
+        def train(config, reporter):
+            assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd()
+            reporter(timesteps_total=1)
+
+        register_trainable("f1", train)
+        run_experiments({
+            "foo": {
+                "run": "f1",
+                "local_dir": "/tmp/logdir",
+                "config": {
+                    "a": "b"
+                },
+            }
+        })
+
+    def testLogdirStartingWithTilde(self):
+        local_dir = "~/ray_results/local_dir"
+
+        def train(config, reporter):
+            cwd = os.getcwd()
+            assert cwd.startswith(os.path.expanduser(local_dir)), cwd
+            assert not cwd.startswith("~"), cwd
+            reporter(timesteps_total=1)
+
+        register_trainable("f1", train)
+        run_experiments({
+            "foo": {
+                "run": "f1",
+                "local_dir": local_dir,
+                "config": {
+                    "a": "b"
+                },
+            }
+        })
+
+    def testLongFilename(self):
+        def train(config, reporter):
+            assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd()
+            reporter(timesteps_total=1)
+
+        register_trainable("f1", train)
+        run_experiments({
+            "foo": {
+                "run": "f1",
+                "local_dir": "/tmp/logdir",
+                "config": {
+                    "a" * 50: tune.sample_from(lambda spec: 5.0 / 7),
+                    "b" * 50: tune.sample_from(lambda spec: "long" * 40),
+                },
+            }
+        })
+
+    def testBadParams(self):
+        def f():
+            run_experiments({"foo": {}})
+
+        self.assertRaises(TuneError, f)
+
+    def testBadParams2(self):
+        def f():
+            run_experiments({
+                "foo": {
+                    "run": "asdf",
+                    "bah": "this param is not allowed",
+                }
+            })
+
+        self.assertRaises(TuneError, f)
+
+    def testBadParams3(self):
+        def f():
+            run_experiments({
+                "foo": {
+                    "run": grid_search("invalid grid search"),
+                }
+            })
+
+        self.assertRaises(TuneError, f)
+
+    def testBadParams4(self):
+        def f():
+            run_experiments({
+                "foo": {
+                    "run": "asdf",
+                }
+            })
+
+        self.assertRaises(TuneError, f)
+
+    def testBadParams5(self):
+        def f():
+            run_experiments({"foo": {"run": "PPO", "stop": {"asdf": 1}}})
+
+        self.assertRaises(TuneError, f)
+
+    def testBadParams6(self):
+        def f():
+            run_experiments({
+                "foo": {
+                    "run": "PPO",
+                    "resources_per_trial": {
+                        "asdf": 1
+                    }
+                }
+            })
+
+        self.assertRaises(TuneError, f)
+
+    def testBadStoppingReturn(self):
+        def train(config, reporter):
+            reporter()
+
+        register_trainable("f1", train)
+
+        def f():
+            run_experiments({
+                "foo": {
+                    "run": "f1",
+                    "stop": {
+                        "time": 10
+                    },
+                }
+            })
+
+        self.assertRaises(TuneError, f)
+
+    def testNestedStoppingReturn(self):
+        def train(config, reporter):
+            for i in range(10):
+                reporter(test={"test1": {"test2": i}})
+
+        with self.assertRaises(TuneError):
+            [trial] = tune.run(
+                train, stop={
+                    "test": {
+                        "test1": {
+                            "test2": 6
+                        }
+                    }
+                }).trials
+        [trial] = tune.run(train, stop={"test/test1/test2": 6}).trials
+        self.assertEqual(trial.last_result["training_iteration"], 7)
+
+    def testStoppingFunction(self):
+        def train(config, reporter):
+            for i in range(10):
+                reporter(test=i)
+
+        def stop(trial_id, result):
+            return result["test"] > 6
+
+        [trial] = tune.run(train, stop=stop).trials
+        self.assertEqual(trial.last_result["training_iteration"], 8)
+
+    def testStoppingMemberFunction(self):
+        def train(config, reporter):
+            for i in range(10):
+                reporter(test=i)
+
+        class Stopper:
+            def stop(self, trial_id, result):
+                return result["test"] > 6
+
+        [trial] = tune.run(train, stop=Stopper().stop).trials
+        self.assertEqual(trial.last_result["training_iteration"], 8)
+
+    def testBadStoppingFunction(self):
+        def train(config, reporter):
+            for i in range(10):
+                reporter(test=i)
+
+        class Stopper:
+            def stop(self, result):
+                return result["test"] > 6
+
+        def stop(result):
+            return result["test"] > 6
+
+        with self.assertRaises(ValueError):
+            tune.run(train, stop=Stopper().stop)
+        with self.assertRaises(ValueError):
+            tune.run(train, stop=stop)
+
+    def testEarlyReturn(self):
+        def train(config, reporter):
+            reporter(timesteps_total=100, done=True)
+            time.sleep(99999)
+
+        register_trainable("f1", train)
+        [trial] = run_experiments({
+            "foo": {
+                "run": "f1",
+            }
+        })
+        self.assertEqual(trial.status, Trial.TERMINATED)
+        self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 100)
+
+    def testReporterNoUsage(self):
+        def run_task(config, reporter):
+            print("hello")
+
+        experiment = Experiment(run=run_task, name="ray_crash_repro")
+        [trial] = ray.tune.run(experiment).trials
+        print(trial.last_result)
+        self.assertEqual(trial.last_result[DONE], True)
+
+    def testErrorReturn(self):
+        def train(config, reporter):
+            raise Exception("uh oh")
+
+        register_trainable("f1", train)
+
+        def f():
+            run_experiments({
+                "foo": {
+                    "run": "f1",
+                }
+            })
+
+        self.assertRaises(TuneError, f)
+
+    def testSuccess(self):
+        def train(config, reporter):
+            for i in range(100):
+                reporter(timesteps_total=i)
+
+        register_trainable("f1", train)
+        [trial] = run_experiments({
+            "foo": {
+                "run": "f1",
+            }
+        })
+        self.assertEqual(trial.status, Trial.TERMINATED)
+        self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
+
+    def testNoRaiseFlag(self):
+        def train(config, reporter):
+            raise Exception()
+
+        register_trainable("f1", train)
+
+        [trial] = run_experiments(
+            {
+                "foo": {
+                    "run": "f1",
+                }
+            }, raise_on_failed_trial=False)
+        self.assertEqual(trial.status, Trial.ERROR)
+
+    def testReportInfinity(self):
+        def train(config, reporter):
+            for i in range(100):
+                reporter(mean_accuracy=float("inf"))
+
+        register_trainable("f1", train)
+        [trial] = run_experiments({
+            "foo": {
+                "run": "f1",
+            }
+        })
+        self.assertEqual(trial.status, Trial.TERMINATED)
+        self.assertEqual(trial.last_result["mean_accuracy"], float("inf"))
+
+    def testNestedResults(self):
+        def create_result(i):
+            return {"test": {"1": {"2": {"3": i, "4": False}}}}
+
+        flattened_keys = list(flatten_dict(create_result(0)))
+
+        class _MockScheduler(FIFOScheduler):
+            results = []
+
+            def on_trial_result(self, trial_runner, trial, result):
+                self.results += [result]
+                return TrialScheduler.CONTINUE
+
+            def on_trial_complete(self, trial_runner, trial, result):
+                self.complete_result = result
+
+        def train(config, reporter):
+            for i in range(100):
+                reporter(**create_result(i))
+
+        algo = _MockSuggestionAlgorithm()
+        scheduler = _MockScheduler()
+        [trial] = tune.run(
+            train,
+            scheduler=scheduler,
+            search_alg=algo,
+            stop={
+                "test/1/2/3": 20
+            }).trials
+        self.assertEqual(trial.status, Trial.TERMINATED)
+        self.assertEqual(trial.last_result["test"]["1"]["2"]["3"], 20)
+        self.assertEqual(trial.last_result["test"]["1"]["2"]["4"], False)
+        self.assertEqual(trial.last_result[TRAINING_ITERATION], 21)
+        self.assertEqual(len(scheduler.results), 20)
+        self.assertTrue(
+            all(
+                set(result) >= set(flattened_keys)
+                for result in scheduler.results))
+        self.assertTrue(set(scheduler.complete_result) >= set(flattened_keys))
+        self.assertEqual(len(algo.results), 20)
+        self.assertTrue(
+            all(set(result) >= set(flattened_keys) for result in algo.results))
+        with self.assertRaises(TuneError):
+            [trial] = tune.run(train, stop={"1/2/3": 20})
+        with self.assertRaises(TuneError):
+            [trial] = tune.run(train, stop={"test": 1}).trials
+
+    def testReportTimeStep(self):
+        # Test that no timestep count are logged if never the Trainable never
+        # returns any.
+        results1 = [dict(mean_accuracy=5, done=i == 99) for i in range(100)]
+        logs1, _ = self.checkAndReturnConsistentLogs(results1)
+
+        self.assertTrue(all(log[TIMESTEPS_TOTAL] is None for log in logs1))
+
+        # Test that no timesteps_this_iter are logged if only timesteps_total
+        # are returned.
+        results2 = [dict(timesteps_total=5, done=i == 9) for i in range(10)]
+        logs2, _ = self.checkAndReturnConsistentLogs(results2)
+
+        # Re-run the same trials but with added delay. This is to catch some
+        # inconsistent timestep counting that was present in the multi-threaded
+        # FunctionRunner. This part of the test can be removed once the
+        # multi-threaded FunctionRunner is removed from ray/tune.
+        # TODO: remove once the multi-threaded function runner is gone.
+        logs2, _ = self.checkAndReturnConsistentLogs(results2, 0.5)
+
+        # check all timesteps_total report the same value
+        self.assertTrue(all(log[TIMESTEPS_TOTAL] == 5 for log in logs2))
+        # check that none of the logs report timesteps_this_iter
+        self.assertFalse(
+            any(hasattr(log, TIMESTEPS_THIS_ITER) for log in logs2))
+
+        # Test that timesteps_total and episodes_total are reported when
+        # timesteps_this_iter and episodes_this_iter despite only return zeros.
+        results3 = [
+            dict(timesteps_this_iter=0, episodes_this_iter=0)
+            for i in range(10)
+        ]
+        logs3, _ = self.checkAndReturnConsistentLogs(results3)
+
+        self.assertTrue(all(log[TIMESTEPS_TOTAL] == 0 for log in logs3))
+        self.assertTrue(all(log[EPISODES_TOTAL] == 0 for log in logs3))
+
+        # Test that timesteps_total and episodes_total are properly counted
+        # when timesteps_this_iter and episodes_this_iter report non-zero
+        # values.
+        results4 = [
+            dict(timesteps_this_iter=3, episodes_this_iter=i)
+            for i in range(10)
+        ]
+        logs4, _ = self.checkAndReturnConsistentLogs(results4)
+
+        # The last reported result should not be double-logged.
+        self.assertEqual(logs4[-1][TIMESTEPS_TOTAL], 30)
+        self.assertNotEqual(logs4[-2][TIMESTEPS_TOTAL],
+                            logs4[-1][TIMESTEPS_TOTAL])
+        self.assertEqual(logs4[-1][EPISODES_TOTAL], 45)
+        self.assertNotEqual(logs4[-2][EPISODES_TOTAL],
+                            logs4[-1][EPISODES_TOTAL])
+
+    def testAllValuesReceived(self):
+        results1 = [
+            dict(timesteps_total=(i + 1), my_score=i**2, done=i == 4)
+            for i in range(5)
+        ]
+
+        logs1, _ = self.checkAndReturnConsistentLogs(results1)
+
+        # check if the correct number of results were reported
+        self.assertEqual(len(logs1), len(results1))
+
+        def check_no_missing(reported_result, result):
+            common_results = [reported_result[k] == result[k] for k in result]
+            return all(common_results)
+
+        # check that no result was dropped or modified
+        complete_results = [
+            check_no_missing(log, result)
+            for log, result in zip(logs1, results1)
+        ]
+        self.assertTrue(all(complete_results))
+
+        # check if done was logged exactly once
+        self.assertEqual(len([r for r in logs1 if r.get("done")]), 1)
+
+    def testNoDoneReceived(self):
+        # repeat same test but without explicitly reporting done=True
+        results1 = [
+            dict(timesteps_total=(i + 1), my_score=i**2) for i in range(5)
+        ]
+
+        logs1, trials = self.checkAndReturnConsistentLogs(results1)
+
+        # check if the correct number of results were reported.
+        self.assertEqual(len(logs1), len(results1))
+
+        def check_no_missing(reported_result, result):
+            common_results = [reported_result[k] == result[k] for k in result]
+            return all(common_results)
+
+        # check that no result was dropped or modified
+        complete_results1 = [
+            check_no_missing(log, result)
+            for log, result in zip(logs1, results1)
+        ]
+        self.assertTrue(all(complete_results1))
+
+    def testCheckpointDict(self):
+        class TestTrain(Trainable):
+            def _setup(self, config):
+                self.state = {"hi": 1}
+
+            def _train(self):
+                return {"timesteps_this_iter": 1, "done": True}
+
+            def _save(self, path):
+                return self.state
+
+            def _restore(self, state):
+                self.state = state
+
+        test_trainable = TestTrain()
+        result = test_trainable.save()
+        test_trainable.state["hi"] = 2
+        test_trainable.restore(result)
+        self.assertEqual(test_trainable.state["hi"], 1)
+
+        trials = run_experiments({
+            "foo": {
+                "run": TestTrain,
+                "checkpoint_at_end": True
+            }
+        })
+        for trial in trials:
+            self.assertEqual(trial.status, Trial.TERMINATED)
+            self.assertTrue(trial.has_checkpoint())
+
+    def testMultipleCheckpoints(self):
+        class TestTrain(Trainable):
+            def _setup(self, config):
+                self.state = {"hi": 1, "iter": 0}
+
+            def _train(self):
+                self.state["iter"] += 1
+                return {"timesteps_this_iter": 1, "done": True}
+
+            def _save(self, path):
+                return self.state
+
+            def _restore(self, state):
+                self.state = state
+
+        test_trainable = TestTrain()
+        checkpoint_1 = test_trainable.save()
+        test_trainable.train()
+        checkpoint_2 = test_trainable.save()
+        self.assertNotEqual(checkpoint_1, checkpoint_2)
+        test_trainable.restore(checkpoint_2)
+        self.assertEqual(test_trainable.state["iter"], 1)
+        test_trainable.restore(checkpoint_1)
+        self.assertEqual(test_trainable.state["iter"], 0)
+
+        trials = run_experiments({
+            "foo": {
+                "run": TestTrain,
+                "checkpoint_at_end": True
+            }
+        })
+        for trial in trials:
+            self.assertEqual(trial.status, Trial.TERMINATED)
+            self.assertTrue(trial.has_checkpoint())
+
+    def testIterationCounter(self):
+        def train(config, reporter):
+            for i in range(100):
+                reporter(itr=i, timesteps_this_iter=1)
+
+        register_trainable("exp", train)
+        config = {
+            "my_exp": {
+                "run": "exp",
+                "config": {
+                    "iterations": 100,
+                },
+                "stop": {
+                    "timesteps_total": 100
+                },
+            }
+        }
+        [trial] = run_experiments(config)
+        self.assertEqual(trial.status, Trial.TERMINATED)
+        self.assertEqual(trial.last_result[TRAINING_ITERATION], 100)
+        self.assertEqual(trial.last_result["itr"], 99)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -70,4 +70,6 @@ class AutoMLSearcherTest(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -109,4 +109,4 @@ class CheckpointManagerTest(unittest.TestCase):
 if __name__ == "__main__":
    import pytest
    import sys
-    sys.exit(pytest.main(["-v", "-s", __file__]))
+    sys.exit(pytest.main(["-v", __file__]))
@@ -13,8 +13,8 @@ import sys
 import ray
 from ray import tune
 from ray.rllib import _register_all
-from ray.tests.cluster_utils import Cluster
-from ray.tests.utils import run_string_as_driver_nonblocking
+from ray.cluster_utils import Cluster
+from ray.test_utils import run_string_as_driver_nonblocking
 from ray.tune.error import TuneError
 from ray.tune.ray_trial_executor import RayTrialExecutor
 from ray.tune.experiment import Experiment
@@ -598,4 +598,4 @@ tune.run(
 if __name__ == "__main__":
    import pytest
    import sys
-    sys.exit(pytest.main(["-v", "-s", __file__]))
+    sys.exit(pytest.main(["-v", __file__]))
@@ -62,4 +62,6 @@ class ExperimentTest(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -204,4 +204,6 @@ class AnalysisSuite(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -64,4 +64,6 @@ class LoggerSuite(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -14,7 +14,7 @@ from ray.tune.registry import _global_registry, TRAINABLE_CLASS
 from ray.tune.suggest import BasicVariantGenerator
 from ray.tune.trial import Trial, Checkpoint
 from ray.tune.resources import Resources
-from ray.tests.cluster_utils import Cluster
+from ray.cluster_utils import Cluster


 class RayTrialExecutorTest(unittest.TestCase):
@@ -190,4 +190,6 @@ class LocalModeExecutorTest(RayTrialExecutorTest):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -0,0 +1,241 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+
+import ray
+from ray.rllib import _register_all
+
+from ray.tune.result import TIMESTEPS_TOTAL
+from ray.tune import Trainable, TuneError
+from ray.tune import register_trainable, run_experiments
+from ray.tune.logger import Logger
+from ray.tune.experiment import Experiment
+from ray.tune.trial import Trial, ExportFormat
+
+
+class RunExperimentTest(unittest.TestCase):
+    def tearDown(self):
+        ray.shutdown()
+        _register_all()  # re-register the evicted objects
+
+    def testDict(self):
+        def train(config, reporter):
+            for i in range(100):
+                reporter(timesteps_total=i)
+
+        register_trainable("f1", train)
+        trials = run_experiments({
+            "foo": {
+                "run": "f1",
+            },
+            "bar": {
+                "run": "f1",
+            }
+        })
+        for trial in trials:
+            self.assertEqual(trial.status, Trial.TERMINATED)
+            self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
+
+    def testExperiment(self):
+        def train(config, reporter):
+            for i in range(100):
+                reporter(timesteps_total=i)
+
+        register_trainable("f1", train)
+        exp1 = Experiment(**{
+            "name": "foo",
+            "run": "f1",
+        })
+        [trial] = run_experiments(exp1)
+        self.assertEqual(trial.status, Trial.TERMINATED)
+        self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
+
+    def testExperimentList(self):
+        def train(config, reporter):
+            for i in range(100):
+                reporter(timesteps_total=i)
+
+        register_trainable("f1", train)
+        exp1 = Experiment(**{
+            "name": "foo",
+            "run": "f1",
+        })
+        exp2 = Experiment(**{
+            "name": "bar",
+            "run": "f1",
+        })
+        trials = run_experiments([exp1, exp2])
+        for trial in trials:
+            self.assertEqual(trial.status, Trial.TERMINATED)
+            self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
+
+    def testAutoregisterTrainable(self):
+        def train(config, reporter):
+            for i in range(100):
+                reporter(timesteps_total=i)
+
+        class B(Trainable):
+            def _train(self):
+                return {"timesteps_this_iter": 1, "done": True}
+
+        register_trainable("f1", train)
+        trials = run_experiments({
+            "foo": {
+                "run": train,
+            },
+            "bar": {
+                "run": B
+            }
+        })
+        for trial in trials:
+            self.assertEqual(trial.status, Trial.TERMINATED)
+
+    def testCheckpointAtEnd(self):
+        class train(Trainable):
+            def _train(self):
+                return {"timesteps_this_iter": 1, "done": True}
+
+            def _save(self, path):
+                checkpoint = path + "/checkpoint"
+                with open(checkpoint, "w") as f:
+                    f.write("OK")
+                return checkpoint
+
+        trials = run_experiments({
+            "foo": {
+                "run": train,
+                "checkpoint_at_end": True
+            }
+        })
+        for trial in trials:
+            self.assertEqual(trial.status, Trial.TERMINATED)
+            self.assertTrue(trial.has_checkpoint())
+
+    def testExportFormats(self):
+        class train(Trainable):
+            def _train(self):
+                return {"timesteps_this_iter": 1, "done": True}
+
+            def _export_model(self, export_formats, export_dir):
+                path = export_dir + "/exported"
+                with open(path, "w") as f:
+                    f.write("OK")
+                return {export_formats[0]: path}
+
+        trials = run_experiments({
+            "foo": {
+                "run": train,
+                "export_formats": ["format"]
+            }
+        })
+        for trial in trials:
+            self.assertEqual(trial.status, Trial.TERMINATED)
+            self.assertTrue(
+                os.path.exists(os.path.join(trial.logdir, "exported")))
+
+    def testInvalidExportFormats(self):
+        class train(Trainable):
+            def _train(self):
+                return {"timesteps_this_iter": 1, "done": True}
+
+            def _export_model(self, export_formats, export_dir):
+                ExportFormat.validate(export_formats)
+                return {}
+
+        def fail_trial():
+            run_experiments({
+                "foo": {
+                    "run": train,
+                    "export_formats": ["format"]
+                }
+            })
+
+        self.assertRaises(TuneError, fail_trial)
+
+    def testCustomResources(self):
+        ray.shutdown()
+        ray.init(resources={"hi": 3})
+
+        class train(Trainable):
+            def _train(self):
+                return {"timesteps_this_iter": 1, "done": True}
+
+        trials = run_experiments({
+            "foo": {
+                "run": train,
+                "resources_per_trial": {
+                    "cpu": 1,
+                    "custom_resources": {
+                        "hi": 2
+                    }
+                }
+            }
+        })
+        for trial in trials:
+            self.assertEqual(trial.status, Trial.TERMINATED)
+
+    def testCustomLogger(self):
+        class CustomLogger(Logger):
+            def on_result(self, result):
+                with open(os.path.join(self.logdir, "test.log"), "w") as f:
+                    f.write("hi")
+
+        [trial] = run_experiments({
+            "foo": {
+                "run": "__fake",
+                "stop": {
+                    "training_iteration": 1
+                },
+                "loggers": [CustomLogger]
+            }
+        })
+        self.assertTrue(os.path.exists(os.path.join(trial.logdir, "test.log")))
+        self.assertFalse(
+            os.path.exists(os.path.join(trial.logdir, "params.json")))
+
+        [trial] = run_experiments({
+            "foo": {
+                "run": "__fake",
+                "stop": {
+                    "training_iteration": 1
+                }
+            }
+        })
+        self.assertTrue(
+            os.path.exists(os.path.join(trial.logdir, "params.json")))
+
+        [trial] = run_experiments({
+            "foo": {
+                "run": "__fake",
+                "stop": {
+                    "training_iteration": 1
+                },
+                "loggers": []
+            }
+        })
+        self.assertFalse(
+            os.path.exists(os.path.join(trial.logdir, "params.json")))
+
+    def testCustomTrialString(self):
+        [trial] = run_experiments({
+            "foo": {
+                "run": "__fake",
+                "stop": {
+                    "training_iteration": 1
+                },
+                "trial_name_creator":
+                    lambda t: "{}_{}_321".format(t.trainable_name, t.trial_id)
+            }
+        })
+        self.assertEquals(
+            str(trial), "{}_{}_321".format(trial.trainable_name,
+                                           trial.trial_id))
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -0,0 +1,218 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+
+import ray
+from ray.rllib import _register_all
+
+from ray import tune
+from ray.tune import TuneError
+from ray.tune.syncer import CommandBasedClient
+
+if sys.version_info >= (3, 3):
+    from unittest.mock import patch
+else:
+    from mock import patch
+
+
+class TestSyncFunctionality(unittest.TestCase):
+    def setUp(self):
+        ray.init()
+
+    def tearDown(self):
+        ray.shutdown()
+        _register_all()  # re-register the evicted objects
+
+    @patch("ray.tune.syncer.S3_PREFIX", "test")
+    def testNoUploadDir(self):
+        """No Upload Dir is given."""
+        with self.assertRaises(AssertionError):
+            [trial] = tune.run(
+                "__fake",
+                name="foo",
+                max_failures=0,
+                **{
+                    "stop": {
+                        "training_iteration": 1
+                    },
+                    "sync_to_cloud": "echo {source} {target}"
+                }).trials
+
+    @patch("ray.tune.syncer.S3_PREFIX", "test")
+    def testCloudProperString(self):
+        with self.assertRaises(ValueError):
+            [trial] = tune.run(
+                "__fake",
+                name="foo",
+                max_failures=0,
+                **{
+                    "stop": {
+                        "training_iteration": 1
+                    },
+                    "upload_dir": "test",
+                    "sync_to_cloud": "ls {target}"
+                }).trials
+
+        with self.assertRaises(ValueError):
+            [trial] = tune.run(
+                "__fake",
+                name="foo",
+                max_failures=0,
+                **{
+                    "stop": {
+                        "training_iteration": 1
+                    },
+                    "upload_dir": "test",
+                    "sync_to_cloud": "ls {source}"
+                }).trials
+
+        tmpdir = tempfile.mkdtemp()
+        logfile = os.path.join(tmpdir, "test.log")
+
+        [trial] = tune.run(
+            "__fake",
+            name="foo",
+            max_failures=0,
+            **{
+                "stop": {
+                    "training_iteration": 1
+                },
+                "upload_dir": "test",
+                "sync_to_cloud": "echo {source} {target} > " + logfile
+            }).trials
+        with open(logfile) as f:
+            lines = f.read()
+            self.assertTrue("test" in lines)
+        shutil.rmtree(tmpdir)
+
+    def testClusterProperString(self):
+        """Tests that invalid commands throw.."""
+        with self.assertRaises(TuneError):
+            # This raises TuneError because logger is init in safe zone.
+            [trial] = tune.run(
+                "__fake",
+                name="foo",
+                max_failures=0,
+                **{
+                    "stop": {
+                        "training_iteration": 1
+                    },
+                    "sync_to_driver": "ls {target}"
+                }).trials
+
+        with self.assertRaises(TuneError):
+            # This raises TuneError because logger is init in safe zone.
+            [trial] = tune.run(
+                "__fake",
+                name="foo",
+                max_failures=0,
+                **{
+                    "stop": {
+                        "training_iteration": 1
+                    },
+                    "sync_to_driver": "ls {source}"
+                }).trials
+
+        with patch.object(CommandBasedClient, "execute") as mock_fn:
+            with patch("ray.services.get_node_ip_address") as mock_sync:
+                mock_sync.return_value = "0.0.0.0"
+                [trial] = tune.run(
+                    "__fake",
+                    name="foo",
+                    max_failures=0,
+                    **{
+                        "stop": {
+                            "training_iteration": 1
+                        },
+                        "sync_to_driver": "echo {source} {target}"
+                    }).trials
+                self.assertGreater(mock_fn.call_count, 0)
+
+    def testCloudFunctions(self):
+        tmpdir = tempfile.mkdtemp()
+        tmpdir2 = tempfile.mkdtemp()
+        os.mkdir(os.path.join(tmpdir2, "foo"))
+
+        def sync_func(local, remote):
+            for filename in glob.glob(os.path.join(local, "*.json")):
+                shutil.copy(filename, remote)
+
+        [trial] = tune.run(
+            "__fake",
+            name="foo",
+            max_failures=0,
+            local_dir=tmpdir,
+            stop={
+                "training_iteration": 1
+            },
+            upload_dir=tmpdir2,
+            sync_to_cloud=sync_func).trials
+        test_file_path = glob.glob(os.path.join(tmpdir2, "foo", "*.json"))
+        self.assertTrue(test_file_path)
+        shutil.rmtree(tmpdir)
+        shutil.rmtree(tmpdir2)
+
+    def testClusterSyncFunction(self):
+        def sync_func_driver(source, target):
+            assert ":" in source, "Source {} not a remote path.".format(source)
+            assert ":" not in target, "Target is supposed to be local."
+            with open(os.path.join(target, "test.log2"), "w") as f:
+                print("writing to", f.name)
+                f.write(source)
+
+        [trial] = tune.run(
+            "__fake",
+            name="foo",
+            max_failures=0,
+            stop={
+                "training_iteration": 1
+            },
+            sync_to_driver=sync_func_driver).trials
+        test_file_path = os.path.join(trial.logdir, "test.log2")
+        self.assertFalse(os.path.exists(test_file_path))
+
+        with patch("ray.services.get_node_ip_address") as mock_sync:
+            mock_sync.return_value = "0.0.0.0"
+            [trial] = tune.run(
+                "__fake",
+                name="foo",
+                max_failures=0,
+                stop={
+                    "training_iteration": 1
+                },
+                sync_to_driver=sync_func_driver).trials
+        test_file_path = os.path.join(trial.logdir, "test.log2")
+        self.assertTrue(os.path.exists(test_file_path))
+        os.remove(test_file_path)
+
+    def testNoSync(self):
+        """Sync should not run on a single node."""
+
+        def sync_func(source, target):
+            pass
+
+        with patch.object(CommandBasedClient, "execute") as mock_sync:
+            [trial] = tune.run(
+                "__fake",
+                name="foo",
+                max_failures=0,
+                **{
+                    "stop": {
+                        "training_iteration": 1
+                    },
+                    "sync_to_driver": sync_func
+                }).trials
+            self.assertEqual(mock_sync.call_count, 0)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -85,4 +85,6 @@ class TrackApiTest(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -1194,4 +1194,6 @@ class AsyncHyperBandSuite(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -13,7 +13,7 @@ import numpy as np

 import ray
 from ray import tune
-from ray.tests.utils import recursive_fnmatch
+from ray.test_utils import recursive_fnmatch
 from ray.tune.util import validate_save_restore
 from ray.rllib import _register_all
 from ray.tune.suggest.hyperopt import HyperOptSearch
@@ -277,4 +277,6 @@ class SigOptWarmStartTest(AbstractWarmStartTest, unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -151,4 +151,6 @@ class SerialTuneRelativeLocalDirTest(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -144,4 +144,6 @@ class TuneServerSuite(unittest.TestCase):


 if __name__ == "__main__":
-    unittest.main(verbosity=2)
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -0,0 +1,319 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+import unittest
+
+import ray
+from ray.rllib import _register_all
+
+from ray import tune
+from ray.tune.result import DEFAULT_RESULTS_DIR
+from ray.tune.experiment import Experiment
+from ray.tune.suggest import grid_search, BasicVariantGenerator
+from ray.tune.suggest.suggestion import _MockSuggestionAlgorithm
+from ray.tune.suggest.variant_generator import (RecursiveDependencyError,
+                                                resolve_nested_dict)
+
+
+class VariantGeneratorTest(unittest.TestCase):
+    def setUp(self):
+        ray.init()
+
+    def tearDown(self):
+        ray.shutdown()
+        _register_all()  # re-register the evicted objects
+
+    def generate_trials(self, spec, name):
+        suggester = BasicVariantGenerator()
+        suggester.add_configurations({name: spec})
+        return suggester.next_trials()
+
+    def testParseToTrials(self):
+        trials = self.generate_trials({
+            "run": "PPO",
+            "num_samples": 2,
+            "max_failures": 5,
+            "config": {
+                "env": "Pong-v0",
+                "foo": "bar"
+            },
+        }, "tune-pong")
+        trials = list(trials)
+        self.assertEqual(len(trials), 2)
+        self.assertTrue("PPO_Pong-v0" in str(trials[0]))
+        self.assertEqual(trials[0].config, {"foo": "bar", "env": "Pong-v0"})
+        self.assertEqual(trials[0].trainable_name, "PPO")
+        self.assertEqual(trials[0].experiment_tag, "0")
+        self.assertEqual(trials[0].max_failures, 5)
+        self.assertEqual(trials[0].evaluated_params, {})
+        self.assertEqual(trials[0].local_dir,
+                         os.path.join(DEFAULT_RESULTS_DIR, "tune-pong"))
+        self.assertEqual(trials[1].experiment_tag, "1")
+
+    def testEval(self):
+        trials = self.generate_trials({
+            "run": "PPO",
+            "config": {
+                "foo": {
+                    "eval": "2 + 2"
+                },
+            },
+        }, "eval")
+        trials = list(trials)
+        self.assertEqual(len(trials), 1)
+        self.assertEqual(trials[0].config, {"foo": 4})
+        self.assertEqual(trials[0].evaluated_params, {"foo": 4})
+        self.assertEqual(trials[0].experiment_tag, "0_foo=4")
+
+    def testGridSearch(self):
+        trials = self.generate_trials({
+            "run": "PPO",
+            "config": {
+                "bar": {
+                    "grid_search": [True, False]
+                },
+                "foo": {
+                    "grid_search": [1, 2, 3]
+                },
+                "baz": "asd",
+            },
+        }, "grid_search")
+        trials = list(trials)
+        self.assertEqual(len(trials), 6)
+        self.assertEqual(trials[0].config, {
+            "bar": True,
+            "foo": 1,
+            "baz": "asd",
+        })
+        self.assertEqual(trials[0].evaluated_params, {
+            "bar": True,
+            "foo": 1,
+        })
+        self.assertEqual(trials[0].experiment_tag, "0_bar=True,foo=1")
+
+        self.assertEqual(trials[1].config, {
+            "bar": False,
+            "foo": 1,
+            "baz": "asd",
+        })
+        self.assertEqual(trials[1].evaluated_params, {
+            "bar": False,
+            "foo": 1,
+        })
+        self.assertEqual(trials[1].experiment_tag, "1_bar=False,foo=1")
+
+        self.assertEqual(trials[2].config, {
+            "bar": True,
+            "foo": 2,
+            "baz": "asd",
+        })
+        self.assertEqual(trials[2].evaluated_params, {
+            "bar": True,
+            "foo": 2,
+        })
+
+        self.assertEqual(trials[3].config, {
+            "bar": False,
+            "foo": 2,
+            "baz": "asd",
+        })
+        self.assertEqual(trials[3].evaluated_params, {
+            "bar": False,
+            "foo": 2,
+        })
+
+        self.assertEqual(trials[4].config, {
+            "bar": True,
+            "foo": 3,
+            "baz": "asd",
+        })
+        self.assertEqual(trials[4].evaluated_params, {
+            "bar": True,
+            "foo": 3,
+        })
+
+        self.assertEqual(trials[5].config, {
+            "bar": False,
+            "foo": 3,
+            "baz": "asd",
+        })
+        self.assertEqual(trials[5].evaluated_params, {
+            "bar": False,
+            "foo": 3,
+        })
+
+    def testGridSearchAndEval(self):
+        trials = self.generate_trials({
+            "run": "PPO",
+            "config": {
+                "qux": tune.sample_from(lambda spec: 2 + 2),
+                "bar": grid_search([True, False]),
+                "foo": grid_search([1, 2, 3]),
+                "baz": "asd",
+            },
+        }, "grid_eval")
+        trials = list(trials)
+        self.assertEqual(len(trials), 6)
+        self.assertEqual(trials[0].config, {
+            "bar": True,
+            "foo": 1,
+            "qux": 4,
+            "baz": "asd",
+        })
+        self.assertEqual(trials[0].evaluated_params, {
+            "bar": True,
+            "foo": 1,
+            "qux": 4,
+        })
+        self.assertEqual(trials[0].experiment_tag, "0_bar=True,foo=1,qux=4")
+
+    def testConditionResolution(self):
+        trials = self.generate_trials({
+            "run": "PPO",
+            "config": {
+                "x": 1,
+                "y": tune.sample_from(lambda spec: spec.config.x + 1),
+                "z": tune.sample_from(lambda spec: spec.config.y + 1),
+            },
+        }, "condition_resolution")
+        trials = list(trials)
+        self.assertEqual(len(trials), 1)
+        self.assertEqual(trials[0].config, {"x": 1, "y": 2, "z": 3})
+        self.assertEqual(trials[0].evaluated_params, {"y": 2, "z": 3})
+        self.assertEqual(trials[0].experiment_tag, "0_y=2,z=3")
+
+    def testDependentLambda(self):
+        trials = self.generate_trials({
+            "run": "PPO",
+            "config": {
+                "x": grid_search([1, 2]),
+                "y": tune.sample_from(lambda spec: spec.config.x * 100),
+            },
+        }, "dependent_lambda")
+        trials = list(trials)
+        self.assertEqual(len(trials), 2)
+        self.assertEqual(trials[0].config, {"x": 1, "y": 100})
+        self.assertEqual(trials[1].config, {"x": 2, "y": 200})
+
+    def testDependentGridSearch(self):
+        trials = self.generate_trials({
+            "run": "PPO",
+            "config": {
+                "x": grid_search([
+                    tune.sample_from(lambda spec: spec.config.y * 100),
+                    tune.sample_from(lambda spec: spec.config.y * 200)
+                ]),
+                "y": tune.sample_from(lambda spec: 1),
+            },
+        }, "dependent_grid_search")
+        trials = list(trials)
+        self.assertEqual(len(trials), 2)
+        self.assertEqual(trials[0].config, {"x": 100, "y": 1})
+        self.assertEqual(trials[1].config, {"x": 200, "y": 1})
+
+    def testNestedValues(self):
+        trials = self.generate_trials({
+            "run": "PPO",
+            "config": {
+                "x": {
+                    "y": {
+                        "z": tune.sample_from(lambda spec: 1)
+                    }
+                },
+                "y": tune.sample_from(lambda spec: 12),
+                "z": tune.sample_from(lambda spec: spec.config.x.y.z * 100),
+            },
+        }, "nested_values")
+        trials = list(trials)
+        self.assertEqual(len(trials), 1)
+        self.assertEqual(trials[0].config, {
+            "x": {
+                "y": {
+                    "z": 1
+                }
+            },
+            "y": 12,
+            "z": 100
+        })
+        self.assertEqual(trials[0].evaluated_params, {
+            "x/y/z": 1,
+            "y": 12,
+            "z": 100
+        })
+
+    def testLogUniform(self):
+        sampler = tune.loguniform(1e-10, 1e-1).func
+        results = [sampler(None) for i in range(1000)]
+        assert abs(np.log(min(results)) / np.log(10) - -10) < 0.1
+        assert abs(np.log(max(results)) / np.log(10) - -1) < 0.1
+
+        sampler_e = tune.loguniform(np.e**-4, np.e, base=np.e).func
+        results_e = [sampler_e(None) for i in range(1000)]
+        assert abs(np.log(min(results_e)) - -4) < 0.1
+        assert abs(np.log(max(results_e)) - 1) < 0.1
+
+    def test_resolve_dict(self):
+        config = {
+            "a": {
+                "b": 1,
+                "c": 2,
+            },
+            "b": {
+                "a": 3
+            }
+        }
+        resolved = resolve_nested_dict(config)
+        for k, v in [(("a", "b"), 1), (("a", "c"), 2), (("b", "a"), 3)]:
+            self.assertEqual(resolved.get(k), v)
+
+    def testRecursiveDep(self):
+        try:
+            list(
+                self.generate_trials({
+                    "run": "PPO",
+                    "config": {
+                        "foo": tune.sample_from(lambda spec: spec.config.foo),
+                    },
+                }, "recursive_dep"))
+        except RecursiveDependencyError as e:
+            assert "`foo` recursively depends on" in str(e), e
+        else:
+            assert False
+
+    def testMaxConcurrentSuggestions(self):
+        """Checks that next_trials() supports throttling."""
+        experiment_spec = {
+            "run": "PPO",
+            "num_samples": 6,
+        }
+        experiments = [Experiment.from_json("test", experiment_spec)]
+
+        searcher = _MockSuggestionAlgorithm(max_concurrent=4)
+        searcher.add_configurations(experiments)
+        trials = searcher.next_trials()
+        self.assertEqual(len(trials), 4)
+        self.assertEqual(searcher.next_trials(), [])
+
+        finished_trial = trials.pop()
+        searcher.on_trial_complete(finished_trial.trial_id)
+        self.assertEqual(len(searcher.next_trials()), 1)
+
+        finished_trial = trials.pop()
+        searcher.on_trial_complete(finished_trial.trial_id)
+
+        finished_trial = trials.pop()
+        searcher.on_trial_complete(finished_trial.trial_id)
+
+        finished_trial = trials.pop()
+        searcher.on_trial_complete(finished_trial.trial_id)
+        self.assertEqual(len(searcher.next_trials()), 1)
+        self.assertEqual(len(searcher.next_trials()), 0)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))