Move more unit tests to bazel (#6250)

* move more unit tests to bazel

* move to avoid conflict

* fix lint

* fix deps

* seprate

* fix failing tests

* show tests

* ignore mismatch

* try combining bazel runs

* build lint

* remove tests from install

* fix test utils

* better config

* split up

* exclusive

* fix verbosity

* fix tests class

* cleanup

* remove flaky

* fix metrics test

* Update .travis.yml

* no retry flaky

* split up actor

* split basic test

* split up trial runner test

* split stress

* fix basic test

* fix tests

* switch to pytest runner for main

* make microbench not fail

* move load code to py3

* test is no longer package

* bazel to end
This commit is contained in:
Eric Liang
2019-11-24 11:43:34 -08:00
committed by GitHub
parent aa8d5d2f6c
commit 53641f1f74
70 changed files with 6599 additions and 5766 deletions
+1 -1
View File
@@ -4,7 +4,7 @@ import time
import numpy as np
import ray
from ray.tests.cluster_utils import Cluster
from ray.cluster_utils import Cluster
def main():
@@ -2,6 +2,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import fnmatch
import os
import subprocess
@@ -170,3 +171,11 @@ def recursive_fnmatch(dirpath, pattern):
for filename in fnmatch.filter(filenames, pattern):
matches.append(os.path.join(root, filename))
return matches
def generate_internal_config_map(**kwargs):
internal_config = json.dumps(kwargs)
ray_kwargs = {
"_internal_config": internal_config,
}
return ray_kwargs
+297
View File
@@ -0,0 +1,297 @@
py_test(
name = "test_actor",
size = "large",
srcs = ["test_actor.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_actor_resources",
size = "large",
srcs = ["test_actor_resources.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_actor_failures",
size = "large",
srcs = ["test_actor_failures.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_basic",
size = "large",
srcs = ["test_basic.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_advanced",
size = "large",
srcs = ["test_advanced.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_component_failures",
size = "large",
srcs = ["test_component_failures.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_multinode_failures",
size = "large",
srcs = ["test_multinode_failures.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_stress",
size = "large",
srcs = ["test_stress.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_stress_sharded",
size = "large",
srcs = ["test_stress_sharded.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_stress_failure",
size = "large",
srcs = ["test_stress_failure.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_array",
size = "medium",
srcs = ["test_array.py"],
deps = ["//:ray_lib"],
flaky = 1,
)
py_test(
name = "test_autoscaler",
size = "small",
srcs = ["test_autoscaler.py"],
deps = ["//:ray_lib"],
flaky = 1,
)
py_test(
name = "test_autoscaler_yaml",
size = "small",
srcs = ["test_autoscaler_yaml.py"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_cython",
size = "small",
srcs = ["test_cython.py"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_debug_tools",
size = "small",
srcs = ["test_debug_tools.py"],
deps = ["//:ray_lib"],
flaky = 1,
)
py_test(
name = "test_dynres",
size = "medium",
srcs = ["test_dynres.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_failure",
size = "medium",
srcs = ["test_failure.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_garbage_collection",
size = "medium",
srcs = ["test_garbage_collection.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_global_state",
size = "medium",
srcs = ["test_global_state.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_logical_graph",
size = "medium",
srcs = ["test_logical_graph.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_memory_limits",
size = "medium",
srcs = ["test_memory_limits.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_memory_scheduling",
size = "medium",
srcs = ["test_memory_scheduling.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_metrics",
size = "small",
srcs = ["test_metrics.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_microbenchmarks",
size = "medium",
srcs = ["test_microbenchmarks.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_mini",
size = "small",
srcs = ["test_mini.py"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_monitors",
size = "medium",
srcs = ["test_monitors.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_multi_node_2",
size = "medium",
srcs = ["test_multi_node_2.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_multi_node",
size = "medium",
srcs = ["test_multi_node.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_node_manager",
size = "small",
srcs = ["test_node_manager.py"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_object_manager",
size = "medium",
srcs = ["test_object_manager.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_projects",
size = "small",
srcs = ["test_projects.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_queue",
size = "small",
srcs = ["test_queue.py"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_ray_init",
size = "medium",
srcs = ["test_ray_init.py"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_signal",
size = "medium",
srcs = ["test_signal.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_tempfile",
size = "small",
srcs = ["test_tempfile.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_tensorflow",
size = "medium",
srcs = ["test_tensorflow.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_unreconstructable_errors",
size = "medium",
srcs = ["test_unreconstructable_errors.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
py_test(
name = "test_webui",
size = "medium",
srcs = ["test_webui.py"],
tags = ["exclusive"],
deps = ["//:ray_lib"],
)
View File
+2 -10
View File
@@ -8,7 +8,7 @@ import pytest
import subprocess
import ray
from ray.tests.cluster_utils import Cluster
from ray.cluster_utils import Cluster
@pytest.fixture
@@ -18,14 +18,6 @@ def shutdown_only():
ray.shutdown()
def generate_internal_config_map(**kwargs):
internal_config = json.dumps(kwargs)
ray_kwargs = {
"_internal_config": internal_config,
}
return ray_kwargs
def get_default_fixure_internal_config():
internal_config = json.dumps({
"initial_reconstruction_timeout_milliseconds": 200,
@@ -177,7 +169,7 @@ def two_node_cluster():
"initial_reconstruction_timeout_milliseconds": 200,
"num_heartbeats_timeout": 10,
})
cluster = ray.tests.cluster_utils.Cluster(
cluster = ray.cluster_utils.Cluster(
head_node_args={"_internal_config": internal_config})
for _ in range(2):
remote_node = cluster.add_node(
+2 -2
View File
@@ -8,8 +8,8 @@ import threading
import pytest
import ray
import ray.tests.cluster_utils
import ray.tests.utils
import ray.cluster_utils
import ray.test_utils
@pytest.mark.parametrize(
File diff suppressed because it is too large Load Diff
+732
View File
@@ -0,0 +1,732 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import json
import numpy as np
import os
import pytest
try:
import pytest_timeout
except ImportError:
pytest_timeout = None
import signal
import sys
import time
import ray
import ray.ray_constants as ray_constants
import ray.test_utils
import ray.cluster_utils
from ray.test_utils import (relevant_errors, wait_for_condition,
wait_for_errors, wait_for_pid_to_exit,
generate_internal_config_map)
@pytest.fixture
def ray_checkpointable_actor_cls(request):
checkpoint_dir = "/tmp/ray_temp_checkpoint_dir/"
if not os.path.isdir(checkpoint_dir):
os.mkdir(checkpoint_dir)
class CheckpointableActor(ray.actor.Checkpointable):
def __init__(self):
self.value = 0
self.resumed_from_checkpoint = False
self.checkpoint_dir = checkpoint_dir
def node_id(self):
return ray.worker.global_worker.node.unique_id
def increase(self):
self.value += 1
return self.value
def get(self):
return self.value
def was_resumed_from_checkpoint(self):
return self.resumed_from_checkpoint
def get_pid(self):
return os.getpid()
def should_checkpoint(self, checkpoint_context):
# Checkpoint the actor when value is increased to 3.
should_checkpoint = self.value == 3
return should_checkpoint
def save_checkpoint(self, actor_id, checkpoint_id):
actor_id, checkpoint_id = actor_id.hex(), checkpoint_id.hex()
# Save checkpoint into a file.
with open(self.checkpoint_dir + actor_id, "a+") as f:
print(checkpoint_id, self.value, file=f)
def load_checkpoint(self, actor_id, available_checkpoints):
actor_id = actor_id.hex()
filename = self.checkpoint_dir + actor_id
# Load checkpoint from the file.
if not os.path.isfile(filename):
return None
available_checkpoint_ids = [
c.checkpoint_id for c in available_checkpoints
]
with open(filename, "r") as f:
for line in f:
checkpoint_id, value = line.strip().split(" ")
checkpoint_id = ray.ActorCheckpointID(
ray.utils.hex_to_binary(checkpoint_id))
if checkpoint_id in available_checkpoint_ids:
self.value = int(value)
self.resumed_from_checkpoint = True
return checkpoint_id
return None
def checkpoint_expired(self, actor_id, checkpoint_id):
pass
return CheckpointableActor
@pytest.mark.parametrize(
"ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True)
def test_actor_eviction(ray_start_object_store_memory):
object_store_memory = ray_start_object_store_memory
@ray.remote
class Actor(object):
def __init__(self):
pass
def create_object(self, size):
return np.random.rand(size)
a = Actor.remote()
# Submit enough methods on the actor so that they exceed the size of the
# object store.
objects = []
num_objects = 20
for _ in range(num_objects):
obj = a.create_object.remote(object_store_memory // num_objects)
objects.append(obj)
# Get each object once to make sure each object gets created.
ray.get(obj)
# Get each object again. At this point, the earlier objects should have
# been evicted.
num_evicted, num_success = 0, 0
for obj in objects:
try:
val = ray.get(obj)
assert isinstance(val, np.ndarray), val
num_success += 1
except ray.exceptions.UnreconstructableError:
num_evicted += 1
# Some objects should have been evicted, and some should still be in the
# object store.
assert num_evicted > 0
assert num_success > 0
def test_actor_reconstruction(ray_start_regular):
"""Test actor reconstruction when actor process is killed."""
@ray.remote(max_reconstructions=1)
class ReconstructableActor(object):
"""An actor that will be reconstructed at most once."""
def __init__(self):
self.value = 0
def increase(self, delay=0):
time.sleep(delay)
self.value += 1
return self.value
def get_pid(self):
return os.getpid()
actor = ReconstructableActor.remote()
pid = ray.get(actor.get_pid.remote())
# Call increase 3 times
for _ in range(3):
ray.get(actor.increase.remote())
# Call increase again with some delay.
result = actor.increase.remote(delay=0.5)
# Sleep some time to wait for the above task to start execution.
time.sleep(0.2)
# Kill actor process, while the above task is still being executed.
os.kill(pid, signal.SIGKILL)
# Check that the above task didn't fail and the actor is reconstructed.
assert ray.get(result) == 4
# Check that we can still call the actor.
assert ray.get(actor.increase.remote()) == 5
# kill actor process one more time.
pid = ray.get(actor.get_pid.remote())
os.kill(pid, signal.SIGKILL)
# The actor has exceeded max reconstructions, and this task should fail.
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.increase.remote())
# Create another actor.
actor = ReconstructableActor.remote()
# Intentionlly exit the actor
actor.__ray_terminate__.remote()
# Check that the actor won't be reconstructed.
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.increase.remote())
def test_actor_reconstruction_without_task(ray_start_regular):
"""Test a dead actor can be reconstructed without sending task to it."""
@ray.remote(max_reconstructions=1)
class ReconstructableActor(object):
def __init__(self, obj_ids):
for obj_id in obj_ids:
# Every time the actor gets constructed,
# put a new object in plasma store.
global_worker = ray.worker.global_worker
if not global_worker.core_worker.object_exists(obj_id):
global_worker.put_object(1, obj_id)
break
def get_pid(self):
return os.getpid()
obj_ids = [ray.ObjectID.from_random() for _ in range(2)]
actor = ReconstructableActor.remote(obj_ids)
# Kill the actor.
pid = ray.get(actor.get_pid.remote())
os.kill(pid, signal.SIGKILL)
# Wait until the actor is reconstructed.
assert wait_for_condition(
lambda: ray.worker.global_worker.core_worker.object_exists(obj_ids[1]),
timeout_ms=5000)
def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
"""Test actor reconstruction when node dies unexpectedly."""
cluster = ray_start_cluster_head
max_reconstructions = 3
# Add a few nodes to the cluster.
# Use custom resource to make sure the actor is only created on worker
# nodes, not on the head node.
for _ in range(max_reconstructions + 2):
cluster.add_node(
resources={"a": 1},
_internal_config=json.dumps({
"initial_reconstruction_timeout_milliseconds": 200,
"num_heartbeats_timeout": 10,
}),
)
def kill_node(node_id):
node_to_remove = None
for node in cluster.worker_nodes:
if node_id == node.unique_id:
node_to_remove = node
cluster.remove_node(node_to_remove)
@ray.remote(max_reconstructions=max_reconstructions, resources={"a": 1})
class MyActor(object):
def __init__(self):
self.value = 0
def increase(self):
self.value += 1
return self.value
def get_object_store_socket(self):
return ray.worker.global_worker.node.unique_id
actor = MyActor.remote()
# Call increase 3 times.
for _ in range(3):
ray.get(actor.increase.remote())
for i in range(max_reconstructions):
object_store_socket = ray.get(actor.get_object_store_socket.remote())
# Kill actor's node and the actor should be reconstructed
# on a different node.
kill_node(object_store_socket)
# Call increase again.
# Check that the actor is reconstructed and value is correct.
assert ray.get(actor.increase.remote()) == 4 + i
# Check that the actor is now on a different node.
assert object_store_socket != ray.get(
actor.get_object_store_socket.remote())
# kill the node again.
object_store_socket = ray.get(actor.get_object_store_socket.remote())
kill_node(object_store_socket)
# The actor has exceeded max reconstructions, and this task should fail.
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.increase.remote())
# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for
# this test. Because if this value is too small, suprious task reconstruction
# may happen and cause the test fauilure. If the value is too large, this test
# could be very slow. We can remove this once we support dynamic timeout.
@pytest.mark.parametrize(
"ray_start_cluster_head", [
generate_internal_config_map(
initial_reconstruction_timeout_milliseconds=1000)
],
indirect=True)
def test_multiple_actor_reconstruction(ray_start_cluster_head):
cluster = ray_start_cluster_head
# This test can be made more stressful by increasing the numbers below.
# The total number of actors created will be
# num_actors_at_a_time * num_nodes.
num_nodes = 5
num_actors_at_a_time = 3
num_function_calls_at_a_time = 10
worker_nodes = [
cluster.add_node(
num_cpus=3,
_internal_config=json.dumps({
"initial_reconstruction_timeout_milliseconds": 200,
"num_heartbeats_timeout": 10,
})) for _ in range(num_nodes)
]
@ray.remote(max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION)
class SlowCounter(object):
def __init__(self):
self.x = 0
def inc(self, duration):
time.sleep(duration)
self.x += 1
return self.x
# Create some initial actors.
actors = [SlowCounter.remote() for _ in range(num_actors_at_a_time)]
# Wait for the actors to start up.
time.sleep(1)
# This is a mapping from actor handles to object IDs returned by
# methods on that actor.
result_ids = collections.defaultdict(lambda: [])
# In a loop we are going to create some actors, run some methods, kill
# a raylet, and run some more methods.
for node in worker_nodes:
# Create some actors.
actors.extend(
[SlowCounter.remote() for _ in range(num_actors_at_a_time)])
# Run some methods.
for j in range(len(actors)):
actor = actors[j]
for _ in range(num_function_calls_at_a_time):
result_ids[actor].append(actor.inc.remote(j**2 * 0.000001))
# Kill a node.
cluster.remove_node(node)
# Run some more methods.
for j in range(len(actors)):
actor = actors[j]
for _ in range(num_function_calls_at_a_time):
result_ids[actor].append(actor.inc.remote(j**2 * 0.000001))
# Get the results and check that they have the correct values.
for _, result_id_list in result_ids.items():
results = list(range(1, len(result_id_list) + 1))
assert ray.get(result_id_list) == results
def kill_actor(actor):
"""A helper function that kills an actor process."""
pid = ray.get(actor.get_pid.remote())
os.kill(pid, signal.SIGKILL)
wait_for_pid_to_exit(pid)
def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
"""Test actor checkpointing and restoring from a checkpoint."""
actor = ray.remote(
max_reconstructions=2)(ray_checkpointable_actor_cls).remote()
# Call increase 3 times, triggering a checkpoint.
expected = 0
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Assert that the actor wasn't resumed from a checkpoint.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
kill_actor(actor)
# Assert that the actor was resumed from a checkpoint and its value is
# still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
# Submit some more tasks. These should get replayed since they happen after
# the checkpoint.
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again and check that reconstruction still works after the
# actor resuming from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
"""Test checkpointing of a remote actor through method invocation."""
# Define a class that exposes a method to save checkpoints.
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
def __init__(self):
super(RemoteCheckpointableActor, self).__init__()
self._should_checkpoint = False
def checkpoint(self):
self._should_checkpoint = True
def should_checkpoint(self, checkpoint_context):
should_checkpoint = self._should_checkpoint
self._should_checkpoint = False
return should_checkpoint
cls = ray.remote(max_reconstructions=2)(RemoteCheckpointableActor)
actor = cls.remote()
# Call increase 3 times.
expected = 0
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Call a checkpoint task.
actor.checkpoint.remote()
# Assert that the actor wasn't resumed from a checkpoint.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
kill_actor(actor)
# Assert that the actor was resumed from a checkpoint and its value is
# still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
# Submit some more tasks. These should get replayed since they happen after
# the checkpoint.
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again and check that reconstruction still works after the
# actor resuming from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
ray_checkpointable_actor_cls):
"""Test actor checkpointing on a remote node."""
# Place the actor on the remote node.
cluster = ray_start_cluster_2_nodes
remote_node = list(cluster.worker_nodes)
actor_cls = ray.remote(max_reconstructions=1)(ray_checkpointable_actor_cls)
actor = actor_cls.remote()
while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id):
actor = actor_cls.remote()
# Call increase several times.
expected = 0
for _ in range(6):
ray.get(actor.increase.remote())
expected += 1
# Assert that the actor wasn't resumed from a checkpoint.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
cluster.remove_node(remote_node[0])
# Assert that the actor was resumed from a checkpoint and its value is
# still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
def test_checkpointing_save_exception(ray_start_regular,
ray_checkpointable_actor_cls):
"""Test actor can still be recovered if checkpoints fail to complete."""
@ray.remote(max_reconstructions=2)
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
def save_checkpoint(self, actor_id, checkpoint_context):
raise Exception("Intentional error saving checkpoint.")
actor = RemoteCheckpointableActor.remote()
# Call increase 3 times, triggering a checkpoint that will fail.
expected = 0
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Assert that the actor wasn't resumed from a checkpoint.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
kill_actor(actor)
# Assert that the actor still wasn't resumed from a checkpoint and its
# value is still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Submit some more tasks. These should get replayed since they happen after
# the checkpoint.
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again, and check that reconstruction still works and the actor
# wasn't resumed from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Check that the checkpoint error was pushed to the driver.
wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1)
def test_checkpointing_load_exception(ray_start_regular,
ray_checkpointable_actor_cls):
"""Test actor can still be recovered if checkpoints fail to load."""
@ray.remote(max_reconstructions=2)
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
def load_checkpoint(self, actor_id, checkpoints):
raise Exception("Intentional error loading checkpoint.")
actor = RemoteCheckpointableActor.remote()
# Call increase 3 times, triggering a checkpoint that will succeed.
expected = 0
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Assert that the actor wasn't resumed from a checkpoint because loading
# it failed.
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Kill actor process.
kill_actor(actor)
# Assert that the actor still wasn't resumed from a checkpoint and its
# value is still correct.
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Submit some more tasks. These should get replayed since they happen after
# the checkpoint.
for _ in range(3):
ray.get(actor.increase.remote())
expected += 1
# Kill actor again, and check that reconstruction still works and the actor
# wasn't resumed from a checkpoint.
kill_actor(actor)
assert ray.get(actor.get.remote()) == expected
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
# Check that the checkpoint error was pushed to the driver.
wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1)
@pytest.mark.parametrize(
"ray_start_regular",
# This overwrite currently isn't effective,
# see https://github.com/ray-project/ray/issues/3926.
[generate_internal_config_map(num_actor_checkpoints_to_keep=20)],
indirect=True,
)
def test_deleting_actor_checkpoint(ray_start_regular):
"""Test deleting old actor checkpoints."""
@ray.remote
class CheckpointableActor(ray.actor.Checkpointable):
def __init__(self):
self.checkpoint_ids = []
def get_checkpoint_ids(self):
return self.checkpoint_ids
def should_checkpoint(self, checkpoint_context):
# Save checkpoints after every task
return True
def save_checkpoint(self, actor_id, checkpoint_id):
self.checkpoint_ids.append(checkpoint_id)
pass
def load_checkpoint(self, actor_id, available_checkpoints):
pass
def checkpoint_expired(self, actor_id, checkpoint_id):
assert checkpoint_id == self.checkpoint_ids[0]
del self.checkpoint_ids[0]
actor = CheckpointableActor.remote()
for i in range(19):
assert len(ray.get(actor.get_checkpoint_ids.remote())) == i + 1
for _ in range(20):
assert len(ray.get(actor.get_checkpoint_ids.remote())) == 20
def test_bad_checkpointable_actor_class():
"""Test error raised if an actor class doesn't implement all abstract
methods in the Checkpointable interface."""
with pytest.raises(TypeError):
@ray.remote
class BadCheckpointableActor(ray.actor.Checkpointable):
def should_checkpoint(self, checkpoint_context):
return True
def test_init_exception_in_checkpointable_actor(ray_start_regular,
ray_checkpointable_actor_cls):
# This test is similar to test_failure.py::test_failed_actor_init.
# This test is used to guarantee that checkpointable actor does not
# break the same logic.
error_message1 = "actor constructor failed"
error_message2 = "actor method failed"
@ray.remote
class CheckpointableFailedActor(ray_checkpointable_actor_cls):
def __init__(self):
raise Exception(error_message1)
def fail_method(self):
raise Exception(error_message2)
def should_checkpoint(self, checkpoint_context):
return True
a = CheckpointableFailedActor.remote()
# Make sure that we get errors from a failed constructor.
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
assert len(errors) == 1
assert error_message1 in errors[0]["message"]
# Make sure that we get errors from a failed method.
a.fail_method.remote()
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
assert len(errors) == 2
assert error_message1 in errors[1]["message"]
def test_decorated_method(ray_start_regular):
def method_invocation_decorator(f):
def new_f_invocation(args, kwargs):
# Split one argument into two. Return th kwargs without passing
# them into the actor.
return f([args[0], args[0]], {}), kwargs
return new_f_invocation
def method_execution_decorator(f):
def new_f_execution(self, b, c):
# Turn two arguments into one.
return f(self, b + c)
new_f_execution.__ray_invocation_decorator__ = (
method_invocation_decorator)
return new_f_execution
@ray.remote
class Actor(object):
@method_execution_decorator
def decorated_method(self, x):
return x + 1
a = Actor.remote()
object_id, extra = a.decorated_method.remote(3, kwarg=3)
assert isinstance(object_id, ray.ObjectID)
assert extra == {"kwarg": 3}
assert ray.get(object_id) == 7 # 2 * 3 + 1
@pytest.mark.skipif(
pytest_timeout is None,
reason="Timeout package not installed; skipping test that may hang.")
@pytest.mark.timeout(20)
@pytest.mark.parametrize(
"ray_start_cluster", [{
"num_cpus": 1,
"num_nodes": 2,
}], indirect=True)
def test_ray_wait_dead_actor(ray_start_cluster):
"""Tests that methods completed by dead actors are returned as ready"""
cluster = ray_start_cluster
@ray.remote(num_cpus=1)
class Actor(object):
def __init__(self):
pass
def node_id(self):
return ray.worker.global_worker.node.unique_id
def ping(self):
time.sleep(1)
# Create some actors and wait for them to initialize.
num_nodes = len(cluster.list_all_nodes())
actors = [Actor.remote() for _ in range(num_nodes)]
ray.get([actor.ping.remote() for actor in actors])
# Ping the actors and make sure the tasks complete.
ping_ids = [actor.ping.remote() for actor in actors]
ray.get(ping_ids)
# Evict the result from the node that we're about to kill.
remote_node = cluster.list_all_nodes()[-1]
remote_ping_id = None
for i, actor in enumerate(actors):
if ray.get(actor.node_id.remote()) == remote_node.unique_id:
remote_ping_id = ping_ids[i]
ray.internal.free([remote_ping_id], local_only=True)
cluster.remove_node(remote_node)
# Repeatedly call ray.wait until the exception for the dead actor is
# received.
unready = ping_ids[:]
while unready:
_, unready = ray.wait(unready, timeout=0)
time.sleep(1)
with pytest.raises(ray.exceptions.RayActorError):
ray.get(ping_ids)
# Evict the result from the dead node.
ray.internal.free([remote_ping_id], local_only=True)
# Create an actor on the local node that will call ray.wait in a loop.
head_node_resource = "HEAD_NODE"
ray.experimental.set_resource(head_node_resource, 1)
@ray.remote(num_cpus=0, resources={head_node_resource: 1})
class ParentActor(object):
def __init__(self, ping_ids):
self.unready = ping_ids
def wait(self):
_, self.unready = ray.wait(self.unready, timeout=0)
return len(self.unready) == 0
def ping(self):
return
# Repeatedly call ray.wait through the local actor until the exception for
# the dead actor is received.
parent_actor = ParentActor.remote(ping_ids)
ray.get(parent_actor.ping.remote())
failure_detected = False
while not failure_detected:
failure_detected = ray.get(parent_actor.wait.remote())
if __name__ == "__main__":
import pytest
sys.exit(pytest.main(["-v", __file__]))
+743
View File
@@ -0,0 +1,743 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import json
import os
import pytest
try:
import pytest_timeout
except ImportError:
pytest_timeout = None
import sys
import time
import ray
import ray.test_utils
import ray.cluster_utils
def test_actor_deletion_with_gpus(shutdown_only):
ray.init(
num_cpus=1, num_gpus=1, object_store_memory=int(150 * 1024 * 1024))
# When an actor that uses a GPU exits, make sure that the GPU resources
# are released.
@ray.remote(num_gpus=1)
class Actor(object):
def getpid(self):
return os.getpid()
for _ in range(5):
# If we can successfully create an actor, that means that enough
# GPU resources are available.
a = Actor.remote()
ray.get(a.getpid.remote())
def test_actor_state(ray_start_regular):
@ray.remote
class Counter(object):
def __init__(self):
self.value = 0
def increase(self):
self.value += 1
def value(self):
return self.value
c1 = Counter.remote()
c1.increase.remote()
assert ray.get(c1.value.remote()) == 1
c2 = Counter.remote()
c2.increase.remote()
c2.increase.remote()
assert ray.get(c2.value.remote()) == 2
def test_actor_class_methods(ray_start_regular):
class Foo(object):
x = 2
@classmethod
def as_remote(cls):
return ray.remote(cls)
@classmethod
def f(cls):
return cls.x
@classmethod
def g(cls, y):
return cls.x + y
def echo(self, value):
return value
a = Foo.as_remote().remote()
assert ray.get(a.echo.remote(2)) == 2
assert ray.get(a.f.remote()) == 2
assert ray.get(a.g.remote(2)) == 4
def test_resource_assignment(shutdown_only):
"""Test to make sure that we assign resource to actors at instantiation."""
# This test will create 16 actors. Declaring this many CPUs initially will
# speed up the test because the workers will be started ahead of time.
ray.init(
num_cpus=16,
num_gpus=1,
resources={"Custom": 1},
object_store_memory=int(150 * 1024 * 1024))
class Actor(object):
def __init__(self):
self.resources = ray.get_resource_ids()
def get_actor_resources(self):
return self.resources
def get_actor_method_resources(self):
return ray.get_resource_ids()
decorator_resource_args = [{}, {
"num_cpus": 0.1
}, {
"num_gpus": 0.1
}, {
"resources": {
"Custom": 0.1
}
}]
instantiation_resource_args = [{}, {
"num_cpus": 0.2
}, {
"num_gpus": 0.2
}, {
"resources": {
"Custom": 0.2
}
}]
for decorator_args in decorator_resource_args:
for instantiation_args in instantiation_resource_args:
if len(decorator_args) == 0:
actor_class = ray.remote(Actor)
else:
actor_class = ray.remote(**decorator_args)(Actor)
actor = actor_class._remote(**instantiation_args)
actor_resources = ray.get(actor.get_actor_resources.remote())
actor_method_resources = ray.get(
actor.get_actor_method_resources.remote())
if len(decorator_args) == 0 and len(instantiation_args) == 0:
assert len(actor_resources) == 0, (
"Actor should not be assigned resources.")
assert list(actor_method_resources.keys()) == [
"CPU"
], ("Actor method should only have CPUs")
assert actor_method_resources["CPU"][0][1] == 1, (
"Actor method should default to one cpu.")
else:
if ("num_cpus" not in decorator_args
and "num_cpus" not in instantiation_args):
assert actor_resources["CPU"][0][1] == 1, (
"Actor should default to one cpu.")
correct_resources = {}
defined_resources = decorator_args.copy()
defined_resources.update(instantiation_args)
for resource, value in defined_resources.items():
if resource == "num_cpus":
correct_resources["CPU"] = value
elif resource == "num_gpus":
correct_resources["GPU"] = value
elif resource == "resources":
for custom_resource, amount in value.items():
correct_resources[custom_resource] = amount
for resource, amount in correct_resources.items():
assert (actor_resources[resource][0][0] ==
actor_method_resources[resource][0][0]), (
"Should have assigned same {} for both actor ",
"and actor method.".format(resource))
assert (actor_resources[resource][0][
1] == actor_method_resources[resource][0][1]), (
"Should have assigned same amount of {} for both ",
"actor and actor method.".format(resource))
assert actor_resources[resource][0][1] == amount, (
"Actor should have {amount} {resource} but has ",
"{amount} {resource}".format(
amount=amount, resource=resource))
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Failing with new GCS API on Linux.")
def test_actor_gpus(ray_start_cluster):
cluster = ray_start_cluster
num_nodes = 3
num_gpus_per_raylet = 4
for i in range(num_nodes):
cluster.add_node(
num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet)
ray.init(address=cluster.address)
@ray.remote(num_gpus=1)
class Actor1(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
assert ray.get_gpu_ids() == self.gpu_ids
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
# Create one actor per GPU.
actors = [Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet)]
# Make sure that no two actors are assigned to the same GPU.
locations_and_ids = ray.get(
[actor.get_location_and_ids.remote() for actor in actors])
node_names = {location for location, gpu_id in locations_and_ids}
assert len(node_names) == num_nodes
location_actor_combinations = []
for node_name in node_names:
for gpu_id in range(num_gpus_per_raylet):
location_actor_combinations.append((node_name, (gpu_id, )))
assert set(locations_and_ids) == set(location_actor_combinations)
# Creating a new actor should fail because all of the GPUs are being
# used.
a = Actor1.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
assert ready_ids == []
def test_actor_multiple_gpus(ray_start_cluster):
cluster = ray_start_cluster
num_nodes = 3
num_gpus_per_raylet = 5
for i in range(num_nodes):
cluster.add_node(
num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet)
ray.init(address=cluster.address)
@ray.remote(num_gpus=2)
class Actor1(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
assert ray.get_gpu_ids() == self.gpu_ids
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
# Create some actors.
actors1 = [Actor1.remote() for _ in range(num_nodes * 2)]
# Make sure that no two actors are assigned to the same GPU.
locations_and_ids = ray.get(
[actor.get_location_and_ids.remote() for actor in actors1])
node_names = {location for location, gpu_id in locations_and_ids}
assert len(node_names) == num_nodes
# Keep track of which GPU IDs are being used for each location.
gpus_in_use = {node_name: [] for node_name in node_names}
for location, gpu_ids in locations_and_ids:
gpus_in_use[location].extend(gpu_ids)
for node_name in node_names:
assert len(set(gpus_in_use[node_name])) == 4
# Creating a new actor should fail because all of the GPUs are being
# used.
a = Actor1.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
assert ready_ids == []
# We should be able to create more actors that use only a single GPU.
@ray.remote(num_gpus=1)
class Actor2(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
# Create some actors.
actors2 = [Actor2.remote() for _ in range(num_nodes)]
# Make sure that no two actors are assigned to the same GPU.
locations_and_ids = ray.get(
[actor.get_location_and_ids.remote() for actor in actors2])
names = {location for location, gpu_id in locations_and_ids}
assert node_names == names
for location, gpu_ids in locations_and_ids:
gpus_in_use[location].extend(gpu_ids)
for node_name in node_names:
assert len(gpus_in_use[node_name]) == 5
assert set(gpus_in_use[node_name]) == set(range(5))
# Creating a new actor should fail because all of the GPUs are being
# used.
a = Actor2.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
assert ready_ids == []
def test_actor_different_numbers_of_gpus(ray_start_cluster):
# Test that we can create actors on two nodes that have different
# numbers of GPUs.
cluster = ray_start_cluster
cluster.add_node(num_cpus=10, num_gpus=0)
cluster.add_node(num_cpus=10, num_gpus=5)
cluster.add_node(num_cpus=10, num_gpus=10)
ray.init(address=cluster.address)
@ray.remote(num_gpus=1)
class Actor1(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
# Create some actors.
actors = [Actor1.remote() for _ in range(0 + 5 + 10)]
# Make sure that no two actors are assigned to the same GPU.
locations_and_ids = ray.get(
[actor.get_location_and_ids.remote() for actor in actors])
node_names = {location for location, gpu_id in locations_and_ids}
assert len(node_names) == 2
for node_name in node_names:
node_gpu_ids = [
gpu_id for location, gpu_id in locations_and_ids
if location == node_name
]
assert len(node_gpu_ids) in [5, 10]
assert set(node_gpu_ids) == {(i, ) for i in range(len(node_gpu_ids))}
# Creating a new actor should fail because all of the GPUs are being
# used.
a = Actor1.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
assert ready_ids == []
def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster):
cluster = ray_start_cluster
num_nodes = 5
num_gpus_per_raylet = 5
for i in range(num_nodes):
cluster.add_node(
num_cpus=10 * num_gpus_per_raylet,
num_gpus=num_gpus_per_raylet,
_internal_config=json.dumps({
"num_heartbeats_timeout": 1000
}))
ray.init(address=cluster.address)
@ray.remote
def create_actors(i, n):
@ray.remote(num_gpus=1)
class Actor(object):
def __init__(self, i, j):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
return ((ray.worker.global_worker.node.unique_id),
tuple(self.gpu_ids))
def sleep(self):
time.sleep(100)
# Create n actors.
actors = []
for j in range(n):
actors.append(Actor.remote(i, j))
locations = ray.get(
[actor.get_location_and_ids.remote() for actor in actors])
# Put each actor to sleep for a long time to prevent them from getting
# terminated.
for actor in actors:
actor.sleep.remote()
return locations
all_locations = ray.get([
create_actors.remote(i, num_gpus_per_raylet) for i in range(num_nodes)
])
# Make sure that no two actors are assigned to the same GPU.
node_names = {
location
for locations in all_locations for location, gpu_id in locations
}
assert len(node_names) == num_nodes
# Keep track of which GPU IDs are being used for each location.
gpus_in_use = {node_name: [] for node_name in node_names}
for locations in all_locations:
for location, gpu_ids in locations:
gpus_in_use[location].extend(gpu_ids)
for node_name in node_names:
assert len(set(gpus_in_use[node_name])) == num_gpus_per_raylet
@ray.remote(num_gpus=1)
class Actor(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
# All the GPUs should be used up now.
a = Actor.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
assert ready_ids == []
@pytest.mark.skipif(
sys.version_info < (3, 0), reason="This test requires Python 3.")
def test_actors_and_tasks_with_gpus(ray_start_cluster):
cluster = ray_start_cluster
num_nodes = 3
num_gpus_per_raylet = 2
for i in range(num_nodes):
cluster.add_node(
num_cpus=num_gpus_per_raylet, num_gpus=num_gpus_per_raylet)
ray.init(address=cluster.address)
def check_intervals_non_overlapping(list_of_intervals):
for i in range(len(list_of_intervals)):
for j in range(i):
first_interval = list_of_intervals[i]
second_interval = list_of_intervals[j]
# Check that list_of_intervals[i] and list_of_intervals[j]
# don't overlap.
assert first_interval[0] < first_interval[1]
assert second_interval[0] < second_interval[1]
intervals_nonoverlapping = (
first_interval[1] <= second_interval[0]
or second_interval[1] <= first_interval[0])
assert intervals_nonoverlapping, (
"Intervals {} and {} are overlapping.".format(
first_interval, second_interval))
@ray.remote(num_gpus=1)
def f1():
t1 = time.monotonic()
time.sleep(0.1)
t2 = time.monotonic()
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 1
assert gpu_ids[0] in range(num_gpus_per_raylet)
return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids),
[t1, t2])
@ray.remote(num_gpus=2)
def f2():
t1 = time.monotonic()
time.sleep(0.1)
t2 = time.monotonic()
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 2
assert gpu_ids[0] in range(num_gpus_per_raylet)
assert gpu_ids[1] in range(num_gpus_per_raylet)
return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids),
[t1, t2])
@ray.remote(num_gpus=1)
class Actor1(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
assert len(self.gpu_ids) == 1
assert self.gpu_ids[0] in range(num_gpus_per_raylet)
def get_location_and_ids(self):
assert ray.get_gpu_ids() == self.gpu_ids
return (ray.worker.global_worker.node.unique_id,
tuple(self.gpu_ids))
def locations_to_intervals_for_many_tasks():
# Launch a bunch of GPU tasks.
locations_ids_and_intervals = ray.get(
[f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] +
[f2.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] +
[f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)])
locations_to_intervals = collections.defaultdict(lambda: [])
for location, gpu_ids, interval in locations_ids_and_intervals:
for gpu_id in gpu_ids:
locations_to_intervals[(location, gpu_id)].append(interval)
return locations_to_intervals
# Run a bunch of GPU tasks.
locations_to_intervals = locations_to_intervals_for_many_tasks()
# For each GPU, verify that the set of tasks that used this specific
# GPU did not overlap in time.
for locations in locations_to_intervals:
check_intervals_non_overlapping(locations_to_intervals[locations])
# Create an actor that uses a GPU.
a = Actor1.remote()
actor_location = ray.get(a.get_location_and_ids.remote())
actor_location = (actor_location[0], actor_location[1][0])
# This check makes sure that actor_location is formatted the same way
# that the keys of locations_to_intervals are formatted.
assert actor_location in locations_to_intervals
# Run a bunch of GPU tasks.
locations_to_intervals = locations_to_intervals_for_many_tasks()
# For each GPU, verify that the set of tasks that used this specific
# GPU did not overlap in time.
for locations in locations_to_intervals:
check_intervals_non_overlapping(locations_to_intervals[locations])
# Make sure that the actor's GPU was not used.
assert actor_location not in locations_to_intervals
# Create more actors to fill up all the GPUs.
more_actors = [
Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet - 1)
]
# Wait for the actors to finish being created.
ray.get([actor.get_location_and_ids.remote() for actor in more_actors])
# Now if we run some GPU tasks, they should not be scheduled.
results = [f1.remote() for _ in range(30)]
ready_ids, remaining_ids = ray.wait(results, timeout=1.0)
assert len(ready_ids) == 0
def test_actors_and_tasks_with_gpus_version_two(shutdown_only):
# Create tasks and actors that both use GPUs and make sure that they
# are given different GPUs
num_gpus = 4
ray.init(
num_cpus=(num_gpus + 1),
num_gpus=num_gpus,
object_store_memory=int(150 * 1024 * 1024))
# The point of this actor is to record which GPU IDs have been seen. We
# can't just return them from the tasks, because the tasks don't return
# for a long time in order to make sure the GPU is not released
# prematurely.
@ray.remote
class RecordGPUs(object):
def __init__(self):
self.gpu_ids_seen = []
self.num_calls = 0
def add_ids(self, gpu_ids):
self.gpu_ids_seen += gpu_ids
self.num_calls += 1
def get_gpu_ids_and_calls(self):
return self.gpu_ids_seen, self.num_calls
@ray.remote(num_gpus=1)
def f(record_gpu_actor):
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 1
record_gpu_actor.add_ids.remote(gpu_ids)
# Sleep for a long time so that the GPU never gets released. This task
# will be killed by ray.shutdown() before it actually finishes.
time.sleep(1000)
@ray.remote(num_gpus=1)
class Actor(object):
def __init__(self, record_gpu_actor):
self.gpu_ids = ray.get_gpu_ids()
assert len(self.gpu_ids) == 1
record_gpu_actor.add_ids.remote(self.gpu_ids)
def check_gpu_ids(self):
assert ray.get_gpu_ids() == self.gpu_ids
record_gpu_actor = RecordGPUs.remote()
actors = []
actor_results = []
for _ in range(num_gpus // 2):
f.remote(record_gpu_actor)
a = Actor.remote(record_gpu_actor)
actor_results.append(a.check_gpu_ids.remote())
# Prevent the actor handle from going out of scope so that its GPU
# resources don't get released.
actors.append(a)
# Make sure that the actor method calls succeeded.
ray.get(actor_results)
start_time = time.time()
while time.time() - start_time < 30:
seen_gpu_ids, num_calls = ray.get(
record_gpu_actor.get_gpu_ids_and_calls.remote())
if num_calls == num_gpus:
break
assert set(seen_gpu_ids) == set(range(num_gpus))
def test_blocking_actor_task(shutdown_only):
ray.init(
num_cpus=1, num_gpus=1, object_store_memory=int(150 * 1024 * 1024))
@ray.remote(num_gpus=1)
def f():
return 1
@ray.remote
class Foo(object):
def __init__(self):
pass
def blocking_method(self):
ray.get(f.remote())
# Make sure we can execute a blocking actor method even if there is
# only one CPU.
actor = Foo.remote()
ray.get(actor.blocking_method.remote())
@ray.remote(num_cpus=1)
class CPUFoo(object):
def __init__(self):
pass
def blocking_method(self):
ray.get(f.remote())
# Make sure that lifetime CPU resources are not released when actors
# block.
actor = CPUFoo.remote()
x_id = actor.blocking_method.remote()
ready_ids, remaining_ids = ray.wait([x_id], timeout=1.0)
assert ready_ids == []
assert remaining_ids == [x_id]
@ray.remote(num_gpus=1)
class GPUFoo(object):
def __init__(self):
pass
def blocking_method(self):
ray.get(f.remote())
# Make sure that GPU resources are not released when actors block.
actor = GPUFoo.remote()
x_id = actor.blocking_method.remote()
ready_ids, remaining_ids = ray.wait([x_id], timeout=1.0)
assert ready_ids == []
assert remaining_ids == [x_id]
@pytest.mark.skipif(
sys.version_info < (3, 0),
reason="This test is currently failing on Python 2.7.")
def test_lifetime_and_transient_resources(ray_start_regular):
# This actor acquires resources only when running methods.
@ray.remote
class Actor1(object):
def method(self):
pass
# This actor acquires resources for its lifetime.
@ray.remote(num_cpus=1)
class Actor2(object):
def method(self):
pass
actor1s = [Actor1.remote() for _ in range(10)]
ray.get([a.method.remote() for a in actor1s])
actor2s = [Actor2.remote() for _ in range(2)]
results = [a.method.remote() for a in actor2s]
ready_ids, remaining_ids = ray.wait(
results, num_returns=len(results), timeout=5.0)
assert len(ready_ids) == 1
def test_custom_label_placement(ray_start_cluster):
cluster = ray_start_cluster
cluster.add_node(num_cpus=2, resources={"CustomResource1": 2})
cluster.add_node(num_cpus=2, resources={"CustomResource2": 2})
ray.init(address=cluster.address)
@ray.remote(resources={"CustomResource1": 1})
class ResourceActor1(object):
def get_location(self):
return ray.worker.global_worker.node.unique_id
@ray.remote(resources={"CustomResource2": 1})
class ResourceActor2(object):
def get_location(self):
return ray.worker.global_worker.node.unique_id
node_id = ray.worker.global_worker.node.unique_id
# Create some actors.
actors1 = [ResourceActor1.remote() for _ in range(2)]
actors2 = [ResourceActor2.remote() for _ in range(2)]
locations1 = ray.get([a.get_location.remote() for a in actors1])
locations2 = ray.get([a.get_location.remote() for a in actors2])
for location in locations1:
assert location == node_id
for location in locations2:
assert location != node_id
def test_creating_more_actors_than_resources(shutdown_only):
ray.init(num_cpus=10, num_gpus=2, resources={"CustomResource1": 1})
@ray.remote(num_gpus=1)
class ResourceActor1(object):
def method(self):
return ray.get_gpu_ids()[0]
@ray.remote(resources={"CustomResource1": 1})
class ResourceActor2(object):
def method(self):
pass
# Make sure the first two actors get created and the third one does
# not.
actor1 = ResourceActor1.remote()
result1 = actor1.method.remote()
ray.wait([result1])
actor2 = ResourceActor1.remote()
result2 = actor2.method.remote()
ray.wait([result2])
actor3 = ResourceActor1.remote()
result3 = actor3.method.remote()
ready_ids, _ = ray.wait([result3], timeout=0.2)
assert len(ready_ids) == 0
# By deleting actor1, we free up resources to create actor3.
del actor1
results = ray.get([result1, result2, result3])
assert results[0] == results[2]
assert set(results) == {0, 1}
# Make sure that when one actor goes out of scope a new actor is
# created because some resources have been freed up.
results = []
for _ in range(3):
actor = ResourceActor2.remote()
object_id = actor.method.remote()
results.append(object_id)
# Wait for the task to execute. We do this because otherwise it may
# be possible for the __ray_terminate__ task to execute before the
# method.
ray.wait([object_id])
ray.get(results)
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
File diff suppressed because it is too large Load Diff
+7 -1
View File
@@ -10,7 +10,7 @@ import sys
import ray
import ray.experimental.array.remote as ra
import ray.experimental.array.distributed as da
import ray.tests.cluster_utils
import ray.cluster_utils
if sys.version_info >= (3, 0):
from importlib import reload
@@ -216,3 +216,9 @@ def test_distributed_array_methods(ray_start_cluster_2_nodes, reload_modules):
d1 = np.random.randint(1, 35)
d2 = np.random.randint(1, 35)
test_dist_qr(d1, d2)
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+3 -2
View File
@@ -17,7 +17,7 @@ from ray.autoscaler.autoscaler import StandardAutoscaler, LoadMetrics, \
from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_NODE_STATUS, \
STATUS_UP_TO_DATE, STATUS_UPDATE_FAILED
from ray.autoscaler.node_provider import NODE_PROVIDERS, NodeProvider
from ray.tests.utils import RayTestTimeoutException
from ray.test_utils import RayTestTimeoutException
import pytest
@@ -1084,4 +1084,5 @@ class AutoscalingTest(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import sys
sys.exit(pytest.main(["-v", __file__]))
+4 -2
View File
@@ -7,7 +7,7 @@ import unittest
import yaml
from ray.autoscaler.autoscaler import fillout_defaults, validate_config
from ray.tests.utils import recursive_fnmatch
from ray.test_utils import recursive_fnmatch
RAY_PATH = os.path.abspath(os.path.join(__file__, "../../"))
CONFIG_PATHS = recursive_fnmatch(
@@ -31,4 +31,6 @@ class AutoscalingConfigTest(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
File diff suppressed because it is too large Load Diff
-56
View File
@@ -1,56 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
import ray
from ray.experimental.streaming.batched_queue import BatchedQueue
@ray.remote
class Reader(object):
def __init__(self, queue):
self.queue = queue
self.num_reads = 0
self.start = time.time()
def read(self, read_slowly):
expected_value = 0
for _ in range(1000):
x = self.queue.read_next()
assert x == expected_value, (x, expected_value)
expected_value += 1
self.num_reads += 1
if read_slowly:
time.sleep(0.001)
def test_batched_queue(ray_start_regular):
# Batched queue parameters
max_queue_size = 10000 # Max number of batches in queue
max_batch_size = 1000 # Max number of elements per batch
batch_timeout = 0.001 # 1ms flush timeout
prefetch_depth = 10 # Number of batches to prefetch from plasma
background_flush = False # Don't use daemon thread for flushing
# Two tests: one with a big queue and slow reader, and
# a second one with a small queue and a faster reader
for read_slowly in [True, False]:
# Construct the batched queue
queue = BatchedQueue(
max_size=max_queue_size,
max_batch_size=max_batch_size,
max_batch_time=batch_timeout,
prefetch_depth=prefetch_depth,
background_flush=background_flush)
# Create and start the reader
reader = Reader.remote(queue)
object_id = reader.read.remote(read_slowly=read_slowly)
value = 0
for _ in range(1000):
queue.put_next(value)
value += 1
queue._flush_writes()
ray.get(object_id)
# Test once more with a very small queue size and a faster reader
max_queue_size = 10
+8 -3
View File
@@ -13,9 +13,9 @@ import pytest
import ray
import ray.ray_constants as ray_constants
from ray.tests.cluster_utils import Cluster
from ray.tests.utils import (run_string_as_driver_nonblocking,
RayTestTimeoutException)
from ray.cluster_utils import Cluster
from ray.test_utils import (run_string_as_driver_nonblocking,
RayTestTimeoutException)
# This test checks that when a worker dies in the middle of a get, the plasma
@@ -441,3 +441,8 @@ def test_driver_lives_parallel(ray_start_regular):
process_info.process.wait()
# If the driver can reach the tearDown method, then it is still alive.
if __name__ == "__main__":
import pytest
sys.exit(pytest.main(["-v", __file__]))
-42
View File
@@ -1,42 +0,0 @@
from __future__ import absolute_import, division, print_function
import os
import unittest
import redis
import ray
def parse_client(addr_port_str):
address, redis_port = addr_port_str.split(":")
return redis.StrictRedis(host=address, port=redis_port)
@unittest.skipIf(not os.environ.get("RAY_USE_NEW_GCS", False),
"Tests functionality of the new GCS.")
class CredisTest(unittest.TestCase):
def setUp(self):
self.config = ray.init(num_cpus=0)
def tearDown(self):
ray.shutdown()
def test_credis_started(self):
assert "redis_address" in self.config
primary = parse_client(self.config["redis_address"])
assert primary.ping() is True
member = primary.lrange("RedisShards", 0, -1)[0]
shard = parse_client(member.decode())
# Check that primary has loaded credis's master module.
chain = primary.execute_command("MASTER.GET_CHAIN")
assert len(chain) == 1
# Check that the shard has loaded credis' member module.
assert chain[0] == member
assert shard.execute_command("MEMBER.SN") == -1
if __name__ == "__main__":
unittest.main(verbosity=2)
+3 -1
View File
@@ -54,4 +54,6 @@ class CythonTest(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+9
View File
@@ -47,3 +47,12 @@ def test_raylet_gdb(ray_gdb_start):
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
assert pgrep_command.communicate()[0]
if __name__ == "__main__":
import pytest
import sys
# Make subprocess happy in bazel.
os.environ["LC_ALL"] = "en_US.UTF-8"
os.environ["LANG"] = "en_US.UTF-8"
sys.exit(pytest.main(["-v", __file__]))
+8 -2
View File
@@ -6,8 +6,8 @@ import logging
import time
import ray
import ray.tests.cluster_utils
import ray.tests.utils
import ray.cluster_utils
import ray.test_utils
logger = logging.getLogger(__name__)
@@ -605,3 +605,9 @@ def test_release_cpus_when_actor_creation_task_blocking(shutdown_only):
result = wait_until(assert_available_resources, 1000)
assert result is True
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+7 -2
View File
@@ -15,8 +15,8 @@ import redis
import ray
import ray.ray_constants as ray_constants
from ray.tests.cluster_utils import Cluster
from ray.tests.utils import (
from ray.cluster_utils import Cluster
from ray.test_utils import (
relevant_errors,
wait_for_errors,
RayTestTimeoutException,
@@ -903,3 +903,8 @@ def test_direct_call_serialized_id(ray_start_cluster):
obj = small_object.remote()
ray.get(get.remote([obj]))
if __name__ == "__main__":
import pytest
sys.exit(pytest.main(["-v", __file__]))
+8 -2
View File
@@ -10,8 +10,8 @@ import logging
import pytest
import ray
import ray.tests.cluster_utils
import ray.tests.utils
import ray.cluster_utils
import ray.test_utils
logger = logging.getLogger(__name__)
@@ -78,3 +78,9 @@ def test_pending_task_dependency(shutdown_only):
ray.put(np_array)
ray.get(oid)
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+6
View File
@@ -78,3 +78,9 @@ def test_add_remove_cluster_resources(ray_start_cluster_head):
nodes += [cluster.add_node(num_cpus=1)]
cluster.wait_for_nodes()
assert ray.cluster_resources()["CPU"] == 6
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+6
View File
@@ -202,3 +202,9 @@ def test_channel_generation():
def test_wordcount():
"""Tests a simple streaming wordcount."""
pass
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+3 -1
View File
@@ -86,4 +86,6 @@ class TestMemoryLimits(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+3 -1
View File
@@ -152,4 +152,6 @@ class TestMemoryScheduling(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+10 -2
View File
@@ -10,7 +10,7 @@ import time
import ray
from ray.core.generated import node_manager_pb2
from ray.core.generated import node_manager_pb2_grpc
from ray.tests.utils import RayTestTimeoutException
from ray.test_utils import RayTestTimeoutException
def test_worker_stats(ray_start_regular):
@@ -53,5 +53,13 @@ def test_worker_stats(ray_start_regular):
if p.info["pid"] in pids
]
for process in processes:
assert "python" in process or "ray" in process
# TODO(ekl) why does travis/mi end up in the process list
assert ("python" in process or "ray" in process
or "travis" in process)
break
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+8 -7
View File
@@ -2,7 +2,6 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pytest
import time
import numpy as np
@@ -104,9 +103,11 @@ def test_cache(ray_start_regular):
d = time.time() - c
if d > 1.5 * b:
if os.getenv("TRAVIS") is None:
raise Exception("The caching test was too slow. "
"d = {}, b = {}".format(d, b))
else:
print("WARNING: The caching test was too slow. "
"d = {}, b = {}".format(d, b))
print("WARNING: The caching test was too slow. "
"d = {}, b = {}".format(d, b))
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+9
View File
@@ -107,3 +107,12 @@ def test_cleanup_on_driver_exit_single_redis_shard():
def test_cleanup_on_driver_exit_many_redis_shards():
_test_cleanup_on_driver_exit(num_redis_shards=5)
_test_cleanup_on_driver_exit(num_redis_shards=31)
if __name__ == "__main__":
import pytest
import sys
# Make subprocess happy in bazel.
os.environ["LC_ALL"] = "en_US.UTF-8"
os.environ["LANG"] = "en_US.UTF-8"
sys.exit(pytest.main(["-v", __file__]))
+10 -1
View File
@@ -8,7 +8,7 @@ import subprocess
import time
import ray
from ray.tests.utils import (
from ray.test_utils import (
RayTestTimeoutException,
run_string_as_driver,
run_string_as_driver_nonblocking,
@@ -615,3 +615,12 @@ def test_use_pickle(call_ray_start):
return (3, "world")
assert ray.get(f.remote(x)) == (3, "world")
if __name__ == "__main__":
import pytest
import sys
# Make subprocess happy in bazel.
os.environ["LC_ALL"] = "en_US.UTF-8"
os.environ["LANG"] = "en_US.UTF-8"
sys.exit(pytest.main(["-v", __file__]))
+8 -2
View File
@@ -9,8 +9,8 @@ import time
import ray
import ray.ray_constants as ray_constants
from ray.monitor import Monitor
from ray.tests.cluster_utils import Cluster
from ray.tests.conftest import generate_internal_config_map
from ray.cluster_utils import Cluster
from ray.test_utils import generate_internal_config_map
logger = logging.getLogger(__name__)
@@ -221,3 +221,9 @@ def test_worker_plasma_store_failure(ray_start_cluster_head):
worker.kill_plasma_store()
worker.all_processes[ray_constants.PROCESS_TYPE_RAYLET][0].process.wait()
assert not worker.any_processes_alive(), worker.live_processes()
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+273
View File
@@ -0,0 +1,273 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import os
import signal
import sys
import time
import numpy as np
import pytest
import ray
import ray.ray_constants as ray_constants
from ray.cluster_utils import Cluster
from ray.test_utils import RayTestTimeoutException
@pytest.fixture(params=[(1, 4), (4, 4)])
def ray_start_workers_separate_multinode(request):
num_nodes = request.param[0]
num_initial_workers = request.param[1]
# Start the Ray processes.
cluster = Cluster()
for _ in range(num_nodes):
cluster.add_node(num_cpus=num_initial_workers)
ray.init(address=cluster.address)
yield num_nodes, num_initial_workers
# The code after the yield will run as teardown code.
ray.shutdown()
cluster.shutdown()
def test_worker_failed(ray_start_workers_separate_multinode):
num_nodes, num_initial_workers = (ray_start_workers_separate_multinode)
@ray.remote
def get_pids():
time.sleep(0.25)
return os.getpid()
start_time = time.time()
pids = set()
while len(pids) < num_nodes * num_initial_workers:
new_pids = ray.get([
get_pids.remote()
for _ in range(2 * num_nodes * num_initial_workers)
])
for pid in new_pids:
pids.add(pid)
if time.time() - start_time > 60:
raise RayTestTimeoutException(
"Timed out while waiting to get worker PIDs.")
@ray.remote
def f(x):
time.sleep(0.5)
return x
# Submit more tasks than there are workers so that all workers and
# cores are utilized.
object_ids = [f.remote(i) for i in range(num_initial_workers * num_nodes)]
object_ids += [f.remote(object_id) for object_id in object_ids]
# Allow the tasks some time to begin executing.
time.sleep(0.1)
# Kill the workers as the tasks execute.
for pid in pids:
os.kill(pid, signal.SIGKILL)
time.sleep(0.1)
# Make sure that we either get the object or we get an appropriate
# exception.
for object_id in object_ids:
try:
ray.get(object_id)
except (ray.exceptions.RayTaskError, ray.exceptions.RayWorkerError):
pass
def _test_component_failed(cluster, component_type):
"""Kill a component on all worker nodes and check workload succeeds."""
# Submit many tasks with many dependencies.
@ray.remote
def f(x):
return x
@ray.remote
def g(*xs):
return 1
# Kill the component on all nodes except the head node as the tasks
# execute. Do this in a loop while submitting tasks between each
# component failure.
time.sleep(0.1)
worker_nodes = cluster.list_all_nodes()[1:]
assert len(worker_nodes) > 0
for node in worker_nodes:
process = node.all_processes[component_type][0].process
# Submit a round of tasks with many dependencies.
x = 1
for _ in range(1000):
x = f.remote(x)
xs = [g.remote(1)]
for _ in range(100):
xs.append(g.remote(*xs))
xs.append(g.remote(1))
# Kill a component on one of the nodes.
process.terminate()
time.sleep(1)
process.kill()
process.wait()
assert not process.poll() is None
# Make sure that we can still get the objects after the
# executing tasks died.
ray.get(x)
ray.get(xs)
def check_components_alive(cluster, component_type, check_component_alive):
"""Check that a given component type is alive on all worker nodes."""
worker_nodes = cluster.list_all_nodes()[1:]
assert len(worker_nodes) > 0
for node in worker_nodes:
process = node.all_processes[component_type][0].process
if check_component_alive:
assert process.poll() is None
else:
print("waiting for " + component_type + " with PID " +
str(process.pid) + "to terminate")
process.wait()
print("done waiting for " + component_type + " with PID " +
str(process.pid) + "to terminate")
assert not process.poll() is None
@pytest.mark.parametrize(
"ray_start_cluster", [{
"num_cpus": 8,
"num_nodes": 4,
"_internal_config": json.dumps({
"num_heartbeats_timeout": 100
}),
}],
indirect=True)
def test_raylet_failed(ray_start_cluster):
cluster = ray_start_cluster
# Kill all raylets on worker nodes.
_test_component_failed(cluster, ray_constants.PROCESS_TYPE_RAYLET)
# The plasma stores should still be alive on the worker nodes.
check_components_alive(cluster, ray_constants.PROCESS_TYPE_PLASMA_STORE,
True)
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
@pytest.mark.parametrize(
"ray_start_cluster", [{
"num_cpus": 8,
"num_nodes": 2,
"_internal_config": json.dumps({
"num_heartbeats_timeout": 100
}),
}],
indirect=True)
def test_plasma_store_failed(ray_start_cluster):
cluster = ray_start_cluster
# Kill all plasma stores on worker nodes.
_test_component_failed(cluster, ray_constants.PROCESS_TYPE_PLASMA_STORE)
# No processes should be left alive on the worker nodes.
check_components_alive(cluster, ray_constants.PROCESS_TYPE_PLASMA_STORE,
False)
check_components_alive(cluster, ray_constants.PROCESS_TYPE_RAYLET, False)
@pytest.mark.parametrize(
"ray_start_cluster", [{
"num_cpus": 4,
"num_nodes": 3,
"do_init": True
}],
indirect=True)
def test_actor_creation_node_failure(ray_start_cluster):
# TODO(swang): Refactor test_raylet_failed, etc to reuse the below code.
cluster = ray_start_cluster
@ray.remote
class Child(object):
def __init__(self, death_probability):
self.death_probability = death_probability
def ping(self):
# Exit process with some probability.
exit_chance = np.random.rand()
if exit_chance < self.death_probability:
sys.exit(-1)
num_children = 50
# Children actors will die about half the time.
death_probability = 0.5
children = [Child.remote(death_probability) for _ in range(num_children)]
while len(cluster.list_all_nodes()) > 1:
for j in range(2):
# Submit some tasks on the actors. About half of the actors will
# fail.
children_out = [child.ping.remote() for child in children]
# Wait a while for all the tasks to complete. This should trigger
# reconstruction for any actor creation tasks that were forwarded
# to nodes that then failed.
ready, _ = ray.wait(
children_out, num_returns=len(children_out), timeout=5 * 60.0)
assert len(ready) == len(children_out)
# Replace any actors that died.
for i, out in enumerate(children_out):
try:
ray.get(out)
except ray.exceptions.RayActorError:
children[i] = Child.remote(death_probability)
# Remove a node. Any actor creation tasks that were forwarded to this
# node must be reconstructed.
cluster.remove_node(cluster.list_all_nodes()[-1])
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_driver_lives_sequential(ray_start_regular):
ray.worker._global_node.kill_raylet()
ray.worker._global_node.kill_plasma_store()
ray.worker._global_node.kill_log_monitor()
ray.worker._global_node.kill_monitor()
ray.worker._global_node.kill_raylet_monitor()
# If the driver can reach the tearDown method, then it is still alive.
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_driver_lives_parallel(ray_start_regular):
all_processes = ray.worker._global_node.all_processes
process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
all_processes[ray_constants.PROCESS_TYPE_MONITOR] +
all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR])
assert len(process_infos) == 5
# Kill all the components in parallel.
for process_info in process_infos:
process_info.process.terminate()
time.sleep(0.1)
for process_info in process_infos:
process_info.process.kill()
for process_info in process_infos:
process_info.process.wait()
# If the driver can reach the tearDown method, then it is still alive.
if __name__ == "__main__":
import pytest
sys.exit(pytest.main(["-v", __file__]))
+7 -1
View File
@@ -3,7 +3,7 @@ from __future__ import division
from __future__ import print_function
import ray
from ray.tests.utils import run_string_as_driver
from ray.test_utils import run_string_as_driver
# This tests the queue transitions for infeasible tasks. This has been an issue
@@ -48,3 +48,9 @@ f.remote()
ray.get([
f._submit(args=[], kwargs={}, resources={str(i): 1}) for i in range(3)
])
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+7 -1
View File
@@ -11,7 +11,7 @@ import time
import warnings
import ray
from ray.tests.cluster_utils import Cluster
from ray.cluster_utils import Cluster
# TODO(yuhguo): This test file requires a lot of CPU/memory, and
# better be put in Jenkins. However, it fails frequently in Jenkins, but
@@ -325,3 +325,9 @@ def test_many_small_transfers(ray_start_cluster_with_resource):
do_transfers()
do_transfers()
do_transfers()
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+8
View File
@@ -237,3 +237,11 @@ def test_session_create_multiple():
"session-tests/commands-test", session_start,
["first", "--a", "*", "--b", "*"])
assert result.exit_code == 1
if __name__ == "__main__":
import sys
# Make subprocess happy in bazel.
os.environ["LC_ALL"] = "en_US.UTF-8"
os.environ["LANG"] = "en_US.UTF-8"
sys.exit(pytest.main(["-v", __file__]))
+5
View File
@@ -119,3 +119,8 @@ def test_queue(ray_start_regular):
assert q.get() == item
size -= 1
assert q.qsize() == size
if __name__ == "__main__":
import sys
sys.exit(pytest.main(["-v", __file__]))
+7 -1
View File
@@ -7,7 +7,7 @@ import pytest
import redis
import ray
from ray.tests.cluster_utils import Cluster
from ray.cluster_utils import Cluster
@pytest.fixture
@@ -61,3 +61,9 @@ class TestRedisPassword(object):
object_id = f.remote()
ray.get(object_id)
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+6
View File
@@ -387,3 +387,9 @@ def test_small_receive_timeout(ray_start_regular):
result_list = ray.experimental.signal.receive([a], timeout=small_timeout)
assert len(result_list) == 1
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+5 -450
View File
@@ -2,39 +2,12 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import numpy as np
import os
import pytest
import sys
import time
import ray
from ray.tests.cluster_utils import Cluster
from ray.tests.utils import flat_errors
import ray.ray_constants as ray_constants
@pytest.fixture(params=[1, 4])
def ray_start_sharded(request):
num_redis_shards = request.param
if os.environ.get("RAY_USE_NEW_GCS") == "on":
num_redis_shards = 1
# For now, RAY_USE_NEW_GCS supports 1 shard, and credis supports
# 1-node chain for that shard only.
# Start the Ray processes.
ray.init(
object_store_memory=int(0.5 * 10**9),
num_cpus=10,
num_redis_shards=num_redis_shards,
redis_max_memory=10**7)
yield None
# The code after the yield will run as teardown code.
ray.shutdown()
from ray.cluster_utils import Cluster
@pytest.fixture(params=[(1, 4), (4, 4)])
@@ -105,71 +78,6 @@ def test_dependencies(ray_start_combination):
assert cluster.remaining_processes_alive()
def test_submitting_many_tasks(ray_start_sharded):
@ray.remote
def f(x):
return 1
def g(n):
x = 1
for i in range(n):
x = f.remote(x)
return x
ray.get([g(1000) for _ in range(100)])
assert ray.services.remaining_processes_alive()
def test_submitting_many_actors_to_one(ray_start_sharded):
@ray.remote
class Actor(object):
def __init__(self):
pass
def ping(self):
return
@ray.remote
class Worker(object):
def __init__(self, actor):
self.actor = actor
def ping(self):
return ray.get(self.actor.ping.remote())
a = Actor.remote()
workers = [Worker.remote(a) for _ in range(10)]
for _ in range(10):
out = ray.get([w.ping.remote() for w in workers])
assert out == [None for _ in workers]
def test_getting_and_putting(ray_start_sharded):
for n in range(8):
x = np.zeros(10**n)
for _ in range(100):
ray.put(x)
x_id = ray.put(x)
for _ in range(1000):
ray.get(x_id)
assert ray.services.remaining_processes_alive()
def test_getting_many_objects(ray_start_sharded):
@ray.remote
def f():
return 1
n = 10**4 # TODO(pcm): replace by 10 ** 5 once this is faster.
lst = ray.get([f.remote() for _ in range(n)])
assert lst == n * [1]
assert ray.services.remaining_processes_alive()
def test_wait(ray_start_combination):
num_nodes, num_workers_per_scheduler, cluster = ray_start_combination
num_workers = num_nodes * num_workers_per_scheduler
@@ -197,360 +105,7 @@ def test_wait(ray_start_combination):
assert cluster.remaining_processes_alive()
@pytest.fixture(params=[1, 4])
def ray_start_reconstruction(request):
num_nodes = request.param
plasma_store_memory = int(0.5 * 10**9)
cluster = Cluster(
initialize_head=True,
head_node_args={
"num_cpus": 1,
"object_store_memory": plasma_store_memory // num_nodes,
"redis_max_memory": 10**7,
"_internal_config": json.dumps({
"initial_reconstruction_timeout_milliseconds": 200
})
})
for i in range(num_nodes - 1):
cluster.add_node(
num_cpus=1,
object_store_memory=plasma_store_memory // num_nodes,
_internal_config=json.dumps({
"initial_reconstruction_timeout_milliseconds": 200
}))
ray.init(address=cluster.address)
yield plasma_store_memory, num_nodes, cluster
# Clean up the Ray cluster.
ray.shutdown()
cluster.shutdown()
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Failing with new GCS API on Linux.")
def test_simple(ray_start_reconstruction):
plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
# Define the size of one task's return argument so that the combined
# sum of all objects' sizes is at least twice the plasma stores'
# combined allotted memory.
num_objects = 100
size = int(plasma_store_memory * 1.5 / (num_objects * 8))
# Define a remote task with no dependencies, which returns a numpy
# array of the given size.
@ray.remote
def foo(i, size):
array = np.zeros(size)
array[0] = i
return array
# Launch num_objects instances of the remote task.
args = []
for i in range(num_objects):
args.append(foo.remote(i, size))
# Get each value to force each task to finish. After some number of
# gets, old values should be evicted.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get each value again to force reconstruction.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get values sequentially, in chunks.
num_chunks = 4 * num_nodes
chunk = num_objects // num_chunks
for i in range(num_chunks):
values = ray.get(args[i * chunk:(i + 1) * chunk])
del values
assert cluster.remaining_processes_alive()
def sorted_random_indexes(total, output_num):
random_indexes = [np.random.randint(total) for _ in range(output_num)]
random_indexes.sort()
return random_indexes
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Failing with new GCS API on Linux.")
def test_recursive(ray_start_reconstruction):
plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
# Define the size of one task's return argument so that the combined
# sum of all objects' sizes is at least twice the plasma stores'
# combined allotted memory.
num_objects = 100
size = int(plasma_store_memory * 1.5 / (num_objects * 8))
# Define a root task with no dependencies, which returns a numpy array
# of the given size.
@ray.remote
def no_dependency_task(size):
array = np.zeros(size)
return array
# Define a task with a single dependency, which returns its one
# argument.
@ray.remote
def single_dependency(i, arg):
arg = np.copy(arg)
arg[0] = i
return arg
# Launch num_objects instances of the remote task, each dependent on
# the one before it.
arg = no_dependency_task.remote(size)
args = []
for i in range(num_objects):
arg = single_dependency.remote(i, arg)
args.append(arg)
# Get each value to force each task to finish. After some number of
# gets, old values should be evicted.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get each value again to force reconstruction.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get 10 values randomly.
random_indexes = sorted_random_indexes(num_objects, 10)
for i in random_indexes:
value = ray.get(args[i])
assert value[0] == i
# Get values sequentially, in chunks.
num_chunks = 4 * num_nodes
chunk = num_objects // num_chunks
for i in range(num_chunks):
values = ray.get(args[i * chunk:(i + 1) * chunk])
del values
assert cluster.remaining_processes_alive()
@pytest.mark.skip(reason="This test often hangs or fails in CI.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Failing with new GCS API on Linux.")
def test_multiple_recursive(ray_start_reconstruction):
plasma_store_memory, _, cluster = ray_start_reconstruction
# Define the size of one task's return argument so that the combined
# sum of all objects' sizes is at least twice the plasma stores'
# combined allotted memory.
num_objects = 100
size = plasma_store_memory * 2 // (num_objects * 8)
# Define a root task with no dependencies, which returns a numpy array
# of the given size.
@ray.remote
def no_dependency_task(size):
array = np.zeros(size)
return array
# Define a task with multiple dependencies, which returns its first
# argument.
@ray.remote
def multiple_dependency(i, arg1, arg2, arg3):
arg1 = np.copy(arg1)
arg1[0] = i
return arg1
# Launch num_args instances of the root task. Then launch num_objects
# instances of the multi-dependency remote task, each dependent on the
# num_args tasks before it.
num_args = 3
args = []
for i in range(num_args):
arg = no_dependency_task.remote(size)
args.append(arg)
for i in range(num_objects):
args.append(multiple_dependency.remote(i, *args[i:i + num_args]))
# Get each value to force each task to finish. After some number of
# gets, old values should be evicted.
args = args[num_args:]
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get each value again to force reconstruction.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get 10 values randomly.
random_indexes = sorted_random_indexes(num_objects, 10)
for i in random_indexes:
value = ray.get(args[i])
assert value[0] == i
assert cluster.remaining_processes_alive()
def wait_for_errors(error_check):
# Wait for errors from all the nondeterministic tasks.
errors = []
time_left = 100
while time_left > 0:
errors = flat_errors()
if error_check(errors):
break
time_left -= 1
time.sleep(1)
# Make sure that enough errors came through.
assert error_check(errors)
return errors
@pytest.mark.skip("This test does not work yet.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Failing with new GCS API on Linux.")
def test_nondeterministic_task(ray_start_reconstruction):
plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
# Define the size of one task's return argument so that the combined
# sum of all objects' sizes is at least twice the plasma stores'
# combined allotted memory.
num_objects = 1000
size = plasma_store_memory * 2 // (num_objects * 8)
# Define a nondeterministic remote task with no dependencies, which
# returns a random numpy array of the given size. This task should
# produce an error on the driver if it is ever reexecuted.
@ray.remote
def foo(i, size):
array = np.random.rand(size)
array[0] = i
return array
# Define a deterministic remote task with no dependencies, which
# returns a numpy array of zeros of the given size.
@ray.remote
def bar(i, size):
array = np.zeros(size)
array[0] = i
return array
# Launch num_objects instances, half deterministic and half
# nondeterministic.
args = []
for i in range(num_objects):
if i % 2 == 0:
args.append(foo.remote(i, size))
else:
args.append(bar.remote(i, size))
# Get each value to force each task to finish. After some number of
# gets, old values should be evicted.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get each value again to force reconstruction.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
def error_check(errors):
if num_nodes == 1:
# In a single-node setting, each object is evicted and
# reconstructed exactly once, so exactly half the objects will
# produce an error during reconstruction.
min_errors = num_objects // 2
else:
# In a multinode setting, each object is evicted zero or one
# times, so some of the nondeterministic tasks may not be
# reexecuted.
min_errors = 1
return len(errors) >= min_errors
errors = wait_for_errors(error_check)
# Make sure all the errors have the correct type.
assert all(error["type"] == ray_constants.HASH_MISMATCH_PUSH_ERROR
for error in errors)
assert cluster.remaining_processes_alive()
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Failing with new GCS API on Linux.")
@pytest.mark.skipif(
sys.version_info < (3, 0), reason="This test requires Python 3.")
@pytest.mark.parametrize(
"ray_start_object_store_memory", [10**9], indirect=True)
def test_driver_put_errors(ray_start_object_store_memory):
plasma_store_memory = ray_start_object_store_memory
# Define the size of one task's return argument so that the combined
# sum of all objects' sizes is at least twice the plasma stores'
# combined allotted memory.
num_objects = 100
size = plasma_store_memory * 2 // (num_objects * 8)
# Define a task with a single dependency, a numpy array, that returns
# another array.
@ray.remote
def single_dependency(i, arg):
arg = np.copy(arg)
arg[0] = i
return arg
# Launch num_objects instances of the remote task, each dependent on
# the one before it. The first instance of the task takes a numpy array
# as an argument, which is put into the object store.
args = []
arg = single_dependency.remote(0, np.zeros(size))
for i in range(num_objects):
arg = single_dependency.remote(i, arg)
args.append(arg)
# Get each value to force each task to finish. After some number of
# gets, old values should be evicted.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get each value starting from the beginning to force reconstruction.
# Currently, since we're not able to reconstruct `ray.put` objects that
# were evicted and whose originating tasks are still running, this
# for-loop should hang on its first iteration and push an error to the
# driver.
ray.worker.global_worker.raylet_client.fetch_or_reconstruct([args[0]],
False)
def error_check(errors):
return len(errors) > 1
errors = wait_for_errors(error_check)
assert all(error["type"] == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
or "ray.exceptions.UnreconstructableError" in error["message"]
for error in errors)
# NOTE(swang): This test tries to launch 1000 workers and breaks.
# TODO(rkn): This test needs to be updated to use pytest.
# class WorkerPoolTests(unittest.TestCase):
#
# def tearDown(self):
# ray.shutdown()
#
# def testBlockingTasks(self):
# @ray.remote
# def f(i, j):
# return (i, j)
#
# @ray.remote
# def g(i):
# # Each instance of g submits and blocks on the result of another remote
# # task.
# object_ids = [f.remote(i, j) for j in range(10)]
# return ray.get(object_ids)
#
# ray.init(num_workers=1)
# ray.get([g.remote(i) for i in range(1000)])
# ray.shutdown()
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+379
View File
@@ -0,0 +1,379 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import numpy as np
import os
import pytest
import sys
import time
import ray
from ray.cluster_utils import Cluster
from ray.test_utils import flat_errors
import ray.ray_constants as ray_constants
@pytest.fixture(params=[1, 4])
def ray_start_reconstruction(request):
num_nodes = request.param
plasma_store_memory = int(0.5 * 10**9)
cluster = Cluster(
initialize_head=True,
head_node_args={
"num_cpus": 1,
"object_store_memory": plasma_store_memory // num_nodes,
"redis_max_memory": 10**7,
"_internal_config": json.dumps({
"initial_reconstruction_timeout_milliseconds": 200
})
})
for i in range(num_nodes - 1):
cluster.add_node(
num_cpus=1,
object_store_memory=plasma_store_memory // num_nodes,
_internal_config=json.dumps({
"initial_reconstruction_timeout_milliseconds": 200
}))
ray.init(address=cluster.address)
yield plasma_store_memory, num_nodes, cluster
# Clean up the Ray cluster.
ray.shutdown()
cluster.shutdown()
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Failing with new GCS API on Linux.")
def test_simple(ray_start_reconstruction):
plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
# Define the size of one task's return argument so that the combined
# sum of all objects' sizes is at least twice the plasma stores'
# combined allotted memory.
num_objects = 100
size = int(plasma_store_memory * 1.5 / (num_objects * 8))
# Define a remote task with no dependencies, which returns a numpy
# array of the given size.
@ray.remote
def foo(i, size):
array = np.zeros(size)
array[0] = i
return array
# Launch num_objects instances of the remote task.
args = []
for i in range(num_objects):
args.append(foo.remote(i, size))
# Get each value to force each task to finish. After some number of
# gets, old values should be evicted.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get each value again to force reconstruction.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get values sequentially, in chunks.
num_chunks = 4 * num_nodes
chunk = num_objects // num_chunks
for i in range(num_chunks):
values = ray.get(args[i * chunk:(i + 1) * chunk])
del values
assert cluster.remaining_processes_alive()
def sorted_random_indexes(total, output_num):
random_indexes = [np.random.randint(total) for _ in range(output_num)]
random_indexes.sort()
return random_indexes
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Failing with new GCS API on Linux.")
def test_recursive(ray_start_reconstruction):
plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
# Define the size of one task's return argument so that the combined
# sum of all objects' sizes is at least twice the plasma stores'
# combined allotted memory.
num_objects = 100
size = int(plasma_store_memory * 1.5 / (num_objects * 8))
# Define a root task with no dependencies, which returns a numpy array
# of the given size.
@ray.remote
def no_dependency_task(size):
array = np.zeros(size)
return array
# Define a task with a single dependency, which returns its one
# argument.
@ray.remote
def single_dependency(i, arg):
arg = np.copy(arg)
arg[0] = i
return arg
# Launch num_objects instances of the remote task, each dependent on
# the one before it.
arg = no_dependency_task.remote(size)
args = []
for i in range(num_objects):
arg = single_dependency.remote(i, arg)
args.append(arg)
# Get each value to force each task to finish. After some number of
# gets, old values should be evicted.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get each value again to force reconstruction.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get 10 values randomly.
random_indexes = sorted_random_indexes(num_objects, 10)
for i in random_indexes:
value = ray.get(args[i])
assert value[0] == i
# Get values sequentially, in chunks.
num_chunks = 4 * num_nodes
chunk = num_objects // num_chunks
for i in range(num_chunks):
values = ray.get(args[i * chunk:(i + 1) * chunk])
del values
assert cluster.remaining_processes_alive()
@pytest.mark.skip(reason="This test often hangs or fails in CI.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Failing with new GCS API on Linux.")
def test_multiple_recursive(ray_start_reconstruction):
plasma_store_memory, _, cluster = ray_start_reconstruction
# Define the size of one task's return argument so that the combined
# sum of all objects' sizes is at least twice the plasma stores'
# combined allotted memory.
num_objects = 100
size = plasma_store_memory * 2 // (num_objects * 8)
# Define a root task with no dependencies, which returns a numpy array
# of the given size.
@ray.remote
def no_dependency_task(size):
array = np.zeros(size)
return array
# Define a task with multiple dependencies, which returns its first
# argument.
@ray.remote
def multiple_dependency(i, arg1, arg2, arg3):
arg1 = np.copy(arg1)
arg1[0] = i
return arg1
# Launch num_args instances of the root task. Then launch num_objects
# instances of the multi-dependency remote task, each dependent on the
# num_args tasks before it.
num_args = 3
args = []
for i in range(num_args):
arg = no_dependency_task.remote(size)
args.append(arg)
for i in range(num_objects):
args.append(multiple_dependency.remote(i, *args[i:i + num_args]))
# Get each value to force each task to finish. After some number of
# gets, old values should be evicted.
args = args[num_args:]
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get each value again to force reconstruction.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get 10 values randomly.
random_indexes = sorted_random_indexes(num_objects, 10)
for i in random_indexes:
value = ray.get(args[i])
assert value[0] == i
assert cluster.remaining_processes_alive()
def wait_for_errors(error_check):
# Wait for errors from all the nondeterministic tasks.
errors = []
time_left = 100
while time_left > 0:
errors = flat_errors()
if error_check(errors):
break
time_left -= 1
time.sleep(1)
# Make sure that enough errors came through.
assert error_check(errors)
return errors
@pytest.mark.skip("This test does not work yet.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Failing with new GCS API on Linux.")
def test_nondeterministic_task(ray_start_reconstruction):
plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
# Define the size of one task's return argument so that the combined
# sum of all objects' sizes is at least twice the plasma stores'
# combined allotted memory.
num_objects = 1000
size = plasma_store_memory * 2 // (num_objects * 8)
# Define a nondeterministic remote task with no dependencies, which
# returns a random numpy array of the given size. This task should
# produce an error on the driver if it is ever reexecuted.
@ray.remote
def foo(i, size):
array = np.random.rand(size)
array[0] = i
return array
# Define a deterministic remote task with no dependencies, which
# returns a numpy array of zeros of the given size.
@ray.remote
def bar(i, size):
array = np.zeros(size)
array[0] = i
return array
# Launch num_objects instances, half deterministic and half
# nondeterministic.
args = []
for i in range(num_objects):
if i % 2 == 0:
args.append(foo.remote(i, size))
else:
args.append(bar.remote(i, size))
# Get each value to force each task to finish. After some number of
# gets, old values should be evicted.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get each value again to force reconstruction.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
def error_check(errors):
if num_nodes == 1:
# In a single-node setting, each object is evicted and
# reconstructed exactly once, so exactly half the objects will
# produce an error during reconstruction.
min_errors = num_objects // 2
else:
# In a multinode setting, each object is evicted zero or one
# times, so some of the nondeterministic tasks may not be
# reexecuted.
min_errors = 1
return len(errors) >= min_errors
errors = wait_for_errors(error_check)
# Make sure all the errors have the correct type.
assert all(error["type"] == ray_constants.HASH_MISMATCH_PUSH_ERROR
for error in errors)
assert cluster.remaining_processes_alive()
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Failing with new GCS API on Linux.")
@pytest.mark.skipif(
sys.version_info < (3, 0), reason="This test requires Python 3.")
@pytest.mark.parametrize(
"ray_start_object_store_memory", [10**9], indirect=True)
def test_driver_put_errors(ray_start_object_store_memory):
plasma_store_memory = ray_start_object_store_memory
# Define the size of one task's return argument so that the combined
# sum of all objects' sizes is at least twice the plasma stores'
# combined allotted memory.
num_objects = 100
size = plasma_store_memory * 2 // (num_objects * 8)
# Define a task with a single dependency, a numpy array, that returns
# another array.
@ray.remote
def single_dependency(i, arg):
arg = np.copy(arg)
arg[0] = i
return arg
# Launch num_objects instances of the remote task, each dependent on
# the one before it. The first instance of the task takes a numpy array
# as an argument, which is put into the object store.
args = []
arg = single_dependency.remote(0, np.zeros(size))
for i in range(num_objects):
arg = single_dependency.remote(i, arg)
args.append(arg)
# Get each value to force each task to finish. After some number of
# gets, old values should be evicted.
for i in range(num_objects):
value = ray.get(args[i])
assert value[0] == i
# Get each value starting from the beginning to force reconstruction.
# Currently, since we're not able to reconstruct `ray.put` objects that
# were evicted and whose originating tasks are still running, this
# for-loop should hang on its first iteration and push an error to the
# driver.
ray.worker.global_worker.raylet_client.fetch_or_reconstruct([args[0]],
False)
def error_check(errors):
return len(errors) > 1
errors = wait_for_errors(error_check)
assert all(error["type"] == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
or "ray.exceptions.UnreconstructableError" in error["message"]
for error in errors)
# NOTE(swang): This test tries to launch 1000 workers and breaks.
# TODO(rkn): This test needs to be updated to use pytest.
# class WorkerPoolTests(unittest.TestCase):
#
# def tearDown(self):
# ray.shutdown()
#
# def testBlockingTasks(self):
# @ray.remote
# def f(i, j):
# return (i, j)
#
# @ray.remote
# def g(i):
# # Each instance of g submits and blocks on the result of another remote
# # task.
# object_ids = [f.remote(i, j) for j in range(10)]
# return ray.get(object_ids)
#
# ray.init(num_workers=1)
# ray.get([g.remote(i) for i in range(1000)])
# ray.shutdown()
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+102
View File
@@ -0,0 +1,102 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import os
import pytest
import ray
@pytest.fixture(params=[1, 4])
def ray_start_sharded(request):
num_redis_shards = request.param
if os.environ.get("RAY_USE_NEW_GCS") == "on":
num_redis_shards = 1
# For now, RAY_USE_NEW_GCS supports 1 shard, and credis supports
# 1-node chain for that shard only.
# Start the Ray processes.
ray.init(
object_store_memory=int(0.5 * 10**9),
num_cpus=10,
num_redis_shards=num_redis_shards,
redis_max_memory=10**7)
yield None
# The code after the yield will run as teardown code.
ray.shutdown()
def test_submitting_many_tasks(ray_start_sharded):
@ray.remote
def f(x):
return 1
def g(n):
x = 1
for i in range(n):
x = f.remote(x)
return x
ray.get([g(1000) for _ in range(100)])
assert ray.services.remaining_processes_alive()
def test_submitting_many_actors_to_one(ray_start_sharded):
@ray.remote
class Actor(object):
def __init__(self):
pass
def ping(self):
return
@ray.remote
class Worker(object):
def __init__(self, actor):
self.actor = actor
def ping(self):
return ray.get(self.actor.ping.remote())
a = Actor.remote()
workers = [Worker.remote(a) for _ in range(10)]
for _ in range(10):
out = ray.get([w.ping.remote() for w in workers])
assert out == [None for _ in workers]
def test_getting_and_putting(ray_start_sharded):
for n in range(8):
x = np.zeros(10**n)
for _ in range(100):
ray.put(x)
x_id = ray.put(x)
for _ in range(1000):
ray.get(x_id)
assert ray.services.remaining_processes_alive()
def test_getting_many_objects(ray_start_sharded):
@ray.remote
def f():
return 1
n = 10**4 # TODO(pcm): replace by 10 ** 5 once this is faster.
lst = ray.get([f.remote() for _ in range(n)])
assert lst == n * [1]
assert ray.services.remaining_processes_alive()
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+9 -1
View File
@@ -7,7 +7,7 @@ import shutil
import time
import pytest
import ray
from ray.tests.cluster_utils import Cluster
from ray.cluster_utils import Cluster
def test_conn_cluster():
@@ -150,3 +150,11 @@ def test_session_dir_uniqueness():
session_dirs.add(ray.worker._global_node.get_session_dir_path)
ray.shutdown()
assert len(session_dirs) == 3
if __name__ == "__main__":
import sys
# Make subprocess happy in bazel.
os.environ["LC_ALL"] = "en_US.UTF-8"
os.environ["LANG"] = "en_US.UTF-8"
sys.exit(pytest.main(["-v", __file__]))
+6
View File
@@ -252,3 +252,9 @@ def test_remote_training_loss(ray_start_2_cpus):
after_acc = sess.run(
loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))
assert before_acc < after_acc
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
@@ -44,4 +44,6 @@ class TestUnreconstructableErrors(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+6
View File
@@ -33,3 +33,9 @@ def test_get_webui(shutdown_only):
assert node_info["error"] is None
assert node_info["result"] is not None
assert isinstance(node_info["timestamp"], float)
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+38 -2
View File
@@ -3,12 +3,14 @@ py_test(
size = "medium",
srcs = ["tests/test_actor_reuse.py"],
tags = ["jenkins_only"],
deps = [":tune_lib"],
)
py_test(
name = "test_automl_searcher",
size = "small",
srcs = ["tests/test_automl_searcher.py"],
deps = [":tune_lib"],
)
py_test(
@@ -77,19 +79,52 @@ py_test(
deps = [":tune_lib"],
)
py_test(
name = "test_run_experiment",
size = "medium",
srcs = ["tests/test_run_experiment.py"],
deps = [":tune_lib"],
tags = ["exclusive"],
)
py_test(
name = "test_trial_runner",
size = "large",
size = "medium",
srcs = ["tests/test_trial_runner.py"],
deps = [":tune_lib"],
tags = ["exclusive"],
)
py_test(
name = "test_var",
size = "medium",
srcs = ["tests/test_var.py"],
deps = [":tune_lib"],
tags = ["exclusive"],
)
py_test(
name = "test_api",
size = "medium",
srcs = ["tests/test_api.py"],
deps = [":tune_lib"],
tags = ["exclusive"],
)
py_test(
name = "test_sync",
size = "medium",
srcs = ["tests/test_sync.py"],
deps = [":tune_lib"],
tags = ["exclusive"],
)
py_test(
name = "test_trial_scheduler",
size = "medium",
srcs = ["tests/test_trial_scheduler.py"],
deps = [":tune_lib"],
tags = ["exclusive"],
)
py_test(
@@ -113,11 +148,12 @@ py_test(
size = "medium",
srcs = ["tests/test_tune_server.py"],
deps = [":tune_lib"],
tags = ["exclusive"],
)
# This is a dummy test dependency that causes the above tests to be
# re-run if any of these files changes.
py_library(
name="tune_lib",
name = "tune_lib",
srcs = glob(["**/*.py"], exclude=["tests/*.py"]),
)
+3 -1
View File
@@ -96,4 +96,6 @@ class ActorReuseTest(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+797
View File
@@ -0,0 +1,797 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import os
import time
import unittest
import ray
from ray.rllib import _register_all
from ray import tune
from ray.tune import Trainable, TuneError
from ray.tune import register_env, register_trainable, run_experiments
from ray.tune.schedulers import TrialScheduler, FIFOScheduler
from ray.tune.trial import Trial
from ray.tune.result import (TIMESTEPS_TOTAL, DONE, HOSTNAME, NODE_IP, PID,
EPISODES_TOTAL, TRAINING_ITERATION,
TIMESTEPS_THIS_ITER, TIME_THIS_ITER_S,
TIME_TOTAL_S, TRIAL_ID, EXPERIMENT_TAG)
from ray.tune.logger import Logger
from ray.tune.util import pin_in_object_store, get_pinned_object, flatten_dict
from ray.tune.experiment import Experiment
from ray.tune.resources import Resources
from ray.tune.suggest import grid_search
from ray.tune.suggest.suggestion import _MockSuggestionAlgorithm
class TrainableFunctionApiTest(unittest.TestCase):
def setUp(self):
ray.init(num_cpus=4, num_gpus=0, object_store_memory=150 * 1024 * 1024)
def tearDown(self):
ray.shutdown()
_register_all() # re-register the evicted objects
def checkAndReturnConsistentLogs(self, results, sleep_per_iter=None):
"""Checks logging is the same between APIs.
Ignore "DONE" for logging but checks that the
scheduler is notified properly with the last result.
"""
class_results = copy.deepcopy(results)
function_results = copy.deepcopy(results)
class_output = []
function_output = []
scheduler_notif = []
class MockScheduler(FIFOScheduler):
def on_trial_complete(self, runner, trial, result):
scheduler_notif.append(result)
class ClassAPILogger(Logger):
def on_result(self, result):
class_output.append(result)
class FunctionAPILogger(Logger):
def on_result(self, result):
function_output.append(result)
class _WrappedTrainable(Trainable):
def _setup(self, config):
del config
self._result_iter = copy.deepcopy(class_results)
def _train(self):
if sleep_per_iter:
time.sleep(sleep_per_iter)
res = self._result_iter.pop(0) # This should not fail
if not self._result_iter: # Mark "Done" for last result
res[DONE] = True
return res
def _function_trainable(config, reporter):
for result in function_results:
if sleep_per_iter:
time.sleep(sleep_per_iter)
reporter(**result)
class_trainable_name = "class_trainable"
register_trainable(class_trainable_name, _WrappedTrainable)
trials = run_experiments(
{
"function_api": {
"run": _function_trainable,
"loggers": [FunctionAPILogger],
},
"class_api": {
"run": class_trainable_name,
"loggers": [ClassAPILogger],
},
},
raise_on_failed_trial=False,
scheduler=MockScheduler())
# Ignore these fields
NO_COMPARE_FIELDS = {
HOSTNAME,
NODE_IP,
TRIAL_ID,
EXPERIMENT_TAG,
PID,
TIME_THIS_ITER_S,
TIME_TOTAL_S,
DONE, # This is ignored because FunctionAPI has different handling
"timestamp",
"time_since_restore",
"experiment_id",
"date",
}
self.assertEqual(len(class_output), len(results))
self.assertEqual(len(function_output), len(results))
def as_comparable_result(result):
return {
k: v
for k, v in result.items() if k not in NO_COMPARE_FIELDS
}
function_comparable = [
as_comparable_result(result) for result in function_output
]
class_comparable = [
as_comparable_result(result) for result in class_output
]
self.assertEqual(function_comparable, class_comparable)
self.assertEqual(sum(t.get(DONE) for t in scheduler_notif), 2)
self.assertEqual(
as_comparable_result(scheduler_notif[0]),
as_comparable_result(scheduler_notif[1]))
# Make sure the last result is the same.
self.assertEqual(
as_comparable_result(trials[0].last_result),
as_comparable_result(trials[1].last_result))
return function_output, trials
def testPinObject(self):
X = pin_in_object_store("hello")
@ray.remote
def f():
return get_pinned_object(X)
self.assertEqual(ray.get(f.remote()), "hello")
def testFetchPinned(self):
X = pin_in_object_store("hello")
def train(config, reporter):
get_pinned_object(X)
reporter(timesteps_total=100, done=True)
register_trainable("f1", train)
[trial] = run_experiments({
"foo": {
"run": "f1",
}
})
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 100)
def testRegisterEnv(self):
register_env("foo", lambda: None)
self.assertRaises(TypeError, lambda: register_env("foo", 2))
def testRegisterEnvOverwrite(self):
def train(config, reporter):
reporter(timesteps_total=100, done=True)
def train2(config, reporter):
reporter(timesteps_total=200, done=True)
register_trainable("f1", train)
register_trainable("f1", train2)
[trial] = run_experiments({
"foo": {
"run": "f1",
}
})
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 200)
def testRegisterTrainable(self):
def train(config, reporter):
pass
class A(object):
pass
class B(Trainable):
pass
register_trainable("foo", train)
Experiment("test", train)
register_trainable("foo", B)
Experiment("test", B)
self.assertRaises(TypeError, lambda: register_trainable("foo", B()))
self.assertRaises(TuneError, lambda: Experiment("foo", B()))
self.assertRaises(TypeError, lambda: register_trainable("foo", A))
self.assertRaises(TypeError, lambda: Experiment("foo", A))
def testTrainableCallable(self):
def dummy_fn(config, reporter, steps):
reporter(timesteps_total=steps, done=True)
from functools import partial
steps = 500
register_trainable("test", partial(dummy_fn, steps=steps))
[trial] = run_experiments({
"foo": {
"run": "test",
}
})
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], steps)
[trial] = tune.run(partial(dummy_fn, steps=steps)).trials
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], steps)
def testBuiltInTrainableResources(self):
class B(Trainable):
@classmethod
def default_resource_request(cls, config):
return Resources(cpu=config["cpu"], gpu=config["gpu"])
def _train(self):
return {"timesteps_this_iter": 1, "done": True}
register_trainable("B", B)
def f(cpus, gpus, queue_trials):
return run_experiments(
{
"foo": {
"run": "B",
"config": {
"cpu": cpus,
"gpu": gpus,
},
}
},
queue_trials=queue_trials)[0]
# Should all succeed
self.assertEqual(f(0, 0, False).status, Trial.TERMINATED)
self.assertEqual(f(1, 0, True).status, Trial.TERMINATED)
self.assertEqual(f(1, 0, True).status, Trial.TERMINATED)
# Too large resource request
self.assertRaises(TuneError, lambda: f(100, 100, False))
self.assertRaises(TuneError, lambda: f(0, 100, False))
self.assertRaises(TuneError, lambda: f(100, 0, False))
# TODO(ekl) how can we test this is queued (hangs)?
# f(100, 0, True)
def testRewriteEnv(self):
def train(config, reporter):
reporter(timesteps_total=1)
register_trainable("f1", train)
[trial] = run_experiments({
"foo": {
"run": "f1",
"env": "CartPole-v0",
}
})
self.assertEqual(trial.config["env"], "CartPole-v0")
def testConfigPurity(self):
def train(config, reporter):
assert config == {"a": "b"}, config
reporter(timesteps_total=1)
register_trainable("f1", train)
run_experiments({
"foo": {
"run": "f1",
"config": {
"a": "b"
},
}
})
def testLogdir(self):
def train(config, reporter):
assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd()
reporter(timesteps_total=1)
register_trainable("f1", train)
run_experiments({
"foo": {
"run": "f1",
"local_dir": "/tmp/logdir",
"config": {
"a": "b"
},
}
})
def testLogdirStartingWithTilde(self):
local_dir = "~/ray_results/local_dir"
def train(config, reporter):
cwd = os.getcwd()
assert cwd.startswith(os.path.expanduser(local_dir)), cwd
assert not cwd.startswith("~"), cwd
reporter(timesteps_total=1)
register_trainable("f1", train)
run_experiments({
"foo": {
"run": "f1",
"local_dir": local_dir,
"config": {
"a": "b"
},
}
})
def testLongFilename(self):
def train(config, reporter):
assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd()
reporter(timesteps_total=1)
register_trainable("f1", train)
run_experiments({
"foo": {
"run": "f1",
"local_dir": "/tmp/logdir",
"config": {
"a" * 50: tune.sample_from(lambda spec: 5.0 / 7),
"b" * 50: tune.sample_from(lambda spec: "long" * 40),
},
}
})
def testBadParams(self):
def f():
run_experiments({"foo": {}})
self.assertRaises(TuneError, f)
def testBadParams2(self):
def f():
run_experiments({
"foo": {
"run": "asdf",
"bah": "this param is not allowed",
}
})
self.assertRaises(TuneError, f)
def testBadParams3(self):
def f():
run_experiments({
"foo": {
"run": grid_search("invalid grid search"),
}
})
self.assertRaises(TuneError, f)
def testBadParams4(self):
def f():
run_experiments({
"foo": {
"run": "asdf",
}
})
self.assertRaises(TuneError, f)
def testBadParams5(self):
def f():
run_experiments({"foo": {"run": "PPO", "stop": {"asdf": 1}}})
self.assertRaises(TuneError, f)
def testBadParams6(self):
def f():
run_experiments({
"foo": {
"run": "PPO",
"resources_per_trial": {
"asdf": 1
}
}
})
self.assertRaises(TuneError, f)
def testBadStoppingReturn(self):
def train(config, reporter):
reporter()
register_trainable("f1", train)
def f():
run_experiments({
"foo": {
"run": "f1",
"stop": {
"time": 10
},
}
})
self.assertRaises(TuneError, f)
def testNestedStoppingReturn(self):
def train(config, reporter):
for i in range(10):
reporter(test={"test1": {"test2": i}})
with self.assertRaises(TuneError):
[trial] = tune.run(
train, stop={
"test": {
"test1": {
"test2": 6
}
}
}).trials
[trial] = tune.run(train, stop={"test/test1/test2": 6}).trials
self.assertEqual(trial.last_result["training_iteration"], 7)
def testStoppingFunction(self):
def train(config, reporter):
for i in range(10):
reporter(test=i)
def stop(trial_id, result):
return result["test"] > 6
[trial] = tune.run(train, stop=stop).trials
self.assertEqual(trial.last_result["training_iteration"], 8)
def testStoppingMemberFunction(self):
def train(config, reporter):
for i in range(10):
reporter(test=i)
class Stopper:
def stop(self, trial_id, result):
return result["test"] > 6
[trial] = tune.run(train, stop=Stopper().stop).trials
self.assertEqual(trial.last_result["training_iteration"], 8)
def testBadStoppingFunction(self):
def train(config, reporter):
for i in range(10):
reporter(test=i)
class Stopper:
def stop(self, result):
return result["test"] > 6
def stop(result):
return result["test"] > 6
with self.assertRaises(ValueError):
tune.run(train, stop=Stopper().stop)
with self.assertRaises(ValueError):
tune.run(train, stop=stop)
def testEarlyReturn(self):
def train(config, reporter):
reporter(timesteps_total=100, done=True)
time.sleep(99999)
register_trainable("f1", train)
[trial] = run_experiments({
"foo": {
"run": "f1",
}
})
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 100)
def testReporterNoUsage(self):
def run_task(config, reporter):
print("hello")
experiment = Experiment(run=run_task, name="ray_crash_repro")
[trial] = ray.tune.run(experiment).trials
print(trial.last_result)
self.assertEqual(trial.last_result[DONE], True)
def testErrorReturn(self):
def train(config, reporter):
raise Exception("uh oh")
register_trainable("f1", train)
def f():
run_experiments({
"foo": {
"run": "f1",
}
})
self.assertRaises(TuneError, f)
def testSuccess(self):
def train(config, reporter):
for i in range(100):
reporter(timesteps_total=i)
register_trainable("f1", train)
[trial] = run_experiments({
"foo": {
"run": "f1",
}
})
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
def testNoRaiseFlag(self):
def train(config, reporter):
raise Exception()
register_trainable("f1", train)
[trial] = run_experiments(
{
"foo": {
"run": "f1",
}
}, raise_on_failed_trial=False)
self.assertEqual(trial.status, Trial.ERROR)
def testReportInfinity(self):
def train(config, reporter):
for i in range(100):
reporter(mean_accuracy=float("inf"))
register_trainable("f1", train)
[trial] = run_experiments({
"foo": {
"run": "f1",
}
})
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertEqual(trial.last_result["mean_accuracy"], float("inf"))
def testNestedResults(self):
def create_result(i):
return {"test": {"1": {"2": {"3": i, "4": False}}}}
flattened_keys = list(flatten_dict(create_result(0)))
class _MockScheduler(FIFOScheduler):
results = []
def on_trial_result(self, trial_runner, trial, result):
self.results += [result]
return TrialScheduler.CONTINUE
def on_trial_complete(self, trial_runner, trial, result):
self.complete_result = result
def train(config, reporter):
for i in range(100):
reporter(**create_result(i))
algo = _MockSuggestionAlgorithm()
scheduler = _MockScheduler()
[trial] = tune.run(
train,
scheduler=scheduler,
search_alg=algo,
stop={
"test/1/2/3": 20
}).trials
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertEqual(trial.last_result["test"]["1"]["2"]["3"], 20)
self.assertEqual(trial.last_result["test"]["1"]["2"]["4"], False)
self.assertEqual(trial.last_result[TRAINING_ITERATION], 21)
self.assertEqual(len(scheduler.results), 20)
self.assertTrue(
all(
set(result) >= set(flattened_keys)
for result in scheduler.results))
self.assertTrue(set(scheduler.complete_result) >= set(flattened_keys))
self.assertEqual(len(algo.results), 20)
self.assertTrue(
all(set(result) >= set(flattened_keys) for result in algo.results))
with self.assertRaises(TuneError):
[trial] = tune.run(train, stop={"1/2/3": 20})
with self.assertRaises(TuneError):
[trial] = tune.run(train, stop={"test": 1}).trials
def testReportTimeStep(self):
# Test that no timestep count are logged if never the Trainable never
# returns any.
results1 = [dict(mean_accuracy=5, done=i == 99) for i in range(100)]
logs1, _ = self.checkAndReturnConsistentLogs(results1)
self.assertTrue(all(log[TIMESTEPS_TOTAL] is None for log in logs1))
# Test that no timesteps_this_iter are logged if only timesteps_total
# are returned.
results2 = [dict(timesteps_total=5, done=i == 9) for i in range(10)]
logs2, _ = self.checkAndReturnConsistentLogs(results2)
# Re-run the same trials but with added delay. This is to catch some
# inconsistent timestep counting that was present in the multi-threaded
# FunctionRunner. This part of the test can be removed once the
# multi-threaded FunctionRunner is removed from ray/tune.
# TODO: remove once the multi-threaded function runner is gone.
logs2, _ = self.checkAndReturnConsistentLogs(results2, 0.5)
# check all timesteps_total report the same value
self.assertTrue(all(log[TIMESTEPS_TOTAL] == 5 for log in logs2))
# check that none of the logs report timesteps_this_iter
self.assertFalse(
any(hasattr(log, TIMESTEPS_THIS_ITER) for log in logs2))
# Test that timesteps_total and episodes_total are reported when
# timesteps_this_iter and episodes_this_iter despite only return zeros.
results3 = [
dict(timesteps_this_iter=0, episodes_this_iter=0)
for i in range(10)
]
logs3, _ = self.checkAndReturnConsistentLogs(results3)
self.assertTrue(all(log[TIMESTEPS_TOTAL] == 0 for log in logs3))
self.assertTrue(all(log[EPISODES_TOTAL] == 0 for log in logs3))
# Test that timesteps_total and episodes_total are properly counted
# when timesteps_this_iter and episodes_this_iter report non-zero
# values.
results4 = [
dict(timesteps_this_iter=3, episodes_this_iter=i)
for i in range(10)
]
logs4, _ = self.checkAndReturnConsistentLogs(results4)
# The last reported result should not be double-logged.
self.assertEqual(logs4[-1][TIMESTEPS_TOTAL], 30)
self.assertNotEqual(logs4[-2][TIMESTEPS_TOTAL],
logs4[-1][TIMESTEPS_TOTAL])
self.assertEqual(logs4[-1][EPISODES_TOTAL], 45)
self.assertNotEqual(logs4[-2][EPISODES_TOTAL],
logs4[-1][EPISODES_TOTAL])
def testAllValuesReceived(self):
results1 = [
dict(timesteps_total=(i + 1), my_score=i**2, done=i == 4)
for i in range(5)
]
logs1, _ = self.checkAndReturnConsistentLogs(results1)
# check if the correct number of results were reported
self.assertEqual(len(logs1), len(results1))
def check_no_missing(reported_result, result):
common_results = [reported_result[k] == result[k] for k in result]
return all(common_results)
# check that no result was dropped or modified
complete_results = [
check_no_missing(log, result)
for log, result in zip(logs1, results1)
]
self.assertTrue(all(complete_results))
# check if done was logged exactly once
self.assertEqual(len([r for r in logs1 if r.get("done")]), 1)
def testNoDoneReceived(self):
# repeat same test but without explicitly reporting done=True
results1 = [
dict(timesteps_total=(i + 1), my_score=i**2) for i in range(5)
]
logs1, trials = self.checkAndReturnConsistentLogs(results1)
# check if the correct number of results were reported.
self.assertEqual(len(logs1), len(results1))
def check_no_missing(reported_result, result):
common_results = [reported_result[k] == result[k] for k in result]
return all(common_results)
# check that no result was dropped or modified
complete_results1 = [
check_no_missing(log, result)
for log, result in zip(logs1, results1)
]
self.assertTrue(all(complete_results1))
def testCheckpointDict(self):
class TestTrain(Trainable):
def _setup(self, config):
self.state = {"hi": 1}
def _train(self):
return {"timesteps_this_iter": 1, "done": True}
def _save(self, path):
return self.state
def _restore(self, state):
self.state = state
test_trainable = TestTrain()
result = test_trainable.save()
test_trainable.state["hi"] = 2
test_trainable.restore(result)
self.assertEqual(test_trainable.state["hi"], 1)
trials = run_experiments({
"foo": {
"run": TestTrain,
"checkpoint_at_end": True
}
})
for trial in trials:
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertTrue(trial.has_checkpoint())
def testMultipleCheckpoints(self):
class TestTrain(Trainable):
def _setup(self, config):
self.state = {"hi": 1, "iter": 0}
def _train(self):
self.state["iter"] += 1
return {"timesteps_this_iter": 1, "done": True}
def _save(self, path):
return self.state
def _restore(self, state):
self.state = state
test_trainable = TestTrain()
checkpoint_1 = test_trainable.save()
test_trainable.train()
checkpoint_2 = test_trainable.save()
self.assertNotEqual(checkpoint_1, checkpoint_2)
test_trainable.restore(checkpoint_2)
self.assertEqual(test_trainable.state["iter"], 1)
test_trainable.restore(checkpoint_1)
self.assertEqual(test_trainable.state["iter"], 0)
trials = run_experiments({
"foo": {
"run": TestTrain,
"checkpoint_at_end": True
}
})
for trial in trials:
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertTrue(trial.has_checkpoint())
def testIterationCounter(self):
def train(config, reporter):
for i in range(100):
reporter(itr=i, timesteps_this_iter=1)
register_trainable("exp", train)
config = {
"my_exp": {
"run": "exp",
"config": {
"iterations": 100,
},
"stop": {
"timesteps_total": 100
},
}
}
[trial] = run_experiments(config)
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertEqual(trial.last_result[TRAINING_ITERATION], 100)
self.assertEqual(trial.last_result["itr"], 99)
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
@@ -70,4 +70,6 @@ class AutoMLSearcherTest(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
@@ -109,4 +109,4 @@ class CheckpointManagerTest(unittest.TestCase):
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", "-s", __file__]))
sys.exit(pytest.main(["-v", __file__]))
+3 -3
View File
@@ -13,8 +13,8 @@ import sys
import ray
from ray import tune
from ray.rllib import _register_all
from ray.tests.cluster_utils import Cluster
from ray.tests.utils import run_string_as_driver_nonblocking
from ray.cluster_utils import Cluster
from ray.test_utils import run_string_as_driver_nonblocking
from ray.tune.error import TuneError
from ray.tune.ray_trial_executor import RayTrialExecutor
from ray.tune.experiment import Experiment
@@ -598,4 +598,4 @@ tune.run(
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", "-s", __file__]))
sys.exit(pytest.main(["-v", __file__]))
+3 -1
View File
@@ -62,4 +62,6 @@ class ExperimentTest(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
@@ -204,4 +204,6 @@ class AnalysisSuite(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+3 -1
View File
@@ -64,4 +64,6 @@ class LoggerSuite(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
@@ -14,7 +14,7 @@ from ray.tune.registry import _global_registry, TRAINABLE_CLASS
from ray.tune.suggest import BasicVariantGenerator
from ray.tune.trial import Trial, Checkpoint
from ray.tune.resources import Resources
from ray.tests.cluster_utils import Cluster
from ray.cluster_utils import Cluster
class RayTrialExecutorTest(unittest.TestCase):
@@ -190,4 +190,6 @@ class LocalModeExecutorTest(RayTrialExecutorTest):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
@@ -0,0 +1,241 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import unittest
import ray
from ray.rllib import _register_all
from ray.tune.result import TIMESTEPS_TOTAL
from ray.tune import Trainable, TuneError
from ray.tune import register_trainable, run_experiments
from ray.tune.logger import Logger
from ray.tune.experiment import Experiment
from ray.tune.trial import Trial, ExportFormat
class RunExperimentTest(unittest.TestCase):
def tearDown(self):
ray.shutdown()
_register_all() # re-register the evicted objects
def testDict(self):
def train(config, reporter):
for i in range(100):
reporter(timesteps_total=i)
register_trainable("f1", train)
trials = run_experiments({
"foo": {
"run": "f1",
},
"bar": {
"run": "f1",
}
})
for trial in trials:
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
def testExperiment(self):
def train(config, reporter):
for i in range(100):
reporter(timesteps_total=i)
register_trainable("f1", train)
exp1 = Experiment(**{
"name": "foo",
"run": "f1",
})
[trial] = run_experiments(exp1)
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
def testExperimentList(self):
def train(config, reporter):
for i in range(100):
reporter(timesteps_total=i)
register_trainable("f1", train)
exp1 = Experiment(**{
"name": "foo",
"run": "f1",
})
exp2 = Experiment(**{
"name": "bar",
"run": "f1",
})
trials = run_experiments([exp1, exp2])
for trial in trials:
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
def testAutoregisterTrainable(self):
def train(config, reporter):
for i in range(100):
reporter(timesteps_total=i)
class B(Trainable):
def _train(self):
return {"timesteps_this_iter": 1, "done": True}
register_trainable("f1", train)
trials = run_experiments({
"foo": {
"run": train,
},
"bar": {
"run": B
}
})
for trial in trials:
self.assertEqual(trial.status, Trial.TERMINATED)
def testCheckpointAtEnd(self):
class train(Trainable):
def _train(self):
return {"timesteps_this_iter": 1, "done": True}
def _save(self, path):
checkpoint = path + "/checkpoint"
with open(checkpoint, "w") as f:
f.write("OK")
return checkpoint
trials = run_experiments({
"foo": {
"run": train,
"checkpoint_at_end": True
}
})
for trial in trials:
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertTrue(trial.has_checkpoint())
def testExportFormats(self):
class train(Trainable):
def _train(self):
return {"timesteps_this_iter": 1, "done": True}
def _export_model(self, export_formats, export_dir):
path = export_dir + "/exported"
with open(path, "w") as f:
f.write("OK")
return {export_formats[0]: path}
trials = run_experiments({
"foo": {
"run": train,
"export_formats": ["format"]
}
})
for trial in trials:
self.assertEqual(trial.status, Trial.TERMINATED)
self.assertTrue(
os.path.exists(os.path.join(trial.logdir, "exported")))
def testInvalidExportFormats(self):
class train(Trainable):
def _train(self):
return {"timesteps_this_iter": 1, "done": True}
def _export_model(self, export_formats, export_dir):
ExportFormat.validate(export_formats)
return {}
def fail_trial():
run_experiments({
"foo": {
"run": train,
"export_formats": ["format"]
}
})
self.assertRaises(TuneError, fail_trial)
def testCustomResources(self):
ray.shutdown()
ray.init(resources={"hi": 3})
class train(Trainable):
def _train(self):
return {"timesteps_this_iter": 1, "done": True}
trials = run_experiments({
"foo": {
"run": train,
"resources_per_trial": {
"cpu": 1,
"custom_resources": {
"hi": 2
}
}
}
})
for trial in trials:
self.assertEqual(trial.status, Trial.TERMINATED)
def testCustomLogger(self):
class CustomLogger(Logger):
def on_result(self, result):
with open(os.path.join(self.logdir, "test.log"), "w") as f:
f.write("hi")
[trial] = run_experiments({
"foo": {
"run": "__fake",
"stop": {
"training_iteration": 1
},
"loggers": [CustomLogger]
}
})
self.assertTrue(os.path.exists(os.path.join(trial.logdir, "test.log")))
self.assertFalse(
os.path.exists(os.path.join(trial.logdir, "params.json")))
[trial] = run_experiments({
"foo": {
"run": "__fake",
"stop": {
"training_iteration": 1
}
}
})
self.assertTrue(
os.path.exists(os.path.join(trial.logdir, "params.json")))
[trial] = run_experiments({
"foo": {
"run": "__fake",
"stop": {
"training_iteration": 1
},
"loggers": []
}
})
self.assertFalse(
os.path.exists(os.path.join(trial.logdir, "params.json")))
def testCustomTrialString(self):
[trial] = run_experiments({
"foo": {
"run": "__fake",
"stop": {
"training_iteration": 1
},
"trial_name_creator":
lambda t: "{}_{}_321".format(t.trainable_name, t.trial_id)
}
})
self.assertEquals(
str(trial), "{}_{}_321".format(trial.trainable_name,
trial.trial_id))
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+218
View File
@@ -0,0 +1,218 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import glob
import os
import shutil
import sys
import tempfile
import unittest
import ray
from ray.rllib import _register_all
from ray import tune
from ray.tune import TuneError
from ray.tune.syncer import CommandBasedClient
if sys.version_info >= (3, 3):
from unittest.mock import patch
else:
from mock import patch
class TestSyncFunctionality(unittest.TestCase):
def setUp(self):
ray.init()
def tearDown(self):
ray.shutdown()
_register_all() # re-register the evicted objects
@patch("ray.tune.syncer.S3_PREFIX", "test")
def testNoUploadDir(self):
"""No Upload Dir is given."""
with self.assertRaises(AssertionError):
[trial] = tune.run(
"__fake",
name="foo",
max_failures=0,
**{
"stop": {
"training_iteration": 1
},
"sync_to_cloud": "echo {source} {target}"
}).trials
@patch("ray.tune.syncer.S3_PREFIX", "test")
def testCloudProperString(self):
with self.assertRaises(ValueError):
[trial] = tune.run(
"__fake",
name="foo",
max_failures=0,
**{
"stop": {
"training_iteration": 1
},
"upload_dir": "test",
"sync_to_cloud": "ls {target}"
}).trials
with self.assertRaises(ValueError):
[trial] = tune.run(
"__fake",
name="foo",
max_failures=0,
**{
"stop": {
"training_iteration": 1
},
"upload_dir": "test",
"sync_to_cloud": "ls {source}"
}).trials
tmpdir = tempfile.mkdtemp()
logfile = os.path.join(tmpdir, "test.log")
[trial] = tune.run(
"__fake",
name="foo",
max_failures=0,
**{
"stop": {
"training_iteration": 1
},
"upload_dir": "test",
"sync_to_cloud": "echo {source} {target} > " + logfile
}).trials
with open(logfile) as f:
lines = f.read()
self.assertTrue("test" in lines)
shutil.rmtree(tmpdir)
def testClusterProperString(self):
"""Tests that invalid commands throw.."""
with self.assertRaises(TuneError):
# This raises TuneError because logger is init in safe zone.
[trial] = tune.run(
"__fake",
name="foo",
max_failures=0,
**{
"stop": {
"training_iteration": 1
},
"sync_to_driver": "ls {target}"
}).trials
with self.assertRaises(TuneError):
# This raises TuneError because logger is init in safe zone.
[trial] = tune.run(
"__fake",
name="foo",
max_failures=0,
**{
"stop": {
"training_iteration": 1
},
"sync_to_driver": "ls {source}"
}).trials
with patch.object(CommandBasedClient, "execute") as mock_fn:
with patch("ray.services.get_node_ip_address") as mock_sync:
mock_sync.return_value = "0.0.0.0"
[trial] = tune.run(
"__fake",
name="foo",
max_failures=0,
**{
"stop": {
"training_iteration": 1
},
"sync_to_driver": "echo {source} {target}"
}).trials
self.assertGreater(mock_fn.call_count, 0)
def testCloudFunctions(self):
tmpdir = tempfile.mkdtemp()
tmpdir2 = tempfile.mkdtemp()
os.mkdir(os.path.join(tmpdir2, "foo"))
def sync_func(local, remote):
for filename in glob.glob(os.path.join(local, "*.json")):
shutil.copy(filename, remote)
[trial] = tune.run(
"__fake",
name="foo",
max_failures=0,
local_dir=tmpdir,
stop={
"training_iteration": 1
},
upload_dir=tmpdir2,
sync_to_cloud=sync_func).trials
test_file_path = glob.glob(os.path.join(tmpdir2, "foo", "*.json"))
self.assertTrue(test_file_path)
shutil.rmtree(tmpdir)
shutil.rmtree(tmpdir2)
def testClusterSyncFunction(self):
def sync_func_driver(source, target):
assert ":" in source, "Source {} not a remote path.".format(source)
assert ":" not in target, "Target is supposed to be local."
with open(os.path.join(target, "test.log2"), "w") as f:
print("writing to", f.name)
f.write(source)
[trial] = tune.run(
"__fake",
name="foo",
max_failures=0,
stop={
"training_iteration": 1
},
sync_to_driver=sync_func_driver).trials
test_file_path = os.path.join(trial.logdir, "test.log2")
self.assertFalse(os.path.exists(test_file_path))
with patch("ray.services.get_node_ip_address") as mock_sync:
mock_sync.return_value = "0.0.0.0"
[trial] = tune.run(
"__fake",
name="foo",
max_failures=0,
stop={
"training_iteration": 1
},
sync_to_driver=sync_func_driver).trials
test_file_path = os.path.join(trial.logdir, "test.log2")
self.assertTrue(os.path.exists(test_file_path))
os.remove(test_file_path)
def testNoSync(self):
"""Sync should not run on a single node."""
def sync_func(source, target):
pass
with patch.object(CommandBasedClient, "execute") as mock_sync:
[trial] = tune.run(
"__fake",
name="foo",
max_failures=0,
**{
"stop": {
"training_iteration": 1
},
"sync_to_driver": sync_func
}).trials
self.assertEqual(mock_sync.call_count, 0)
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+3 -1
View File
@@ -85,4 +85,6 @@ class TrackApiTest(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
File diff suppressed because it is too large Load Diff
@@ -1194,4 +1194,6 @@ class AsyncHyperBandSuite(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+4 -2
View File
@@ -13,7 +13,7 @@ import numpy as np
import ray
from ray import tune
from ray.tests.utils import recursive_fnmatch
from ray.test_utils import recursive_fnmatch
from ray.tune.util import validate_save_restore
from ray.rllib import _register_all
from ray.tune.suggest.hyperopt import HyperOptSearch
@@ -277,4 +277,6 @@ class SigOptWarmStartTest(AbstractWarmStartTest, unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
@@ -151,4 +151,6 @@ class SerialTuneRelativeLocalDirTest(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+3 -1
View File
@@ -144,4 +144,6 @@ class TuneServerSuite(unittest.TestCase):
if __name__ == "__main__":
unittest.main(verbosity=2)
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+319
View File
@@ -0,0 +1,319 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import unittest
import ray
from ray.rllib import _register_all
from ray import tune
from ray.tune.result import DEFAULT_RESULTS_DIR
from ray.tune.experiment import Experiment
from ray.tune.suggest import grid_search, BasicVariantGenerator
from ray.tune.suggest.suggestion import _MockSuggestionAlgorithm
from ray.tune.suggest.variant_generator import (RecursiveDependencyError,
resolve_nested_dict)
class VariantGeneratorTest(unittest.TestCase):
def setUp(self):
ray.init()
def tearDown(self):
ray.shutdown()
_register_all() # re-register the evicted objects
def generate_trials(self, spec, name):
suggester = BasicVariantGenerator()
suggester.add_configurations({name: spec})
return suggester.next_trials()
def testParseToTrials(self):
trials = self.generate_trials({
"run": "PPO",
"num_samples": 2,
"max_failures": 5,
"config": {
"env": "Pong-v0",
"foo": "bar"
},
}, "tune-pong")
trials = list(trials)
self.assertEqual(len(trials), 2)
self.assertTrue("PPO_Pong-v0" in str(trials[0]))
self.assertEqual(trials[0].config, {"foo": "bar", "env": "Pong-v0"})
self.assertEqual(trials[0].trainable_name, "PPO")
self.assertEqual(trials[0].experiment_tag, "0")
self.assertEqual(trials[0].max_failures, 5)
self.assertEqual(trials[0].evaluated_params, {})
self.assertEqual(trials[0].local_dir,
os.path.join(DEFAULT_RESULTS_DIR, "tune-pong"))
self.assertEqual(trials[1].experiment_tag, "1")
def testEval(self):
trials = self.generate_trials({
"run": "PPO",
"config": {
"foo": {
"eval": "2 + 2"
},
},
}, "eval")
trials = list(trials)
self.assertEqual(len(trials), 1)
self.assertEqual(trials[0].config, {"foo": 4})
self.assertEqual(trials[0].evaluated_params, {"foo": 4})
self.assertEqual(trials[0].experiment_tag, "0_foo=4")
def testGridSearch(self):
trials = self.generate_trials({
"run": "PPO",
"config": {
"bar": {
"grid_search": [True, False]
},
"foo": {
"grid_search": [1, 2, 3]
},
"baz": "asd",
},
}, "grid_search")
trials = list(trials)
self.assertEqual(len(trials), 6)
self.assertEqual(trials[0].config, {
"bar": True,
"foo": 1,
"baz": "asd",
})
self.assertEqual(trials[0].evaluated_params, {
"bar": True,
"foo": 1,
})
self.assertEqual(trials[0].experiment_tag, "0_bar=True,foo=1")
self.assertEqual(trials[1].config, {
"bar": False,
"foo": 1,
"baz": "asd",
})
self.assertEqual(trials[1].evaluated_params, {
"bar": False,
"foo": 1,
})
self.assertEqual(trials[1].experiment_tag, "1_bar=False,foo=1")
self.assertEqual(trials[2].config, {
"bar": True,
"foo": 2,
"baz": "asd",
})
self.assertEqual(trials[2].evaluated_params, {
"bar": True,
"foo": 2,
})
self.assertEqual(trials[3].config, {
"bar": False,
"foo": 2,
"baz": "asd",
})
self.assertEqual(trials[3].evaluated_params, {
"bar": False,
"foo": 2,
})
self.assertEqual(trials[4].config, {
"bar": True,
"foo": 3,
"baz": "asd",
})
self.assertEqual(trials[4].evaluated_params, {
"bar": True,
"foo": 3,
})
self.assertEqual(trials[5].config, {
"bar": False,
"foo": 3,
"baz": "asd",
})
self.assertEqual(trials[5].evaluated_params, {
"bar": False,
"foo": 3,
})
def testGridSearchAndEval(self):
trials = self.generate_trials({
"run": "PPO",
"config": {
"qux": tune.sample_from(lambda spec: 2 + 2),
"bar": grid_search([True, False]),
"foo": grid_search([1, 2, 3]),
"baz": "asd",
},
}, "grid_eval")
trials = list(trials)
self.assertEqual(len(trials), 6)
self.assertEqual(trials[0].config, {
"bar": True,
"foo": 1,
"qux": 4,
"baz": "asd",
})
self.assertEqual(trials[0].evaluated_params, {
"bar": True,
"foo": 1,
"qux": 4,
})
self.assertEqual(trials[0].experiment_tag, "0_bar=True,foo=1,qux=4")
def testConditionResolution(self):
trials = self.generate_trials({
"run": "PPO",
"config": {
"x": 1,
"y": tune.sample_from(lambda spec: spec.config.x + 1),
"z": tune.sample_from(lambda spec: spec.config.y + 1),
},
}, "condition_resolution")
trials = list(trials)
self.assertEqual(len(trials), 1)
self.assertEqual(trials[0].config, {"x": 1, "y": 2, "z": 3})
self.assertEqual(trials[0].evaluated_params, {"y": 2, "z": 3})
self.assertEqual(trials[0].experiment_tag, "0_y=2,z=3")
def testDependentLambda(self):
trials = self.generate_trials({
"run": "PPO",
"config": {
"x": grid_search([1, 2]),
"y": tune.sample_from(lambda spec: spec.config.x * 100),
},
}, "dependent_lambda")
trials = list(trials)
self.assertEqual(len(trials), 2)
self.assertEqual(trials[0].config, {"x": 1, "y": 100})
self.assertEqual(trials[1].config, {"x": 2, "y": 200})
def testDependentGridSearch(self):
trials = self.generate_trials({
"run": "PPO",
"config": {
"x": grid_search([
tune.sample_from(lambda spec: spec.config.y * 100),
tune.sample_from(lambda spec: spec.config.y * 200)
]),
"y": tune.sample_from(lambda spec: 1),
},
}, "dependent_grid_search")
trials = list(trials)
self.assertEqual(len(trials), 2)
self.assertEqual(trials[0].config, {"x": 100, "y": 1})
self.assertEqual(trials[1].config, {"x": 200, "y": 1})
def testNestedValues(self):
trials = self.generate_trials({
"run": "PPO",
"config": {
"x": {
"y": {
"z": tune.sample_from(lambda spec: 1)
}
},
"y": tune.sample_from(lambda spec: 12),
"z": tune.sample_from(lambda spec: spec.config.x.y.z * 100),
},
}, "nested_values")
trials = list(trials)
self.assertEqual(len(trials), 1)
self.assertEqual(trials[0].config, {
"x": {
"y": {
"z": 1
}
},
"y": 12,
"z": 100
})
self.assertEqual(trials[0].evaluated_params, {
"x/y/z": 1,
"y": 12,
"z": 100
})
def testLogUniform(self):
sampler = tune.loguniform(1e-10, 1e-1).func
results = [sampler(None) for i in range(1000)]
assert abs(np.log(min(results)) / np.log(10) - -10) < 0.1
assert abs(np.log(max(results)) / np.log(10) - -1) < 0.1
sampler_e = tune.loguniform(np.e**-4, np.e, base=np.e).func
results_e = [sampler_e(None) for i in range(1000)]
assert abs(np.log(min(results_e)) - -4) < 0.1
assert abs(np.log(max(results_e)) - 1) < 0.1
def test_resolve_dict(self):
config = {
"a": {
"b": 1,
"c": 2,
},
"b": {
"a": 3
}
}
resolved = resolve_nested_dict(config)
for k, v in [(("a", "b"), 1), (("a", "c"), 2), (("b", "a"), 3)]:
self.assertEqual(resolved.get(k), v)
def testRecursiveDep(self):
try:
list(
self.generate_trials({
"run": "PPO",
"config": {
"foo": tune.sample_from(lambda spec: spec.config.foo),
},
}, "recursive_dep"))
except RecursiveDependencyError as e:
assert "`foo` recursively depends on" in str(e), e
else:
assert False
def testMaxConcurrentSuggestions(self):
"""Checks that next_trials() supports throttling."""
experiment_spec = {
"run": "PPO",
"num_samples": 6,
}
experiments = [Experiment.from_json("test", experiment_spec)]
searcher = _MockSuggestionAlgorithm(max_concurrent=4)
searcher.add_configurations(experiments)
trials = searcher.next_trials()
self.assertEqual(len(trials), 4)
self.assertEqual(searcher.next_trials(), [])
finished_trial = trials.pop()
searcher.on_trial_complete(finished_trial.trial_id)
self.assertEqual(len(searcher.next_trials()), 1)
finished_trial = trials.pop()
searcher.on_trial_complete(finished_trial.trial_id)
finished_trial = trials.pop()
searcher.on_trial_complete(finished_trial.trial_id)
finished_trial = trials.pop()
searcher.on_trial_complete(finished_trial.trial_id)
self.assertEqual(len(searcher.next_trials()), 1)
self.assertEqual(len(searcher.next_trials()), 0)
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))