mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 15:06:28 +08:00
Move more unit tests to bazel (#6250)
* move more unit tests to bazel * move to avoid conflict * fix lint * fix deps * seprate * fix failing tests * show tests * ignore mismatch * try combining bazel runs * build lint * remove tests from install * fix test utils * better config * split up * exclusive * fix verbosity * fix tests class * cleanup * remove flaky * fix metrics test * Update .travis.yml * no retry flaky * split up actor * split basic test * split up trial runner test * split stress * fix basic test * fix tests * switch to pytest runner for main * make microbench not fail * move load code to py3 * test is no longer package * bazel to end
This commit is contained in:
@@ -4,7 +4,7 @@ import time
|
||||
import numpy as np
|
||||
import ray
|
||||
|
||||
from ray.tests.cluster_utils import Cluster
|
||||
from ray.cluster_utils import Cluster
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import json
|
||||
import fnmatch
|
||||
import os
|
||||
import subprocess
|
||||
@@ -170,3 +171,11 @@ def recursive_fnmatch(dirpath, pattern):
|
||||
for filename in fnmatch.filter(filenames, pattern):
|
||||
matches.append(os.path.join(root, filename))
|
||||
return matches
|
||||
|
||||
|
||||
def generate_internal_config_map(**kwargs):
|
||||
internal_config = json.dumps(kwargs)
|
||||
ray_kwargs = {
|
||||
"_internal_config": internal_config,
|
||||
}
|
||||
return ray_kwargs
|
||||
@@ -0,0 +1,297 @@
|
||||
py_test(
|
||||
name = "test_actor",
|
||||
size = "large",
|
||||
srcs = ["test_actor.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_actor_resources",
|
||||
size = "large",
|
||||
srcs = ["test_actor_resources.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_actor_failures",
|
||||
size = "large",
|
||||
srcs = ["test_actor_failures.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_basic",
|
||||
size = "large",
|
||||
srcs = ["test_basic.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_advanced",
|
||||
size = "large",
|
||||
srcs = ["test_advanced.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_component_failures",
|
||||
size = "large",
|
||||
srcs = ["test_component_failures.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_multinode_failures",
|
||||
size = "large",
|
||||
srcs = ["test_multinode_failures.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_stress",
|
||||
size = "large",
|
||||
srcs = ["test_stress.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_stress_sharded",
|
||||
size = "large",
|
||||
srcs = ["test_stress_sharded.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_stress_failure",
|
||||
size = "large",
|
||||
srcs = ["test_stress_failure.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_array",
|
||||
size = "medium",
|
||||
srcs = ["test_array.py"],
|
||||
deps = ["//:ray_lib"],
|
||||
flaky = 1,
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_autoscaler",
|
||||
size = "small",
|
||||
srcs = ["test_autoscaler.py"],
|
||||
deps = ["//:ray_lib"],
|
||||
flaky = 1,
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_autoscaler_yaml",
|
||||
size = "small",
|
||||
srcs = ["test_autoscaler_yaml.py"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_cython",
|
||||
size = "small",
|
||||
srcs = ["test_cython.py"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_debug_tools",
|
||||
size = "small",
|
||||
srcs = ["test_debug_tools.py"],
|
||||
deps = ["//:ray_lib"],
|
||||
flaky = 1,
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_dynres",
|
||||
size = "medium",
|
||||
srcs = ["test_dynres.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_failure",
|
||||
size = "medium",
|
||||
srcs = ["test_failure.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_garbage_collection",
|
||||
size = "medium",
|
||||
srcs = ["test_garbage_collection.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_global_state",
|
||||
size = "medium",
|
||||
srcs = ["test_global_state.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_logical_graph",
|
||||
size = "medium",
|
||||
srcs = ["test_logical_graph.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_memory_limits",
|
||||
size = "medium",
|
||||
srcs = ["test_memory_limits.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_memory_scheduling",
|
||||
size = "medium",
|
||||
srcs = ["test_memory_scheduling.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_metrics",
|
||||
size = "small",
|
||||
srcs = ["test_metrics.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_microbenchmarks",
|
||||
size = "medium",
|
||||
srcs = ["test_microbenchmarks.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_mini",
|
||||
size = "small",
|
||||
srcs = ["test_mini.py"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_monitors",
|
||||
size = "medium",
|
||||
srcs = ["test_monitors.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_multi_node_2",
|
||||
size = "medium",
|
||||
srcs = ["test_multi_node_2.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_multi_node",
|
||||
size = "medium",
|
||||
srcs = ["test_multi_node.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_node_manager",
|
||||
size = "small",
|
||||
srcs = ["test_node_manager.py"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_object_manager",
|
||||
size = "medium",
|
||||
srcs = ["test_object_manager.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_projects",
|
||||
size = "small",
|
||||
srcs = ["test_projects.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_queue",
|
||||
size = "small",
|
||||
srcs = ["test_queue.py"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_ray_init",
|
||||
size = "medium",
|
||||
srcs = ["test_ray_init.py"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_signal",
|
||||
size = "medium",
|
||||
srcs = ["test_signal.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_tempfile",
|
||||
size = "small",
|
||||
srcs = ["test_tempfile.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_tensorflow",
|
||||
size = "medium",
|
||||
srcs = ["test_tensorflow.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_unreconstructable_errors",
|
||||
size = "medium",
|
||||
srcs = ["test_unreconstructable_errors.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_webui",
|
||||
size = "medium",
|
||||
srcs = ["test_webui.py"],
|
||||
tags = ["exclusive"],
|
||||
deps = ["//:ray_lib"],
|
||||
)
|
||||
@@ -8,7 +8,7 @@ import pytest
|
||||
import subprocess
|
||||
|
||||
import ray
|
||||
from ray.tests.cluster_utils import Cluster
|
||||
from ray.cluster_utils import Cluster
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -18,14 +18,6 @@ def shutdown_only():
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def generate_internal_config_map(**kwargs):
|
||||
internal_config = json.dumps(kwargs)
|
||||
ray_kwargs = {
|
||||
"_internal_config": internal_config,
|
||||
}
|
||||
return ray_kwargs
|
||||
|
||||
|
||||
def get_default_fixure_internal_config():
|
||||
internal_config = json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
@@ -177,7 +169,7 @@ def two_node_cluster():
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"num_heartbeats_timeout": 10,
|
||||
})
|
||||
cluster = ray.tests.cluster_utils.Cluster(
|
||||
cluster = ray.cluster_utils.Cluster(
|
||||
head_node_args={"_internal_config": internal_config})
|
||||
for _ in range(2):
|
||||
remote_node = cluster.add_node(
|
||||
|
||||
@@ -8,8 +8,8 @@ import threading
|
||||
import pytest
|
||||
|
||||
import ray
|
||||
import ray.tests.cluster_utils
|
||||
import ray.tests.utils
|
||||
import ray.cluster_utils
|
||||
import ray.test_utils
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
+12
-1435
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,732 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import pytest
|
||||
try:
|
||||
import pytest_timeout
|
||||
except ImportError:
|
||||
pytest_timeout = None
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
|
||||
import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
import ray.test_utils
|
||||
import ray.cluster_utils
|
||||
from ray.test_utils import (relevant_errors, wait_for_condition,
|
||||
wait_for_errors, wait_for_pid_to_exit,
|
||||
generate_internal_config_map)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_checkpointable_actor_cls(request):
|
||||
checkpoint_dir = "/tmp/ray_temp_checkpoint_dir/"
|
||||
if not os.path.isdir(checkpoint_dir):
|
||||
os.mkdir(checkpoint_dir)
|
||||
|
||||
class CheckpointableActor(ray.actor.Checkpointable):
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
self.resumed_from_checkpoint = False
|
||||
self.checkpoint_dir = checkpoint_dir
|
||||
|
||||
def node_id(self):
|
||||
return ray.worker.global_worker.node.unique_id
|
||||
|
||||
def increase(self):
|
||||
self.value += 1
|
||||
return self.value
|
||||
|
||||
def get(self):
|
||||
return self.value
|
||||
|
||||
def was_resumed_from_checkpoint(self):
|
||||
return self.resumed_from_checkpoint
|
||||
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
# Checkpoint the actor when value is increased to 3.
|
||||
should_checkpoint = self.value == 3
|
||||
return should_checkpoint
|
||||
|
||||
def save_checkpoint(self, actor_id, checkpoint_id):
|
||||
actor_id, checkpoint_id = actor_id.hex(), checkpoint_id.hex()
|
||||
# Save checkpoint into a file.
|
||||
with open(self.checkpoint_dir + actor_id, "a+") as f:
|
||||
print(checkpoint_id, self.value, file=f)
|
||||
|
||||
def load_checkpoint(self, actor_id, available_checkpoints):
|
||||
actor_id = actor_id.hex()
|
||||
filename = self.checkpoint_dir + actor_id
|
||||
# Load checkpoint from the file.
|
||||
if not os.path.isfile(filename):
|
||||
return None
|
||||
|
||||
available_checkpoint_ids = [
|
||||
c.checkpoint_id for c in available_checkpoints
|
||||
]
|
||||
with open(filename, "r") as f:
|
||||
for line in f:
|
||||
checkpoint_id, value = line.strip().split(" ")
|
||||
checkpoint_id = ray.ActorCheckpointID(
|
||||
ray.utils.hex_to_binary(checkpoint_id))
|
||||
if checkpoint_id in available_checkpoint_ids:
|
||||
self.value = int(value)
|
||||
self.resumed_from_checkpoint = True
|
||||
return checkpoint_id
|
||||
return None
|
||||
|
||||
def checkpoint_expired(self, actor_id, checkpoint_id):
|
||||
pass
|
||||
|
||||
return CheckpointableActor
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True)
|
||||
def test_actor_eviction(ray_start_object_store_memory):
|
||||
object_store_memory = ray_start_object_store_memory
|
||||
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def create_object(self, size):
|
||||
return np.random.rand(size)
|
||||
|
||||
a = Actor.remote()
|
||||
# Submit enough methods on the actor so that they exceed the size of the
|
||||
# object store.
|
||||
objects = []
|
||||
num_objects = 20
|
||||
for _ in range(num_objects):
|
||||
obj = a.create_object.remote(object_store_memory // num_objects)
|
||||
objects.append(obj)
|
||||
# Get each object once to make sure each object gets created.
|
||||
ray.get(obj)
|
||||
|
||||
# Get each object again. At this point, the earlier objects should have
|
||||
# been evicted.
|
||||
num_evicted, num_success = 0, 0
|
||||
for obj in objects:
|
||||
try:
|
||||
val = ray.get(obj)
|
||||
assert isinstance(val, np.ndarray), val
|
||||
num_success += 1
|
||||
except ray.exceptions.UnreconstructableError:
|
||||
num_evicted += 1
|
||||
# Some objects should have been evicted, and some should still be in the
|
||||
# object store.
|
||||
assert num_evicted > 0
|
||||
assert num_success > 0
|
||||
|
||||
|
||||
def test_actor_reconstruction(ray_start_regular):
|
||||
"""Test actor reconstruction when actor process is killed."""
|
||||
|
||||
@ray.remote(max_reconstructions=1)
|
||||
class ReconstructableActor(object):
|
||||
"""An actor that will be reconstructed at most once."""
|
||||
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
|
||||
def increase(self, delay=0):
|
||||
time.sleep(delay)
|
||||
self.value += 1
|
||||
return self.value
|
||||
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
actor = ReconstructableActor.remote()
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
# Call increase 3 times
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
# Call increase again with some delay.
|
||||
result = actor.increase.remote(delay=0.5)
|
||||
# Sleep some time to wait for the above task to start execution.
|
||||
time.sleep(0.2)
|
||||
# Kill actor process, while the above task is still being executed.
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
# Check that the above task didn't fail and the actor is reconstructed.
|
||||
assert ray.get(result) == 4
|
||||
# Check that we can still call the actor.
|
||||
assert ray.get(actor.increase.remote()) == 5
|
||||
# kill actor process one more time.
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
# The actor has exceeded max reconstructions, and this task should fail.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
# Create another actor.
|
||||
actor = ReconstructableActor.remote()
|
||||
# Intentionlly exit the actor
|
||||
actor.__ray_terminate__.remote()
|
||||
# Check that the actor won't be reconstructed.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
|
||||
def test_actor_reconstruction_without_task(ray_start_regular):
|
||||
"""Test a dead actor can be reconstructed without sending task to it."""
|
||||
|
||||
@ray.remote(max_reconstructions=1)
|
||||
class ReconstructableActor(object):
|
||||
def __init__(self, obj_ids):
|
||||
for obj_id in obj_ids:
|
||||
# Every time the actor gets constructed,
|
||||
# put a new object in plasma store.
|
||||
global_worker = ray.worker.global_worker
|
||||
if not global_worker.core_worker.object_exists(obj_id):
|
||||
global_worker.put_object(1, obj_id)
|
||||
break
|
||||
|
||||
def get_pid(self):
|
||||
return os.getpid()
|
||||
|
||||
obj_ids = [ray.ObjectID.from_random() for _ in range(2)]
|
||||
actor = ReconstructableActor.remote(obj_ids)
|
||||
# Kill the actor.
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
# Wait until the actor is reconstructed.
|
||||
assert wait_for_condition(
|
||||
lambda: ray.worker.global_worker.core_worker.object_exists(obj_ids[1]),
|
||||
timeout_ms=5000)
|
||||
|
||||
|
||||
def test_actor_reconstruction_on_node_failure(ray_start_cluster_head):
|
||||
"""Test actor reconstruction when node dies unexpectedly."""
|
||||
cluster = ray_start_cluster_head
|
||||
max_reconstructions = 3
|
||||
# Add a few nodes to the cluster.
|
||||
# Use custom resource to make sure the actor is only created on worker
|
||||
# nodes, not on the head node.
|
||||
for _ in range(max_reconstructions + 2):
|
||||
cluster.add_node(
|
||||
resources={"a": 1},
|
||||
_internal_config=json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"num_heartbeats_timeout": 10,
|
||||
}),
|
||||
)
|
||||
|
||||
def kill_node(node_id):
|
||||
node_to_remove = None
|
||||
for node in cluster.worker_nodes:
|
||||
if node_id == node.unique_id:
|
||||
node_to_remove = node
|
||||
cluster.remove_node(node_to_remove)
|
||||
|
||||
@ray.remote(max_reconstructions=max_reconstructions, resources={"a": 1})
|
||||
class MyActor(object):
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
|
||||
def increase(self):
|
||||
self.value += 1
|
||||
return self.value
|
||||
|
||||
def get_object_store_socket(self):
|
||||
return ray.worker.global_worker.node.unique_id
|
||||
|
||||
actor = MyActor.remote()
|
||||
# Call increase 3 times.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
for i in range(max_reconstructions):
|
||||
object_store_socket = ray.get(actor.get_object_store_socket.remote())
|
||||
# Kill actor's node and the actor should be reconstructed
|
||||
# on a different node.
|
||||
kill_node(object_store_socket)
|
||||
# Call increase again.
|
||||
# Check that the actor is reconstructed and value is correct.
|
||||
assert ray.get(actor.increase.remote()) == 4 + i
|
||||
# Check that the actor is now on a different node.
|
||||
assert object_store_socket != ray.get(
|
||||
actor.get_object_store_socket.remote())
|
||||
|
||||
# kill the node again.
|
||||
object_store_socket = ray.get(actor.get_object_store_socket.remote())
|
||||
kill_node(object_store_socket)
|
||||
# The actor has exceeded max reconstructions, and this task should fail.
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.increase.remote())
|
||||
|
||||
|
||||
# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for
|
||||
# this test. Because if this value is too small, suprious task reconstruction
|
||||
# may happen and cause the test fauilure. If the value is too large, this test
|
||||
# could be very slow. We can remove this once we support dynamic timeout.
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster_head", [
|
||||
generate_internal_config_map(
|
||||
initial_reconstruction_timeout_milliseconds=1000)
|
||||
],
|
||||
indirect=True)
|
||||
def test_multiple_actor_reconstruction(ray_start_cluster_head):
|
||||
cluster = ray_start_cluster_head
|
||||
# This test can be made more stressful by increasing the numbers below.
|
||||
# The total number of actors created will be
|
||||
# num_actors_at_a_time * num_nodes.
|
||||
num_nodes = 5
|
||||
num_actors_at_a_time = 3
|
||||
num_function_calls_at_a_time = 10
|
||||
|
||||
worker_nodes = [
|
||||
cluster.add_node(
|
||||
num_cpus=3,
|
||||
_internal_config=json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200,
|
||||
"num_heartbeats_timeout": 10,
|
||||
})) for _ in range(num_nodes)
|
||||
]
|
||||
|
||||
@ray.remote(max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION)
|
||||
class SlowCounter(object):
|
||||
def __init__(self):
|
||||
self.x = 0
|
||||
|
||||
def inc(self, duration):
|
||||
time.sleep(duration)
|
||||
self.x += 1
|
||||
return self.x
|
||||
|
||||
# Create some initial actors.
|
||||
actors = [SlowCounter.remote() for _ in range(num_actors_at_a_time)]
|
||||
|
||||
# Wait for the actors to start up.
|
||||
time.sleep(1)
|
||||
|
||||
# This is a mapping from actor handles to object IDs returned by
|
||||
# methods on that actor.
|
||||
result_ids = collections.defaultdict(lambda: [])
|
||||
|
||||
# In a loop we are going to create some actors, run some methods, kill
|
||||
# a raylet, and run some more methods.
|
||||
for node in worker_nodes:
|
||||
# Create some actors.
|
||||
actors.extend(
|
||||
[SlowCounter.remote() for _ in range(num_actors_at_a_time)])
|
||||
# Run some methods.
|
||||
for j in range(len(actors)):
|
||||
actor = actors[j]
|
||||
for _ in range(num_function_calls_at_a_time):
|
||||
result_ids[actor].append(actor.inc.remote(j**2 * 0.000001))
|
||||
# Kill a node.
|
||||
cluster.remove_node(node)
|
||||
|
||||
# Run some more methods.
|
||||
for j in range(len(actors)):
|
||||
actor = actors[j]
|
||||
for _ in range(num_function_calls_at_a_time):
|
||||
result_ids[actor].append(actor.inc.remote(j**2 * 0.000001))
|
||||
|
||||
# Get the results and check that they have the correct values.
|
||||
for _, result_id_list in result_ids.items():
|
||||
results = list(range(1, len(result_id_list) + 1))
|
||||
assert ray.get(result_id_list) == results
|
||||
|
||||
|
||||
def kill_actor(actor):
|
||||
"""A helper function that kills an actor process."""
|
||||
pid = ray.get(actor.get_pid.remote())
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
wait_for_pid_to_exit(pid)
|
||||
|
||||
|
||||
def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
"""Test actor checkpointing and restoring from a checkpoint."""
|
||||
actor = ray.remote(
|
||||
max_reconstructions=2)(ray_checkpointable_actor_cls).remote()
|
||||
# Call increase 3 times, triggering a checkpoint.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor was resumed from a checkpoint and its value is
|
||||
# still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again and check that reconstruction still works after the
|
||||
# actor resuming from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls):
|
||||
"""Test checkpointing of a remote actor through method invocation."""
|
||||
|
||||
# Define a class that exposes a method to save checkpoints.
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def __init__(self):
|
||||
super(RemoteCheckpointableActor, self).__init__()
|
||||
self._should_checkpoint = False
|
||||
|
||||
def checkpoint(self):
|
||||
self._should_checkpoint = True
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
should_checkpoint = self._should_checkpoint
|
||||
self._should_checkpoint = False
|
||||
return should_checkpoint
|
||||
|
||||
cls = ray.remote(max_reconstructions=2)(RemoteCheckpointableActor)
|
||||
actor = cls.remote()
|
||||
# Call increase 3 times.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Call a checkpoint task.
|
||||
actor.checkpoint.remote()
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor was resumed from a checkpoint and its value is
|
||||
# still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again and check that reconstruction still works after the
|
||||
# actor resuming from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor checkpointing on a remote node."""
|
||||
# Place the actor on the remote node.
|
||||
cluster = ray_start_cluster_2_nodes
|
||||
remote_node = list(cluster.worker_nodes)
|
||||
actor_cls = ray.remote(max_reconstructions=1)(ray_checkpointable_actor_cls)
|
||||
actor = actor_cls.remote()
|
||||
while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id):
|
||||
actor = actor_cls.remote()
|
||||
|
||||
# Call increase several times.
|
||||
expected = 0
|
||||
for _ in range(6):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
cluster.remove_node(remote_node[0])
|
||||
# Assert that the actor was resumed from a checkpoint and its value is
|
||||
# still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True
|
||||
|
||||
|
||||
def test_checkpointing_save_exception(ray_start_regular,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor can still be recovered if checkpoints fail to complete."""
|
||||
|
||||
@ray.remote(max_reconstructions=2)
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def save_checkpoint(self, actor_id, checkpoint_context):
|
||||
raise Exception("Intentional error saving checkpoint.")
|
||||
|
||||
actor = RemoteCheckpointableActor.remote()
|
||||
# Call increase 3 times, triggering a checkpoint that will fail.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor still wasn't resumed from a checkpoint and its
|
||||
# value is still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again, and check that reconstruction still works and the actor
|
||||
# wasn't resumed from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Check that the checkpoint error was pushed to the driver.
|
||||
wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1)
|
||||
|
||||
|
||||
def test_checkpointing_load_exception(ray_start_regular,
|
||||
ray_checkpointable_actor_cls):
|
||||
"""Test actor can still be recovered if checkpoints fail to load."""
|
||||
|
||||
@ray.remote(max_reconstructions=2)
|
||||
class RemoteCheckpointableActor(ray_checkpointable_actor_cls):
|
||||
def load_checkpoint(self, actor_id, checkpoints):
|
||||
raise Exception("Intentional error loading checkpoint.")
|
||||
|
||||
actor = RemoteCheckpointableActor.remote()
|
||||
# Call increase 3 times, triggering a checkpoint that will succeed.
|
||||
expected = 0
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Assert that the actor wasn't resumed from a checkpoint because loading
|
||||
# it failed.
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
# Kill actor process.
|
||||
kill_actor(actor)
|
||||
# Assert that the actor still wasn't resumed from a checkpoint and its
|
||||
# value is still correct.
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Submit some more tasks. These should get replayed since they happen after
|
||||
# the checkpoint.
|
||||
for _ in range(3):
|
||||
ray.get(actor.increase.remote())
|
||||
expected += 1
|
||||
# Kill actor again, and check that reconstruction still works and the actor
|
||||
# wasn't resumed from a checkpoint.
|
||||
kill_actor(actor)
|
||||
assert ray.get(actor.get.remote()) == expected
|
||||
assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False
|
||||
|
||||
# Check that the checkpoint error was pushed to the driver.
|
||||
wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_regular",
|
||||
# This overwrite currently isn't effective,
|
||||
# see https://github.com/ray-project/ray/issues/3926.
|
||||
[generate_internal_config_map(num_actor_checkpoints_to_keep=20)],
|
||||
indirect=True,
|
||||
)
|
||||
def test_deleting_actor_checkpoint(ray_start_regular):
|
||||
"""Test deleting old actor checkpoints."""
|
||||
|
||||
@ray.remote
|
||||
class CheckpointableActor(ray.actor.Checkpointable):
|
||||
def __init__(self):
|
||||
self.checkpoint_ids = []
|
||||
|
||||
def get_checkpoint_ids(self):
|
||||
return self.checkpoint_ids
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
# Save checkpoints after every task
|
||||
return True
|
||||
|
||||
def save_checkpoint(self, actor_id, checkpoint_id):
|
||||
self.checkpoint_ids.append(checkpoint_id)
|
||||
pass
|
||||
|
||||
def load_checkpoint(self, actor_id, available_checkpoints):
|
||||
pass
|
||||
|
||||
def checkpoint_expired(self, actor_id, checkpoint_id):
|
||||
assert checkpoint_id == self.checkpoint_ids[0]
|
||||
del self.checkpoint_ids[0]
|
||||
|
||||
actor = CheckpointableActor.remote()
|
||||
for i in range(19):
|
||||
assert len(ray.get(actor.get_checkpoint_ids.remote())) == i + 1
|
||||
for _ in range(20):
|
||||
assert len(ray.get(actor.get_checkpoint_ids.remote())) == 20
|
||||
|
||||
|
||||
def test_bad_checkpointable_actor_class():
|
||||
"""Test error raised if an actor class doesn't implement all abstract
|
||||
methods in the Checkpointable interface."""
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
|
||||
@ray.remote
|
||||
class BadCheckpointableActor(ray.actor.Checkpointable):
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
return True
|
||||
|
||||
|
||||
def test_init_exception_in_checkpointable_actor(ray_start_regular,
|
||||
ray_checkpointable_actor_cls):
|
||||
# This test is similar to test_failure.py::test_failed_actor_init.
|
||||
# This test is used to guarantee that checkpointable actor does not
|
||||
# break the same logic.
|
||||
error_message1 = "actor constructor failed"
|
||||
error_message2 = "actor method failed"
|
||||
|
||||
@ray.remote
|
||||
class CheckpointableFailedActor(ray_checkpointable_actor_cls):
|
||||
def __init__(self):
|
||||
raise Exception(error_message1)
|
||||
|
||||
def fail_method(self):
|
||||
raise Exception(error_message2)
|
||||
|
||||
def should_checkpoint(self, checkpoint_context):
|
||||
return True
|
||||
|
||||
a = CheckpointableFailedActor.remote()
|
||||
|
||||
# Make sure that we get errors from a failed constructor.
|
||||
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
|
||||
errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
|
||||
assert len(errors) == 1
|
||||
assert error_message1 in errors[0]["message"]
|
||||
|
||||
# Make sure that we get errors from a failed method.
|
||||
a.fail_method.remote()
|
||||
wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
|
||||
errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
|
||||
assert len(errors) == 2
|
||||
assert error_message1 in errors[1]["message"]
|
||||
|
||||
|
||||
def test_decorated_method(ray_start_regular):
|
||||
def method_invocation_decorator(f):
|
||||
def new_f_invocation(args, kwargs):
|
||||
# Split one argument into two. Return th kwargs without passing
|
||||
# them into the actor.
|
||||
return f([args[0], args[0]], {}), kwargs
|
||||
|
||||
return new_f_invocation
|
||||
|
||||
def method_execution_decorator(f):
|
||||
def new_f_execution(self, b, c):
|
||||
# Turn two arguments into one.
|
||||
return f(self, b + c)
|
||||
|
||||
new_f_execution.__ray_invocation_decorator__ = (
|
||||
method_invocation_decorator)
|
||||
return new_f_execution
|
||||
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
@method_execution_decorator
|
||||
def decorated_method(self, x):
|
||||
return x + 1
|
||||
|
||||
a = Actor.remote()
|
||||
|
||||
object_id, extra = a.decorated_method.remote(3, kwarg=3)
|
||||
assert isinstance(object_id, ray.ObjectID)
|
||||
assert extra == {"kwarg": 3}
|
||||
assert ray.get(object_id) == 7 # 2 * 3 + 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
pytest_timeout is None,
|
||||
reason="Timeout package not installed; skipping test that may hang.")
|
||||
@pytest.mark.timeout(20)
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster", [{
|
||||
"num_cpus": 1,
|
||||
"num_nodes": 2,
|
||||
}], indirect=True)
|
||||
def test_ray_wait_dead_actor(ray_start_cluster):
|
||||
"""Tests that methods completed by dead actors are returned as ready"""
|
||||
cluster = ray_start_cluster
|
||||
|
||||
@ray.remote(num_cpus=1)
|
||||
class Actor(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def node_id(self):
|
||||
return ray.worker.global_worker.node.unique_id
|
||||
|
||||
def ping(self):
|
||||
time.sleep(1)
|
||||
|
||||
# Create some actors and wait for them to initialize.
|
||||
num_nodes = len(cluster.list_all_nodes())
|
||||
actors = [Actor.remote() for _ in range(num_nodes)]
|
||||
ray.get([actor.ping.remote() for actor in actors])
|
||||
|
||||
# Ping the actors and make sure the tasks complete.
|
||||
ping_ids = [actor.ping.remote() for actor in actors]
|
||||
ray.get(ping_ids)
|
||||
# Evict the result from the node that we're about to kill.
|
||||
remote_node = cluster.list_all_nodes()[-1]
|
||||
remote_ping_id = None
|
||||
for i, actor in enumerate(actors):
|
||||
if ray.get(actor.node_id.remote()) == remote_node.unique_id:
|
||||
remote_ping_id = ping_ids[i]
|
||||
ray.internal.free([remote_ping_id], local_only=True)
|
||||
cluster.remove_node(remote_node)
|
||||
|
||||
# Repeatedly call ray.wait until the exception for the dead actor is
|
||||
# received.
|
||||
unready = ping_ids[:]
|
||||
while unready:
|
||||
_, unready = ray.wait(unready, timeout=0)
|
||||
time.sleep(1)
|
||||
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(ping_ids)
|
||||
|
||||
# Evict the result from the dead node.
|
||||
ray.internal.free([remote_ping_id], local_only=True)
|
||||
# Create an actor on the local node that will call ray.wait in a loop.
|
||||
head_node_resource = "HEAD_NODE"
|
||||
ray.experimental.set_resource(head_node_resource, 1)
|
||||
|
||||
@ray.remote(num_cpus=0, resources={head_node_resource: 1})
|
||||
class ParentActor(object):
|
||||
def __init__(self, ping_ids):
|
||||
self.unready = ping_ids
|
||||
|
||||
def wait(self):
|
||||
_, self.unready = ray.wait(self.unready, timeout=0)
|
||||
return len(self.unready) == 0
|
||||
|
||||
def ping(self):
|
||||
return
|
||||
|
||||
# Repeatedly call ray.wait through the local actor until the exception for
|
||||
# the dead actor is received.
|
||||
parent_actor = ParentActor.remote(ping_ids)
|
||||
ray.get(parent_actor.ping.remote())
|
||||
failure_detected = False
|
||||
while not failure_detected:
|
||||
failure_detected = ray.get(parent_actor.wait.remote())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
@@ -0,0 +1,743 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import json
|
||||
import os
|
||||
import pytest
|
||||
try:
|
||||
import pytest_timeout
|
||||
except ImportError:
|
||||
pytest_timeout = None
|
||||
import sys
|
||||
import time
|
||||
|
||||
import ray
|
||||
import ray.test_utils
|
||||
import ray.cluster_utils
|
||||
|
||||
|
||||
def test_actor_deletion_with_gpus(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=1, num_gpus=1, object_store_memory=int(150 * 1024 * 1024))
|
||||
|
||||
# When an actor that uses a GPU exits, make sure that the GPU resources
|
||||
# are released.
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
class Actor(object):
|
||||
def getpid(self):
|
||||
return os.getpid()
|
||||
|
||||
for _ in range(5):
|
||||
# If we can successfully create an actor, that means that enough
|
||||
# GPU resources are available.
|
||||
a = Actor.remote()
|
||||
ray.get(a.getpid.remote())
|
||||
|
||||
|
||||
def test_actor_state(ray_start_regular):
|
||||
@ray.remote
|
||||
class Counter(object):
|
||||
def __init__(self):
|
||||
self.value = 0
|
||||
|
||||
def increase(self):
|
||||
self.value += 1
|
||||
|
||||
def value(self):
|
||||
return self.value
|
||||
|
||||
c1 = Counter.remote()
|
||||
c1.increase.remote()
|
||||
assert ray.get(c1.value.remote()) == 1
|
||||
|
||||
c2 = Counter.remote()
|
||||
c2.increase.remote()
|
||||
c2.increase.remote()
|
||||
assert ray.get(c2.value.remote()) == 2
|
||||
|
||||
|
||||
def test_actor_class_methods(ray_start_regular):
|
||||
class Foo(object):
|
||||
x = 2
|
||||
|
||||
@classmethod
|
||||
def as_remote(cls):
|
||||
return ray.remote(cls)
|
||||
|
||||
@classmethod
|
||||
def f(cls):
|
||||
return cls.x
|
||||
|
||||
@classmethod
|
||||
def g(cls, y):
|
||||
return cls.x + y
|
||||
|
||||
def echo(self, value):
|
||||
return value
|
||||
|
||||
a = Foo.as_remote().remote()
|
||||
assert ray.get(a.echo.remote(2)) == 2
|
||||
assert ray.get(a.f.remote()) == 2
|
||||
assert ray.get(a.g.remote(2)) == 4
|
||||
|
||||
|
||||
def test_resource_assignment(shutdown_only):
|
||||
"""Test to make sure that we assign resource to actors at instantiation."""
|
||||
# This test will create 16 actors. Declaring this many CPUs initially will
|
||||
# speed up the test because the workers will be started ahead of time.
|
||||
ray.init(
|
||||
num_cpus=16,
|
||||
num_gpus=1,
|
||||
resources={"Custom": 1},
|
||||
object_store_memory=int(150 * 1024 * 1024))
|
||||
|
||||
class Actor(object):
|
||||
def __init__(self):
|
||||
self.resources = ray.get_resource_ids()
|
||||
|
||||
def get_actor_resources(self):
|
||||
return self.resources
|
||||
|
||||
def get_actor_method_resources(self):
|
||||
return ray.get_resource_ids()
|
||||
|
||||
decorator_resource_args = [{}, {
|
||||
"num_cpus": 0.1
|
||||
}, {
|
||||
"num_gpus": 0.1
|
||||
}, {
|
||||
"resources": {
|
||||
"Custom": 0.1
|
||||
}
|
||||
}]
|
||||
instantiation_resource_args = [{}, {
|
||||
"num_cpus": 0.2
|
||||
}, {
|
||||
"num_gpus": 0.2
|
||||
}, {
|
||||
"resources": {
|
||||
"Custom": 0.2
|
||||
}
|
||||
}]
|
||||
for decorator_args in decorator_resource_args:
|
||||
for instantiation_args in instantiation_resource_args:
|
||||
if len(decorator_args) == 0:
|
||||
actor_class = ray.remote(Actor)
|
||||
else:
|
||||
actor_class = ray.remote(**decorator_args)(Actor)
|
||||
actor = actor_class._remote(**instantiation_args)
|
||||
actor_resources = ray.get(actor.get_actor_resources.remote())
|
||||
actor_method_resources = ray.get(
|
||||
actor.get_actor_method_resources.remote())
|
||||
if len(decorator_args) == 0 and len(instantiation_args) == 0:
|
||||
assert len(actor_resources) == 0, (
|
||||
"Actor should not be assigned resources.")
|
||||
assert list(actor_method_resources.keys()) == [
|
||||
"CPU"
|
||||
], ("Actor method should only have CPUs")
|
||||
assert actor_method_resources["CPU"][0][1] == 1, (
|
||||
"Actor method should default to one cpu.")
|
||||
else:
|
||||
if ("num_cpus" not in decorator_args
|
||||
and "num_cpus" not in instantiation_args):
|
||||
assert actor_resources["CPU"][0][1] == 1, (
|
||||
"Actor should default to one cpu.")
|
||||
correct_resources = {}
|
||||
defined_resources = decorator_args.copy()
|
||||
defined_resources.update(instantiation_args)
|
||||
for resource, value in defined_resources.items():
|
||||
if resource == "num_cpus":
|
||||
correct_resources["CPU"] = value
|
||||
elif resource == "num_gpus":
|
||||
correct_resources["GPU"] = value
|
||||
elif resource == "resources":
|
||||
for custom_resource, amount in value.items():
|
||||
correct_resources[custom_resource] = amount
|
||||
for resource, amount in correct_resources.items():
|
||||
assert (actor_resources[resource][0][0] ==
|
||||
actor_method_resources[resource][0][0]), (
|
||||
"Should have assigned same {} for both actor ",
|
||||
"and actor method.".format(resource))
|
||||
assert (actor_resources[resource][0][
|
||||
1] == actor_method_resources[resource][0][1]), (
|
||||
"Should have assigned same amount of {} for both ",
|
||||
"actor and actor method.".format(resource))
|
||||
assert actor_resources[resource][0][1] == amount, (
|
||||
"Actor should have {amount} {resource} but has ",
|
||||
"{amount} {resource}".format(
|
||||
amount=amount, resource=resource))
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Failing with new GCS API on Linux.")
|
||||
def test_actor_gpus(ray_start_cluster):
|
||||
cluster = ray_start_cluster
|
||||
num_nodes = 3
|
||||
num_gpus_per_raylet = 4
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node(
|
||||
num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet)
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
class Actor1(object):
|
||||
def __init__(self):
|
||||
self.gpu_ids = ray.get_gpu_ids()
|
||||
|
||||
def get_location_and_ids(self):
|
||||
assert ray.get_gpu_ids() == self.gpu_ids
|
||||
return (ray.worker.global_worker.node.unique_id,
|
||||
tuple(self.gpu_ids))
|
||||
|
||||
# Create one actor per GPU.
|
||||
actors = [Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet)]
|
||||
# Make sure that no two actors are assigned to the same GPU.
|
||||
locations_and_ids = ray.get(
|
||||
[actor.get_location_and_ids.remote() for actor in actors])
|
||||
node_names = {location for location, gpu_id in locations_and_ids}
|
||||
assert len(node_names) == num_nodes
|
||||
location_actor_combinations = []
|
||||
for node_name in node_names:
|
||||
for gpu_id in range(num_gpus_per_raylet):
|
||||
location_actor_combinations.append((node_name, (gpu_id, )))
|
||||
assert set(locations_and_ids) == set(location_actor_combinations)
|
||||
|
||||
# Creating a new actor should fail because all of the GPUs are being
|
||||
# used.
|
||||
a = Actor1.remote()
|
||||
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
|
||||
assert ready_ids == []
|
||||
|
||||
|
||||
def test_actor_multiple_gpus(ray_start_cluster):
|
||||
cluster = ray_start_cluster
|
||||
num_nodes = 3
|
||||
num_gpus_per_raylet = 5
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node(
|
||||
num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet)
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@ray.remote(num_gpus=2)
|
||||
class Actor1(object):
|
||||
def __init__(self):
|
||||
self.gpu_ids = ray.get_gpu_ids()
|
||||
|
||||
def get_location_and_ids(self):
|
||||
assert ray.get_gpu_ids() == self.gpu_ids
|
||||
return (ray.worker.global_worker.node.unique_id,
|
||||
tuple(self.gpu_ids))
|
||||
|
||||
# Create some actors.
|
||||
actors1 = [Actor1.remote() for _ in range(num_nodes * 2)]
|
||||
# Make sure that no two actors are assigned to the same GPU.
|
||||
locations_and_ids = ray.get(
|
||||
[actor.get_location_and_ids.remote() for actor in actors1])
|
||||
node_names = {location for location, gpu_id in locations_and_ids}
|
||||
assert len(node_names) == num_nodes
|
||||
|
||||
# Keep track of which GPU IDs are being used for each location.
|
||||
gpus_in_use = {node_name: [] for node_name in node_names}
|
||||
for location, gpu_ids in locations_and_ids:
|
||||
gpus_in_use[location].extend(gpu_ids)
|
||||
for node_name in node_names:
|
||||
assert len(set(gpus_in_use[node_name])) == 4
|
||||
|
||||
# Creating a new actor should fail because all of the GPUs are being
|
||||
# used.
|
||||
a = Actor1.remote()
|
||||
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
|
||||
assert ready_ids == []
|
||||
|
||||
# We should be able to create more actors that use only a single GPU.
|
||||
@ray.remote(num_gpus=1)
|
||||
class Actor2(object):
|
||||
def __init__(self):
|
||||
self.gpu_ids = ray.get_gpu_ids()
|
||||
|
||||
def get_location_and_ids(self):
|
||||
return (ray.worker.global_worker.node.unique_id,
|
||||
tuple(self.gpu_ids))
|
||||
|
||||
# Create some actors.
|
||||
actors2 = [Actor2.remote() for _ in range(num_nodes)]
|
||||
# Make sure that no two actors are assigned to the same GPU.
|
||||
locations_and_ids = ray.get(
|
||||
[actor.get_location_and_ids.remote() for actor in actors2])
|
||||
names = {location for location, gpu_id in locations_and_ids}
|
||||
assert node_names == names
|
||||
for location, gpu_ids in locations_and_ids:
|
||||
gpus_in_use[location].extend(gpu_ids)
|
||||
for node_name in node_names:
|
||||
assert len(gpus_in_use[node_name]) == 5
|
||||
assert set(gpus_in_use[node_name]) == set(range(5))
|
||||
|
||||
# Creating a new actor should fail because all of the GPUs are being
|
||||
# used.
|
||||
a = Actor2.remote()
|
||||
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
|
||||
assert ready_ids == []
|
||||
|
||||
|
||||
def test_actor_different_numbers_of_gpus(ray_start_cluster):
|
||||
# Test that we can create actors on two nodes that have different
|
||||
# numbers of GPUs.
|
||||
cluster = ray_start_cluster
|
||||
cluster.add_node(num_cpus=10, num_gpus=0)
|
||||
cluster.add_node(num_cpus=10, num_gpus=5)
|
||||
cluster.add_node(num_cpus=10, num_gpus=10)
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
class Actor1(object):
|
||||
def __init__(self):
|
||||
self.gpu_ids = ray.get_gpu_ids()
|
||||
|
||||
def get_location_and_ids(self):
|
||||
return (ray.worker.global_worker.node.unique_id,
|
||||
tuple(self.gpu_ids))
|
||||
|
||||
# Create some actors.
|
||||
actors = [Actor1.remote() for _ in range(0 + 5 + 10)]
|
||||
# Make sure that no two actors are assigned to the same GPU.
|
||||
locations_and_ids = ray.get(
|
||||
[actor.get_location_and_ids.remote() for actor in actors])
|
||||
node_names = {location for location, gpu_id in locations_and_ids}
|
||||
assert len(node_names) == 2
|
||||
for node_name in node_names:
|
||||
node_gpu_ids = [
|
||||
gpu_id for location, gpu_id in locations_and_ids
|
||||
if location == node_name
|
||||
]
|
||||
assert len(node_gpu_ids) in [5, 10]
|
||||
assert set(node_gpu_ids) == {(i, ) for i in range(len(node_gpu_ids))}
|
||||
|
||||
# Creating a new actor should fail because all of the GPUs are being
|
||||
# used.
|
||||
a = Actor1.remote()
|
||||
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
|
||||
assert ready_ids == []
|
||||
|
||||
|
||||
def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster):
|
||||
cluster = ray_start_cluster
|
||||
num_nodes = 5
|
||||
num_gpus_per_raylet = 5
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node(
|
||||
num_cpus=10 * num_gpus_per_raylet,
|
||||
num_gpus=num_gpus_per_raylet,
|
||||
_internal_config=json.dumps({
|
||||
"num_heartbeats_timeout": 1000
|
||||
}))
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@ray.remote
|
||||
def create_actors(i, n):
|
||||
@ray.remote(num_gpus=1)
|
||||
class Actor(object):
|
||||
def __init__(self, i, j):
|
||||
self.gpu_ids = ray.get_gpu_ids()
|
||||
|
||||
def get_location_and_ids(self):
|
||||
return ((ray.worker.global_worker.node.unique_id),
|
||||
tuple(self.gpu_ids))
|
||||
|
||||
def sleep(self):
|
||||
time.sleep(100)
|
||||
|
||||
# Create n actors.
|
||||
actors = []
|
||||
for j in range(n):
|
||||
actors.append(Actor.remote(i, j))
|
||||
|
||||
locations = ray.get(
|
||||
[actor.get_location_and_ids.remote() for actor in actors])
|
||||
|
||||
# Put each actor to sleep for a long time to prevent them from getting
|
||||
# terminated.
|
||||
for actor in actors:
|
||||
actor.sleep.remote()
|
||||
|
||||
return locations
|
||||
|
||||
all_locations = ray.get([
|
||||
create_actors.remote(i, num_gpus_per_raylet) for i in range(num_nodes)
|
||||
])
|
||||
|
||||
# Make sure that no two actors are assigned to the same GPU.
|
||||
node_names = {
|
||||
location
|
||||
for locations in all_locations for location, gpu_id in locations
|
||||
}
|
||||
assert len(node_names) == num_nodes
|
||||
|
||||
# Keep track of which GPU IDs are being used for each location.
|
||||
gpus_in_use = {node_name: [] for node_name in node_names}
|
||||
for locations in all_locations:
|
||||
for location, gpu_ids in locations:
|
||||
gpus_in_use[location].extend(gpu_ids)
|
||||
for node_name in node_names:
|
||||
assert len(set(gpus_in_use[node_name])) == num_gpus_per_raylet
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
class Actor(object):
|
||||
def __init__(self):
|
||||
self.gpu_ids = ray.get_gpu_ids()
|
||||
|
||||
def get_location_and_ids(self):
|
||||
return (ray.worker.global_worker.node.unique_id,
|
||||
tuple(self.gpu_ids))
|
||||
|
||||
# All the GPUs should be used up now.
|
||||
a = Actor.remote()
|
||||
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01)
|
||||
assert ready_ids == []
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.version_info < (3, 0), reason="This test requires Python 3.")
|
||||
def test_actors_and_tasks_with_gpus(ray_start_cluster):
|
||||
cluster = ray_start_cluster
|
||||
num_nodes = 3
|
||||
num_gpus_per_raylet = 2
|
||||
for i in range(num_nodes):
|
||||
cluster.add_node(
|
||||
num_cpus=num_gpus_per_raylet, num_gpus=num_gpus_per_raylet)
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
def check_intervals_non_overlapping(list_of_intervals):
|
||||
for i in range(len(list_of_intervals)):
|
||||
for j in range(i):
|
||||
first_interval = list_of_intervals[i]
|
||||
second_interval = list_of_intervals[j]
|
||||
# Check that list_of_intervals[i] and list_of_intervals[j]
|
||||
# don't overlap.
|
||||
assert first_interval[0] < first_interval[1]
|
||||
assert second_interval[0] < second_interval[1]
|
||||
intervals_nonoverlapping = (
|
||||
first_interval[1] <= second_interval[0]
|
||||
or second_interval[1] <= first_interval[0])
|
||||
assert intervals_nonoverlapping, (
|
||||
"Intervals {} and {} are overlapping.".format(
|
||||
first_interval, second_interval))
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
def f1():
|
||||
t1 = time.monotonic()
|
||||
time.sleep(0.1)
|
||||
t2 = time.monotonic()
|
||||
gpu_ids = ray.get_gpu_ids()
|
||||
assert len(gpu_ids) == 1
|
||||
assert gpu_ids[0] in range(num_gpus_per_raylet)
|
||||
return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids),
|
||||
[t1, t2])
|
||||
|
||||
@ray.remote(num_gpus=2)
|
||||
def f2():
|
||||
t1 = time.monotonic()
|
||||
time.sleep(0.1)
|
||||
t2 = time.monotonic()
|
||||
gpu_ids = ray.get_gpu_ids()
|
||||
assert len(gpu_ids) == 2
|
||||
assert gpu_ids[0] in range(num_gpus_per_raylet)
|
||||
assert gpu_ids[1] in range(num_gpus_per_raylet)
|
||||
return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids),
|
||||
[t1, t2])
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
class Actor1(object):
|
||||
def __init__(self):
|
||||
self.gpu_ids = ray.get_gpu_ids()
|
||||
assert len(self.gpu_ids) == 1
|
||||
assert self.gpu_ids[0] in range(num_gpus_per_raylet)
|
||||
|
||||
def get_location_and_ids(self):
|
||||
assert ray.get_gpu_ids() == self.gpu_ids
|
||||
return (ray.worker.global_worker.node.unique_id,
|
||||
tuple(self.gpu_ids))
|
||||
|
||||
def locations_to_intervals_for_many_tasks():
|
||||
# Launch a bunch of GPU tasks.
|
||||
locations_ids_and_intervals = ray.get(
|
||||
[f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] +
|
||||
[f2.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] +
|
||||
[f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)])
|
||||
|
||||
locations_to_intervals = collections.defaultdict(lambda: [])
|
||||
for location, gpu_ids, interval in locations_ids_and_intervals:
|
||||
for gpu_id in gpu_ids:
|
||||
locations_to_intervals[(location, gpu_id)].append(interval)
|
||||
return locations_to_intervals
|
||||
|
||||
# Run a bunch of GPU tasks.
|
||||
locations_to_intervals = locations_to_intervals_for_many_tasks()
|
||||
# For each GPU, verify that the set of tasks that used this specific
|
||||
# GPU did not overlap in time.
|
||||
for locations in locations_to_intervals:
|
||||
check_intervals_non_overlapping(locations_to_intervals[locations])
|
||||
|
||||
# Create an actor that uses a GPU.
|
||||
a = Actor1.remote()
|
||||
actor_location = ray.get(a.get_location_and_ids.remote())
|
||||
actor_location = (actor_location[0], actor_location[1][0])
|
||||
# This check makes sure that actor_location is formatted the same way
|
||||
# that the keys of locations_to_intervals are formatted.
|
||||
assert actor_location in locations_to_intervals
|
||||
|
||||
# Run a bunch of GPU tasks.
|
||||
locations_to_intervals = locations_to_intervals_for_many_tasks()
|
||||
# For each GPU, verify that the set of tasks that used this specific
|
||||
# GPU did not overlap in time.
|
||||
for locations in locations_to_intervals:
|
||||
check_intervals_non_overlapping(locations_to_intervals[locations])
|
||||
# Make sure that the actor's GPU was not used.
|
||||
assert actor_location not in locations_to_intervals
|
||||
|
||||
# Create more actors to fill up all the GPUs.
|
||||
more_actors = [
|
||||
Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet - 1)
|
||||
]
|
||||
# Wait for the actors to finish being created.
|
||||
ray.get([actor.get_location_and_ids.remote() for actor in more_actors])
|
||||
|
||||
# Now if we run some GPU tasks, they should not be scheduled.
|
||||
results = [f1.remote() for _ in range(30)]
|
||||
ready_ids, remaining_ids = ray.wait(results, timeout=1.0)
|
||||
assert len(ready_ids) == 0
|
||||
|
||||
|
||||
def test_actors_and_tasks_with_gpus_version_two(shutdown_only):
|
||||
# Create tasks and actors that both use GPUs and make sure that they
|
||||
# are given different GPUs
|
||||
num_gpus = 4
|
||||
|
||||
ray.init(
|
||||
num_cpus=(num_gpus + 1),
|
||||
num_gpus=num_gpus,
|
||||
object_store_memory=int(150 * 1024 * 1024))
|
||||
|
||||
# The point of this actor is to record which GPU IDs have been seen. We
|
||||
# can't just return them from the tasks, because the tasks don't return
|
||||
# for a long time in order to make sure the GPU is not released
|
||||
# prematurely.
|
||||
@ray.remote
|
||||
class RecordGPUs(object):
|
||||
def __init__(self):
|
||||
self.gpu_ids_seen = []
|
||||
self.num_calls = 0
|
||||
|
||||
def add_ids(self, gpu_ids):
|
||||
self.gpu_ids_seen += gpu_ids
|
||||
self.num_calls += 1
|
||||
|
||||
def get_gpu_ids_and_calls(self):
|
||||
return self.gpu_ids_seen, self.num_calls
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
def f(record_gpu_actor):
|
||||
gpu_ids = ray.get_gpu_ids()
|
||||
assert len(gpu_ids) == 1
|
||||
record_gpu_actor.add_ids.remote(gpu_ids)
|
||||
# Sleep for a long time so that the GPU never gets released. This task
|
||||
# will be killed by ray.shutdown() before it actually finishes.
|
||||
time.sleep(1000)
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
class Actor(object):
|
||||
def __init__(self, record_gpu_actor):
|
||||
self.gpu_ids = ray.get_gpu_ids()
|
||||
assert len(self.gpu_ids) == 1
|
||||
record_gpu_actor.add_ids.remote(self.gpu_ids)
|
||||
|
||||
def check_gpu_ids(self):
|
||||
assert ray.get_gpu_ids() == self.gpu_ids
|
||||
|
||||
record_gpu_actor = RecordGPUs.remote()
|
||||
|
||||
actors = []
|
||||
actor_results = []
|
||||
for _ in range(num_gpus // 2):
|
||||
f.remote(record_gpu_actor)
|
||||
a = Actor.remote(record_gpu_actor)
|
||||
actor_results.append(a.check_gpu_ids.remote())
|
||||
# Prevent the actor handle from going out of scope so that its GPU
|
||||
# resources don't get released.
|
||||
actors.append(a)
|
||||
|
||||
# Make sure that the actor method calls succeeded.
|
||||
ray.get(actor_results)
|
||||
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < 30:
|
||||
seen_gpu_ids, num_calls = ray.get(
|
||||
record_gpu_actor.get_gpu_ids_and_calls.remote())
|
||||
if num_calls == num_gpus:
|
||||
break
|
||||
assert set(seen_gpu_ids) == set(range(num_gpus))
|
||||
|
||||
|
||||
def test_blocking_actor_task(shutdown_only):
|
||||
ray.init(
|
||||
num_cpus=1, num_gpus=1, object_store_memory=int(150 * 1024 * 1024))
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
def f():
|
||||
return 1
|
||||
|
||||
@ray.remote
|
||||
class Foo(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def blocking_method(self):
|
||||
ray.get(f.remote())
|
||||
|
||||
# Make sure we can execute a blocking actor method even if there is
|
||||
# only one CPU.
|
||||
actor = Foo.remote()
|
||||
ray.get(actor.blocking_method.remote())
|
||||
|
||||
@ray.remote(num_cpus=1)
|
||||
class CPUFoo(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def blocking_method(self):
|
||||
ray.get(f.remote())
|
||||
|
||||
# Make sure that lifetime CPU resources are not released when actors
|
||||
# block.
|
||||
actor = CPUFoo.remote()
|
||||
x_id = actor.blocking_method.remote()
|
||||
ready_ids, remaining_ids = ray.wait([x_id], timeout=1.0)
|
||||
assert ready_ids == []
|
||||
assert remaining_ids == [x_id]
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
class GPUFoo(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def blocking_method(self):
|
||||
ray.get(f.remote())
|
||||
|
||||
# Make sure that GPU resources are not released when actors block.
|
||||
actor = GPUFoo.remote()
|
||||
x_id = actor.blocking_method.remote()
|
||||
ready_ids, remaining_ids = ray.wait([x_id], timeout=1.0)
|
||||
assert ready_ids == []
|
||||
assert remaining_ids == [x_id]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.version_info < (3, 0),
|
||||
reason="This test is currently failing on Python 2.7.")
|
||||
def test_lifetime_and_transient_resources(ray_start_regular):
|
||||
# This actor acquires resources only when running methods.
|
||||
@ray.remote
|
||||
class Actor1(object):
|
||||
def method(self):
|
||||
pass
|
||||
|
||||
# This actor acquires resources for its lifetime.
|
||||
@ray.remote(num_cpus=1)
|
||||
class Actor2(object):
|
||||
def method(self):
|
||||
pass
|
||||
|
||||
actor1s = [Actor1.remote() for _ in range(10)]
|
||||
ray.get([a.method.remote() for a in actor1s])
|
||||
|
||||
actor2s = [Actor2.remote() for _ in range(2)]
|
||||
results = [a.method.remote() for a in actor2s]
|
||||
ready_ids, remaining_ids = ray.wait(
|
||||
results, num_returns=len(results), timeout=5.0)
|
||||
assert len(ready_ids) == 1
|
||||
|
||||
|
||||
def test_custom_label_placement(ray_start_cluster):
|
||||
cluster = ray_start_cluster
|
||||
cluster.add_node(num_cpus=2, resources={"CustomResource1": 2})
|
||||
cluster.add_node(num_cpus=2, resources={"CustomResource2": 2})
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
@ray.remote(resources={"CustomResource1": 1})
|
||||
class ResourceActor1(object):
|
||||
def get_location(self):
|
||||
return ray.worker.global_worker.node.unique_id
|
||||
|
||||
@ray.remote(resources={"CustomResource2": 1})
|
||||
class ResourceActor2(object):
|
||||
def get_location(self):
|
||||
return ray.worker.global_worker.node.unique_id
|
||||
|
||||
node_id = ray.worker.global_worker.node.unique_id
|
||||
|
||||
# Create some actors.
|
||||
actors1 = [ResourceActor1.remote() for _ in range(2)]
|
||||
actors2 = [ResourceActor2.remote() for _ in range(2)]
|
||||
locations1 = ray.get([a.get_location.remote() for a in actors1])
|
||||
locations2 = ray.get([a.get_location.remote() for a in actors2])
|
||||
for location in locations1:
|
||||
assert location == node_id
|
||||
for location in locations2:
|
||||
assert location != node_id
|
||||
|
||||
|
||||
def test_creating_more_actors_than_resources(shutdown_only):
|
||||
ray.init(num_cpus=10, num_gpus=2, resources={"CustomResource1": 1})
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
class ResourceActor1(object):
|
||||
def method(self):
|
||||
return ray.get_gpu_ids()[0]
|
||||
|
||||
@ray.remote(resources={"CustomResource1": 1})
|
||||
class ResourceActor2(object):
|
||||
def method(self):
|
||||
pass
|
||||
|
||||
# Make sure the first two actors get created and the third one does
|
||||
# not.
|
||||
actor1 = ResourceActor1.remote()
|
||||
result1 = actor1.method.remote()
|
||||
ray.wait([result1])
|
||||
actor2 = ResourceActor1.remote()
|
||||
result2 = actor2.method.remote()
|
||||
ray.wait([result2])
|
||||
actor3 = ResourceActor1.remote()
|
||||
result3 = actor3.method.remote()
|
||||
ready_ids, _ = ray.wait([result3], timeout=0.2)
|
||||
assert len(ready_ids) == 0
|
||||
|
||||
# By deleting actor1, we free up resources to create actor3.
|
||||
del actor1
|
||||
|
||||
results = ray.get([result1, result2, result3])
|
||||
assert results[0] == results[2]
|
||||
assert set(results) == {0, 1}
|
||||
|
||||
# Make sure that when one actor goes out of scope a new actor is
|
||||
# created because some resources have been freed up.
|
||||
results = []
|
||||
for _ in range(3):
|
||||
actor = ResourceActor2.remote()
|
||||
object_id = actor.method.remote()
|
||||
results.append(object_id)
|
||||
# Wait for the task to execute. We do this because otherwise it may
|
||||
# be possible for the __ray_terminate__ task to execute before the
|
||||
# method.
|
||||
ray.wait([object_id])
|
||||
|
||||
ray.get(results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -10,7 +10,7 @@ import sys
|
||||
import ray
|
||||
import ray.experimental.array.remote as ra
|
||||
import ray.experimental.array.distributed as da
|
||||
import ray.tests.cluster_utils
|
||||
import ray.cluster_utils
|
||||
|
||||
if sys.version_info >= (3, 0):
|
||||
from importlib import reload
|
||||
@@ -216,3 +216,9 @@ def test_distributed_array_methods(ray_start_cluster_2_nodes, reload_modules):
|
||||
d1 = np.random.randint(1, 35)
|
||||
d2 = np.random.randint(1, 35)
|
||||
test_dist_qr(d1, d2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -17,7 +17,7 @@ from ray.autoscaler.autoscaler import StandardAutoscaler, LoadMetrics, \
|
||||
from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_NODE_STATUS, \
|
||||
STATUS_UP_TO_DATE, STATUS_UPDATE_FAILED
|
||||
from ray.autoscaler.node_provider import NODE_PROVIDERS, NodeProvider
|
||||
from ray.tests.utils import RayTestTimeoutException
|
||||
from ray.test_utils import RayTestTimeoutException
|
||||
import pytest
|
||||
|
||||
|
||||
@@ -1084,4 +1084,5 @@ class AutoscalingTest(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -7,7 +7,7 @@ import unittest
|
||||
import yaml
|
||||
|
||||
from ray.autoscaler.autoscaler import fillout_defaults, validate_config
|
||||
from ray.tests.utils import recursive_fnmatch
|
||||
from ray.test_utils import recursive_fnmatch
|
||||
|
||||
RAY_PATH = os.path.abspath(os.path.join(__file__, "../../"))
|
||||
CONFIG_PATHS = recursive_fnmatch(
|
||||
@@ -31,4 +31,6 @@ class AutoscalingConfigTest(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
+6
-2208
File diff suppressed because it is too large
Load Diff
@@ -1,56 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.experimental.streaming.batched_queue import BatchedQueue
|
||||
|
||||
|
||||
@ray.remote
|
||||
class Reader(object):
|
||||
def __init__(self, queue):
|
||||
self.queue = queue
|
||||
self.num_reads = 0
|
||||
self.start = time.time()
|
||||
|
||||
def read(self, read_slowly):
|
||||
expected_value = 0
|
||||
for _ in range(1000):
|
||||
x = self.queue.read_next()
|
||||
assert x == expected_value, (x, expected_value)
|
||||
expected_value += 1
|
||||
self.num_reads += 1
|
||||
if read_slowly:
|
||||
time.sleep(0.001)
|
||||
|
||||
|
||||
def test_batched_queue(ray_start_regular):
|
||||
# Batched queue parameters
|
||||
max_queue_size = 10000 # Max number of batches in queue
|
||||
max_batch_size = 1000 # Max number of elements per batch
|
||||
batch_timeout = 0.001 # 1ms flush timeout
|
||||
prefetch_depth = 10 # Number of batches to prefetch from plasma
|
||||
background_flush = False # Don't use daemon thread for flushing
|
||||
# Two tests: one with a big queue and slow reader, and
|
||||
# a second one with a small queue and a faster reader
|
||||
for read_slowly in [True, False]:
|
||||
# Construct the batched queue
|
||||
queue = BatchedQueue(
|
||||
max_size=max_queue_size,
|
||||
max_batch_size=max_batch_size,
|
||||
max_batch_time=batch_timeout,
|
||||
prefetch_depth=prefetch_depth,
|
||||
background_flush=background_flush)
|
||||
# Create and start the reader
|
||||
reader = Reader.remote(queue)
|
||||
object_id = reader.read.remote(read_slowly=read_slowly)
|
||||
value = 0
|
||||
for _ in range(1000):
|
||||
queue.put_next(value)
|
||||
value += 1
|
||||
queue._flush_writes()
|
||||
ray.get(object_id)
|
||||
# Test once more with a very small queue size and a faster reader
|
||||
max_queue_size = 10
|
||||
@@ -13,9 +13,9 @@ import pytest
|
||||
|
||||
import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray.tests.cluster_utils import Cluster
|
||||
from ray.tests.utils import (run_string_as_driver_nonblocking,
|
||||
RayTestTimeoutException)
|
||||
from ray.cluster_utils import Cluster
|
||||
from ray.test_utils import (run_string_as_driver_nonblocking,
|
||||
RayTestTimeoutException)
|
||||
|
||||
|
||||
# This test checks that when a worker dies in the middle of a get, the plasma
|
||||
@@ -441,3 +441,8 @@ def test_driver_lives_parallel(ray_start_regular):
|
||||
process_info.process.wait()
|
||||
|
||||
# If the driver can reach the tearDown method, then it is still alive.
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import redis
|
||||
|
||||
import ray
|
||||
|
||||
|
||||
def parse_client(addr_port_str):
|
||||
address, redis_port = addr_port_str.split(":")
|
||||
return redis.StrictRedis(host=address, port=redis_port)
|
||||
|
||||
|
||||
@unittest.skipIf(not os.environ.get("RAY_USE_NEW_GCS", False),
|
||||
"Tests functionality of the new GCS.")
|
||||
class CredisTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.config = ray.init(num_cpus=0)
|
||||
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
|
||||
def test_credis_started(self):
|
||||
assert "redis_address" in self.config
|
||||
primary = parse_client(self.config["redis_address"])
|
||||
assert primary.ping() is True
|
||||
member = primary.lrange("RedisShards", 0, -1)[0]
|
||||
shard = parse_client(member.decode())
|
||||
|
||||
# Check that primary has loaded credis's master module.
|
||||
chain = primary.execute_command("MASTER.GET_CHAIN")
|
||||
assert len(chain) == 1
|
||||
|
||||
# Check that the shard has loaded credis' member module.
|
||||
assert chain[0] == member
|
||||
assert shard.execute_command("MEMBER.SN") == -1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
@@ -54,4 +54,6 @@ class CythonTest(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -47,3 +47,12 @@ def test_raylet_gdb(ray_gdb_start):
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
assert pgrep_command.communicate()[0]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
# Make subprocess happy in bazel.
|
||||
os.environ["LC_ALL"] = "en_US.UTF-8"
|
||||
os.environ["LANG"] = "en_US.UTF-8"
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -6,8 +6,8 @@ import logging
|
||||
import time
|
||||
|
||||
import ray
|
||||
import ray.tests.cluster_utils
|
||||
import ray.tests.utils
|
||||
import ray.cluster_utils
|
||||
import ray.test_utils
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -605,3 +605,9 @@ def test_release_cpus_when_actor_creation_task_blocking(shutdown_only):
|
||||
|
||||
result = wait_until(assert_available_resources, 1000)
|
||||
assert result is True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -15,8 +15,8 @@ import redis
|
||||
|
||||
import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray.tests.cluster_utils import Cluster
|
||||
from ray.tests.utils import (
|
||||
from ray.cluster_utils import Cluster
|
||||
from ray.test_utils import (
|
||||
relevant_errors,
|
||||
wait_for_errors,
|
||||
RayTestTimeoutException,
|
||||
@@ -903,3 +903,8 @@ def test_direct_call_serialized_id(ray_start_cluster):
|
||||
|
||||
obj = small_object.remote()
|
||||
ray.get(get.remote([obj]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -10,8 +10,8 @@ import logging
|
||||
import pytest
|
||||
|
||||
import ray
|
||||
import ray.tests.cluster_utils
|
||||
import ray.tests.utils
|
||||
import ray.cluster_utils
|
||||
import ray.test_utils
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -78,3 +78,9 @@ def test_pending_task_dependency(shutdown_only):
|
||||
ray.put(np_array)
|
||||
|
||||
ray.get(oid)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -78,3 +78,9 @@ def test_add_remove_cluster_resources(ray_start_cluster_head):
|
||||
nodes += [cluster.add_node(num_cpus=1)]
|
||||
cluster.wait_for_nodes()
|
||||
assert ray.cluster_resources()["CPU"] == 6
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -202,3 +202,9 @@ def test_channel_generation():
|
||||
def test_wordcount():
|
||||
"""Tests a simple streaming wordcount."""
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -86,4 +86,6 @@ class TestMemoryLimits(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -152,4 +152,6 @@ class TestMemoryScheduling(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -10,7 +10,7 @@ import time
|
||||
import ray
|
||||
from ray.core.generated import node_manager_pb2
|
||||
from ray.core.generated import node_manager_pb2_grpc
|
||||
from ray.tests.utils import RayTestTimeoutException
|
||||
from ray.test_utils import RayTestTimeoutException
|
||||
|
||||
|
||||
def test_worker_stats(ray_start_regular):
|
||||
@@ -53,5 +53,13 @@ def test_worker_stats(ray_start_regular):
|
||||
if p.info["pid"] in pids
|
||||
]
|
||||
for process in processes:
|
||||
assert "python" in process or "ray" in process
|
||||
# TODO(ekl) why does travis/mi end up in the process list
|
||||
assert ("python" in process or "ray" in process
|
||||
or "travis" in process)
|
||||
break
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -2,7 +2,6 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import pytest
|
||||
import time
|
||||
import numpy as np
|
||||
@@ -104,9 +103,11 @@ def test_cache(ray_start_regular):
|
||||
d = time.time() - c
|
||||
|
||||
if d > 1.5 * b:
|
||||
if os.getenv("TRAVIS") is None:
|
||||
raise Exception("The caching test was too slow. "
|
||||
"d = {}, b = {}".format(d, b))
|
||||
else:
|
||||
print("WARNING: The caching test was too slow. "
|
||||
"d = {}, b = {}".format(d, b))
|
||||
print("WARNING: The caching test was too slow. "
|
||||
"d = {}, b = {}".format(d, b))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -107,3 +107,12 @@ def test_cleanup_on_driver_exit_single_redis_shard():
|
||||
def test_cleanup_on_driver_exit_many_redis_shards():
|
||||
_test_cleanup_on_driver_exit(num_redis_shards=5)
|
||||
_test_cleanup_on_driver_exit(num_redis_shards=31)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
# Make subprocess happy in bazel.
|
||||
os.environ["LC_ALL"] = "en_US.UTF-8"
|
||||
os.environ["LANG"] = "en_US.UTF-8"
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -8,7 +8,7 @@ import subprocess
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.tests.utils import (
|
||||
from ray.test_utils import (
|
||||
RayTestTimeoutException,
|
||||
run_string_as_driver,
|
||||
run_string_as_driver_nonblocking,
|
||||
@@ -615,3 +615,12 @@ def test_use_pickle(call_ray_start):
|
||||
return (3, "world")
|
||||
|
||||
assert ray.get(f.remote(x)) == (3, "world")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
# Make subprocess happy in bazel.
|
||||
os.environ["LC_ALL"] = "en_US.UTF-8"
|
||||
os.environ["LANG"] = "en_US.UTF-8"
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -9,8 +9,8 @@ import time
|
||||
import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray.monitor import Monitor
|
||||
from ray.tests.cluster_utils import Cluster
|
||||
from ray.tests.conftest import generate_internal_config_map
|
||||
from ray.cluster_utils import Cluster
|
||||
from ray.test_utils import generate_internal_config_map
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -221,3 +221,9 @@ def test_worker_plasma_store_failure(ray_start_cluster_head):
|
||||
worker.kill_plasma_store()
|
||||
worker.all_processes[ray_constants.PROCESS_TYPE_RAYLET][0].process.wait()
|
||||
assert not worker.any_processes_alive(), worker.live_processes()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -0,0 +1,273 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import ray
|
||||
import ray.ray_constants as ray_constants
|
||||
from ray.cluster_utils import Cluster
|
||||
from ray.test_utils import RayTestTimeoutException
|
||||
|
||||
|
||||
@pytest.fixture(params=[(1, 4), (4, 4)])
|
||||
def ray_start_workers_separate_multinode(request):
|
||||
num_nodes = request.param[0]
|
||||
num_initial_workers = request.param[1]
|
||||
# Start the Ray processes.
|
||||
cluster = Cluster()
|
||||
for _ in range(num_nodes):
|
||||
cluster.add_node(num_cpus=num_initial_workers)
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
yield num_nodes, num_initial_workers
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
cluster.shutdown()
|
||||
|
||||
|
||||
def test_worker_failed(ray_start_workers_separate_multinode):
|
||||
num_nodes, num_initial_workers = (ray_start_workers_separate_multinode)
|
||||
|
||||
@ray.remote
|
||||
def get_pids():
|
||||
time.sleep(0.25)
|
||||
return os.getpid()
|
||||
|
||||
start_time = time.time()
|
||||
pids = set()
|
||||
while len(pids) < num_nodes * num_initial_workers:
|
||||
new_pids = ray.get([
|
||||
get_pids.remote()
|
||||
for _ in range(2 * num_nodes * num_initial_workers)
|
||||
])
|
||||
for pid in new_pids:
|
||||
pids.add(pid)
|
||||
if time.time() - start_time > 60:
|
||||
raise RayTestTimeoutException(
|
||||
"Timed out while waiting to get worker PIDs.")
|
||||
|
||||
@ray.remote
|
||||
def f(x):
|
||||
time.sleep(0.5)
|
||||
return x
|
||||
|
||||
# Submit more tasks than there are workers so that all workers and
|
||||
# cores are utilized.
|
||||
object_ids = [f.remote(i) for i in range(num_initial_workers * num_nodes)]
|
||||
object_ids += [f.remote(object_id) for object_id in object_ids]
|
||||
# Allow the tasks some time to begin executing.
|
||||
time.sleep(0.1)
|
||||
# Kill the workers as the tasks execute.
|
||||
for pid in pids:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
time.sleep(0.1)
|
||||
# Make sure that we either get the object or we get an appropriate
|
||||
# exception.
|
||||
for object_id in object_ids:
|
||||
try:
|
||||
ray.get(object_id)
|
||||
except (ray.exceptions.RayTaskError, ray.exceptions.RayWorkerError):
|
||||
pass
|
||||
|
||||
|
||||
def _test_component_failed(cluster, component_type):
|
||||
"""Kill a component on all worker nodes and check workload succeeds."""
|
||||
# Submit many tasks with many dependencies.
|
||||
@ray.remote
|
||||
def f(x):
|
||||
return x
|
||||
|
||||
@ray.remote
|
||||
def g(*xs):
|
||||
return 1
|
||||
|
||||
# Kill the component on all nodes except the head node as the tasks
|
||||
# execute. Do this in a loop while submitting tasks between each
|
||||
# component failure.
|
||||
time.sleep(0.1)
|
||||
worker_nodes = cluster.list_all_nodes()[1:]
|
||||
assert len(worker_nodes) > 0
|
||||
for node in worker_nodes:
|
||||
process = node.all_processes[component_type][0].process
|
||||
# Submit a round of tasks with many dependencies.
|
||||
x = 1
|
||||
for _ in range(1000):
|
||||
x = f.remote(x)
|
||||
|
||||
xs = [g.remote(1)]
|
||||
for _ in range(100):
|
||||
xs.append(g.remote(*xs))
|
||||
xs.append(g.remote(1))
|
||||
|
||||
# Kill a component on one of the nodes.
|
||||
process.terminate()
|
||||
time.sleep(1)
|
||||
process.kill()
|
||||
process.wait()
|
||||
assert not process.poll() is None
|
||||
|
||||
# Make sure that we can still get the objects after the
|
||||
# executing tasks died.
|
||||
ray.get(x)
|
||||
ray.get(xs)
|
||||
|
||||
|
||||
def check_components_alive(cluster, component_type, check_component_alive):
|
||||
"""Check that a given component type is alive on all worker nodes."""
|
||||
worker_nodes = cluster.list_all_nodes()[1:]
|
||||
assert len(worker_nodes) > 0
|
||||
for node in worker_nodes:
|
||||
process = node.all_processes[component_type][0].process
|
||||
if check_component_alive:
|
||||
assert process.poll() is None
|
||||
else:
|
||||
print("waiting for " + component_type + " with PID " +
|
||||
str(process.pid) + "to terminate")
|
||||
process.wait()
|
||||
print("done waiting for " + component_type + " with PID " +
|
||||
str(process.pid) + "to terminate")
|
||||
assert not process.poll() is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster", [{
|
||||
"num_cpus": 8,
|
||||
"num_nodes": 4,
|
||||
"_internal_config": json.dumps({
|
||||
"num_heartbeats_timeout": 100
|
||||
}),
|
||||
}],
|
||||
indirect=True)
|
||||
def test_raylet_failed(ray_start_cluster):
|
||||
cluster = ray_start_cluster
|
||||
# Kill all raylets on worker nodes.
|
||||
_test_component_failed(cluster, ray_constants.PROCESS_TYPE_RAYLET)
|
||||
|
||||
# The plasma stores should still be alive on the worker nodes.
|
||||
check_components_alive(cluster, ray_constants.PROCESS_TYPE_PLASMA_STORE,
|
||||
True)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Hanging with new GCS API.")
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster", [{
|
||||
"num_cpus": 8,
|
||||
"num_nodes": 2,
|
||||
"_internal_config": json.dumps({
|
||||
"num_heartbeats_timeout": 100
|
||||
}),
|
||||
}],
|
||||
indirect=True)
|
||||
def test_plasma_store_failed(ray_start_cluster):
|
||||
cluster = ray_start_cluster
|
||||
# Kill all plasma stores on worker nodes.
|
||||
_test_component_failed(cluster, ray_constants.PROCESS_TYPE_PLASMA_STORE)
|
||||
|
||||
# No processes should be left alive on the worker nodes.
|
||||
check_components_alive(cluster, ray_constants.PROCESS_TYPE_PLASMA_STORE,
|
||||
False)
|
||||
check_components_alive(cluster, ray_constants.PROCESS_TYPE_RAYLET, False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_cluster", [{
|
||||
"num_cpus": 4,
|
||||
"num_nodes": 3,
|
||||
"do_init": True
|
||||
}],
|
||||
indirect=True)
|
||||
def test_actor_creation_node_failure(ray_start_cluster):
|
||||
# TODO(swang): Refactor test_raylet_failed, etc to reuse the below code.
|
||||
cluster = ray_start_cluster
|
||||
|
||||
@ray.remote
|
||||
class Child(object):
|
||||
def __init__(self, death_probability):
|
||||
self.death_probability = death_probability
|
||||
|
||||
def ping(self):
|
||||
# Exit process with some probability.
|
||||
exit_chance = np.random.rand()
|
||||
if exit_chance < self.death_probability:
|
||||
sys.exit(-1)
|
||||
|
||||
num_children = 50
|
||||
# Children actors will die about half the time.
|
||||
death_probability = 0.5
|
||||
|
||||
children = [Child.remote(death_probability) for _ in range(num_children)]
|
||||
while len(cluster.list_all_nodes()) > 1:
|
||||
for j in range(2):
|
||||
# Submit some tasks on the actors. About half of the actors will
|
||||
# fail.
|
||||
children_out = [child.ping.remote() for child in children]
|
||||
# Wait a while for all the tasks to complete. This should trigger
|
||||
# reconstruction for any actor creation tasks that were forwarded
|
||||
# to nodes that then failed.
|
||||
ready, _ = ray.wait(
|
||||
children_out, num_returns=len(children_out), timeout=5 * 60.0)
|
||||
assert len(ready) == len(children_out)
|
||||
|
||||
# Replace any actors that died.
|
||||
for i, out in enumerate(children_out):
|
||||
try:
|
||||
ray.get(out)
|
||||
except ray.exceptions.RayActorError:
|
||||
children[i] = Child.remote(death_probability)
|
||||
# Remove a node. Any actor creation tasks that were forwarded to this
|
||||
# node must be reconstructed.
|
||||
cluster.remove_node(cluster.list_all_nodes()[-1])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Hanging with new GCS API.")
|
||||
def test_driver_lives_sequential(ray_start_regular):
|
||||
ray.worker._global_node.kill_raylet()
|
||||
ray.worker._global_node.kill_plasma_store()
|
||||
ray.worker._global_node.kill_log_monitor()
|
||||
ray.worker._global_node.kill_monitor()
|
||||
ray.worker._global_node.kill_raylet_monitor()
|
||||
|
||||
# If the driver can reach the tearDown method, then it is still alive.
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Hanging with new GCS API.")
|
||||
def test_driver_lives_parallel(ray_start_regular):
|
||||
all_processes = ray.worker._global_node.all_processes
|
||||
process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_MONITOR] +
|
||||
all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR])
|
||||
assert len(process_infos) == 5
|
||||
|
||||
# Kill all the components in parallel.
|
||||
for process_info in process_infos:
|
||||
process_info.process.terminate()
|
||||
|
||||
time.sleep(0.1)
|
||||
for process_info in process_infos:
|
||||
process_info.process.kill()
|
||||
|
||||
for process_info in process_infos:
|
||||
process_info.process.wait()
|
||||
|
||||
# If the driver can reach the tearDown method, then it is still alive.
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
@@ -3,7 +3,7 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import ray
|
||||
from ray.tests.utils import run_string_as_driver
|
||||
from ray.test_utils import run_string_as_driver
|
||||
|
||||
|
||||
# This tests the queue transitions for infeasible tasks. This has been an issue
|
||||
@@ -48,3 +48,9 @@ f.remote()
|
||||
ray.get([
|
||||
f._submit(args=[], kwargs={}, resources={str(i): 1}) for i in range(3)
|
||||
])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -11,7 +11,7 @@ import time
|
||||
import warnings
|
||||
|
||||
import ray
|
||||
from ray.tests.cluster_utils import Cluster
|
||||
from ray.cluster_utils import Cluster
|
||||
|
||||
# TODO(yuhguo): This test file requires a lot of CPU/memory, and
|
||||
# better be put in Jenkins. However, it fails frequently in Jenkins, but
|
||||
@@ -325,3 +325,9 @@ def test_many_small_transfers(ray_start_cluster_with_resource):
|
||||
do_transfers()
|
||||
do_transfers()
|
||||
do_transfers()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -237,3 +237,11 @@ def test_session_create_multiple():
|
||||
"session-tests/commands-test", session_start,
|
||||
["first", "--a", "*", "--b", "*"])
|
||||
assert result.exit_code == 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
# Make subprocess happy in bazel.
|
||||
os.environ["LC_ALL"] = "en_US.UTF-8"
|
||||
os.environ["LANG"] = "en_US.UTF-8"
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -119,3 +119,8 @@ def test_queue(ray_start_regular):
|
||||
assert q.get() == item
|
||||
size -= 1
|
||||
assert q.qsize() == size
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -7,7 +7,7 @@ import pytest
|
||||
import redis
|
||||
|
||||
import ray
|
||||
from ray.tests.cluster_utils import Cluster
|
||||
from ray.cluster_utils import Cluster
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -61,3 +61,9 @@ class TestRedisPassword(object):
|
||||
|
||||
object_id = f.remote()
|
||||
ray.get(object_id)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -387,3 +387,9 @@ def test_small_receive_timeout(ray_start_regular):
|
||||
result_list = ray.experimental.signal.receive([a], timeout=small_timeout)
|
||||
|
||||
assert len(result_list) == 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -2,39 +2,12 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import pytest
|
||||
import sys
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.tests.cluster_utils import Cluster
|
||||
from ray.tests.utils import flat_errors
|
||||
import ray.ray_constants as ray_constants
|
||||
|
||||
|
||||
@pytest.fixture(params=[1, 4])
|
||||
def ray_start_sharded(request):
|
||||
num_redis_shards = request.param
|
||||
|
||||
if os.environ.get("RAY_USE_NEW_GCS") == "on":
|
||||
num_redis_shards = 1
|
||||
# For now, RAY_USE_NEW_GCS supports 1 shard, and credis supports
|
||||
# 1-node chain for that shard only.
|
||||
|
||||
# Start the Ray processes.
|
||||
ray.init(
|
||||
object_store_memory=int(0.5 * 10**9),
|
||||
num_cpus=10,
|
||||
num_redis_shards=num_redis_shards,
|
||||
redis_max_memory=10**7)
|
||||
|
||||
yield None
|
||||
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
from ray.cluster_utils import Cluster
|
||||
|
||||
|
||||
@pytest.fixture(params=[(1, 4), (4, 4)])
|
||||
@@ -105,71 +78,6 @@ def test_dependencies(ray_start_combination):
|
||||
assert cluster.remaining_processes_alive()
|
||||
|
||||
|
||||
def test_submitting_many_tasks(ray_start_sharded):
|
||||
@ray.remote
|
||||
def f(x):
|
||||
return 1
|
||||
|
||||
def g(n):
|
||||
x = 1
|
||||
for i in range(n):
|
||||
x = f.remote(x)
|
||||
return x
|
||||
|
||||
ray.get([g(1000) for _ in range(100)])
|
||||
assert ray.services.remaining_processes_alive()
|
||||
|
||||
|
||||
def test_submitting_many_actors_to_one(ray_start_sharded):
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def ping(self):
|
||||
return
|
||||
|
||||
@ray.remote
|
||||
class Worker(object):
|
||||
def __init__(self, actor):
|
||||
self.actor = actor
|
||||
|
||||
def ping(self):
|
||||
return ray.get(self.actor.ping.remote())
|
||||
|
||||
a = Actor.remote()
|
||||
workers = [Worker.remote(a) for _ in range(10)]
|
||||
for _ in range(10):
|
||||
out = ray.get([w.ping.remote() for w in workers])
|
||||
assert out == [None for _ in workers]
|
||||
|
||||
|
||||
def test_getting_and_putting(ray_start_sharded):
|
||||
for n in range(8):
|
||||
x = np.zeros(10**n)
|
||||
|
||||
for _ in range(100):
|
||||
ray.put(x)
|
||||
|
||||
x_id = ray.put(x)
|
||||
for _ in range(1000):
|
||||
ray.get(x_id)
|
||||
|
||||
assert ray.services.remaining_processes_alive()
|
||||
|
||||
|
||||
def test_getting_many_objects(ray_start_sharded):
|
||||
@ray.remote
|
||||
def f():
|
||||
return 1
|
||||
|
||||
n = 10**4 # TODO(pcm): replace by 10 ** 5 once this is faster.
|
||||
lst = ray.get([f.remote() for _ in range(n)])
|
||||
assert lst == n * [1]
|
||||
|
||||
assert ray.services.remaining_processes_alive()
|
||||
|
||||
|
||||
def test_wait(ray_start_combination):
|
||||
num_nodes, num_workers_per_scheduler, cluster = ray_start_combination
|
||||
num_workers = num_nodes * num_workers_per_scheduler
|
||||
@@ -197,360 +105,7 @@ def test_wait(ray_start_combination):
|
||||
assert cluster.remaining_processes_alive()
|
||||
|
||||
|
||||
@pytest.fixture(params=[1, 4])
|
||||
def ray_start_reconstruction(request):
|
||||
num_nodes = request.param
|
||||
|
||||
plasma_store_memory = int(0.5 * 10**9)
|
||||
|
||||
cluster = Cluster(
|
||||
initialize_head=True,
|
||||
head_node_args={
|
||||
"num_cpus": 1,
|
||||
"object_store_memory": plasma_store_memory // num_nodes,
|
||||
"redis_max_memory": 10**7,
|
||||
"_internal_config": json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200
|
||||
})
|
||||
})
|
||||
for i in range(num_nodes - 1):
|
||||
cluster.add_node(
|
||||
num_cpus=1,
|
||||
object_store_memory=plasma_store_memory // num_nodes,
|
||||
_internal_config=json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200
|
||||
}))
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
yield plasma_store_memory, num_nodes, cluster
|
||||
|
||||
# Clean up the Ray cluster.
|
||||
ray.shutdown()
|
||||
cluster.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Failing with new GCS API on Linux.")
|
||||
def test_simple(ray_start_reconstruction):
|
||||
plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
|
||||
# Define the size of one task's return argument so that the combined
|
||||
# sum of all objects' sizes is at least twice the plasma stores'
|
||||
# combined allotted memory.
|
||||
num_objects = 100
|
||||
size = int(plasma_store_memory * 1.5 / (num_objects * 8))
|
||||
|
||||
# Define a remote task with no dependencies, which returns a numpy
|
||||
# array of the given size.
|
||||
@ray.remote
|
||||
def foo(i, size):
|
||||
array = np.zeros(size)
|
||||
array[0] = i
|
||||
return array
|
||||
|
||||
# Launch num_objects instances of the remote task.
|
||||
args = []
|
||||
for i in range(num_objects):
|
||||
args.append(foo.remote(i, size))
|
||||
|
||||
# Get each value to force each task to finish. After some number of
|
||||
# gets, old values should be evicted.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get each value again to force reconstruction.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get values sequentially, in chunks.
|
||||
num_chunks = 4 * num_nodes
|
||||
chunk = num_objects // num_chunks
|
||||
for i in range(num_chunks):
|
||||
values = ray.get(args[i * chunk:(i + 1) * chunk])
|
||||
del values
|
||||
|
||||
assert cluster.remaining_processes_alive()
|
||||
|
||||
|
||||
def sorted_random_indexes(total, output_num):
|
||||
random_indexes = [np.random.randint(total) for _ in range(output_num)]
|
||||
random_indexes.sort()
|
||||
return random_indexes
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Failing with new GCS API on Linux.")
|
||||
def test_recursive(ray_start_reconstruction):
|
||||
plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
|
||||
# Define the size of one task's return argument so that the combined
|
||||
# sum of all objects' sizes is at least twice the plasma stores'
|
||||
# combined allotted memory.
|
||||
num_objects = 100
|
||||
size = int(plasma_store_memory * 1.5 / (num_objects * 8))
|
||||
|
||||
# Define a root task with no dependencies, which returns a numpy array
|
||||
# of the given size.
|
||||
@ray.remote
|
||||
def no_dependency_task(size):
|
||||
array = np.zeros(size)
|
||||
return array
|
||||
|
||||
# Define a task with a single dependency, which returns its one
|
||||
# argument.
|
||||
@ray.remote
|
||||
def single_dependency(i, arg):
|
||||
arg = np.copy(arg)
|
||||
arg[0] = i
|
||||
return arg
|
||||
|
||||
# Launch num_objects instances of the remote task, each dependent on
|
||||
# the one before it.
|
||||
arg = no_dependency_task.remote(size)
|
||||
args = []
|
||||
for i in range(num_objects):
|
||||
arg = single_dependency.remote(i, arg)
|
||||
args.append(arg)
|
||||
|
||||
# Get each value to force each task to finish. After some number of
|
||||
# gets, old values should be evicted.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get each value again to force reconstruction.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get 10 values randomly.
|
||||
random_indexes = sorted_random_indexes(num_objects, 10)
|
||||
for i in random_indexes:
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get values sequentially, in chunks.
|
||||
num_chunks = 4 * num_nodes
|
||||
chunk = num_objects // num_chunks
|
||||
for i in range(num_chunks):
|
||||
values = ray.get(args[i * chunk:(i + 1) * chunk])
|
||||
del values
|
||||
|
||||
assert cluster.remaining_processes_alive()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test often hangs or fails in CI.")
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Failing with new GCS API on Linux.")
|
||||
def test_multiple_recursive(ray_start_reconstruction):
|
||||
plasma_store_memory, _, cluster = ray_start_reconstruction
|
||||
# Define the size of one task's return argument so that the combined
|
||||
# sum of all objects' sizes is at least twice the plasma stores'
|
||||
# combined allotted memory.
|
||||
num_objects = 100
|
||||
size = plasma_store_memory * 2 // (num_objects * 8)
|
||||
|
||||
# Define a root task with no dependencies, which returns a numpy array
|
||||
# of the given size.
|
||||
@ray.remote
|
||||
def no_dependency_task(size):
|
||||
array = np.zeros(size)
|
||||
return array
|
||||
|
||||
# Define a task with multiple dependencies, which returns its first
|
||||
# argument.
|
||||
@ray.remote
|
||||
def multiple_dependency(i, arg1, arg2, arg3):
|
||||
arg1 = np.copy(arg1)
|
||||
arg1[0] = i
|
||||
return arg1
|
||||
|
||||
# Launch num_args instances of the root task. Then launch num_objects
|
||||
# instances of the multi-dependency remote task, each dependent on the
|
||||
# num_args tasks before it.
|
||||
num_args = 3
|
||||
args = []
|
||||
for i in range(num_args):
|
||||
arg = no_dependency_task.remote(size)
|
||||
args.append(arg)
|
||||
for i in range(num_objects):
|
||||
args.append(multiple_dependency.remote(i, *args[i:i + num_args]))
|
||||
|
||||
# Get each value to force each task to finish. After some number of
|
||||
# gets, old values should be evicted.
|
||||
args = args[num_args:]
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get each value again to force reconstruction.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get 10 values randomly.
|
||||
random_indexes = sorted_random_indexes(num_objects, 10)
|
||||
for i in random_indexes:
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
|
||||
assert cluster.remaining_processes_alive()
|
||||
|
||||
|
||||
def wait_for_errors(error_check):
|
||||
# Wait for errors from all the nondeterministic tasks.
|
||||
errors = []
|
||||
time_left = 100
|
||||
while time_left > 0:
|
||||
errors = flat_errors()
|
||||
if error_check(errors):
|
||||
break
|
||||
time_left -= 1
|
||||
time.sleep(1)
|
||||
|
||||
# Make sure that enough errors came through.
|
||||
assert error_check(errors)
|
||||
return errors
|
||||
|
||||
|
||||
@pytest.mark.skip("This test does not work yet.")
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Failing with new GCS API on Linux.")
|
||||
def test_nondeterministic_task(ray_start_reconstruction):
|
||||
plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
|
||||
# Define the size of one task's return argument so that the combined
|
||||
# sum of all objects' sizes is at least twice the plasma stores'
|
||||
# combined allotted memory.
|
||||
num_objects = 1000
|
||||
size = plasma_store_memory * 2 // (num_objects * 8)
|
||||
|
||||
# Define a nondeterministic remote task with no dependencies, which
|
||||
# returns a random numpy array of the given size. This task should
|
||||
# produce an error on the driver if it is ever reexecuted.
|
||||
@ray.remote
|
||||
def foo(i, size):
|
||||
array = np.random.rand(size)
|
||||
array[0] = i
|
||||
return array
|
||||
|
||||
# Define a deterministic remote task with no dependencies, which
|
||||
# returns a numpy array of zeros of the given size.
|
||||
@ray.remote
|
||||
def bar(i, size):
|
||||
array = np.zeros(size)
|
||||
array[0] = i
|
||||
return array
|
||||
|
||||
# Launch num_objects instances, half deterministic and half
|
||||
# nondeterministic.
|
||||
args = []
|
||||
for i in range(num_objects):
|
||||
if i % 2 == 0:
|
||||
args.append(foo.remote(i, size))
|
||||
else:
|
||||
args.append(bar.remote(i, size))
|
||||
|
||||
# Get each value to force each task to finish. After some number of
|
||||
# gets, old values should be evicted.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get each value again to force reconstruction.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
|
||||
def error_check(errors):
|
||||
if num_nodes == 1:
|
||||
# In a single-node setting, each object is evicted and
|
||||
# reconstructed exactly once, so exactly half the objects will
|
||||
# produce an error during reconstruction.
|
||||
min_errors = num_objects // 2
|
||||
else:
|
||||
# In a multinode setting, each object is evicted zero or one
|
||||
# times, so some of the nondeterministic tasks may not be
|
||||
# reexecuted.
|
||||
min_errors = 1
|
||||
return len(errors) >= min_errors
|
||||
|
||||
errors = wait_for_errors(error_check)
|
||||
# Make sure all the errors have the correct type.
|
||||
assert all(error["type"] == ray_constants.HASH_MISMATCH_PUSH_ERROR
|
||||
for error in errors)
|
||||
|
||||
assert cluster.remaining_processes_alive()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Failing with new GCS API on Linux.")
|
||||
@pytest.mark.skipif(
|
||||
sys.version_info < (3, 0), reason="This test requires Python 3.")
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_object_store_memory", [10**9], indirect=True)
|
||||
def test_driver_put_errors(ray_start_object_store_memory):
|
||||
plasma_store_memory = ray_start_object_store_memory
|
||||
# Define the size of one task's return argument so that the combined
|
||||
# sum of all objects' sizes is at least twice the plasma stores'
|
||||
# combined allotted memory.
|
||||
num_objects = 100
|
||||
size = plasma_store_memory * 2 // (num_objects * 8)
|
||||
|
||||
# Define a task with a single dependency, a numpy array, that returns
|
||||
# another array.
|
||||
@ray.remote
|
||||
def single_dependency(i, arg):
|
||||
arg = np.copy(arg)
|
||||
arg[0] = i
|
||||
return arg
|
||||
|
||||
# Launch num_objects instances of the remote task, each dependent on
|
||||
# the one before it. The first instance of the task takes a numpy array
|
||||
# as an argument, which is put into the object store.
|
||||
args = []
|
||||
arg = single_dependency.remote(0, np.zeros(size))
|
||||
for i in range(num_objects):
|
||||
arg = single_dependency.remote(i, arg)
|
||||
args.append(arg)
|
||||
# Get each value to force each task to finish. After some number of
|
||||
# gets, old values should be evicted.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
|
||||
# Get each value starting from the beginning to force reconstruction.
|
||||
# Currently, since we're not able to reconstruct `ray.put` objects that
|
||||
# were evicted and whose originating tasks are still running, this
|
||||
# for-loop should hang on its first iteration and push an error to the
|
||||
# driver.
|
||||
ray.worker.global_worker.raylet_client.fetch_or_reconstruct([args[0]],
|
||||
False)
|
||||
|
||||
def error_check(errors):
|
||||
return len(errors) > 1
|
||||
|
||||
errors = wait_for_errors(error_check)
|
||||
assert all(error["type"] == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
|
||||
or "ray.exceptions.UnreconstructableError" in error["message"]
|
||||
for error in errors)
|
||||
|
||||
|
||||
# NOTE(swang): This test tries to launch 1000 workers and breaks.
|
||||
# TODO(rkn): This test needs to be updated to use pytest.
|
||||
# class WorkerPoolTests(unittest.TestCase):
|
||||
#
|
||||
# def tearDown(self):
|
||||
# ray.shutdown()
|
||||
#
|
||||
# def testBlockingTasks(self):
|
||||
# @ray.remote
|
||||
# def f(i, j):
|
||||
# return (i, j)
|
||||
#
|
||||
# @ray.remote
|
||||
# def g(i):
|
||||
# # Each instance of g submits and blocks on the result of another remote
|
||||
# # task.
|
||||
# object_ids = [f.remote(i, j) for j in range(10)]
|
||||
# return ray.get(object_ids)
|
||||
#
|
||||
# ray.init(num_workers=1)
|
||||
# ray.get([g.remote(i) for i in range(1000)])
|
||||
# ray.shutdown()
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -0,0 +1,379 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import pytest
|
||||
import sys
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.cluster_utils import Cluster
|
||||
from ray.test_utils import flat_errors
|
||||
import ray.ray_constants as ray_constants
|
||||
|
||||
|
||||
@pytest.fixture(params=[1, 4])
|
||||
def ray_start_reconstruction(request):
|
||||
num_nodes = request.param
|
||||
|
||||
plasma_store_memory = int(0.5 * 10**9)
|
||||
|
||||
cluster = Cluster(
|
||||
initialize_head=True,
|
||||
head_node_args={
|
||||
"num_cpus": 1,
|
||||
"object_store_memory": plasma_store_memory // num_nodes,
|
||||
"redis_max_memory": 10**7,
|
||||
"_internal_config": json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200
|
||||
})
|
||||
})
|
||||
for i in range(num_nodes - 1):
|
||||
cluster.add_node(
|
||||
num_cpus=1,
|
||||
object_store_memory=plasma_store_memory // num_nodes,
|
||||
_internal_config=json.dumps({
|
||||
"initial_reconstruction_timeout_milliseconds": 200
|
||||
}))
|
||||
ray.init(address=cluster.address)
|
||||
|
||||
yield plasma_store_memory, num_nodes, cluster
|
||||
|
||||
# Clean up the Ray cluster.
|
||||
ray.shutdown()
|
||||
cluster.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Failing with new GCS API on Linux.")
|
||||
def test_simple(ray_start_reconstruction):
|
||||
plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
|
||||
# Define the size of one task's return argument so that the combined
|
||||
# sum of all objects' sizes is at least twice the plasma stores'
|
||||
# combined allotted memory.
|
||||
num_objects = 100
|
||||
size = int(plasma_store_memory * 1.5 / (num_objects * 8))
|
||||
|
||||
# Define a remote task with no dependencies, which returns a numpy
|
||||
# array of the given size.
|
||||
@ray.remote
|
||||
def foo(i, size):
|
||||
array = np.zeros(size)
|
||||
array[0] = i
|
||||
return array
|
||||
|
||||
# Launch num_objects instances of the remote task.
|
||||
args = []
|
||||
for i in range(num_objects):
|
||||
args.append(foo.remote(i, size))
|
||||
|
||||
# Get each value to force each task to finish. After some number of
|
||||
# gets, old values should be evicted.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get each value again to force reconstruction.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get values sequentially, in chunks.
|
||||
num_chunks = 4 * num_nodes
|
||||
chunk = num_objects // num_chunks
|
||||
for i in range(num_chunks):
|
||||
values = ray.get(args[i * chunk:(i + 1) * chunk])
|
||||
del values
|
||||
|
||||
assert cluster.remaining_processes_alive()
|
||||
|
||||
|
||||
def sorted_random_indexes(total, output_num):
|
||||
random_indexes = [np.random.randint(total) for _ in range(output_num)]
|
||||
random_indexes.sort()
|
||||
return random_indexes
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Failing with new GCS API on Linux.")
|
||||
def test_recursive(ray_start_reconstruction):
|
||||
plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
|
||||
# Define the size of one task's return argument so that the combined
|
||||
# sum of all objects' sizes is at least twice the plasma stores'
|
||||
# combined allotted memory.
|
||||
num_objects = 100
|
||||
size = int(plasma_store_memory * 1.5 / (num_objects * 8))
|
||||
|
||||
# Define a root task with no dependencies, which returns a numpy array
|
||||
# of the given size.
|
||||
@ray.remote
|
||||
def no_dependency_task(size):
|
||||
array = np.zeros(size)
|
||||
return array
|
||||
|
||||
# Define a task with a single dependency, which returns its one
|
||||
# argument.
|
||||
@ray.remote
|
||||
def single_dependency(i, arg):
|
||||
arg = np.copy(arg)
|
||||
arg[0] = i
|
||||
return arg
|
||||
|
||||
# Launch num_objects instances of the remote task, each dependent on
|
||||
# the one before it.
|
||||
arg = no_dependency_task.remote(size)
|
||||
args = []
|
||||
for i in range(num_objects):
|
||||
arg = single_dependency.remote(i, arg)
|
||||
args.append(arg)
|
||||
|
||||
# Get each value to force each task to finish. After some number of
|
||||
# gets, old values should be evicted.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get each value again to force reconstruction.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get 10 values randomly.
|
||||
random_indexes = sorted_random_indexes(num_objects, 10)
|
||||
for i in random_indexes:
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get values sequentially, in chunks.
|
||||
num_chunks = 4 * num_nodes
|
||||
chunk = num_objects // num_chunks
|
||||
for i in range(num_chunks):
|
||||
values = ray.get(args[i * chunk:(i + 1) * chunk])
|
||||
del values
|
||||
|
||||
assert cluster.remaining_processes_alive()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test often hangs or fails in CI.")
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Failing with new GCS API on Linux.")
|
||||
def test_multiple_recursive(ray_start_reconstruction):
|
||||
plasma_store_memory, _, cluster = ray_start_reconstruction
|
||||
# Define the size of one task's return argument so that the combined
|
||||
# sum of all objects' sizes is at least twice the plasma stores'
|
||||
# combined allotted memory.
|
||||
num_objects = 100
|
||||
size = plasma_store_memory * 2 // (num_objects * 8)
|
||||
|
||||
# Define a root task with no dependencies, which returns a numpy array
|
||||
# of the given size.
|
||||
@ray.remote
|
||||
def no_dependency_task(size):
|
||||
array = np.zeros(size)
|
||||
return array
|
||||
|
||||
# Define a task with multiple dependencies, which returns its first
|
||||
# argument.
|
||||
@ray.remote
|
||||
def multiple_dependency(i, arg1, arg2, arg3):
|
||||
arg1 = np.copy(arg1)
|
||||
arg1[0] = i
|
||||
return arg1
|
||||
|
||||
# Launch num_args instances of the root task. Then launch num_objects
|
||||
# instances of the multi-dependency remote task, each dependent on the
|
||||
# num_args tasks before it.
|
||||
num_args = 3
|
||||
args = []
|
||||
for i in range(num_args):
|
||||
arg = no_dependency_task.remote(size)
|
||||
args.append(arg)
|
||||
for i in range(num_objects):
|
||||
args.append(multiple_dependency.remote(i, *args[i:i + num_args]))
|
||||
|
||||
# Get each value to force each task to finish. After some number of
|
||||
# gets, old values should be evicted.
|
||||
args = args[num_args:]
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get each value again to force reconstruction.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get 10 values randomly.
|
||||
random_indexes = sorted_random_indexes(num_objects, 10)
|
||||
for i in random_indexes:
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
|
||||
assert cluster.remaining_processes_alive()
|
||||
|
||||
|
||||
def wait_for_errors(error_check):
|
||||
# Wait for errors from all the nondeterministic tasks.
|
||||
errors = []
|
||||
time_left = 100
|
||||
while time_left > 0:
|
||||
errors = flat_errors()
|
||||
if error_check(errors):
|
||||
break
|
||||
time_left -= 1
|
||||
time.sleep(1)
|
||||
|
||||
# Make sure that enough errors came through.
|
||||
assert error_check(errors)
|
||||
return errors
|
||||
|
||||
|
||||
@pytest.mark.skip("This test does not work yet.")
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Failing with new GCS API on Linux.")
|
||||
def test_nondeterministic_task(ray_start_reconstruction):
|
||||
plasma_store_memory, num_nodes, cluster = ray_start_reconstruction
|
||||
# Define the size of one task's return argument so that the combined
|
||||
# sum of all objects' sizes is at least twice the plasma stores'
|
||||
# combined allotted memory.
|
||||
num_objects = 1000
|
||||
size = plasma_store_memory * 2 // (num_objects * 8)
|
||||
|
||||
# Define a nondeterministic remote task with no dependencies, which
|
||||
# returns a random numpy array of the given size. This task should
|
||||
# produce an error on the driver if it is ever reexecuted.
|
||||
@ray.remote
|
||||
def foo(i, size):
|
||||
array = np.random.rand(size)
|
||||
array[0] = i
|
||||
return array
|
||||
|
||||
# Define a deterministic remote task with no dependencies, which
|
||||
# returns a numpy array of zeros of the given size.
|
||||
@ray.remote
|
||||
def bar(i, size):
|
||||
array = np.zeros(size)
|
||||
array[0] = i
|
||||
return array
|
||||
|
||||
# Launch num_objects instances, half deterministic and half
|
||||
# nondeterministic.
|
||||
args = []
|
||||
for i in range(num_objects):
|
||||
if i % 2 == 0:
|
||||
args.append(foo.remote(i, size))
|
||||
else:
|
||||
args.append(bar.remote(i, size))
|
||||
|
||||
# Get each value to force each task to finish. After some number of
|
||||
# gets, old values should be evicted.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
# Get each value again to force reconstruction.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
|
||||
def error_check(errors):
|
||||
if num_nodes == 1:
|
||||
# In a single-node setting, each object is evicted and
|
||||
# reconstructed exactly once, so exactly half the objects will
|
||||
# produce an error during reconstruction.
|
||||
min_errors = num_objects // 2
|
||||
else:
|
||||
# In a multinode setting, each object is evicted zero or one
|
||||
# times, so some of the nondeterministic tasks may not be
|
||||
# reexecuted.
|
||||
min_errors = 1
|
||||
return len(errors) >= min_errors
|
||||
|
||||
errors = wait_for_errors(error_check)
|
||||
# Make sure all the errors have the correct type.
|
||||
assert all(error["type"] == ray_constants.HASH_MISMATCH_PUSH_ERROR
|
||||
for error in errors)
|
||||
|
||||
assert cluster.remaining_processes_alive()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
||||
reason="Failing with new GCS API on Linux.")
|
||||
@pytest.mark.skipif(
|
||||
sys.version_info < (3, 0), reason="This test requires Python 3.")
|
||||
@pytest.mark.parametrize(
|
||||
"ray_start_object_store_memory", [10**9], indirect=True)
|
||||
def test_driver_put_errors(ray_start_object_store_memory):
|
||||
plasma_store_memory = ray_start_object_store_memory
|
||||
# Define the size of one task's return argument so that the combined
|
||||
# sum of all objects' sizes is at least twice the plasma stores'
|
||||
# combined allotted memory.
|
||||
num_objects = 100
|
||||
size = plasma_store_memory * 2 // (num_objects * 8)
|
||||
|
||||
# Define a task with a single dependency, a numpy array, that returns
|
||||
# another array.
|
||||
@ray.remote
|
||||
def single_dependency(i, arg):
|
||||
arg = np.copy(arg)
|
||||
arg[0] = i
|
||||
return arg
|
||||
|
||||
# Launch num_objects instances of the remote task, each dependent on
|
||||
# the one before it. The first instance of the task takes a numpy array
|
||||
# as an argument, which is put into the object store.
|
||||
args = []
|
||||
arg = single_dependency.remote(0, np.zeros(size))
|
||||
for i in range(num_objects):
|
||||
arg = single_dependency.remote(i, arg)
|
||||
args.append(arg)
|
||||
# Get each value to force each task to finish. After some number of
|
||||
# gets, old values should be evicted.
|
||||
for i in range(num_objects):
|
||||
value = ray.get(args[i])
|
||||
assert value[0] == i
|
||||
|
||||
# Get each value starting from the beginning to force reconstruction.
|
||||
# Currently, since we're not able to reconstruct `ray.put` objects that
|
||||
# were evicted and whose originating tasks are still running, this
|
||||
# for-loop should hang on its first iteration and push an error to the
|
||||
# driver.
|
||||
ray.worker.global_worker.raylet_client.fetch_or_reconstruct([args[0]],
|
||||
False)
|
||||
|
||||
def error_check(errors):
|
||||
return len(errors) > 1
|
||||
|
||||
errors = wait_for_errors(error_check)
|
||||
assert all(error["type"] == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
|
||||
or "ray.exceptions.UnreconstructableError" in error["message"]
|
||||
for error in errors)
|
||||
|
||||
|
||||
# NOTE(swang): This test tries to launch 1000 workers and breaks.
|
||||
# TODO(rkn): This test needs to be updated to use pytest.
|
||||
# class WorkerPoolTests(unittest.TestCase):
|
||||
#
|
||||
# def tearDown(self):
|
||||
# ray.shutdown()
|
||||
#
|
||||
# def testBlockingTasks(self):
|
||||
# @ray.remote
|
||||
# def f(i, j):
|
||||
# return (i, j)
|
||||
#
|
||||
# @ray.remote
|
||||
# def g(i):
|
||||
# # Each instance of g submits and blocks on the result of another remote
|
||||
# # task.
|
||||
# object_ids = [f.remote(i, j) for j in range(10)]
|
||||
# return ray.get(object_ids)
|
||||
#
|
||||
# ray.init(num_workers=1)
|
||||
# ray.get([g.remote(i) for i in range(1000)])
|
||||
# ray.shutdown()
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
@@ -0,0 +1,102 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import os
|
||||
import pytest
|
||||
|
||||
import ray
|
||||
|
||||
|
||||
@pytest.fixture(params=[1, 4])
|
||||
def ray_start_sharded(request):
|
||||
num_redis_shards = request.param
|
||||
|
||||
if os.environ.get("RAY_USE_NEW_GCS") == "on":
|
||||
num_redis_shards = 1
|
||||
# For now, RAY_USE_NEW_GCS supports 1 shard, and credis supports
|
||||
# 1-node chain for that shard only.
|
||||
|
||||
# Start the Ray processes.
|
||||
ray.init(
|
||||
object_store_memory=int(0.5 * 10**9),
|
||||
num_cpus=10,
|
||||
num_redis_shards=num_redis_shards,
|
||||
redis_max_memory=10**7)
|
||||
|
||||
yield None
|
||||
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def test_submitting_many_tasks(ray_start_sharded):
|
||||
@ray.remote
|
||||
def f(x):
|
||||
return 1
|
||||
|
||||
def g(n):
|
||||
x = 1
|
||||
for i in range(n):
|
||||
x = f.remote(x)
|
||||
return x
|
||||
|
||||
ray.get([g(1000) for _ in range(100)])
|
||||
assert ray.services.remaining_processes_alive()
|
||||
|
||||
|
||||
def test_submitting_many_actors_to_one(ray_start_sharded):
|
||||
@ray.remote
|
||||
class Actor(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def ping(self):
|
||||
return
|
||||
|
||||
@ray.remote
|
||||
class Worker(object):
|
||||
def __init__(self, actor):
|
||||
self.actor = actor
|
||||
|
||||
def ping(self):
|
||||
return ray.get(self.actor.ping.remote())
|
||||
|
||||
a = Actor.remote()
|
||||
workers = [Worker.remote(a) for _ in range(10)]
|
||||
for _ in range(10):
|
||||
out = ray.get([w.ping.remote() for w in workers])
|
||||
assert out == [None for _ in workers]
|
||||
|
||||
|
||||
def test_getting_and_putting(ray_start_sharded):
|
||||
for n in range(8):
|
||||
x = np.zeros(10**n)
|
||||
|
||||
for _ in range(100):
|
||||
ray.put(x)
|
||||
|
||||
x_id = ray.put(x)
|
||||
for _ in range(1000):
|
||||
ray.get(x_id)
|
||||
|
||||
assert ray.services.remaining_processes_alive()
|
||||
|
||||
|
||||
def test_getting_many_objects(ray_start_sharded):
|
||||
@ray.remote
|
||||
def f():
|
||||
return 1
|
||||
|
||||
n = 10**4 # TODO(pcm): replace by 10 ** 5 once this is faster.
|
||||
lst = ray.get([f.remote() for _ in range(n)])
|
||||
assert lst == n * [1]
|
||||
|
||||
assert ray.services.remaining_processes_alive()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
@@ -7,7 +7,7 @@ import shutil
|
||||
import time
|
||||
import pytest
|
||||
import ray
|
||||
from ray.tests.cluster_utils import Cluster
|
||||
from ray.cluster_utils import Cluster
|
||||
|
||||
|
||||
def test_conn_cluster():
|
||||
@@ -150,3 +150,11 @@ def test_session_dir_uniqueness():
|
||||
session_dirs.add(ray.worker._global_node.get_session_dir_path)
|
||||
ray.shutdown()
|
||||
assert len(session_dirs) == 3
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
# Make subprocess happy in bazel.
|
||||
os.environ["LC_ALL"] = "en_US.UTF-8"
|
||||
os.environ["LANG"] = "en_US.UTF-8"
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -252,3 +252,9 @@ def test_remote_training_loss(ray_start_2_cpus):
|
||||
after_acc = sess.run(
|
||||
loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100])))
|
||||
assert before_acc < after_acc
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -44,4 +44,6 @@ class TestUnreconstructableErrors(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -33,3 +33,9 @@ def test_get_webui(shutdown_only):
|
||||
assert node_info["error"] is None
|
||||
assert node_info["result"] is not None
|
||||
assert isinstance(node_info["timestamp"], float)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
+38
-2
@@ -3,12 +3,14 @@ py_test(
|
||||
size = "medium",
|
||||
srcs = ["tests/test_actor_reuse.py"],
|
||||
tags = ["jenkins_only"],
|
||||
deps = [":tune_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_automl_searcher",
|
||||
size = "small",
|
||||
srcs = ["tests/test_automl_searcher.py"],
|
||||
deps = [":tune_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
@@ -77,19 +79,52 @@ py_test(
|
||||
deps = [":tune_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_run_experiment",
|
||||
size = "medium",
|
||||
srcs = ["tests/test_run_experiment.py"],
|
||||
deps = [":tune_lib"],
|
||||
tags = ["exclusive"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_trial_runner",
|
||||
size = "large",
|
||||
size = "medium",
|
||||
srcs = ["tests/test_trial_runner.py"],
|
||||
deps = [":tune_lib"],
|
||||
tags = ["exclusive"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_var",
|
||||
size = "medium",
|
||||
srcs = ["tests/test_var.py"],
|
||||
deps = [":tune_lib"],
|
||||
tags = ["exclusive"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_api",
|
||||
size = "medium",
|
||||
srcs = ["tests/test_api.py"],
|
||||
deps = [":tune_lib"],
|
||||
tags = ["exclusive"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_sync",
|
||||
size = "medium",
|
||||
srcs = ["tests/test_sync.py"],
|
||||
deps = [":tune_lib"],
|
||||
tags = ["exclusive"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_trial_scheduler",
|
||||
size = "medium",
|
||||
srcs = ["tests/test_trial_scheduler.py"],
|
||||
deps = [":tune_lib"],
|
||||
tags = ["exclusive"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
@@ -113,11 +148,12 @@ py_test(
|
||||
size = "medium",
|
||||
srcs = ["tests/test_tune_server.py"],
|
||||
deps = [":tune_lib"],
|
||||
tags = ["exclusive"],
|
||||
)
|
||||
|
||||
# This is a dummy test dependency that causes the above tests to be
|
||||
# re-run if any of these files changes.
|
||||
py_library(
|
||||
name="tune_lib",
|
||||
name = "tune_lib",
|
||||
srcs = glob(["**/*.py"], exclude=["tests/*.py"]),
|
||||
)
|
||||
|
||||
@@ -96,4 +96,6 @@ class ActorReuseTest(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -0,0 +1,797 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import copy
|
||||
import os
|
||||
import time
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
from ray.rllib import _register_all
|
||||
|
||||
from ray import tune
|
||||
from ray.tune import Trainable, TuneError
|
||||
from ray.tune import register_env, register_trainable, run_experiments
|
||||
from ray.tune.schedulers import TrialScheduler, FIFOScheduler
|
||||
from ray.tune.trial import Trial
|
||||
from ray.tune.result import (TIMESTEPS_TOTAL, DONE, HOSTNAME, NODE_IP, PID,
|
||||
EPISODES_TOTAL, TRAINING_ITERATION,
|
||||
TIMESTEPS_THIS_ITER, TIME_THIS_ITER_S,
|
||||
TIME_TOTAL_S, TRIAL_ID, EXPERIMENT_TAG)
|
||||
from ray.tune.logger import Logger
|
||||
from ray.tune.util import pin_in_object_store, get_pinned_object, flatten_dict
|
||||
from ray.tune.experiment import Experiment
|
||||
from ray.tune.resources import Resources
|
||||
from ray.tune.suggest import grid_search
|
||||
from ray.tune.suggest.suggestion import _MockSuggestionAlgorithm
|
||||
|
||||
|
||||
class TrainableFunctionApiTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
ray.init(num_cpus=4, num_gpus=0, object_store_memory=150 * 1024 * 1024)
|
||||
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
_register_all() # re-register the evicted objects
|
||||
|
||||
def checkAndReturnConsistentLogs(self, results, sleep_per_iter=None):
|
||||
"""Checks logging is the same between APIs.
|
||||
|
||||
Ignore "DONE" for logging but checks that the
|
||||
scheduler is notified properly with the last result.
|
||||
"""
|
||||
class_results = copy.deepcopy(results)
|
||||
function_results = copy.deepcopy(results)
|
||||
|
||||
class_output = []
|
||||
function_output = []
|
||||
scheduler_notif = []
|
||||
|
||||
class MockScheduler(FIFOScheduler):
|
||||
def on_trial_complete(self, runner, trial, result):
|
||||
scheduler_notif.append(result)
|
||||
|
||||
class ClassAPILogger(Logger):
|
||||
def on_result(self, result):
|
||||
class_output.append(result)
|
||||
|
||||
class FunctionAPILogger(Logger):
|
||||
def on_result(self, result):
|
||||
function_output.append(result)
|
||||
|
||||
class _WrappedTrainable(Trainable):
|
||||
def _setup(self, config):
|
||||
del config
|
||||
self._result_iter = copy.deepcopy(class_results)
|
||||
|
||||
def _train(self):
|
||||
if sleep_per_iter:
|
||||
time.sleep(sleep_per_iter)
|
||||
res = self._result_iter.pop(0) # This should not fail
|
||||
if not self._result_iter: # Mark "Done" for last result
|
||||
res[DONE] = True
|
||||
return res
|
||||
|
||||
def _function_trainable(config, reporter):
|
||||
for result in function_results:
|
||||
if sleep_per_iter:
|
||||
time.sleep(sleep_per_iter)
|
||||
reporter(**result)
|
||||
|
||||
class_trainable_name = "class_trainable"
|
||||
register_trainable(class_trainable_name, _WrappedTrainable)
|
||||
|
||||
trials = run_experiments(
|
||||
{
|
||||
"function_api": {
|
||||
"run": _function_trainable,
|
||||
"loggers": [FunctionAPILogger],
|
||||
},
|
||||
"class_api": {
|
||||
"run": class_trainable_name,
|
||||
"loggers": [ClassAPILogger],
|
||||
},
|
||||
},
|
||||
raise_on_failed_trial=False,
|
||||
scheduler=MockScheduler())
|
||||
|
||||
# Ignore these fields
|
||||
NO_COMPARE_FIELDS = {
|
||||
HOSTNAME,
|
||||
NODE_IP,
|
||||
TRIAL_ID,
|
||||
EXPERIMENT_TAG,
|
||||
PID,
|
||||
TIME_THIS_ITER_S,
|
||||
TIME_TOTAL_S,
|
||||
DONE, # This is ignored because FunctionAPI has different handling
|
||||
"timestamp",
|
||||
"time_since_restore",
|
||||
"experiment_id",
|
||||
"date",
|
||||
}
|
||||
|
||||
self.assertEqual(len(class_output), len(results))
|
||||
self.assertEqual(len(function_output), len(results))
|
||||
|
||||
def as_comparable_result(result):
|
||||
return {
|
||||
k: v
|
||||
for k, v in result.items() if k not in NO_COMPARE_FIELDS
|
||||
}
|
||||
|
||||
function_comparable = [
|
||||
as_comparable_result(result) for result in function_output
|
||||
]
|
||||
class_comparable = [
|
||||
as_comparable_result(result) for result in class_output
|
||||
]
|
||||
|
||||
self.assertEqual(function_comparable, class_comparable)
|
||||
|
||||
self.assertEqual(sum(t.get(DONE) for t in scheduler_notif), 2)
|
||||
self.assertEqual(
|
||||
as_comparable_result(scheduler_notif[0]),
|
||||
as_comparable_result(scheduler_notif[1]))
|
||||
|
||||
# Make sure the last result is the same.
|
||||
self.assertEqual(
|
||||
as_comparable_result(trials[0].last_result),
|
||||
as_comparable_result(trials[1].last_result))
|
||||
|
||||
return function_output, trials
|
||||
|
||||
def testPinObject(self):
|
||||
X = pin_in_object_store("hello")
|
||||
|
||||
@ray.remote
|
||||
def f():
|
||||
return get_pinned_object(X)
|
||||
|
||||
self.assertEqual(ray.get(f.remote()), "hello")
|
||||
|
||||
def testFetchPinned(self):
|
||||
X = pin_in_object_store("hello")
|
||||
|
||||
def train(config, reporter):
|
||||
get_pinned_object(X)
|
||||
reporter(timesteps_total=100, done=True)
|
||||
|
||||
register_trainable("f1", train)
|
||||
[trial] = run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
}
|
||||
})
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 100)
|
||||
|
||||
def testRegisterEnv(self):
|
||||
register_env("foo", lambda: None)
|
||||
self.assertRaises(TypeError, lambda: register_env("foo", 2))
|
||||
|
||||
def testRegisterEnvOverwrite(self):
|
||||
def train(config, reporter):
|
||||
reporter(timesteps_total=100, done=True)
|
||||
|
||||
def train2(config, reporter):
|
||||
reporter(timesteps_total=200, done=True)
|
||||
|
||||
register_trainable("f1", train)
|
||||
register_trainable("f1", train2)
|
||||
[trial] = run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
}
|
||||
})
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 200)
|
||||
|
||||
def testRegisterTrainable(self):
|
||||
def train(config, reporter):
|
||||
pass
|
||||
|
||||
class A(object):
|
||||
pass
|
||||
|
||||
class B(Trainable):
|
||||
pass
|
||||
|
||||
register_trainable("foo", train)
|
||||
Experiment("test", train)
|
||||
register_trainable("foo", B)
|
||||
Experiment("test", B)
|
||||
self.assertRaises(TypeError, lambda: register_trainable("foo", B()))
|
||||
self.assertRaises(TuneError, lambda: Experiment("foo", B()))
|
||||
self.assertRaises(TypeError, lambda: register_trainable("foo", A))
|
||||
self.assertRaises(TypeError, lambda: Experiment("foo", A))
|
||||
|
||||
def testTrainableCallable(self):
|
||||
def dummy_fn(config, reporter, steps):
|
||||
reporter(timesteps_total=steps, done=True)
|
||||
|
||||
from functools import partial
|
||||
steps = 500
|
||||
register_trainable("test", partial(dummy_fn, steps=steps))
|
||||
[trial] = run_experiments({
|
||||
"foo": {
|
||||
"run": "test",
|
||||
}
|
||||
})
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], steps)
|
||||
[trial] = tune.run(partial(dummy_fn, steps=steps)).trials
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], steps)
|
||||
|
||||
def testBuiltInTrainableResources(self):
|
||||
class B(Trainable):
|
||||
@classmethod
|
||||
def default_resource_request(cls, config):
|
||||
return Resources(cpu=config["cpu"], gpu=config["gpu"])
|
||||
|
||||
def _train(self):
|
||||
return {"timesteps_this_iter": 1, "done": True}
|
||||
|
||||
register_trainable("B", B)
|
||||
|
||||
def f(cpus, gpus, queue_trials):
|
||||
return run_experiments(
|
||||
{
|
||||
"foo": {
|
||||
"run": "B",
|
||||
"config": {
|
||||
"cpu": cpus,
|
||||
"gpu": gpus,
|
||||
},
|
||||
}
|
||||
},
|
||||
queue_trials=queue_trials)[0]
|
||||
|
||||
# Should all succeed
|
||||
self.assertEqual(f(0, 0, False).status, Trial.TERMINATED)
|
||||
self.assertEqual(f(1, 0, True).status, Trial.TERMINATED)
|
||||
self.assertEqual(f(1, 0, True).status, Trial.TERMINATED)
|
||||
|
||||
# Too large resource request
|
||||
self.assertRaises(TuneError, lambda: f(100, 100, False))
|
||||
self.assertRaises(TuneError, lambda: f(0, 100, False))
|
||||
self.assertRaises(TuneError, lambda: f(100, 0, False))
|
||||
|
||||
# TODO(ekl) how can we test this is queued (hangs)?
|
||||
# f(100, 0, True)
|
||||
|
||||
def testRewriteEnv(self):
|
||||
def train(config, reporter):
|
||||
reporter(timesteps_total=1)
|
||||
|
||||
register_trainable("f1", train)
|
||||
|
||||
[trial] = run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
"env": "CartPole-v0",
|
||||
}
|
||||
})
|
||||
self.assertEqual(trial.config["env"], "CartPole-v0")
|
||||
|
||||
def testConfigPurity(self):
|
||||
def train(config, reporter):
|
||||
assert config == {"a": "b"}, config
|
||||
reporter(timesteps_total=1)
|
||||
|
||||
register_trainable("f1", train)
|
||||
run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
"config": {
|
||||
"a": "b"
|
||||
},
|
||||
}
|
||||
})
|
||||
|
||||
def testLogdir(self):
|
||||
def train(config, reporter):
|
||||
assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd()
|
||||
reporter(timesteps_total=1)
|
||||
|
||||
register_trainable("f1", train)
|
||||
run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
"local_dir": "/tmp/logdir",
|
||||
"config": {
|
||||
"a": "b"
|
||||
},
|
||||
}
|
||||
})
|
||||
|
||||
def testLogdirStartingWithTilde(self):
|
||||
local_dir = "~/ray_results/local_dir"
|
||||
|
||||
def train(config, reporter):
|
||||
cwd = os.getcwd()
|
||||
assert cwd.startswith(os.path.expanduser(local_dir)), cwd
|
||||
assert not cwd.startswith("~"), cwd
|
||||
reporter(timesteps_total=1)
|
||||
|
||||
register_trainable("f1", train)
|
||||
run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
"local_dir": local_dir,
|
||||
"config": {
|
||||
"a": "b"
|
||||
},
|
||||
}
|
||||
})
|
||||
|
||||
def testLongFilename(self):
|
||||
def train(config, reporter):
|
||||
assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd()
|
||||
reporter(timesteps_total=1)
|
||||
|
||||
register_trainable("f1", train)
|
||||
run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
"local_dir": "/tmp/logdir",
|
||||
"config": {
|
||||
"a" * 50: tune.sample_from(lambda spec: 5.0 / 7),
|
||||
"b" * 50: tune.sample_from(lambda spec: "long" * 40),
|
||||
},
|
||||
}
|
||||
})
|
||||
|
||||
def testBadParams(self):
|
||||
def f():
|
||||
run_experiments({"foo": {}})
|
||||
|
||||
self.assertRaises(TuneError, f)
|
||||
|
||||
def testBadParams2(self):
|
||||
def f():
|
||||
run_experiments({
|
||||
"foo": {
|
||||
"run": "asdf",
|
||||
"bah": "this param is not allowed",
|
||||
}
|
||||
})
|
||||
|
||||
self.assertRaises(TuneError, f)
|
||||
|
||||
def testBadParams3(self):
|
||||
def f():
|
||||
run_experiments({
|
||||
"foo": {
|
||||
"run": grid_search("invalid grid search"),
|
||||
}
|
||||
})
|
||||
|
||||
self.assertRaises(TuneError, f)
|
||||
|
||||
def testBadParams4(self):
|
||||
def f():
|
||||
run_experiments({
|
||||
"foo": {
|
||||
"run": "asdf",
|
||||
}
|
||||
})
|
||||
|
||||
self.assertRaises(TuneError, f)
|
||||
|
||||
def testBadParams5(self):
|
||||
def f():
|
||||
run_experiments({"foo": {"run": "PPO", "stop": {"asdf": 1}}})
|
||||
|
||||
self.assertRaises(TuneError, f)
|
||||
|
||||
def testBadParams6(self):
|
||||
def f():
|
||||
run_experiments({
|
||||
"foo": {
|
||||
"run": "PPO",
|
||||
"resources_per_trial": {
|
||||
"asdf": 1
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
self.assertRaises(TuneError, f)
|
||||
|
||||
def testBadStoppingReturn(self):
|
||||
def train(config, reporter):
|
||||
reporter()
|
||||
|
||||
register_trainable("f1", train)
|
||||
|
||||
def f():
|
||||
run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
"stop": {
|
||||
"time": 10
|
||||
},
|
||||
}
|
||||
})
|
||||
|
||||
self.assertRaises(TuneError, f)
|
||||
|
||||
def testNestedStoppingReturn(self):
|
||||
def train(config, reporter):
|
||||
for i in range(10):
|
||||
reporter(test={"test1": {"test2": i}})
|
||||
|
||||
with self.assertRaises(TuneError):
|
||||
[trial] = tune.run(
|
||||
train, stop={
|
||||
"test": {
|
||||
"test1": {
|
||||
"test2": 6
|
||||
}
|
||||
}
|
||||
}).trials
|
||||
[trial] = tune.run(train, stop={"test/test1/test2": 6}).trials
|
||||
self.assertEqual(trial.last_result["training_iteration"], 7)
|
||||
|
||||
def testStoppingFunction(self):
|
||||
def train(config, reporter):
|
||||
for i in range(10):
|
||||
reporter(test=i)
|
||||
|
||||
def stop(trial_id, result):
|
||||
return result["test"] > 6
|
||||
|
||||
[trial] = tune.run(train, stop=stop).trials
|
||||
self.assertEqual(trial.last_result["training_iteration"], 8)
|
||||
|
||||
def testStoppingMemberFunction(self):
|
||||
def train(config, reporter):
|
||||
for i in range(10):
|
||||
reporter(test=i)
|
||||
|
||||
class Stopper:
|
||||
def stop(self, trial_id, result):
|
||||
return result["test"] > 6
|
||||
|
||||
[trial] = tune.run(train, stop=Stopper().stop).trials
|
||||
self.assertEqual(trial.last_result["training_iteration"], 8)
|
||||
|
||||
def testBadStoppingFunction(self):
|
||||
def train(config, reporter):
|
||||
for i in range(10):
|
||||
reporter(test=i)
|
||||
|
||||
class Stopper:
|
||||
def stop(self, result):
|
||||
return result["test"] > 6
|
||||
|
||||
def stop(result):
|
||||
return result["test"] > 6
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
tune.run(train, stop=Stopper().stop)
|
||||
with self.assertRaises(ValueError):
|
||||
tune.run(train, stop=stop)
|
||||
|
||||
def testEarlyReturn(self):
|
||||
def train(config, reporter):
|
||||
reporter(timesteps_total=100, done=True)
|
||||
time.sleep(99999)
|
||||
|
||||
register_trainable("f1", train)
|
||||
[trial] = run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
}
|
||||
})
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 100)
|
||||
|
||||
def testReporterNoUsage(self):
|
||||
def run_task(config, reporter):
|
||||
print("hello")
|
||||
|
||||
experiment = Experiment(run=run_task, name="ray_crash_repro")
|
||||
[trial] = ray.tune.run(experiment).trials
|
||||
print(trial.last_result)
|
||||
self.assertEqual(trial.last_result[DONE], True)
|
||||
|
||||
def testErrorReturn(self):
|
||||
def train(config, reporter):
|
||||
raise Exception("uh oh")
|
||||
|
||||
register_trainable("f1", train)
|
||||
|
||||
def f():
|
||||
run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
}
|
||||
})
|
||||
|
||||
self.assertRaises(TuneError, f)
|
||||
|
||||
def testSuccess(self):
|
||||
def train(config, reporter):
|
||||
for i in range(100):
|
||||
reporter(timesteps_total=i)
|
||||
|
||||
register_trainable("f1", train)
|
||||
[trial] = run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
}
|
||||
})
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
|
||||
|
||||
def testNoRaiseFlag(self):
|
||||
def train(config, reporter):
|
||||
raise Exception()
|
||||
|
||||
register_trainable("f1", train)
|
||||
|
||||
[trial] = run_experiments(
|
||||
{
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
}
|
||||
}, raise_on_failed_trial=False)
|
||||
self.assertEqual(trial.status, Trial.ERROR)
|
||||
|
||||
def testReportInfinity(self):
|
||||
def train(config, reporter):
|
||||
for i in range(100):
|
||||
reporter(mean_accuracy=float("inf"))
|
||||
|
||||
register_trainable("f1", train)
|
||||
[trial] = run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
}
|
||||
})
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertEqual(trial.last_result["mean_accuracy"], float("inf"))
|
||||
|
||||
def testNestedResults(self):
|
||||
def create_result(i):
|
||||
return {"test": {"1": {"2": {"3": i, "4": False}}}}
|
||||
|
||||
flattened_keys = list(flatten_dict(create_result(0)))
|
||||
|
||||
class _MockScheduler(FIFOScheduler):
|
||||
results = []
|
||||
|
||||
def on_trial_result(self, trial_runner, trial, result):
|
||||
self.results += [result]
|
||||
return TrialScheduler.CONTINUE
|
||||
|
||||
def on_trial_complete(self, trial_runner, trial, result):
|
||||
self.complete_result = result
|
||||
|
||||
def train(config, reporter):
|
||||
for i in range(100):
|
||||
reporter(**create_result(i))
|
||||
|
||||
algo = _MockSuggestionAlgorithm()
|
||||
scheduler = _MockScheduler()
|
||||
[trial] = tune.run(
|
||||
train,
|
||||
scheduler=scheduler,
|
||||
search_alg=algo,
|
||||
stop={
|
||||
"test/1/2/3": 20
|
||||
}).trials
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertEqual(trial.last_result["test"]["1"]["2"]["3"], 20)
|
||||
self.assertEqual(trial.last_result["test"]["1"]["2"]["4"], False)
|
||||
self.assertEqual(trial.last_result[TRAINING_ITERATION], 21)
|
||||
self.assertEqual(len(scheduler.results), 20)
|
||||
self.assertTrue(
|
||||
all(
|
||||
set(result) >= set(flattened_keys)
|
||||
for result in scheduler.results))
|
||||
self.assertTrue(set(scheduler.complete_result) >= set(flattened_keys))
|
||||
self.assertEqual(len(algo.results), 20)
|
||||
self.assertTrue(
|
||||
all(set(result) >= set(flattened_keys) for result in algo.results))
|
||||
with self.assertRaises(TuneError):
|
||||
[trial] = tune.run(train, stop={"1/2/3": 20})
|
||||
with self.assertRaises(TuneError):
|
||||
[trial] = tune.run(train, stop={"test": 1}).trials
|
||||
|
||||
def testReportTimeStep(self):
|
||||
# Test that no timestep count are logged if never the Trainable never
|
||||
# returns any.
|
||||
results1 = [dict(mean_accuracy=5, done=i == 99) for i in range(100)]
|
||||
logs1, _ = self.checkAndReturnConsistentLogs(results1)
|
||||
|
||||
self.assertTrue(all(log[TIMESTEPS_TOTAL] is None for log in logs1))
|
||||
|
||||
# Test that no timesteps_this_iter are logged if only timesteps_total
|
||||
# are returned.
|
||||
results2 = [dict(timesteps_total=5, done=i == 9) for i in range(10)]
|
||||
logs2, _ = self.checkAndReturnConsistentLogs(results2)
|
||||
|
||||
# Re-run the same trials but with added delay. This is to catch some
|
||||
# inconsistent timestep counting that was present in the multi-threaded
|
||||
# FunctionRunner. This part of the test can be removed once the
|
||||
# multi-threaded FunctionRunner is removed from ray/tune.
|
||||
# TODO: remove once the multi-threaded function runner is gone.
|
||||
logs2, _ = self.checkAndReturnConsistentLogs(results2, 0.5)
|
||||
|
||||
# check all timesteps_total report the same value
|
||||
self.assertTrue(all(log[TIMESTEPS_TOTAL] == 5 for log in logs2))
|
||||
# check that none of the logs report timesteps_this_iter
|
||||
self.assertFalse(
|
||||
any(hasattr(log, TIMESTEPS_THIS_ITER) for log in logs2))
|
||||
|
||||
# Test that timesteps_total and episodes_total are reported when
|
||||
# timesteps_this_iter and episodes_this_iter despite only return zeros.
|
||||
results3 = [
|
||||
dict(timesteps_this_iter=0, episodes_this_iter=0)
|
||||
for i in range(10)
|
||||
]
|
||||
logs3, _ = self.checkAndReturnConsistentLogs(results3)
|
||||
|
||||
self.assertTrue(all(log[TIMESTEPS_TOTAL] == 0 for log in logs3))
|
||||
self.assertTrue(all(log[EPISODES_TOTAL] == 0 for log in logs3))
|
||||
|
||||
# Test that timesteps_total and episodes_total are properly counted
|
||||
# when timesteps_this_iter and episodes_this_iter report non-zero
|
||||
# values.
|
||||
results4 = [
|
||||
dict(timesteps_this_iter=3, episodes_this_iter=i)
|
||||
for i in range(10)
|
||||
]
|
||||
logs4, _ = self.checkAndReturnConsistentLogs(results4)
|
||||
|
||||
# The last reported result should not be double-logged.
|
||||
self.assertEqual(logs4[-1][TIMESTEPS_TOTAL], 30)
|
||||
self.assertNotEqual(logs4[-2][TIMESTEPS_TOTAL],
|
||||
logs4[-1][TIMESTEPS_TOTAL])
|
||||
self.assertEqual(logs4[-1][EPISODES_TOTAL], 45)
|
||||
self.assertNotEqual(logs4[-2][EPISODES_TOTAL],
|
||||
logs4[-1][EPISODES_TOTAL])
|
||||
|
||||
def testAllValuesReceived(self):
|
||||
results1 = [
|
||||
dict(timesteps_total=(i + 1), my_score=i**2, done=i == 4)
|
||||
for i in range(5)
|
||||
]
|
||||
|
||||
logs1, _ = self.checkAndReturnConsistentLogs(results1)
|
||||
|
||||
# check if the correct number of results were reported
|
||||
self.assertEqual(len(logs1), len(results1))
|
||||
|
||||
def check_no_missing(reported_result, result):
|
||||
common_results = [reported_result[k] == result[k] for k in result]
|
||||
return all(common_results)
|
||||
|
||||
# check that no result was dropped or modified
|
||||
complete_results = [
|
||||
check_no_missing(log, result)
|
||||
for log, result in zip(logs1, results1)
|
||||
]
|
||||
self.assertTrue(all(complete_results))
|
||||
|
||||
# check if done was logged exactly once
|
||||
self.assertEqual(len([r for r in logs1 if r.get("done")]), 1)
|
||||
|
||||
def testNoDoneReceived(self):
|
||||
# repeat same test but without explicitly reporting done=True
|
||||
results1 = [
|
||||
dict(timesteps_total=(i + 1), my_score=i**2) for i in range(5)
|
||||
]
|
||||
|
||||
logs1, trials = self.checkAndReturnConsistentLogs(results1)
|
||||
|
||||
# check if the correct number of results were reported.
|
||||
self.assertEqual(len(logs1), len(results1))
|
||||
|
||||
def check_no_missing(reported_result, result):
|
||||
common_results = [reported_result[k] == result[k] for k in result]
|
||||
return all(common_results)
|
||||
|
||||
# check that no result was dropped or modified
|
||||
complete_results1 = [
|
||||
check_no_missing(log, result)
|
||||
for log, result in zip(logs1, results1)
|
||||
]
|
||||
self.assertTrue(all(complete_results1))
|
||||
|
||||
def testCheckpointDict(self):
|
||||
class TestTrain(Trainable):
|
||||
def _setup(self, config):
|
||||
self.state = {"hi": 1}
|
||||
|
||||
def _train(self):
|
||||
return {"timesteps_this_iter": 1, "done": True}
|
||||
|
||||
def _save(self, path):
|
||||
return self.state
|
||||
|
||||
def _restore(self, state):
|
||||
self.state = state
|
||||
|
||||
test_trainable = TestTrain()
|
||||
result = test_trainable.save()
|
||||
test_trainable.state["hi"] = 2
|
||||
test_trainable.restore(result)
|
||||
self.assertEqual(test_trainable.state["hi"], 1)
|
||||
|
||||
trials = run_experiments({
|
||||
"foo": {
|
||||
"run": TestTrain,
|
||||
"checkpoint_at_end": True
|
||||
}
|
||||
})
|
||||
for trial in trials:
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertTrue(trial.has_checkpoint())
|
||||
|
||||
def testMultipleCheckpoints(self):
|
||||
class TestTrain(Trainable):
|
||||
def _setup(self, config):
|
||||
self.state = {"hi": 1, "iter": 0}
|
||||
|
||||
def _train(self):
|
||||
self.state["iter"] += 1
|
||||
return {"timesteps_this_iter": 1, "done": True}
|
||||
|
||||
def _save(self, path):
|
||||
return self.state
|
||||
|
||||
def _restore(self, state):
|
||||
self.state = state
|
||||
|
||||
test_trainable = TestTrain()
|
||||
checkpoint_1 = test_trainable.save()
|
||||
test_trainable.train()
|
||||
checkpoint_2 = test_trainable.save()
|
||||
self.assertNotEqual(checkpoint_1, checkpoint_2)
|
||||
test_trainable.restore(checkpoint_2)
|
||||
self.assertEqual(test_trainable.state["iter"], 1)
|
||||
test_trainable.restore(checkpoint_1)
|
||||
self.assertEqual(test_trainable.state["iter"], 0)
|
||||
|
||||
trials = run_experiments({
|
||||
"foo": {
|
||||
"run": TestTrain,
|
||||
"checkpoint_at_end": True
|
||||
}
|
||||
})
|
||||
for trial in trials:
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertTrue(trial.has_checkpoint())
|
||||
|
||||
def testIterationCounter(self):
|
||||
def train(config, reporter):
|
||||
for i in range(100):
|
||||
reporter(itr=i, timesteps_this_iter=1)
|
||||
|
||||
register_trainable("exp", train)
|
||||
config = {
|
||||
"my_exp": {
|
||||
"run": "exp",
|
||||
"config": {
|
||||
"iterations": 100,
|
||||
},
|
||||
"stop": {
|
||||
"timesteps_total": 100
|
||||
},
|
||||
}
|
||||
}
|
||||
[trial] = run_experiments(config)
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertEqual(trial.last_result[TRAINING_ITERATION], 100)
|
||||
self.assertEqual(trial.last_result["itr"], 99)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
@@ -70,4 +70,6 @@ class AutoMLSearcherTest(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -109,4 +109,4 @@ class CheckpointManagerTest(unittest.TestCase):
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", "-s", __file__]))
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -13,8 +13,8 @@ import sys
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.rllib import _register_all
|
||||
from ray.tests.cluster_utils import Cluster
|
||||
from ray.tests.utils import run_string_as_driver_nonblocking
|
||||
from ray.cluster_utils import Cluster
|
||||
from ray.test_utils import run_string_as_driver_nonblocking
|
||||
from ray.tune.error import TuneError
|
||||
from ray.tune.ray_trial_executor import RayTrialExecutor
|
||||
from ray.tune.experiment import Experiment
|
||||
@@ -598,4 +598,4 @@ tune.run(
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", "-s", __file__]))
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -62,4 +62,6 @@ class ExperimentTest(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -204,4 +204,6 @@ class AnalysisSuite(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -64,4 +64,6 @@ class LoggerSuite(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -14,7 +14,7 @@ from ray.tune.registry import _global_registry, TRAINABLE_CLASS
|
||||
from ray.tune.suggest import BasicVariantGenerator
|
||||
from ray.tune.trial import Trial, Checkpoint
|
||||
from ray.tune.resources import Resources
|
||||
from ray.tests.cluster_utils import Cluster
|
||||
from ray.cluster_utils import Cluster
|
||||
|
||||
|
||||
class RayTrialExecutorTest(unittest.TestCase):
|
||||
@@ -190,4 +190,6 @@ class LocalModeExecutorTest(RayTrialExecutorTest):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -0,0 +1,241 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
from ray.rllib import _register_all
|
||||
|
||||
from ray.tune.result import TIMESTEPS_TOTAL
|
||||
from ray.tune import Trainable, TuneError
|
||||
from ray.tune import register_trainable, run_experiments
|
||||
from ray.tune.logger import Logger
|
||||
from ray.tune.experiment import Experiment
|
||||
from ray.tune.trial import Trial, ExportFormat
|
||||
|
||||
|
||||
class RunExperimentTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
_register_all() # re-register the evicted objects
|
||||
|
||||
def testDict(self):
|
||||
def train(config, reporter):
|
||||
for i in range(100):
|
||||
reporter(timesteps_total=i)
|
||||
|
||||
register_trainable("f1", train)
|
||||
trials = run_experiments({
|
||||
"foo": {
|
||||
"run": "f1",
|
||||
},
|
||||
"bar": {
|
||||
"run": "f1",
|
||||
}
|
||||
})
|
||||
for trial in trials:
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
|
||||
|
||||
def testExperiment(self):
|
||||
def train(config, reporter):
|
||||
for i in range(100):
|
||||
reporter(timesteps_total=i)
|
||||
|
||||
register_trainable("f1", train)
|
||||
exp1 = Experiment(**{
|
||||
"name": "foo",
|
||||
"run": "f1",
|
||||
})
|
||||
[trial] = run_experiments(exp1)
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
|
||||
|
||||
def testExperimentList(self):
|
||||
def train(config, reporter):
|
||||
for i in range(100):
|
||||
reporter(timesteps_total=i)
|
||||
|
||||
register_trainable("f1", train)
|
||||
exp1 = Experiment(**{
|
||||
"name": "foo",
|
||||
"run": "f1",
|
||||
})
|
||||
exp2 = Experiment(**{
|
||||
"name": "bar",
|
||||
"run": "f1",
|
||||
})
|
||||
trials = run_experiments([exp1, exp2])
|
||||
for trial in trials:
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99)
|
||||
|
||||
def testAutoregisterTrainable(self):
|
||||
def train(config, reporter):
|
||||
for i in range(100):
|
||||
reporter(timesteps_total=i)
|
||||
|
||||
class B(Trainable):
|
||||
def _train(self):
|
||||
return {"timesteps_this_iter": 1, "done": True}
|
||||
|
||||
register_trainable("f1", train)
|
||||
trials = run_experiments({
|
||||
"foo": {
|
||||
"run": train,
|
||||
},
|
||||
"bar": {
|
||||
"run": B
|
||||
}
|
||||
})
|
||||
for trial in trials:
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
|
||||
def testCheckpointAtEnd(self):
|
||||
class train(Trainable):
|
||||
def _train(self):
|
||||
return {"timesteps_this_iter": 1, "done": True}
|
||||
|
||||
def _save(self, path):
|
||||
checkpoint = path + "/checkpoint"
|
||||
with open(checkpoint, "w") as f:
|
||||
f.write("OK")
|
||||
return checkpoint
|
||||
|
||||
trials = run_experiments({
|
||||
"foo": {
|
||||
"run": train,
|
||||
"checkpoint_at_end": True
|
||||
}
|
||||
})
|
||||
for trial in trials:
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertTrue(trial.has_checkpoint())
|
||||
|
||||
def testExportFormats(self):
|
||||
class train(Trainable):
|
||||
def _train(self):
|
||||
return {"timesteps_this_iter": 1, "done": True}
|
||||
|
||||
def _export_model(self, export_formats, export_dir):
|
||||
path = export_dir + "/exported"
|
||||
with open(path, "w") as f:
|
||||
f.write("OK")
|
||||
return {export_formats[0]: path}
|
||||
|
||||
trials = run_experiments({
|
||||
"foo": {
|
||||
"run": train,
|
||||
"export_formats": ["format"]
|
||||
}
|
||||
})
|
||||
for trial in trials:
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
self.assertTrue(
|
||||
os.path.exists(os.path.join(trial.logdir, "exported")))
|
||||
|
||||
def testInvalidExportFormats(self):
|
||||
class train(Trainable):
|
||||
def _train(self):
|
||||
return {"timesteps_this_iter": 1, "done": True}
|
||||
|
||||
def _export_model(self, export_formats, export_dir):
|
||||
ExportFormat.validate(export_formats)
|
||||
return {}
|
||||
|
||||
def fail_trial():
|
||||
run_experiments({
|
||||
"foo": {
|
||||
"run": train,
|
||||
"export_formats": ["format"]
|
||||
}
|
||||
})
|
||||
|
||||
self.assertRaises(TuneError, fail_trial)
|
||||
|
||||
def testCustomResources(self):
|
||||
ray.shutdown()
|
||||
ray.init(resources={"hi": 3})
|
||||
|
||||
class train(Trainable):
|
||||
def _train(self):
|
||||
return {"timesteps_this_iter": 1, "done": True}
|
||||
|
||||
trials = run_experiments({
|
||||
"foo": {
|
||||
"run": train,
|
||||
"resources_per_trial": {
|
||||
"cpu": 1,
|
||||
"custom_resources": {
|
||||
"hi": 2
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
for trial in trials:
|
||||
self.assertEqual(trial.status, Trial.TERMINATED)
|
||||
|
||||
def testCustomLogger(self):
|
||||
class CustomLogger(Logger):
|
||||
def on_result(self, result):
|
||||
with open(os.path.join(self.logdir, "test.log"), "w") as f:
|
||||
f.write("hi")
|
||||
|
||||
[trial] = run_experiments({
|
||||
"foo": {
|
||||
"run": "__fake",
|
||||
"stop": {
|
||||
"training_iteration": 1
|
||||
},
|
||||
"loggers": [CustomLogger]
|
||||
}
|
||||
})
|
||||
self.assertTrue(os.path.exists(os.path.join(trial.logdir, "test.log")))
|
||||
self.assertFalse(
|
||||
os.path.exists(os.path.join(trial.logdir, "params.json")))
|
||||
|
||||
[trial] = run_experiments({
|
||||
"foo": {
|
||||
"run": "__fake",
|
||||
"stop": {
|
||||
"training_iteration": 1
|
||||
}
|
||||
}
|
||||
})
|
||||
self.assertTrue(
|
||||
os.path.exists(os.path.join(trial.logdir, "params.json")))
|
||||
|
||||
[trial] = run_experiments({
|
||||
"foo": {
|
||||
"run": "__fake",
|
||||
"stop": {
|
||||
"training_iteration": 1
|
||||
},
|
||||
"loggers": []
|
||||
}
|
||||
})
|
||||
self.assertFalse(
|
||||
os.path.exists(os.path.join(trial.logdir, "params.json")))
|
||||
|
||||
def testCustomTrialString(self):
|
||||
[trial] = run_experiments({
|
||||
"foo": {
|
||||
"run": "__fake",
|
||||
"stop": {
|
||||
"training_iteration": 1
|
||||
},
|
||||
"trial_name_creator":
|
||||
lambda t: "{}_{}_321".format(t.trainable_name, t.trial_id)
|
||||
}
|
||||
})
|
||||
self.assertEquals(
|
||||
str(trial), "{}_{}_321".format(trial.trainable_name,
|
||||
trial.trial_id))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
@@ -0,0 +1,218 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import glob
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
from ray.rllib import _register_all
|
||||
|
||||
from ray import tune
|
||||
from ray.tune import TuneError
|
||||
from ray.tune.syncer import CommandBasedClient
|
||||
|
||||
if sys.version_info >= (3, 3):
|
||||
from unittest.mock import patch
|
||||
else:
|
||||
from mock import patch
|
||||
|
||||
|
||||
class TestSyncFunctionality(unittest.TestCase):
|
||||
def setUp(self):
|
||||
ray.init()
|
||||
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
_register_all() # re-register the evicted objects
|
||||
|
||||
@patch("ray.tune.syncer.S3_PREFIX", "test")
|
||||
def testNoUploadDir(self):
|
||||
"""No Upload Dir is given."""
|
||||
with self.assertRaises(AssertionError):
|
||||
[trial] = tune.run(
|
||||
"__fake",
|
||||
name="foo",
|
||||
max_failures=0,
|
||||
**{
|
||||
"stop": {
|
||||
"training_iteration": 1
|
||||
},
|
||||
"sync_to_cloud": "echo {source} {target}"
|
||||
}).trials
|
||||
|
||||
@patch("ray.tune.syncer.S3_PREFIX", "test")
|
||||
def testCloudProperString(self):
|
||||
with self.assertRaises(ValueError):
|
||||
[trial] = tune.run(
|
||||
"__fake",
|
||||
name="foo",
|
||||
max_failures=0,
|
||||
**{
|
||||
"stop": {
|
||||
"training_iteration": 1
|
||||
},
|
||||
"upload_dir": "test",
|
||||
"sync_to_cloud": "ls {target}"
|
||||
}).trials
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
[trial] = tune.run(
|
||||
"__fake",
|
||||
name="foo",
|
||||
max_failures=0,
|
||||
**{
|
||||
"stop": {
|
||||
"training_iteration": 1
|
||||
},
|
||||
"upload_dir": "test",
|
||||
"sync_to_cloud": "ls {source}"
|
||||
}).trials
|
||||
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
logfile = os.path.join(tmpdir, "test.log")
|
||||
|
||||
[trial] = tune.run(
|
||||
"__fake",
|
||||
name="foo",
|
||||
max_failures=0,
|
||||
**{
|
||||
"stop": {
|
||||
"training_iteration": 1
|
||||
},
|
||||
"upload_dir": "test",
|
||||
"sync_to_cloud": "echo {source} {target} > " + logfile
|
||||
}).trials
|
||||
with open(logfile) as f:
|
||||
lines = f.read()
|
||||
self.assertTrue("test" in lines)
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
def testClusterProperString(self):
|
||||
"""Tests that invalid commands throw.."""
|
||||
with self.assertRaises(TuneError):
|
||||
# This raises TuneError because logger is init in safe zone.
|
||||
[trial] = tune.run(
|
||||
"__fake",
|
||||
name="foo",
|
||||
max_failures=0,
|
||||
**{
|
||||
"stop": {
|
||||
"training_iteration": 1
|
||||
},
|
||||
"sync_to_driver": "ls {target}"
|
||||
}).trials
|
||||
|
||||
with self.assertRaises(TuneError):
|
||||
# This raises TuneError because logger is init in safe zone.
|
||||
[trial] = tune.run(
|
||||
"__fake",
|
||||
name="foo",
|
||||
max_failures=0,
|
||||
**{
|
||||
"stop": {
|
||||
"training_iteration": 1
|
||||
},
|
||||
"sync_to_driver": "ls {source}"
|
||||
}).trials
|
||||
|
||||
with patch.object(CommandBasedClient, "execute") as mock_fn:
|
||||
with patch("ray.services.get_node_ip_address") as mock_sync:
|
||||
mock_sync.return_value = "0.0.0.0"
|
||||
[trial] = tune.run(
|
||||
"__fake",
|
||||
name="foo",
|
||||
max_failures=0,
|
||||
**{
|
||||
"stop": {
|
||||
"training_iteration": 1
|
||||
},
|
||||
"sync_to_driver": "echo {source} {target}"
|
||||
}).trials
|
||||
self.assertGreater(mock_fn.call_count, 0)
|
||||
|
||||
def testCloudFunctions(self):
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
tmpdir2 = tempfile.mkdtemp()
|
||||
os.mkdir(os.path.join(tmpdir2, "foo"))
|
||||
|
||||
def sync_func(local, remote):
|
||||
for filename in glob.glob(os.path.join(local, "*.json")):
|
||||
shutil.copy(filename, remote)
|
||||
|
||||
[trial] = tune.run(
|
||||
"__fake",
|
||||
name="foo",
|
||||
max_failures=0,
|
||||
local_dir=tmpdir,
|
||||
stop={
|
||||
"training_iteration": 1
|
||||
},
|
||||
upload_dir=tmpdir2,
|
||||
sync_to_cloud=sync_func).trials
|
||||
test_file_path = glob.glob(os.path.join(tmpdir2, "foo", "*.json"))
|
||||
self.assertTrue(test_file_path)
|
||||
shutil.rmtree(tmpdir)
|
||||
shutil.rmtree(tmpdir2)
|
||||
|
||||
def testClusterSyncFunction(self):
|
||||
def sync_func_driver(source, target):
|
||||
assert ":" in source, "Source {} not a remote path.".format(source)
|
||||
assert ":" not in target, "Target is supposed to be local."
|
||||
with open(os.path.join(target, "test.log2"), "w") as f:
|
||||
print("writing to", f.name)
|
||||
f.write(source)
|
||||
|
||||
[trial] = tune.run(
|
||||
"__fake",
|
||||
name="foo",
|
||||
max_failures=0,
|
||||
stop={
|
||||
"training_iteration": 1
|
||||
},
|
||||
sync_to_driver=sync_func_driver).trials
|
||||
test_file_path = os.path.join(trial.logdir, "test.log2")
|
||||
self.assertFalse(os.path.exists(test_file_path))
|
||||
|
||||
with patch("ray.services.get_node_ip_address") as mock_sync:
|
||||
mock_sync.return_value = "0.0.0.0"
|
||||
[trial] = tune.run(
|
||||
"__fake",
|
||||
name="foo",
|
||||
max_failures=0,
|
||||
stop={
|
||||
"training_iteration": 1
|
||||
},
|
||||
sync_to_driver=sync_func_driver).trials
|
||||
test_file_path = os.path.join(trial.logdir, "test.log2")
|
||||
self.assertTrue(os.path.exists(test_file_path))
|
||||
os.remove(test_file_path)
|
||||
|
||||
def testNoSync(self):
|
||||
"""Sync should not run on a single node."""
|
||||
|
||||
def sync_func(source, target):
|
||||
pass
|
||||
|
||||
with patch.object(CommandBasedClient, "execute") as mock_sync:
|
||||
[trial] = tune.run(
|
||||
"__fake",
|
||||
name="foo",
|
||||
max_failures=0,
|
||||
**{
|
||||
"stop": {
|
||||
"training_iteration": 1
|
||||
},
|
||||
"sync_to_driver": sync_func
|
||||
}).trials
|
||||
self.assertEqual(mock_sync.call_count, 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
@@ -85,4 +85,6 @@ class TrackApiTest(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1194,4 +1194,6 @@ class AsyncHyperBandSuite(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -13,7 +13,7 @@ import numpy as np
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.tests.utils import recursive_fnmatch
|
||||
from ray.test_utils import recursive_fnmatch
|
||||
from ray.tune.util import validate_save_restore
|
||||
from ray.rllib import _register_all
|
||||
from ray.tune.suggest.hyperopt import HyperOptSearch
|
||||
@@ -277,4 +277,6 @@ class SigOptWarmStartTest(AbstractWarmStartTest, unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -151,4 +151,6 @@ class SerialTuneRelativeLocalDirTest(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -144,4 +144,6 @@ class TuneServerSuite(unittest.TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -0,0 +1,319 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
from ray.rllib import _register_all
|
||||
|
||||
from ray import tune
|
||||
from ray.tune.result import DEFAULT_RESULTS_DIR
|
||||
from ray.tune.experiment import Experiment
|
||||
from ray.tune.suggest import grid_search, BasicVariantGenerator
|
||||
from ray.tune.suggest.suggestion import _MockSuggestionAlgorithm
|
||||
from ray.tune.suggest.variant_generator import (RecursiveDependencyError,
|
||||
resolve_nested_dict)
|
||||
|
||||
|
||||
class VariantGeneratorTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
ray.init()
|
||||
|
||||
def tearDown(self):
|
||||
ray.shutdown()
|
||||
_register_all() # re-register the evicted objects
|
||||
|
||||
def generate_trials(self, spec, name):
|
||||
suggester = BasicVariantGenerator()
|
||||
suggester.add_configurations({name: spec})
|
||||
return suggester.next_trials()
|
||||
|
||||
def testParseToTrials(self):
|
||||
trials = self.generate_trials({
|
||||
"run": "PPO",
|
||||
"num_samples": 2,
|
||||
"max_failures": 5,
|
||||
"config": {
|
||||
"env": "Pong-v0",
|
||||
"foo": "bar"
|
||||
},
|
||||
}, "tune-pong")
|
||||
trials = list(trials)
|
||||
self.assertEqual(len(trials), 2)
|
||||
self.assertTrue("PPO_Pong-v0" in str(trials[0]))
|
||||
self.assertEqual(trials[0].config, {"foo": "bar", "env": "Pong-v0"})
|
||||
self.assertEqual(trials[0].trainable_name, "PPO")
|
||||
self.assertEqual(trials[0].experiment_tag, "0")
|
||||
self.assertEqual(trials[0].max_failures, 5)
|
||||
self.assertEqual(trials[0].evaluated_params, {})
|
||||
self.assertEqual(trials[0].local_dir,
|
||||
os.path.join(DEFAULT_RESULTS_DIR, "tune-pong"))
|
||||
self.assertEqual(trials[1].experiment_tag, "1")
|
||||
|
||||
def testEval(self):
|
||||
trials = self.generate_trials({
|
||||
"run": "PPO",
|
||||
"config": {
|
||||
"foo": {
|
||||
"eval": "2 + 2"
|
||||
},
|
||||
},
|
||||
}, "eval")
|
||||
trials = list(trials)
|
||||
self.assertEqual(len(trials), 1)
|
||||
self.assertEqual(trials[0].config, {"foo": 4})
|
||||
self.assertEqual(trials[0].evaluated_params, {"foo": 4})
|
||||
self.assertEqual(trials[0].experiment_tag, "0_foo=4")
|
||||
|
||||
def testGridSearch(self):
|
||||
trials = self.generate_trials({
|
||||
"run": "PPO",
|
||||
"config": {
|
||||
"bar": {
|
||||
"grid_search": [True, False]
|
||||
},
|
||||
"foo": {
|
||||
"grid_search": [1, 2, 3]
|
||||
},
|
||||
"baz": "asd",
|
||||
},
|
||||
}, "grid_search")
|
||||
trials = list(trials)
|
||||
self.assertEqual(len(trials), 6)
|
||||
self.assertEqual(trials[0].config, {
|
||||
"bar": True,
|
||||
"foo": 1,
|
||||
"baz": "asd",
|
||||
})
|
||||
self.assertEqual(trials[0].evaluated_params, {
|
||||
"bar": True,
|
||||
"foo": 1,
|
||||
})
|
||||
self.assertEqual(trials[0].experiment_tag, "0_bar=True,foo=1")
|
||||
|
||||
self.assertEqual(trials[1].config, {
|
||||
"bar": False,
|
||||
"foo": 1,
|
||||
"baz": "asd",
|
||||
})
|
||||
self.assertEqual(trials[1].evaluated_params, {
|
||||
"bar": False,
|
||||
"foo": 1,
|
||||
})
|
||||
self.assertEqual(trials[1].experiment_tag, "1_bar=False,foo=1")
|
||||
|
||||
self.assertEqual(trials[2].config, {
|
||||
"bar": True,
|
||||
"foo": 2,
|
||||
"baz": "asd",
|
||||
})
|
||||
self.assertEqual(trials[2].evaluated_params, {
|
||||
"bar": True,
|
||||
"foo": 2,
|
||||
})
|
||||
|
||||
self.assertEqual(trials[3].config, {
|
||||
"bar": False,
|
||||
"foo": 2,
|
||||
"baz": "asd",
|
||||
})
|
||||
self.assertEqual(trials[3].evaluated_params, {
|
||||
"bar": False,
|
||||
"foo": 2,
|
||||
})
|
||||
|
||||
self.assertEqual(trials[4].config, {
|
||||
"bar": True,
|
||||
"foo": 3,
|
||||
"baz": "asd",
|
||||
})
|
||||
self.assertEqual(trials[4].evaluated_params, {
|
||||
"bar": True,
|
||||
"foo": 3,
|
||||
})
|
||||
|
||||
self.assertEqual(trials[5].config, {
|
||||
"bar": False,
|
||||
"foo": 3,
|
||||
"baz": "asd",
|
||||
})
|
||||
self.assertEqual(trials[5].evaluated_params, {
|
||||
"bar": False,
|
||||
"foo": 3,
|
||||
})
|
||||
|
||||
def testGridSearchAndEval(self):
|
||||
trials = self.generate_trials({
|
||||
"run": "PPO",
|
||||
"config": {
|
||||
"qux": tune.sample_from(lambda spec: 2 + 2),
|
||||
"bar": grid_search([True, False]),
|
||||
"foo": grid_search([1, 2, 3]),
|
||||
"baz": "asd",
|
||||
},
|
||||
}, "grid_eval")
|
||||
trials = list(trials)
|
||||
self.assertEqual(len(trials), 6)
|
||||
self.assertEqual(trials[0].config, {
|
||||
"bar": True,
|
||||
"foo": 1,
|
||||
"qux": 4,
|
||||
"baz": "asd",
|
||||
})
|
||||
self.assertEqual(trials[0].evaluated_params, {
|
||||
"bar": True,
|
||||
"foo": 1,
|
||||
"qux": 4,
|
||||
})
|
||||
self.assertEqual(trials[0].experiment_tag, "0_bar=True,foo=1,qux=4")
|
||||
|
||||
def testConditionResolution(self):
|
||||
trials = self.generate_trials({
|
||||
"run": "PPO",
|
||||
"config": {
|
||||
"x": 1,
|
||||
"y": tune.sample_from(lambda spec: spec.config.x + 1),
|
||||
"z": tune.sample_from(lambda spec: spec.config.y + 1),
|
||||
},
|
||||
}, "condition_resolution")
|
||||
trials = list(trials)
|
||||
self.assertEqual(len(trials), 1)
|
||||
self.assertEqual(trials[0].config, {"x": 1, "y": 2, "z": 3})
|
||||
self.assertEqual(trials[0].evaluated_params, {"y": 2, "z": 3})
|
||||
self.assertEqual(trials[0].experiment_tag, "0_y=2,z=3")
|
||||
|
||||
def testDependentLambda(self):
|
||||
trials = self.generate_trials({
|
||||
"run": "PPO",
|
||||
"config": {
|
||||
"x": grid_search([1, 2]),
|
||||
"y": tune.sample_from(lambda spec: spec.config.x * 100),
|
||||
},
|
||||
}, "dependent_lambda")
|
||||
trials = list(trials)
|
||||
self.assertEqual(len(trials), 2)
|
||||
self.assertEqual(trials[0].config, {"x": 1, "y": 100})
|
||||
self.assertEqual(trials[1].config, {"x": 2, "y": 200})
|
||||
|
||||
def testDependentGridSearch(self):
|
||||
trials = self.generate_trials({
|
||||
"run": "PPO",
|
||||
"config": {
|
||||
"x": grid_search([
|
||||
tune.sample_from(lambda spec: spec.config.y * 100),
|
||||
tune.sample_from(lambda spec: spec.config.y * 200)
|
||||
]),
|
||||
"y": tune.sample_from(lambda spec: 1),
|
||||
},
|
||||
}, "dependent_grid_search")
|
||||
trials = list(trials)
|
||||
self.assertEqual(len(trials), 2)
|
||||
self.assertEqual(trials[0].config, {"x": 100, "y": 1})
|
||||
self.assertEqual(trials[1].config, {"x": 200, "y": 1})
|
||||
|
||||
def testNestedValues(self):
|
||||
trials = self.generate_trials({
|
||||
"run": "PPO",
|
||||
"config": {
|
||||
"x": {
|
||||
"y": {
|
||||
"z": tune.sample_from(lambda spec: 1)
|
||||
}
|
||||
},
|
||||
"y": tune.sample_from(lambda spec: 12),
|
||||
"z": tune.sample_from(lambda spec: spec.config.x.y.z * 100),
|
||||
},
|
||||
}, "nested_values")
|
||||
trials = list(trials)
|
||||
self.assertEqual(len(trials), 1)
|
||||
self.assertEqual(trials[0].config, {
|
||||
"x": {
|
||||
"y": {
|
||||
"z": 1
|
||||
}
|
||||
},
|
||||
"y": 12,
|
||||
"z": 100
|
||||
})
|
||||
self.assertEqual(trials[0].evaluated_params, {
|
||||
"x/y/z": 1,
|
||||
"y": 12,
|
||||
"z": 100
|
||||
})
|
||||
|
||||
def testLogUniform(self):
|
||||
sampler = tune.loguniform(1e-10, 1e-1).func
|
||||
results = [sampler(None) for i in range(1000)]
|
||||
assert abs(np.log(min(results)) / np.log(10) - -10) < 0.1
|
||||
assert abs(np.log(max(results)) / np.log(10) - -1) < 0.1
|
||||
|
||||
sampler_e = tune.loguniform(np.e**-4, np.e, base=np.e).func
|
||||
results_e = [sampler_e(None) for i in range(1000)]
|
||||
assert abs(np.log(min(results_e)) - -4) < 0.1
|
||||
assert abs(np.log(max(results_e)) - 1) < 0.1
|
||||
|
||||
def test_resolve_dict(self):
|
||||
config = {
|
||||
"a": {
|
||||
"b": 1,
|
||||
"c": 2,
|
||||
},
|
||||
"b": {
|
||||
"a": 3
|
||||
}
|
||||
}
|
||||
resolved = resolve_nested_dict(config)
|
||||
for k, v in [(("a", "b"), 1), (("a", "c"), 2), (("b", "a"), 3)]:
|
||||
self.assertEqual(resolved.get(k), v)
|
||||
|
||||
def testRecursiveDep(self):
|
||||
try:
|
||||
list(
|
||||
self.generate_trials({
|
||||
"run": "PPO",
|
||||
"config": {
|
||||
"foo": tune.sample_from(lambda spec: spec.config.foo),
|
||||
},
|
||||
}, "recursive_dep"))
|
||||
except RecursiveDependencyError as e:
|
||||
assert "`foo` recursively depends on" in str(e), e
|
||||
else:
|
||||
assert False
|
||||
|
||||
def testMaxConcurrentSuggestions(self):
|
||||
"""Checks that next_trials() supports throttling."""
|
||||
experiment_spec = {
|
||||
"run": "PPO",
|
||||
"num_samples": 6,
|
||||
}
|
||||
experiments = [Experiment.from_json("test", experiment_spec)]
|
||||
|
||||
searcher = _MockSuggestionAlgorithm(max_concurrent=4)
|
||||
searcher.add_configurations(experiments)
|
||||
trials = searcher.next_trials()
|
||||
self.assertEqual(len(trials), 4)
|
||||
self.assertEqual(searcher.next_trials(), [])
|
||||
|
||||
finished_trial = trials.pop()
|
||||
searcher.on_trial_complete(finished_trial.trial_id)
|
||||
self.assertEqual(len(searcher.next_trials()), 1)
|
||||
|
||||
finished_trial = trials.pop()
|
||||
searcher.on_trial_complete(finished_trial.trial_id)
|
||||
|
||||
finished_trial = trials.pop()
|
||||
searcher.on_trial_complete(finished_trial.trial_id)
|
||||
|
||||
finished_trial = trials.pop()
|
||||
searcher.on_trial_complete(finished_trial.trial_id)
|
||||
self.assertEqual(len(searcher.next_trials()), 1)
|
||||
self.assertEqual(len(searcher.next_trials()), 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
Reference in New Issue
Block a user