diff --git a/.travis.yml b/.travis.yml index 3295df708..ee2de146f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -44,7 +44,6 @@ matrix: # Try generating Sphinx documentation. To do this, we need to install # Ray first. - ./ci/travis/install-dependencies.sh - - export PATH="$HOME/miniconda/bin:$PATH" - cd doc - pip install -q -r requirements-doc.txt - pip install -q yapf==0.23.0 @@ -154,27 +153,14 @@ install: - ./ci/suppress_output ./ci/travis/install-ray.sh - ./ci/suppress_output ./ci/travis/install-cython-examples.sh - - ./ci/suppress_output bash src/ray/test/run_gcs_tests.sh - - # core worker test. - - ./ci/suppress_output bash src/ray/test/run_core_worker_tests.sh - - # Raylet tests. - - ./ci/suppress_output bash src/ray/test/run_object_manager_tests.sh - - ./ci/suppress_output bazel test --build_tests_only --test_lang_filters=cc //:all - # Shutdown bazel to release the memory held by bazel. - - bazel shutdown - script: - - export PATH="$HOME/miniconda/bin:$PATH" - # The following is needed so cloudpickle can find some of the - # class definitions: The main module of tests that are run - # with pytest have the same name as the test file -- and this - # module is only found if the test directory is in the PYTHONPATH. - # - export PYTHONPATH="$PYTHONPATH:./ci/" + # raylet integration tests + - ./ci/suppress_output bash src/ray/test/run_gcs_tests.sh + - ./ci/suppress_output bash src/ray/test/run_core_worker_tests.sh + - ./ci/suppress_output bash src/ray/test/run_object_manager_tests.sh - # ray tune tests, except tests tagged jenkins only. - - if [ $RAY_CI_TUNE_AFFECTED == "1" ]; then bazel test --spawn_strategy=local --python_version=$BAZEL_PYTHON_VERSION --incompatible_allow_python_version_transitions=false --incompatible_py3_is_default=false --show_progress_rate_limit=120 --test_output=errors --test_tag_filters=-jenkins_only python/ray/tune/...; fi + # cc bazel tests + - ./ci/suppress_output bazel test --build_tests_only --show_progress_rate_limit=100 --test_output=errors //:all # ray serve tests - if [ $RAY_CI_SERVE_AFFECTED == "1" ]; then python -c 'import sys;exit(sys.version_info>=(3,5))' || python -m pytest -v --durations=5 --timeout=300 python/ray/experimental/serve/tests; fi @@ -184,7 +170,9 @@ script: # Python3.5+ only. Otherwise we will get `SyntaxError` regardless of how we set the tester. - if [ $RAY_CI_PYTHON_AFFECTED == "1" ]; then python -c 'import sys;exit(sys.version_info>=(3,5))' || python -m pytest -v --durations=5 --timeout=300 python/ray/experimental/test/async_test.py; fi - if [ $RAY_CI_PYTHON_AFFECTED == "1" ]; then python -c 'import sys;exit(sys.version_info>=(3,5))' || python -m pytest -v --durations=5 --timeout=300 python/ray/tests/py3_test.py; fi - - if [ $RAY_CI_PYTHON_AFFECTED == "1" ]; then python -m pytest -v --durations=10 --timeout=300 python/ray/tests --ignore=python/ray/tests/perf_integration_tests --ignore=python/ray/tests/py3_test.py; fi + + # py bazel tests, run using local strategy since PY2 breaks with sandbox + - bazel test --spawn_strategy=local --python_version=$BAZEL_PYTHON_VERSION --incompatible_allow_python_version_transitions=false --incompatible_py3_is_default=false --show_progress_rate_limit=100 --test_output=errors --test_tag_filters=-jenkins_only python/ray/... deploy: - provider: s3 diff --git a/BUILD.bazel b/BUILD.bazel index 8237fd719..4ae6c0492 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -899,6 +899,17 @@ filegroup( ], ) +# This is a dummy test dependency that causes the python tests to be +# re-run if any of these files changes. +py_library( + name = "ray_lib", + srcs = glob( + ["python/ray/**/*.py"], + exclude = ["python/ray/tests/*.py"], + ), + visibility = ["__subpackages__"], +) + genrule( name = "ray_pkg", srcs = [ diff --git a/ci/long_running_tests/workloads/many_drivers.py b/ci/long_running_tests/workloads/many_drivers.py index f9eae2f8d..7867f8124 100644 --- a/ci/long_running_tests/workloads/many_drivers.py +++ b/ci/long_running_tests/workloads/many_drivers.py @@ -8,7 +8,7 @@ import time import ray from ray.tests.cluster_utils import Cluster -from ray.tests.utils import run_string_as_driver +from ray.test_utils import run_string_as_driver num_redis_shards = 5 redis_max_memory = 10**8 diff --git a/python/ray/tests/cluster_utils.py b/python/ray/cluster_utils.py similarity index 100% rename from python/ray/tests/cluster_utils.py rename to python/ray/cluster_utils.py diff --git a/python/ray/ray_cluster_perf.py b/python/ray/ray_cluster_perf.py index 28869f023..823d7d72a 100644 --- a/python/ray/ray_cluster_perf.py +++ b/python/ray/ray_cluster_perf.py @@ -4,7 +4,7 @@ import time import numpy as np import ray -from ray.tests.cluster_utils import Cluster +from ray.cluster_utils import Cluster def main(): diff --git a/python/ray/tests/utils.py b/python/ray/test_utils.py similarity index 96% rename from python/ray/tests/utils.py rename to python/ray/test_utils.py index 06c476e6d..a3b937259 100644 --- a/python/ray/tests/utils.py +++ b/python/ray/test_utils.py @@ -2,6 +2,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import json import fnmatch import os import subprocess @@ -170,3 +171,11 @@ def recursive_fnmatch(dirpath, pattern): for filename in fnmatch.filter(filenames, pattern): matches.append(os.path.join(root, filename)) return matches + + +def generate_internal_config_map(**kwargs): + internal_config = json.dumps(kwargs) + ray_kwargs = { + "_internal_config": internal_config, + } + return ray_kwargs diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD new file mode 100644 index 000000000..78356ef82 --- /dev/null +++ b/python/ray/tests/BUILD @@ -0,0 +1,297 @@ +py_test( + name = "test_actor", + size = "large", + srcs = ["test_actor.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_actor_resources", + size = "large", + srcs = ["test_actor_resources.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_actor_failures", + size = "large", + srcs = ["test_actor_failures.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_basic", + size = "large", + srcs = ["test_basic.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_advanced", + size = "large", + srcs = ["test_advanced.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_component_failures", + size = "large", + srcs = ["test_component_failures.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_multinode_failures", + size = "large", + srcs = ["test_multinode_failures.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_stress", + size = "large", + srcs = ["test_stress.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_stress_sharded", + size = "large", + srcs = ["test_stress_sharded.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_stress_failure", + size = "large", + srcs = ["test_stress_failure.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_array", + size = "medium", + srcs = ["test_array.py"], + deps = ["//:ray_lib"], + flaky = 1, +) + +py_test( + name = "test_autoscaler", + size = "small", + srcs = ["test_autoscaler.py"], + deps = ["//:ray_lib"], + flaky = 1, +) + +py_test( + name = "test_autoscaler_yaml", + size = "small", + srcs = ["test_autoscaler_yaml.py"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_cython", + size = "small", + srcs = ["test_cython.py"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_debug_tools", + size = "small", + srcs = ["test_debug_tools.py"], + deps = ["//:ray_lib"], + flaky = 1, +) + +py_test( + name = "test_dynres", + size = "medium", + srcs = ["test_dynres.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_failure", + size = "medium", + srcs = ["test_failure.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_garbage_collection", + size = "medium", + srcs = ["test_garbage_collection.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_global_state", + size = "medium", + srcs = ["test_global_state.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_logical_graph", + size = "medium", + srcs = ["test_logical_graph.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_memory_limits", + size = "medium", + srcs = ["test_memory_limits.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_memory_scheduling", + size = "medium", + srcs = ["test_memory_scheduling.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_metrics", + size = "small", + srcs = ["test_metrics.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_microbenchmarks", + size = "medium", + srcs = ["test_microbenchmarks.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_mini", + size = "small", + srcs = ["test_mini.py"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_monitors", + size = "medium", + srcs = ["test_monitors.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_multi_node_2", + size = "medium", + srcs = ["test_multi_node_2.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_multi_node", + size = "medium", + srcs = ["test_multi_node.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_node_manager", + size = "small", + srcs = ["test_node_manager.py"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_object_manager", + size = "medium", + srcs = ["test_object_manager.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_projects", + size = "small", + srcs = ["test_projects.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_queue", + size = "small", + srcs = ["test_queue.py"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_ray_init", + size = "medium", + srcs = ["test_ray_init.py"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_signal", + size = "medium", + srcs = ["test_signal.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_tempfile", + size = "small", + srcs = ["test_tempfile.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_tensorflow", + size = "medium", + srcs = ["test_tensorflow.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_unreconstructable_errors", + size = "medium", + srcs = ["test_unreconstructable_errors.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) + +py_test( + name = "test_webui", + size = "medium", + srcs = ["test_webui.py"], + tags = ["exclusive"], + deps = ["//:ray_lib"], +) diff --git a/python/ray/tests/__init__.py b/python/ray/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/python/ray/tests/conftest.py b/python/ray/tests/conftest.py index b6062f2af..1476b8642 100644 --- a/python/ray/tests/conftest.py +++ b/python/ray/tests/conftest.py @@ -8,7 +8,7 @@ import pytest import subprocess import ray -from ray.tests.cluster_utils import Cluster +from ray.cluster_utils import Cluster @pytest.fixture @@ -18,14 +18,6 @@ def shutdown_only(): ray.shutdown() -def generate_internal_config_map(**kwargs): - internal_config = json.dumps(kwargs) - ray_kwargs = { - "_internal_config": internal_config, - } - return ray_kwargs - - def get_default_fixure_internal_config(): internal_config = json.dumps({ "initial_reconstruction_timeout_milliseconds": 200, @@ -177,7 +169,7 @@ def two_node_cluster(): "initial_reconstruction_timeout_milliseconds": 200, "num_heartbeats_timeout": 10, }) - cluster = ray.tests.cluster_utils.Cluster( + cluster = ray.cluster_utils.Cluster( head_node_args={"_internal_config": internal_config}) for _ in range(2): remote_node = cluster.add_node( diff --git a/python/ray/tests/perf_integration_tests/__init__.py b/python/ray/tests/perf_integration_tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/python/ray/tests/py3_test.py b/python/ray/tests/py3_test.py index fed76e294..e171db5e4 100644 --- a/python/ray/tests/py3_test.py +++ b/python/ray/tests/py3_test.py @@ -8,8 +8,8 @@ import threading import pytest import ray -import ray.tests.cluster_utils -import ray.tests.utils +import ray.cluster_utils +import ray.test_utils @pytest.mark.parametrize( diff --git a/python/ray/tests/test_actor.py b/python/ray/tests/test_actor.py index f31068e2b..53c8957b3 100644 --- a/python/ray/tests/test_actor.py +++ b/python/ray/tests/test_actor.py @@ -2,8 +2,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import collections -import json import random import numpy as np import os @@ -12,84 +10,13 @@ try: import pytest_timeout except ImportError: pytest_timeout = None -import signal import sys import time import ray -import ray.ray_constants as ray_constants -import ray.tests.utils -import ray.tests.cluster_utils -from ray.tests.conftest import generate_internal_config_map -from ray.tests.utils import (relevant_errors, wait_for_condition, - wait_for_errors, wait_for_pid_to_exit, - run_string_as_driver) - - -@pytest.fixture -def ray_checkpointable_actor_cls(request): - checkpoint_dir = "/tmp/ray_temp_checkpoint_dir/" - if not os.path.isdir(checkpoint_dir): - os.mkdir(checkpoint_dir) - - class CheckpointableActor(ray.actor.Checkpointable): - def __init__(self): - self.value = 0 - self.resumed_from_checkpoint = False - self.checkpoint_dir = checkpoint_dir - - def node_id(self): - return ray.worker.global_worker.node.unique_id - - def increase(self): - self.value += 1 - return self.value - - def get(self): - return self.value - - def was_resumed_from_checkpoint(self): - return self.resumed_from_checkpoint - - def get_pid(self): - return os.getpid() - - def should_checkpoint(self, checkpoint_context): - # Checkpoint the actor when value is increased to 3. - should_checkpoint = self.value == 3 - return should_checkpoint - - def save_checkpoint(self, actor_id, checkpoint_id): - actor_id, checkpoint_id = actor_id.hex(), checkpoint_id.hex() - # Save checkpoint into a file. - with open(self.checkpoint_dir + actor_id, "a+") as f: - print(checkpoint_id, self.value, file=f) - - def load_checkpoint(self, actor_id, available_checkpoints): - actor_id = actor_id.hex() - filename = self.checkpoint_dir + actor_id - # Load checkpoint from the file. - if not os.path.isfile(filename): - return None - - available_checkpoint_ids = [ - c.checkpoint_id for c in available_checkpoints - ] - with open(filename, "r") as f: - for line in f: - checkpoint_id, value = line.strip().split(" ") - checkpoint_id = ray.ActorCheckpointID( - ray.utils.hex_to_binary(checkpoint_id)) - if checkpoint_id in available_checkpoint_ids: - self.value = int(value) - self.resumed_from_checkpoint = True - return checkpoint_id - return None - - def checkpoint_expired(self, actor_id, checkpoint_id): - pass - - return CheckpointableActor +import ray.test_utils +import ray.cluster_utils +from ray.test_utils import run_string_as_driver def test_actor_init_error_propagated(ray_start_regular): @@ -394,7 +321,7 @@ def test_actor_class_name(ray_start_regular): assert len(actor_keys) == 1 actor_class_info = r.hgetall(actor_keys[0]) assert actor_class_info[b"class_name"] == b"Foo" - assert actor_class_info[b"module"] == b"ray.tests.test_actor" + assert b"test_actor" in actor_class_info[b"module"] def test_actor_inheritance(ray_start_regular): @@ -486,13 +413,13 @@ def test_actor_deletion(ray_start_regular): a = Actor.remote() pid = ray.get(a.getpid.remote()) a = None - ray.tests.utils.wait_for_pid_to_exit(pid) + ray.test_utils.wait_for_pid_to_exit(pid) actors = [Actor.remote() for _ in range(10)] pids = ray.get([a.getpid.remote() for a in actors]) a = None actors = None - [ray.tests.utils.wait_for_pid_to_exit(pid) for pid in pids] + [ray.test_utils.wait_for_pid_to_exit(pid) for pid in pids] @pytest.mark.skipif( @@ -510,159 +437,6 @@ def test_actor_method_deletion(ray_start_regular): assert ray.get(Actor.remote().method.remote()) == 1 -def test_actor_deletion_with_gpus(shutdown_only): - ray.init( - num_cpus=1, num_gpus=1, object_store_memory=int(150 * 1024 * 1024)) - - # When an actor that uses a GPU exits, make sure that the GPU resources - # are released. - - @ray.remote(num_gpus=1) - class Actor(object): - def getpid(self): - return os.getpid() - - for _ in range(5): - # If we can successfully create an actor, that means that enough - # GPU resources are available. - a = Actor.remote() - ray.get(a.getpid.remote()) - - -def test_actor_state(ray_start_regular): - @ray.remote - class Counter(object): - def __init__(self): - self.value = 0 - - def increase(self): - self.value += 1 - - def value(self): - return self.value - - c1 = Counter.remote() - c1.increase.remote() - assert ray.get(c1.value.remote()) == 1 - - c2 = Counter.remote() - c2.increase.remote() - c2.increase.remote() - assert ray.get(c2.value.remote()) == 2 - - -def test_actor_class_methods(ray_start_regular): - class Foo(object): - x = 2 - - @classmethod - def as_remote(cls): - return ray.remote(cls) - - @classmethod - def f(cls): - return cls.x - - @classmethod - def g(cls, y): - return cls.x + y - - def echo(self, value): - return value - - a = Foo.as_remote().remote() - assert ray.get(a.echo.remote(2)) == 2 - assert ray.get(a.f.remote()) == 2 - assert ray.get(a.g.remote(2)) == 4 - - -def test_resource_assignment(shutdown_only): - """Test to make sure that we assign resource to actors at instantiation.""" - # This test will create 16 actors. Declaring this many CPUs initially will - # speed up the test because the workers will be started ahead of time. - ray.init( - num_cpus=16, - num_gpus=1, - resources={"Custom": 1}, - object_store_memory=int(150 * 1024 * 1024)) - - class Actor(object): - def __init__(self): - self.resources = ray.get_resource_ids() - - def get_actor_resources(self): - return self.resources - - def get_actor_method_resources(self): - return ray.get_resource_ids() - - decorator_resource_args = [{}, { - "num_cpus": 0.1 - }, { - "num_gpus": 0.1 - }, { - "resources": { - "Custom": 0.1 - } - }] - instantiation_resource_args = [{}, { - "num_cpus": 0.2 - }, { - "num_gpus": 0.2 - }, { - "resources": { - "Custom": 0.2 - } - }] - for decorator_args in decorator_resource_args: - for instantiation_args in instantiation_resource_args: - if len(decorator_args) == 0: - actor_class = ray.remote(Actor) - else: - actor_class = ray.remote(**decorator_args)(Actor) - actor = actor_class._remote(**instantiation_args) - actor_resources = ray.get(actor.get_actor_resources.remote()) - actor_method_resources = ray.get( - actor.get_actor_method_resources.remote()) - if len(decorator_args) == 0 and len(instantiation_args) == 0: - assert len(actor_resources) == 0, ( - "Actor should not be assigned resources.") - assert list(actor_method_resources.keys()) == [ - "CPU" - ], ("Actor method should only have CPUs") - assert actor_method_resources["CPU"][0][1] == 1, ( - "Actor method should default to one cpu.") - else: - if ("num_cpus" not in decorator_args - and "num_cpus" not in instantiation_args): - assert actor_resources["CPU"][0][1] == 1, ( - "Actor should default to one cpu.") - correct_resources = {} - defined_resources = decorator_args.copy() - defined_resources.update(instantiation_args) - for resource, value in defined_resources.items(): - if resource == "num_cpus": - correct_resources["CPU"] = value - elif resource == "num_gpus": - correct_resources["GPU"] = value - elif resource == "resources": - for custom_resource, amount in value.items(): - correct_resources[custom_resource] = amount - for resource, amount in correct_resources.items(): - assert (actor_resources[resource][0][0] == - actor_method_resources[resource][0][0]), ( - "Should have assigned same {} for both actor ", - "and actor method.".format(resource)) - assert (actor_resources[resource][0][ - 1] == actor_method_resources[resource][0][1]), ( - "Should have assigned same amount of {} for both ", - "actor and actor method.".format(resource)) - assert actor_resources[resource][0][1] == amount, ( - "Actor should have {amount} {resource} but has ", - "{amount} {resource}".format( - amount=amount, resource=resource)) - - def test_multiple_actors(ray_start_regular): @ray.remote class Counter(object): @@ -993,469 +767,6 @@ def test_actor_lifetime_load_balancing(ray_start_cluster): ray.get([actor.ping.remote() for actor in actors]) -@pytest.mark.skipif( - os.environ.get("RAY_USE_NEW_GCS") == "on", - reason="Failing with new GCS API on Linux.") -def test_actor_gpus(ray_start_cluster): - cluster = ray_start_cluster - num_nodes = 3 - num_gpus_per_raylet = 4 - for i in range(num_nodes): - cluster.add_node( - num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet) - ray.init(address=cluster.address) - - @ray.remote(num_gpus=1) - class Actor1(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() - - def get_location_and_ids(self): - assert ray.get_gpu_ids() == self.gpu_ids - return (ray.worker.global_worker.node.unique_id, - tuple(self.gpu_ids)) - - # Create one actor per GPU. - actors = [Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet)] - # Make sure that no two actors are assigned to the same GPU. - locations_and_ids = ray.get( - [actor.get_location_and_ids.remote() for actor in actors]) - node_names = {location for location, gpu_id in locations_and_ids} - assert len(node_names) == num_nodes - location_actor_combinations = [] - for node_name in node_names: - for gpu_id in range(num_gpus_per_raylet): - location_actor_combinations.append((node_name, (gpu_id, ))) - assert set(locations_and_ids) == set(location_actor_combinations) - - # Creating a new actor should fail because all of the GPUs are being - # used. - a = Actor1.remote() - ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01) - assert ready_ids == [] - - -def test_actor_multiple_gpus(ray_start_cluster): - cluster = ray_start_cluster - num_nodes = 3 - num_gpus_per_raylet = 5 - for i in range(num_nodes): - cluster.add_node( - num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet) - ray.init(address=cluster.address) - - @ray.remote(num_gpus=2) - class Actor1(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() - - def get_location_and_ids(self): - assert ray.get_gpu_ids() == self.gpu_ids - return (ray.worker.global_worker.node.unique_id, - tuple(self.gpu_ids)) - - # Create some actors. - actors1 = [Actor1.remote() for _ in range(num_nodes * 2)] - # Make sure that no two actors are assigned to the same GPU. - locations_and_ids = ray.get( - [actor.get_location_and_ids.remote() for actor in actors1]) - node_names = {location for location, gpu_id in locations_and_ids} - assert len(node_names) == num_nodes - - # Keep track of which GPU IDs are being used for each location. - gpus_in_use = {node_name: [] for node_name in node_names} - for location, gpu_ids in locations_and_ids: - gpus_in_use[location].extend(gpu_ids) - for node_name in node_names: - assert len(set(gpus_in_use[node_name])) == 4 - - # Creating a new actor should fail because all of the GPUs are being - # used. - a = Actor1.remote() - ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01) - assert ready_ids == [] - - # We should be able to create more actors that use only a single GPU. - @ray.remote(num_gpus=1) - class Actor2(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() - - def get_location_and_ids(self): - return (ray.worker.global_worker.node.unique_id, - tuple(self.gpu_ids)) - - # Create some actors. - actors2 = [Actor2.remote() for _ in range(num_nodes)] - # Make sure that no two actors are assigned to the same GPU. - locations_and_ids = ray.get( - [actor.get_location_and_ids.remote() for actor in actors2]) - names = {location for location, gpu_id in locations_and_ids} - assert node_names == names - for location, gpu_ids in locations_and_ids: - gpus_in_use[location].extend(gpu_ids) - for node_name in node_names: - assert len(gpus_in_use[node_name]) == 5 - assert set(gpus_in_use[node_name]) == set(range(5)) - - # Creating a new actor should fail because all of the GPUs are being - # used. - a = Actor2.remote() - ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01) - assert ready_ids == [] - - -def test_actor_different_numbers_of_gpus(ray_start_cluster): - # Test that we can create actors on two nodes that have different - # numbers of GPUs. - cluster = ray_start_cluster - cluster.add_node(num_cpus=10, num_gpus=0) - cluster.add_node(num_cpus=10, num_gpus=5) - cluster.add_node(num_cpus=10, num_gpus=10) - ray.init(address=cluster.address) - - @ray.remote(num_gpus=1) - class Actor1(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() - - def get_location_and_ids(self): - return (ray.worker.global_worker.node.unique_id, - tuple(self.gpu_ids)) - - # Create some actors. - actors = [Actor1.remote() for _ in range(0 + 5 + 10)] - # Make sure that no two actors are assigned to the same GPU. - locations_and_ids = ray.get( - [actor.get_location_and_ids.remote() for actor in actors]) - node_names = {location for location, gpu_id in locations_and_ids} - assert len(node_names) == 2 - for node_name in node_names: - node_gpu_ids = [ - gpu_id for location, gpu_id in locations_and_ids - if location == node_name - ] - assert len(node_gpu_ids) in [5, 10] - assert set(node_gpu_ids) == {(i, ) for i in range(len(node_gpu_ids))} - - # Creating a new actor should fail because all of the GPUs are being - # used. - a = Actor1.remote() - ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01) - assert ready_ids == [] - - -def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster): - cluster = ray_start_cluster - num_nodes = 5 - num_gpus_per_raylet = 5 - for i in range(num_nodes): - cluster.add_node( - num_cpus=10 * num_gpus_per_raylet, - num_gpus=num_gpus_per_raylet, - _internal_config=json.dumps({ - "num_heartbeats_timeout": 1000 - })) - ray.init(address=cluster.address) - - @ray.remote - def create_actors(i, n): - @ray.remote(num_gpus=1) - class Actor(object): - def __init__(self, i, j): - self.gpu_ids = ray.get_gpu_ids() - - def get_location_and_ids(self): - return ((ray.worker.global_worker.node.unique_id), - tuple(self.gpu_ids)) - - def sleep(self): - time.sleep(100) - - # Create n actors. - actors = [] - for j in range(n): - actors.append(Actor.remote(i, j)) - - locations = ray.get( - [actor.get_location_and_ids.remote() for actor in actors]) - - # Put each actor to sleep for a long time to prevent them from getting - # terminated. - for actor in actors: - actor.sleep.remote() - - return locations - - all_locations = ray.get([ - create_actors.remote(i, num_gpus_per_raylet) for i in range(num_nodes) - ]) - - # Make sure that no two actors are assigned to the same GPU. - node_names = { - location - for locations in all_locations for location, gpu_id in locations - } - assert len(node_names) == num_nodes - - # Keep track of which GPU IDs are being used for each location. - gpus_in_use = {node_name: [] for node_name in node_names} - for locations in all_locations: - for location, gpu_ids in locations: - gpus_in_use[location].extend(gpu_ids) - for node_name in node_names: - assert len(set(gpus_in_use[node_name])) == num_gpus_per_raylet - - @ray.remote(num_gpus=1) - class Actor(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() - - def get_location_and_ids(self): - return (ray.worker.global_worker.node.unique_id, - tuple(self.gpu_ids)) - - # All the GPUs should be used up now. - a = Actor.remote() - ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01) - assert ready_ids == [] - - -@pytest.mark.skipif( - sys.version_info < (3, 0), reason="This test requires Python 3.") -def test_actors_and_tasks_with_gpus(ray_start_cluster): - cluster = ray_start_cluster - num_nodes = 3 - num_gpus_per_raylet = 2 - for i in range(num_nodes): - cluster.add_node( - num_cpus=num_gpus_per_raylet, num_gpus=num_gpus_per_raylet) - ray.init(address=cluster.address) - - def check_intervals_non_overlapping(list_of_intervals): - for i in range(len(list_of_intervals)): - for j in range(i): - first_interval = list_of_intervals[i] - second_interval = list_of_intervals[j] - # Check that list_of_intervals[i] and list_of_intervals[j] - # don't overlap. - assert first_interval[0] < first_interval[1] - assert second_interval[0] < second_interval[1] - intervals_nonoverlapping = ( - first_interval[1] <= second_interval[0] - or second_interval[1] <= first_interval[0]) - assert intervals_nonoverlapping, ( - "Intervals {} and {} are overlapping.".format( - first_interval, second_interval)) - - @ray.remote(num_gpus=1) - def f1(): - t1 = time.monotonic() - time.sleep(0.1) - t2 = time.monotonic() - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 1 - assert gpu_ids[0] in range(num_gpus_per_raylet) - return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids), - [t1, t2]) - - @ray.remote(num_gpus=2) - def f2(): - t1 = time.monotonic() - time.sleep(0.1) - t2 = time.monotonic() - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 2 - assert gpu_ids[0] in range(num_gpus_per_raylet) - assert gpu_ids[1] in range(num_gpus_per_raylet) - return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids), - [t1, t2]) - - @ray.remote(num_gpus=1) - class Actor1(object): - def __init__(self): - self.gpu_ids = ray.get_gpu_ids() - assert len(self.gpu_ids) == 1 - assert self.gpu_ids[0] in range(num_gpus_per_raylet) - - def get_location_and_ids(self): - assert ray.get_gpu_ids() == self.gpu_ids - return (ray.worker.global_worker.node.unique_id, - tuple(self.gpu_ids)) - - def locations_to_intervals_for_many_tasks(): - # Launch a bunch of GPU tasks. - locations_ids_and_intervals = ray.get( - [f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] + - [f2.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] + - [f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)]) - - locations_to_intervals = collections.defaultdict(lambda: []) - for location, gpu_ids, interval in locations_ids_and_intervals: - for gpu_id in gpu_ids: - locations_to_intervals[(location, gpu_id)].append(interval) - return locations_to_intervals - - # Run a bunch of GPU tasks. - locations_to_intervals = locations_to_intervals_for_many_tasks() - # For each GPU, verify that the set of tasks that used this specific - # GPU did not overlap in time. - for locations in locations_to_intervals: - check_intervals_non_overlapping(locations_to_intervals[locations]) - - # Create an actor that uses a GPU. - a = Actor1.remote() - actor_location = ray.get(a.get_location_and_ids.remote()) - actor_location = (actor_location[0], actor_location[1][0]) - # This check makes sure that actor_location is formatted the same way - # that the keys of locations_to_intervals are formatted. - assert actor_location in locations_to_intervals - - # Run a bunch of GPU tasks. - locations_to_intervals = locations_to_intervals_for_many_tasks() - # For each GPU, verify that the set of tasks that used this specific - # GPU did not overlap in time. - for locations in locations_to_intervals: - check_intervals_non_overlapping(locations_to_intervals[locations]) - # Make sure that the actor's GPU was not used. - assert actor_location not in locations_to_intervals - - # Create more actors to fill up all the GPUs. - more_actors = [ - Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet - 1) - ] - # Wait for the actors to finish being created. - ray.get([actor.get_location_and_ids.remote() for actor in more_actors]) - - # Now if we run some GPU tasks, they should not be scheduled. - results = [f1.remote() for _ in range(30)] - ready_ids, remaining_ids = ray.wait(results, timeout=1.0) - assert len(ready_ids) == 0 - - -def test_actors_and_tasks_with_gpus_version_two(shutdown_only): - # Create tasks and actors that both use GPUs and make sure that they - # are given different GPUs - num_gpus = 4 - - ray.init( - num_cpus=(num_gpus + 1), - num_gpus=num_gpus, - object_store_memory=int(150 * 1024 * 1024)) - - # The point of this actor is to record which GPU IDs have been seen. We - # can't just return them from the tasks, because the tasks don't return - # for a long time in order to make sure the GPU is not released - # prematurely. - @ray.remote - class RecordGPUs(object): - def __init__(self): - self.gpu_ids_seen = [] - self.num_calls = 0 - - def add_ids(self, gpu_ids): - self.gpu_ids_seen += gpu_ids - self.num_calls += 1 - - def get_gpu_ids_and_calls(self): - return self.gpu_ids_seen, self.num_calls - - @ray.remote(num_gpus=1) - def f(record_gpu_actor): - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 1 - record_gpu_actor.add_ids.remote(gpu_ids) - # Sleep for a long time so that the GPU never gets released. This task - # will be killed by ray.shutdown() before it actually finishes. - time.sleep(1000) - - @ray.remote(num_gpus=1) - class Actor(object): - def __init__(self, record_gpu_actor): - self.gpu_ids = ray.get_gpu_ids() - assert len(self.gpu_ids) == 1 - record_gpu_actor.add_ids.remote(self.gpu_ids) - - def check_gpu_ids(self): - assert ray.get_gpu_ids() == self.gpu_ids - - record_gpu_actor = RecordGPUs.remote() - - actors = [] - actor_results = [] - for _ in range(num_gpus // 2): - f.remote(record_gpu_actor) - a = Actor.remote(record_gpu_actor) - actor_results.append(a.check_gpu_ids.remote()) - # Prevent the actor handle from going out of scope so that its GPU - # resources don't get released. - actors.append(a) - - # Make sure that the actor method calls succeeded. - ray.get(actor_results) - - start_time = time.time() - while time.time() - start_time < 30: - seen_gpu_ids, num_calls = ray.get( - record_gpu_actor.get_gpu_ids_and_calls.remote()) - if num_calls == num_gpus: - break - assert set(seen_gpu_ids) == set(range(num_gpus)) - - -def test_blocking_actor_task(shutdown_only): - ray.init( - num_cpus=1, num_gpus=1, object_store_memory=int(150 * 1024 * 1024)) - - @ray.remote(num_gpus=1) - def f(): - return 1 - - @ray.remote - class Foo(object): - def __init__(self): - pass - - def blocking_method(self): - ray.get(f.remote()) - - # Make sure we can execute a blocking actor method even if there is - # only one CPU. - actor = Foo.remote() - ray.get(actor.blocking_method.remote()) - - @ray.remote(num_cpus=1) - class CPUFoo(object): - def __init__(self): - pass - - def blocking_method(self): - ray.get(f.remote()) - - # Make sure that lifetime CPU resources are not released when actors - # block. - actor = CPUFoo.remote() - x_id = actor.blocking_method.remote() - ready_ids, remaining_ids = ray.wait([x_id], timeout=1.0) - assert ready_ids == [] - assert remaining_ids == [x_id] - - @ray.remote(num_gpus=1) - class GPUFoo(object): - def __init__(self): - pass - - def blocking_method(self): - ray.get(f.remote()) - - # Make sure that GPU resources are not released when actors block. - actor = GPUFoo.remote() - x_id = actor.blocking_method.remote() - ready_ids, remaining_ids = ray.wait([x_id], timeout=1.0) - assert ready_ids == [] - assert remaining_ids == [x_id] - - def test_exception_raised_when_actor_node_dies(ray_start_cluster_head): cluster = ray_start_cluster_head remote_node = cluster.add_node() @@ -2081,746 +1392,6 @@ def test_register_and_get_named_actors(ray_start_regular): assert ray.get(f2.method.remote()) == 4 -@pytest.mark.skipif( - sys.version_info < (3, 0), - reason="This test is currently failing on Python 2.7.") -def test_lifetime_and_transient_resources(ray_start_regular): - # This actor acquires resources only when running methods. - @ray.remote - class Actor1(object): - def method(self): - pass - - # This actor acquires resources for its lifetime. - @ray.remote(num_cpus=1) - class Actor2(object): - def method(self): - pass - - actor1s = [Actor1.remote() for _ in range(10)] - ray.get([a.method.remote() for a in actor1s]) - - actor2s = [Actor2.remote() for _ in range(2)] - results = [a.method.remote() for a in actor2s] - ready_ids, remaining_ids = ray.wait( - results, num_returns=len(results), timeout=5.0) - assert len(ready_ids) == 1 - - -def test_custom_label_placement(ray_start_cluster): - cluster = ray_start_cluster - cluster.add_node(num_cpus=2, resources={"CustomResource1": 2}) - cluster.add_node(num_cpus=2, resources={"CustomResource2": 2}) - ray.init(address=cluster.address) - - @ray.remote(resources={"CustomResource1": 1}) - class ResourceActor1(object): - def get_location(self): - return ray.worker.global_worker.node.unique_id - - @ray.remote(resources={"CustomResource2": 1}) - class ResourceActor2(object): - def get_location(self): - return ray.worker.global_worker.node.unique_id - - node_id = ray.worker.global_worker.node.unique_id - - # Create some actors. - actors1 = [ResourceActor1.remote() for _ in range(2)] - actors2 = [ResourceActor2.remote() for _ in range(2)] - locations1 = ray.get([a.get_location.remote() for a in actors1]) - locations2 = ray.get([a.get_location.remote() for a in actors2]) - for location in locations1: - assert location == node_id - for location in locations2: - assert location != node_id - - -def test_creating_more_actors_than_resources(shutdown_only): - ray.init(num_cpus=10, num_gpus=2, resources={"CustomResource1": 1}) - - @ray.remote(num_gpus=1) - class ResourceActor1(object): - def method(self): - return ray.get_gpu_ids()[0] - - @ray.remote(resources={"CustomResource1": 1}) - class ResourceActor2(object): - def method(self): - pass - - # Make sure the first two actors get created and the third one does - # not. - actor1 = ResourceActor1.remote() - result1 = actor1.method.remote() - ray.wait([result1]) - actor2 = ResourceActor1.remote() - result2 = actor2.method.remote() - ray.wait([result2]) - actor3 = ResourceActor1.remote() - result3 = actor3.method.remote() - ready_ids, _ = ray.wait([result3], timeout=0.2) - assert len(ready_ids) == 0 - - # By deleting actor1, we free up resources to create actor3. - del actor1 - - results = ray.get([result1, result2, result3]) - assert results[0] == results[2] - assert set(results) == {0, 1} - - # Make sure that when one actor goes out of scope a new actor is - # created because some resources have been freed up. - results = [] - for _ in range(3): - actor = ResourceActor2.remote() - object_id = actor.method.remote() - results.append(object_id) - # Wait for the task to execute. We do this because otherwise it may - # be possible for the __ray_terminate__ task to execute before the - # method. - ray.wait([object_id]) - - ray.get(results) - - -@pytest.mark.parametrize( - "ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True) -def test_actor_eviction(ray_start_object_store_memory): - object_store_memory = ray_start_object_store_memory - - @ray.remote - class Actor(object): - def __init__(self): - pass - - def create_object(self, size): - return np.random.rand(size) - - a = Actor.remote() - # Submit enough methods on the actor so that they exceed the size of the - # object store. - objects = [] - num_objects = 20 - for _ in range(num_objects): - obj = a.create_object.remote(object_store_memory // num_objects) - objects.append(obj) - # Get each object once to make sure each object gets created. - ray.get(obj) - - # Get each object again. At this point, the earlier objects should have - # been evicted. - num_evicted, num_success = 0, 0 - for obj in objects: - try: - val = ray.get(obj) - assert isinstance(val, np.ndarray), val - num_success += 1 - except ray.exceptions.UnreconstructableError: - num_evicted += 1 - # Some objects should have been evicted, and some should still be in the - # object store. - assert num_evicted > 0 - assert num_success > 0 - - -def test_actor_reconstruction(ray_start_regular): - """Test actor reconstruction when actor process is killed.""" - - @ray.remote(max_reconstructions=1) - class ReconstructableActor(object): - """An actor that will be reconstructed at most once.""" - - def __init__(self): - self.value = 0 - - def increase(self, delay=0): - time.sleep(delay) - self.value += 1 - return self.value - - def get_pid(self): - return os.getpid() - - actor = ReconstructableActor.remote() - pid = ray.get(actor.get_pid.remote()) - # Call increase 3 times - for _ in range(3): - ray.get(actor.increase.remote()) - # Call increase again with some delay. - result = actor.increase.remote(delay=0.5) - # Sleep some time to wait for the above task to start execution. - time.sleep(0.2) - # Kill actor process, while the above task is still being executed. - os.kill(pid, signal.SIGKILL) - # Check that the above task didn't fail and the actor is reconstructed. - assert ray.get(result) == 4 - # Check that we can still call the actor. - assert ray.get(actor.increase.remote()) == 5 - # kill actor process one more time. - pid = ray.get(actor.get_pid.remote()) - os.kill(pid, signal.SIGKILL) - # The actor has exceeded max reconstructions, and this task should fail. - with pytest.raises(ray.exceptions.RayActorError): - ray.get(actor.increase.remote()) - - # Create another actor. - actor = ReconstructableActor.remote() - # Intentionlly exit the actor - actor.__ray_terminate__.remote() - # Check that the actor won't be reconstructed. - with pytest.raises(ray.exceptions.RayActorError): - ray.get(actor.increase.remote()) - - -def test_actor_reconstruction_without_task(ray_start_regular): - """Test a dead actor can be reconstructed without sending task to it.""" - - @ray.remote(max_reconstructions=1) - class ReconstructableActor(object): - def __init__(self, obj_ids): - for obj_id in obj_ids: - # Every time the actor gets constructed, - # put a new object in plasma store. - global_worker = ray.worker.global_worker - if not global_worker.core_worker.object_exists(obj_id): - global_worker.put_object(1, obj_id) - break - - def get_pid(self): - return os.getpid() - - obj_ids = [ray.ObjectID.from_random() for _ in range(2)] - actor = ReconstructableActor.remote(obj_ids) - # Kill the actor. - pid = ray.get(actor.get_pid.remote()) - os.kill(pid, signal.SIGKILL) - # Wait until the actor is reconstructed. - assert wait_for_condition( - lambda: ray.worker.global_worker.core_worker.object_exists(obj_ids[1]), - timeout_ms=5000) - - -def test_actor_reconstruction_on_node_failure(ray_start_cluster_head): - """Test actor reconstruction when node dies unexpectedly.""" - cluster = ray_start_cluster_head - max_reconstructions = 3 - # Add a few nodes to the cluster. - # Use custom resource to make sure the actor is only created on worker - # nodes, not on the head node. - for _ in range(max_reconstructions + 2): - cluster.add_node( - resources={"a": 1}, - _internal_config=json.dumps({ - "initial_reconstruction_timeout_milliseconds": 200, - "num_heartbeats_timeout": 10, - }), - ) - - def kill_node(node_id): - node_to_remove = None - for node in cluster.worker_nodes: - if node_id == node.unique_id: - node_to_remove = node - cluster.remove_node(node_to_remove) - - @ray.remote(max_reconstructions=max_reconstructions, resources={"a": 1}) - class MyActor(object): - def __init__(self): - self.value = 0 - - def increase(self): - self.value += 1 - return self.value - - def get_object_store_socket(self): - return ray.worker.global_worker.node.unique_id - - actor = MyActor.remote() - # Call increase 3 times. - for _ in range(3): - ray.get(actor.increase.remote()) - - for i in range(max_reconstructions): - object_store_socket = ray.get(actor.get_object_store_socket.remote()) - # Kill actor's node and the actor should be reconstructed - # on a different node. - kill_node(object_store_socket) - # Call increase again. - # Check that the actor is reconstructed and value is correct. - assert ray.get(actor.increase.remote()) == 4 + i - # Check that the actor is now on a different node. - assert object_store_socket != ray.get( - actor.get_object_store_socket.remote()) - - # kill the node again. - object_store_socket = ray.get(actor.get_object_store_socket.remote()) - kill_node(object_store_socket) - # The actor has exceeded max reconstructions, and this task should fail. - with pytest.raises(ray.exceptions.RayActorError): - ray.get(actor.increase.remote()) - - -# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for -# this test. Because if this value is too small, suprious task reconstruction -# may happen and cause the test fauilure. If the value is too large, this test -# could be very slow. We can remove this once we support dynamic timeout. -@pytest.mark.parametrize( - "ray_start_cluster_head", [ - generate_internal_config_map( - initial_reconstruction_timeout_milliseconds=1000) - ], - indirect=True) -def test_multiple_actor_reconstruction(ray_start_cluster_head): - cluster = ray_start_cluster_head - # This test can be made more stressful by increasing the numbers below. - # The total number of actors created will be - # num_actors_at_a_time * num_nodes. - num_nodes = 5 - num_actors_at_a_time = 3 - num_function_calls_at_a_time = 10 - - worker_nodes = [ - cluster.add_node( - num_cpus=3, - _internal_config=json.dumps({ - "initial_reconstruction_timeout_milliseconds": 200, - "num_heartbeats_timeout": 10, - })) for _ in range(num_nodes) - ] - - @ray.remote(max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION) - class SlowCounter(object): - def __init__(self): - self.x = 0 - - def inc(self, duration): - time.sleep(duration) - self.x += 1 - return self.x - - # Create some initial actors. - actors = [SlowCounter.remote() for _ in range(num_actors_at_a_time)] - - # Wait for the actors to start up. - time.sleep(1) - - # This is a mapping from actor handles to object IDs returned by - # methods on that actor. - result_ids = collections.defaultdict(lambda: []) - - # In a loop we are going to create some actors, run some methods, kill - # a raylet, and run some more methods. - for node in worker_nodes: - # Create some actors. - actors.extend( - [SlowCounter.remote() for _ in range(num_actors_at_a_time)]) - # Run some methods. - for j in range(len(actors)): - actor = actors[j] - for _ in range(num_function_calls_at_a_time): - result_ids[actor].append(actor.inc.remote(j**2 * 0.000001)) - # Kill a node. - cluster.remove_node(node) - - # Run some more methods. - for j in range(len(actors)): - actor = actors[j] - for _ in range(num_function_calls_at_a_time): - result_ids[actor].append(actor.inc.remote(j**2 * 0.000001)) - - # Get the results and check that they have the correct values. - for _, result_id_list in result_ids.items(): - results = list(range(1, len(result_id_list) + 1)) - assert ray.get(result_id_list) == results - - -def kill_actor(actor): - """A helper function that kills an actor process.""" - pid = ray.get(actor.get_pid.remote()) - os.kill(pid, signal.SIGKILL) - wait_for_pid_to_exit(pid) - - -def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls): - """Test actor checkpointing and restoring from a checkpoint.""" - actor = ray.remote( - max_reconstructions=2)(ray_checkpointable_actor_cls).remote() - # Call increase 3 times, triggering a checkpoint. - expected = 0 - for _ in range(3): - ray.get(actor.increase.remote()) - expected += 1 - # Assert that the actor wasn't resumed from a checkpoint. - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False - # Kill actor process. - kill_actor(actor) - # Assert that the actor was resumed from a checkpoint and its value is - # still correct. - assert ray.get(actor.get.remote()) == expected - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True - - # Submit some more tasks. These should get replayed since they happen after - # the checkpoint. - for _ in range(3): - ray.get(actor.increase.remote()) - expected += 1 - # Kill actor again and check that reconstruction still works after the - # actor resuming from a checkpoint. - kill_actor(actor) - assert ray.get(actor.get.remote()) == expected - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True - - -def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls): - """Test checkpointing of a remote actor through method invocation.""" - - # Define a class that exposes a method to save checkpoints. - class RemoteCheckpointableActor(ray_checkpointable_actor_cls): - def __init__(self): - super(RemoteCheckpointableActor, self).__init__() - self._should_checkpoint = False - - def checkpoint(self): - self._should_checkpoint = True - - def should_checkpoint(self, checkpoint_context): - should_checkpoint = self._should_checkpoint - self._should_checkpoint = False - return should_checkpoint - - cls = ray.remote(max_reconstructions=2)(RemoteCheckpointableActor) - actor = cls.remote() - # Call increase 3 times. - expected = 0 - for _ in range(3): - ray.get(actor.increase.remote()) - expected += 1 - # Call a checkpoint task. - actor.checkpoint.remote() - # Assert that the actor wasn't resumed from a checkpoint. - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False - # Kill actor process. - kill_actor(actor) - # Assert that the actor was resumed from a checkpoint and its value is - # still correct. - assert ray.get(actor.get.remote()) == expected - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True - - # Submit some more tasks. These should get replayed since they happen after - # the checkpoint. - for _ in range(3): - ray.get(actor.increase.remote()) - expected += 1 - # Kill actor again and check that reconstruction still works after the - # actor resuming from a checkpoint. - kill_actor(actor) - assert ray.get(actor.get.remote()) == expected - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True - - -def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes, - ray_checkpointable_actor_cls): - """Test actor checkpointing on a remote node.""" - # Place the actor on the remote node. - cluster = ray_start_cluster_2_nodes - remote_node = list(cluster.worker_nodes) - actor_cls = ray.remote(max_reconstructions=1)(ray_checkpointable_actor_cls) - actor = actor_cls.remote() - while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id): - actor = actor_cls.remote() - - # Call increase several times. - expected = 0 - for _ in range(6): - ray.get(actor.increase.remote()) - expected += 1 - # Assert that the actor wasn't resumed from a checkpoint. - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False - # Kill actor process. - cluster.remove_node(remote_node[0]) - # Assert that the actor was resumed from a checkpoint and its value is - # still correct. - assert ray.get(actor.get.remote()) == expected - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True - - -def test_checkpointing_save_exception(ray_start_regular, - ray_checkpointable_actor_cls): - """Test actor can still be recovered if checkpoints fail to complete.""" - - @ray.remote(max_reconstructions=2) - class RemoteCheckpointableActor(ray_checkpointable_actor_cls): - def save_checkpoint(self, actor_id, checkpoint_context): - raise Exception("Intentional error saving checkpoint.") - - actor = RemoteCheckpointableActor.remote() - # Call increase 3 times, triggering a checkpoint that will fail. - expected = 0 - for _ in range(3): - ray.get(actor.increase.remote()) - expected += 1 - # Assert that the actor wasn't resumed from a checkpoint. - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False - # Kill actor process. - kill_actor(actor) - # Assert that the actor still wasn't resumed from a checkpoint and its - # value is still correct. - assert ray.get(actor.get.remote()) == expected - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False - - # Submit some more tasks. These should get replayed since they happen after - # the checkpoint. - for _ in range(3): - ray.get(actor.increase.remote()) - expected += 1 - # Kill actor again, and check that reconstruction still works and the actor - # wasn't resumed from a checkpoint. - kill_actor(actor) - assert ray.get(actor.get.remote()) == expected - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False - - # Check that the checkpoint error was pushed to the driver. - wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1) - - -def test_checkpointing_load_exception(ray_start_regular, - ray_checkpointable_actor_cls): - """Test actor can still be recovered if checkpoints fail to load.""" - - @ray.remote(max_reconstructions=2) - class RemoteCheckpointableActor(ray_checkpointable_actor_cls): - def load_checkpoint(self, actor_id, checkpoints): - raise Exception("Intentional error loading checkpoint.") - - actor = RemoteCheckpointableActor.remote() - # Call increase 3 times, triggering a checkpoint that will succeed. - expected = 0 - for _ in range(3): - ray.get(actor.increase.remote()) - expected += 1 - # Assert that the actor wasn't resumed from a checkpoint because loading - # it failed. - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False - # Kill actor process. - kill_actor(actor) - # Assert that the actor still wasn't resumed from a checkpoint and its - # value is still correct. - assert ray.get(actor.get.remote()) == expected - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False - - # Submit some more tasks. These should get replayed since they happen after - # the checkpoint. - for _ in range(3): - ray.get(actor.increase.remote()) - expected += 1 - # Kill actor again, and check that reconstruction still works and the actor - # wasn't resumed from a checkpoint. - kill_actor(actor) - assert ray.get(actor.get.remote()) == expected - assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False - - # Check that the checkpoint error was pushed to the driver. - wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1) - - -@pytest.mark.parametrize( - "ray_start_regular", - # This overwrite currently isn't effective, - # see https://github.com/ray-project/ray/issues/3926. - [generate_internal_config_map(num_actor_checkpoints_to_keep=20)], - indirect=True, -) -def test_deleting_actor_checkpoint(ray_start_regular): - """Test deleting old actor checkpoints.""" - - @ray.remote - class CheckpointableActor(ray.actor.Checkpointable): - def __init__(self): - self.checkpoint_ids = [] - - def get_checkpoint_ids(self): - return self.checkpoint_ids - - def should_checkpoint(self, checkpoint_context): - # Save checkpoints after every task - return True - - def save_checkpoint(self, actor_id, checkpoint_id): - self.checkpoint_ids.append(checkpoint_id) - pass - - def load_checkpoint(self, actor_id, available_checkpoints): - pass - - def checkpoint_expired(self, actor_id, checkpoint_id): - assert checkpoint_id == self.checkpoint_ids[0] - del self.checkpoint_ids[0] - - actor = CheckpointableActor.remote() - for i in range(19): - assert len(ray.get(actor.get_checkpoint_ids.remote())) == i + 1 - for _ in range(20): - assert len(ray.get(actor.get_checkpoint_ids.remote())) == 20 - - -def test_bad_checkpointable_actor_class(): - """Test error raised if an actor class doesn't implement all abstract - methods in the Checkpointable interface.""" - - with pytest.raises(TypeError): - - @ray.remote - class BadCheckpointableActor(ray.actor.Checkpointable): - def should_checkpoint(self, checkpoint_context): - return True - - -def test_init_exception_in_checkpointable_actor(ray_start_regular, - ray_checkpointable_actor_cls): - # This test is similar to test_failure.py::test_failed_actor_init. - # This test is used to guarantee that checkpointable actor does not - # break the same logic. - error_message1 = "actor constructor failed" - error_message2 = "actor method failed" - - @ray.remote - class CheckpointableFailedActor(ray_checkpointable_actor_cls): - def __init__(self): - raise Exception(error_message1) - - def fail_method(self): - raise Exception(error_message2) - - def should_checkpoint(self, checkpoint_context): - return True - - a = CheckpointableFailedActor.remote() - - # Make sure that we get errors from a failed constructor. - wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1) - errors = relevant_errors(ray_constants.TASK_PUSH_ERROR) - assert len(errors) == 1 - assert error_message1 in errors[0]["message"] - - # Make sure that we get errors from a failed method. - a.fail_method.remote() - wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) - errors = relevant_errors(ray_constants.TASK_PUSH_ERROR) - assert len(errors) == 2 - assert error_message1 in errors[1]["message"] - - -def test_decorated_method(ray_start_regular): - def method_invocation_decorator(f): - def new_f_invocation(args, kwargs): - # Split one argument into two. Return th kwargs without passing - # them into the actor. - return f([args[0], args[0]], {}), kwargs - - return new_f_invocation - - def method_execution_decorator(f): - def new_f_execution(self, b, c): - # Turn two arguments into one. - return f(self, b + c) - - new_f_execution.__ray_invocation_decorator__ = ( - method_invocation_decorator) - return new_f_execution - - @ray.remote - class Actor(object): - @method_execution_decorator - def decorated_method(self, x): - return x + 1 - - a = Actor.remote() - - object_id, extra = a.decorated_method.remote(3, kwarg=3) - assert isinstance(object_id, ray.ObjectID) - assert extra == {"kwarg": 3} - assert ray.get(object_id) == 7 # 2 * 3 + 1 - - -@pytest.mark.skipif( - pytest_timeout is None, - reason="Timeout package not installed; skipping test that may hang.") -@pytest.mark.timeout(20) -@pytest.mark.parametrize( - "ray_start_cluster", [{ - "num_cpus": 1, - "num_nodes": 2, - }], indirect=True) -def test_ray_wait_dead_actor(ray_start_cluster): - """Tests that methods completed by dead actors are returned as ready""" - cluster = ray_start_cluster - - @ray.remote(num_cpus=1) - class Actor(object): - def __init__(self): - pass - - def node_id(self): - return ray.worker.global_worker.node.unique_id - - def ping(self): - time.sleep(1) - - # Create some actors and wait for them to initialize. - num_nodes = len(cluster.list_all_nodes()) - actors = [Actor.remote() for _ in range(num_nodes)] - ray.get([actor.ping.remote() for actor in actors]) - - # Ping the actors and make sure the tasks complete. - ping_ids = [actor.ping.remote() for actor in actors] - ray.get(ping_ids) - # Evict the result from the node that we're about to kill. - remote_node = cluster.list_all_nodes()[-1] - remote_ping_id = None - for i, actor in enumerate(actors): - if ray.get(actor.node_id.remote()) == remote_node.unique_id: - remote_ping_id = ping_ids[i] - ray.internal.free([remote_ping_id], local_only=True) - cluster.remove_node(remote_node) - - # Repeatedly call ray.wait until the exception for the dead actor is - # received. - unready = ping_ids[:] - while unready: - _, unready = ray.wait(unready, timeout=0) - time.sleep(1) - - with pytest.raises(ray.exceptions.RayActorError): - ray.get(ping_ids) - - # Evict the result from the dead node. - ray.internal.free([remote_ping_id], local_only=True) - # Create an actor on the local node that will call ray.wait in a loop. - head_node_resource = "HEAD_NODE" - ray.experimental.set_resource(head_node_resource, 1) - - @ray.remote(num_cpus=0, resources={head_node_resource: 1}) - class ParentActor(object): - def __init__(self, ping_ids): - self.unready = ping_ids - - def wait(self): - _, self.unready = ray.wait(self.unready, timeout=0) - return len(self.unready) == 0 - - def ping(self): - return - - # Repeatedly call ray.wait through the local actor until the exception for - # the dead actor is received. - parent_actor = ParentActor.remote(ping_ids) - ray.get(parent_actor.ping.remote()) - failure_detected = False - while not failure_detected: - failure_detected = ray.get(parent_actor.wait.remote()) - - def test_detached_actor(ray_start_regular): @ray.remote class DetachedActor(object): @@ -2853,3 +1424,9 @@ ray.get(actor.ping.remote()) run_string_as_driver(driver_script) detached_actor = ray.experimental.get_actor(actor_name) assert ray.get(detached_actor.ping.remote()) == "pong" + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_actor_failures.py b/python/ray/tests/test_actor_failures.py new file mode 100644 index 000000000..a0d07b6f8 --- /dev/null +++ b/python/ray/tests/test_actor_failures.py @@ -0,0 +1,732 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import numpy as np +import os +import pytest +try: + import pytest_timeout +except ImportError: + pytest_timeout = None +import signal +import sys +import time + +import ray +import ray.ray_constants as ray_constants +import ray.test_utils +import ray.cluster_utils +from ray.test_utils import (relevant_errors, wait_for_condition, + wait_for_errors, wait_for_pid_to_exit, + generate_internal_config_map) + + +@pytest.fixture +def ray_checkpointable_actor_cls(request): + checkpoint_dir = "/tmp/ray_temp_checkpoint_dir/" + if not os.path.isdir(checkpoint_dir): + os.mkdir(checkpoint_dir) + + class CheckpointableActor(ray.actor.Checkpointable): + def __init__(self): + self.value = 0 + self.resumed_from_checkpoint = False + self.checkpoint_dir = checkpoint_dir + + def node_id(self): + return ray.worker.global_worker.node.unique_id + + def increase(self): + self.value += 1 + return self.value + + def get(self): + return self.value + + def was_resumed_from_checkpoint(self): + return self.resumed_from_checkpoint + + def get_pid(self): + return os.getpid() + + def should_checkpoint(self, checkpoint_context): + # Checkpoint the actor when value is increased to 3. + should_checkpoint = self.value == 3 + return should_checkpoint + + def save_checkpoint(self, actor_id, checkpoint_id): + actor_id, checkpoint_id = actor_id.hex(), checkpoint_id.hex() + # Save checkpoint into a file. + with open(self.checkpoint_dir + actor_id, "a+") as f: + print(checkpoint_id, self.value, file=f) + + def load_checkpoint(self, actor_id, available_checkpoints): + actor_id = actor_id.hex() + filename = self.checkpoint_dir + actor_id + # Load checkpoint from the file. + if not os.path.isfile(filename): + return None + + available_checkpoint_ids = [ + c.checkpoint_id for c in available_checkpoints + ] + with open(filename, "r") as f: + for line in f: + checkpoint_id, value = line.strip().split(" ") + checkpoint_id = ray.ActorCheckpointID( + ray.utils.hex_to_binary(checkpoint_id)) + if checkpoint_id in available_checkpoint_ids: + self.value = int(value) + self.resumed_from_checkpoint = True + return checkpoint_id + return None + + def checkpoint_expired(self, actor_id, checkpoint_id): + pass + + return CheckpointableActor + + +@pytest.mark.parametrize( + "ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True) +def test_actor_eviction(ray_start_object_store_memory): + object_store_memory = ray_start_object_store_memory + + @ray.remote + class Actor(object): + def __init__(self): + pass + + def create_object(self, size): + return np.random.rand(size) + + a = Actor.remote() + # Submit enough methods on the actor so that they exceed the size of the + # object store. + objects = [] + num_objects = 20 + for _ in range(num_objects): + obj = a.create_object.remote(object_store_memory // num_objects) + objects.append(obj) + # Get each object once to make sure each object gets created. + ray.get(obj) + + # Get each object again. At this point, the earlier objects should have + # been evicted. + num_evicted, num_success = 0, 0 + for obj in objects: + try: + val = ray.get(obj) + assert isinstance(val, np.ndarray), val + num_success += 1 + except ray.exceptions.UnreconstructableError: + num_evicted += 1 + # Some objects should have been evicted, and some should still be in the + # object store. + assert num_evicted > 0 + assert num_success > 0 + + +def test_actor_reconstruction(ray_start_regular): + """Test actor reconstruction when actor process is killed.""" + + @ray.remote(max_reconstructions=1) + class ReconstructableActor(object): + """An actor that will be reconstructed at most once.""" + + def __init__(self): + self.value = 0 + + def increase(self, delay=0): + time.sleep(delay) + self.value += 1 + return self.value + + def get_pid(self): + return os.getpid() + + actor = ReconstructableActor.remote() + pid = ray.get(actor.get_pid.remote()) + # Call increase 3 times + for _ in range(3): + ray.get(actor.increase.remote()) + # Call increase again with some delay. + result = actor.increase.remote(delay=0.5) + # Sleep some time to wait for the above task to start execution. + time.sleep(0.2) + # Kill actor process, while the above task is still being executed. + os.kill(pid, signal.SIGKILL) + # Check that the above task didn't fail and the actor is reconstructed. + assert ray.get(result) == 4 + # Check that we can still call the actor. + assert ray.get(actor.increase.remote()) == 5 + # kill actor process one more time. + pid = ray.get(actor.get_pid.remote()) + os.kill(pid, signal.SIGKILL) + # The actor has exceeded max reconstructions, and this task should fail. + with pytest.raises(ray.exceptions.RayActorError): + ray.get(actor.increase.remote()) + + # Create another actor. + actor = ReconstructableActor.remote() + # Intentionlly exit the actor + actor.__ray_terminate__.remote() + # Check that the actor won't be reconstructed. + with pytest.raises(ray.exceptions.RayActorError): + ray.get(actor.increase.remote()) + + +def test_actor_reconstruction_without_task(ray_start_regular): + """Test a dead actor can be reconstructed without sending task to it.""" + + @ray.remote(max_reconstructions=1) + class ReconstructableActor(object): + def __init__(self, obj_ids): + for obj_id in obj_ids: + # Every time the actor gets constructed, + # put a new object in plasma store. + global_worker = ray.worker.global_worker + if not global_worker.core_worker.object_exists(obj_id): + global_worker.put_object(1, obj_id) + break + + def get_pid(self): + return os.getpid() + + obj_ids = [ray.ObjectID.from_random() for _ in range(2)] + actor = ReconstructableActor.remote(obj_ids) + # Kill the actor. + pid = ray.get(actor.get_pid.remote()) + os.kill(pid, signal.SIGKILL) + # Wait until the actor is reconstructed. + assert wait_for_condition( + lambda: ray.worker.global_worker.core_worker.object_exists(obj_ids[1]), + timeout_ms=5000) + + +def test_actor_reconstruction_on_node_failure(ray_start_cluster_head): + """Test actor reconstruction when node dies unexpectedly.""" + cluster = ray_start_cluster_head + max_reconstructions = 3 + # Add a few nodes to the cluster. + # Use custom resource to make sure the actor is only created on worker + # nodes, not on the head node. + for _ in range(max_reconstructions + 2): + cluster.add_node( + resources={"a": 1}, + _internal_config=json.dumps({ + "initial_reconstruction_timeout_milliseconds": 200, + "num_heartbeats_timeout": 10, + }), + ) + + def kill_node(node_id): + node_to_remove = None + for node in cluster.worker_nodes: + if node_id == node.unique_id: + node_to_remove = node + cluster.remove_node(node_to_remove) + + @ray.remote(max_reconstructions=max_reconstructions, resources={"a": 1}) + class MyActor(object): + def __init__(self): + self.value = 0 + + def increase(self): + self.value += 1 + return self.value + + def get_object_store_socket(self): + return ray.worker.global_worker.node.unique_id + + actor = MyActor.remote() + # Call increase 3 times. + for _ in range(3): + ray.get(actor.increase.remote()) + + for i in range(max_reconstructions): + object_store_socket = ray.get(actor.get_object_store_socket.remote()) + # Kill actor's node and the actor should be reconstructed + # on a different node. + kill_node(object_store_socket) + # Call increase again. + # Check that the actor is reconstructed and value is correct. + assert ray.get(actor.increase.remote()) == 4 + i + # Check that the actor is now on a different node. + assert object_store_socket != ray.get( + actor.get_object_store_socket.remote()) + + # kill the node again. + object_store_socket = ray.get(actor.get_object_store_socket.remote()) + kill_node(object_store_socket) + # The actor has exceeded max reconstructions, and this task should fail. + with pytest.raises(ray.exceptions.RayActorError): + ray.get(actor.increase.remote()) + + +# NOTE(hchen): we set initial_reconstruction_timeout_milliseconds to 1s for +# this test. Because if this value is too small, suprious task reconstruction +# may happen and cause the test fauilure. If the value is too large, this test +# could be very slow. We can remove this once we support dynamic timeout. +@pytest.mark.parametrize( + "ray_start_cluster_head", [ + generate_internal_config_map( + initial_reconstruction_timeout_milliseconds=1000) + ], + indirect=True) +def test_multiple_actor_reconstruction(ray_start_cluster_head): + cluster = ray_start_cluster_head + # This test can be made more stressful by increasing the numbers below. + # The total number of actors created will be + # num_actors_at_a_time * num_nodes. + num_nodes = 5 + num_actors_at_a_time = 3 + num_function_calls_at_a_time = 10 + + worker_nodes = [ + cluster.add_node( + num_cpus=3, + _internal_config=json.dumps({ + "initial_reconstruction_timeout_milliseconds": 200, + "num_heartbeats_timeout": 10, + })) for _ in range(num_nodes) + ] + + @ray.remote(max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION) + class SlowCounter(object): + def __init__(self): + self.x = 0 + + def inc(self, duration): + time.sleep(duration) + self.x += 1 + return self.x + + # Create some initial actors. + actors = [SlowCounter.remote() for _ in range(num_actors_at_a_time)] + + # Wait for the actors to start up. + time.sleep(1) + + # This is a mapping from actor handles to object IDs returned by + # methods on that actor. + result_ids = collections.defaultdict(lambda: []) + + # In a loop we are going to create some actors, run some methods, kill + # a raylet, and run some more methods. + for node in worker_nodes: + # Create some actors. + actors.extend( + [SlowCounter.remote() for _ in range(num_actors_at_a_time)]) + # Run some methods. + for j in range(len(actors)): + actor = actors[j] + for _ in range(num_function_calls_at_a_time): + result_ids[actor].append(actor.inc.remote(j**2 * 0.000001)) + # Kill a node. + cluster.remove_node(node) + + # Run some more methods. + for j in range(len(actors)): + actor = actors[j] + for _ in range(num_function_calls_at_a_time): + result_ids[actor].append(actor.inc.remote(j**2 * 0.000001)) + + # Get the results and check that they have the correct values. + for _, result_id_list in result_ids.items(): + results = list(range(1, len(result_id_list) + 1)) + assert ray.get(result_id_list) == results + + +def kill_actor(actor): + """A helper function that kills an actor process.""" + pid = ray.get(actor.get_pid.remote()) + os.kill(pid, signal.SIGKILL) + wait_for_pid_to_exit(pid) + + +def test_checkpointing(ray_start_regular, ray_checkpointable_actor_cls): + """Test actor checkpointing and restoring from a checkpoint.""" + actor = ray.remote( + max_reconstructions=2)(ray_checkpointable_actor_cls).remote() + # Call increase 3 times, triggering a checkpoint. + expected = 0 + for _ in range(3): + ray.get(actor.increase.remote()) + expected += 1 + # Assert that the actor wasn't resumed from a checkpoint. + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False + # Kill actor process. + kill_actor(actor) + # Assert that the actor was resumed from a checkpoint and its value is + # still correct. + assert ray.get(actor.get.remote()) == expected + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True + + # Submit some more tasks. These should get replayed since they happen after + # the checkpoint. + for _ in range(3): + ray.get(actor.increase.remote()) + expected += 1 + # Kill actor again and check that reconstruction still works after the + # actor resuming from a checkpoint. + kill_actor(actor) + assert ray.get(actor.get.remote()) == expected + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True + + +def test_remote_checkpointing(ray_start_regular, ray_checkpointable_actor_cls): + """Test checkpointing of a remote actor through method invocation.""" + + # Define a class that exposes a method to save checkpoints. + class RemoteCheckpointableActor(ray_checkpointable_actor_cls): + def __init__(self): + super(RemoteCheckpointableActor, self).__init__() + self._should_checkpoint = False + + def checkpoint(self): + self._should_checkpoint = True + + def should_checkpoint(self, checkpoint_context): + should_checkpoint = self._should_checkpoint + self._should_checkpoint = False + return should_checkpoint + + cls = ray.remote(max_reconstructions=2)(RemoteCheckpointableActor) + actor = cls.remote() + # Call increase 3 times. + expected = 0 + for _ in range(3): + ray.get(actor.increase.remote()) + expected += 1 + # Call a checkpoint task. + actor.checkpoint.remote() + # Assert that the actor wasn't resumed from a checkpoint. + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False + # Kill actor process. + kill_actor(actor) + # Assert that the actor was resumed from a checkpoint and its value is + # still correct. + assert ray.get(actor.get.remote()) == expected + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True + + # Submit some more tasks. These should get replayed since they happen after + # the checkpoint. + for _ in range(3): + ray.get(actor.increase.remote()) + expected += 1 + # Kill actor again and check that reconstruction still works after the + # actor resuming from a checkpoint. + kill_actor(actor) + assert ray.get(actor.get.remote()) == expected + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True + + +def test_checkpointing_on_node_failure(ray_start_cluster_2_nodes, + ray_checkpointable_actor_cls): + """Test actor checkpointing on a remote node.""" + # Place the actor on the remote node. + cluster = ray_start_cluster_2_nodes + remote_node = list(cluster.worker_nodes) + actor_cls = ray.remote(max_reconstructions=1)(ray_checkpointable_actor_cls) + actor = actor_cls.remote() + while (ray.get(actor.node_id.remote()) != remote_node[0].unique_id): + actor = actor_cls.remote() + + # Call increase several times. + expected = 0 + for _ in range(6): + ray.get(actor.increase.remote()) + expected += 1 + # Assert that the actor wasn't resumed from a checkpoint. + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False + # Kill actor process. + cluster.remove_node(remote_node[0]) + # Assert that the actor was resumed from a checkpoint and its value is + # still correct. + assert ray.get(actor.get.remote()) == expected + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is True + + +def test_checkpointing_save_exception(ray_start_regular, + ray_checkpointable_actor_cls): + """Test actor can still be recovered if checkpoints fail to complete.""" + + @ray.remote(max_reconstructions=2) + class RemoteCheckpointableActor(ray_checkpointable_actor_cls): + def save_checkpoint(self, actor_id, checkpoint_context): + raise Exception("Intentional error saving checkpoint.") + + actor = RemoteCheckpointableActor.remote() + # Call increase 3 times, triggering a checkpoint that will fail. + expected = 0 + for _ in range(3): + ray.get(actor.increase.remote()) + expected += 1 + # Assert that the actor wasn't resumed from a checkpoint. + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False + # Kill actor process. + kill_actor(actor) + # Assert that the actor still wasn't resumed from a checkpoint and its + # value is still correct. + assert ray.get(actor.get.remote()) == expected + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False + + # Submit some more tasks. These should get replayed since they happen after + # the checkpoint. + for _ in range(3): + ray.get(actor.increase.remote()) + expected += 1 + # Kill actor again, and check that reconstruction still works and the actor + # wasn't resumed from a checkpoint. + kill_actor(actor) + assert ray.get(actor.get.remote()) == expected + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False + + # Check that the checkpoint error was pushed to the driver. + wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1) + + +def test_checkpointing_load_exception(ray_start_regular, + ray_checkpointable_actor_cls): + """Test actor can still be recovered if checkpoints fail to load.""" + + @ray.remote(max_reconstructions=2) + class RemoteCheckpointableActor(ray_checkpointable_actor_cls): + def load_checkpoint(self, actor_id, checkpoints): + raise Exception("Intentional error loading checkpoint.") + + actor = RemoteCheckpointableActor.remote() + # Call increase 3 times, triggering a checkpoint that will succeed. + expected = 0 + for _ in range(3): + ray.get(actor.increase.remote()) + expected += 1 + # Assert that the actor wasn't resumed from a checkpoint because loading + # it failed. + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False + # Kill actor process. + kill_actor(actor) + # Assert that the actor still wasn't resumed from a checkpoint and its + # value is still correct. + assert ray.get(actor.get.remote()) == expected + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False + + # Submit some more tasks. These should get replayed since they happen after + # the checkpoint. + for _ in range(3): + ray.get(actor.increase.remote()) + expected += 1 + # Kill actor again, and check that reconstruction still works and the actor + # wasn't resumed from a checkpoint. + kill_actor(actor) + assert ray.get(actor.get.remote()) == expected + assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False + + # Check that the checkpoint error was pushed to the driver. + wait_for_errors(ray_constants.CHECKPOINT_PUSH_ERROR, 1) + + +@pytest.mark.parametrize( + "ray_start_regular", + # This overwrite currently isn't effective, + # see https://github.com/ray-project/ray/issues/3926. + [generate_internal_config_map(num_actor_checkpoints_to_keep=20)], + indirect=True, +) +def test_deleting_actor_checkpoint(ray_start_regular): + """Test deleting old actor checkpoints.""" + + @ray.remote + class CheckpointableActor(ray.actor.Checkpointable): + def __init__(self): + self.checkpoint_ids = [] + + def get_checkpoint_ids(self): + return self.checkpoint_ids + + def should_checkpoint(self, checkpoint_context): + # Save checkpoints after every task + return True + + def save_checkpoint(self, actor_id, checkpoint_id): + self.checkpoint_ids.append(checkpoint_id) + pass + + def load_checkpoint(self, actor_id, available_checkpoints): + pass + + def checkpoint_expired(self, actor_id, checkpoint_id): + assert checkpoint_id == self.checkpoint_ids[0] + del self.checkpoint_ids[0] + + actor = CheckpointableActor.remote() + for i in range(19): + assert len(ray.get(actor.get_checkpoint_ids.remote())) == i + 1 + for _ in range(20): + assert len(ray.get(actor.get_checkpoint_ids.remote())) == 20 + + +def test_bad_checkpointable_actor_class(): + """Test error raised if an actor class doesn't implement all abstract + methods in the Checkpointable interface.""" + + with pytest.raises(TypeError): + + @ray.remote + class BadCheckpointableActor(ray.actor.Checkpointable): + def should_checkpoint(self, checkpoint_context): + return True + + +def test_init_exception_in_checkpointable_actor(ray_start_regular, + ray_checkpointable_actor_cls): + # This test is similar to test_failure.py::test_failed_actor_init. + # This test is used to guarantee that checkpointable actor does not + # break the same logic. + error_message1 = "actor constructor failed" + error_message2 = "actor method failed" + + @ray.remote + class CheckpointableFailedActor(ray_checkpointable_actor_cls): + def __init__(self): + raise Exception(error_message1) + + def fail_method(self): + raise Exception(error_message2) + + def should_checkpoint(self, checkpoint_context): + return True + + a = CheckpointableFailedActor.remote() + + # Make sure that we get errors from a failed constructor. + wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1) + errors = relevant_errors(ray_constants.TASK_PUSH_ERROR) + assert len(errors) == 1 + assert error_message1 in errors[0]["message"] + + # Make sure that we get errors from a failed method. + a.fail_method.remote() + wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) + errors = relevant_errors(ray_constants.TASK_PUSH_ERROR) + assert len(errors) == 2 + assert error_message1 in errors[1]["message"] + + +def test_decorated_method(ray_start_regular): + def method_invocation_decorator(f): + def new_f_invocation(args, kwargs): + # Split one argument into two. Return th kwargs without passing + # them into the actor. + return f([args[0], args[0]], {}), kwargs + + return new_f_invocation + + def method_execution_decorator(f): + def new_f_execution(self, b, c): + # Turn two arguments into one. + return f(self, b + c) + + new_f_execution.__ray_invocation_decorator__ = ( + method_invocation_decorator) + return new_f_execution + + @ray.remote + class Actor(object): + @method_execution_decorator + def decorated_method(self, x): + return x + 1 + + a = Actor.remote() + + object_id, extra = a.decorated_method.remote(3, kwarg=3) + assert isinstance(object_id, ray.ObjectID) + assert extra == {"kwarg": 3} + assert ray.get(object_id) == 7 # 2 * 3 + 1 + + +@pytest.mark.skipif( + pytest_timeout is None, + reason="Timeout package not installed; skipping test that may hang.") +@pytest.mark.timeout(20) +@pytest.mark.parametrize( + "ray_start_cluster", [{ + "num_cpus": 1, + "num_nodes": 2, + }], indirect=True) +def test_ray_wait_dead_actor(ray_start_cluster): + """Tests that methods completed by dead actors are returned as ready""" + cluster = ray_start_cluster + + @ray.remote(num_cpus=1) + class Actor(object): + def __init__(self): + pass + + def node_id(self): + return ray.worker.global_worker.node.unique_id + + def ping(self): + time.sleep(1) + + # Create some actors and wait for them to initialize. + num_nodes = len(cluster.list_all_nodes()) + actors = [Actor.remote() for _ in range(num_nodes)] + ray.get([actor.ping.remote() for actor in actors]) + + # Ping the actors and make sure the tasks complete. + ping_ids = [actor.ping.remote() for actor in actors] + ray.get(ping_ids) + # Evict the result from the node that we're about to kill. + remote_node = cluster.list_all_nodes()[-1] + remote_ping_id = None + for i, actor in enumerate(actors): + if ray.get(actor.node_id.remote()) == remote_node.unique_id: + remote_ping_id = ping_ids[i] + ray.internal.free([remote_ping_id], local_only=True) + cluster.remove_node(remote_node) + + # Repeatedly call ray.wait until the exception for the dead actor is + # received. + unready = ping_ids[:] + while unready: + _, unready = ray.wait(unready, timeout=0) + time.sleep(1) + + with pytest.raises(ray.exceptions.RayActorError): + ray.get(ping_ids) + + # Evict the result from the dead node. + ray.internal.free([remote_ping_id], local_only=True) + # Create an actor on the local node that will call ray.wait in a loop. + head_node_resource = "HEAD_NODE" + ray.experimental.set_resource(head_node_resource, 1) + + @ray.remote(num_cpus=0, resources={head_node_resource: 1}) + class ParentActor(object): + def __init__(self, ping_ids): + self.unready = ping_ids + + def wait(self): + _, self.unready = ray.wait(self.unready, timeout=0) + return len(self.unready) == 0 + + def ping(self): + return + + # Repeatedly call ray.wait through the local actor until the exception for + # the dead actor is received. + parent_actor = ParentActor.remote(ping_ids) + ray.get(parent_actor.ping.remote()) + failure_detected = False + while not failure_detected: + failure_detected = ray.get(parent_actor.wait.remote()) + + +if __name__ == "__main__": + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_actor_resources.py b/python/ray/tests/test_actor_resources.py new file mode 100644 index 000000000..51cf34ec4 --- /dev/null +++ b/python/ray/tests/test_actor_resources.py @@ -0,0 +1,743 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import os +import pytest +try: + import pytest_timeout +except ImportError: + pytest_timeout = None +import sys +import time + +import ray +import ray.test_utils +import ray.cluster_utils + + +def test_actor_deletion_with_gpus(shutdown_only): + ray.init( + num_cpus=1, num_gpus=1, object_store_memory=int(150 * 1024 * 1024)) + + # When an actor that uses a GPU exits, make sure that the GPU resources + # are released. + + @ray.remote(num_gpus=1) + class Actor(object): + def getpid(self): + return os.getpid() + + for _ in range(5): + # If we can successfully create an actor, that means that enough + # GPU resources are available. + a = Actor.remote() + ray.get(a.getpid.remote()) + + +def test_actor_state(ray_start_regular): + @ray.remote + class Counter(object): + def __init__(self): + self.value = 0 + + def increase(self): + self.value += 1 + + def value(self): + return self.value + + c1 = Counter.remote() + c1.increase.remote() + assert ray.get(c1.value.remote()) == 1 + + c2 = Counter.remote() + c2.increase.remote() + c2.increase.remote() + assert ray.get(c2.value.remote()) == 2 + + +def test_actor_class_methods(ray_start_regular): + class Foo(object): + x = 2 + + @classmethod + def as_remote(cls): + return ray.remote(cls) + + @classmethod + def f(cls): + return cls.x + + @classmethod + def g(cls, y): + return cls.x + y + + def echo(self, value): + return value + + a = Foo.as_remote().remote() + assert ray.get(a.echo.remote(2)) == 2 + assert ray.get(a.f.remote()) == 2 + assert ray.get(a.g.remote(2)) == 4 + + +def test_resource_assignment(shutdown_only): + """Test to make sure that we assign resource to actors at instantiation.""" + # This test will create 16 actors. Declaring this many CPUs initially will + # speed up the test because the workers will be started ahead of time. + ray.init( + num_cpus=16, + num_gpus=1, + resources={"Custom": 1}, + object_store_memory=int(150 * 1024 * 1024)) + + class Actor(object): + def __init__(self): + self.resources = ray.get_resource_ids() + + def get_actor_resources(self): + return self.resources + + def get_actor_method_resources(self): + return ray.get_resource_ids() + + decorator_resource_args = [{}, { + "num_cpus": 0.1 + }, { + "num_gpus": 0.1 + }, { + "resources": { + "Custom": 0.1 + } + }] + instantiation_resource_args = [{}, { + "num_cpus": 0.2 + }, { + "num_gpus": 0.2 + }, { + "resources": { + "Custom": 0.2 + } + }] + for decorator_args in decorator_resource_args: + for instantiation_args in instantiation_resource_args: + if len(decorator_args) == 0: + actor_class = ray.remote(Actor) + else: + actor_class = ray.remote(**decorator_args)(Actor) + actor = actor_class._remote(**instantiation_args) + actor_resources = ray.get(actor.get_actor_resources.remote()) + actor_method_resources = ray.get( + actor.get_actor_method_resources.remote()) + if len(decorator_args) == 0 and len(instantiation_args) == 0: + assert len(actor_resources) == 0, ( + "Actor should not be assigned resources.") + assert list(actor_method_resources.keys()) == [ + "CPU" + ], ("Actor method should only have CPUs") + assert actor_method_resources["CPU"][0][1] == 1, ( + "Actor method should default to one cpu.") + else: + if ("num_cpus" not in decorator_args + and "num_cpus" not in instantiation_args): + assert actor_resources["CPU"][0][1] == 1, ( + "Actor should default to one cpu.") + correct_resources = {} + defined_resources = decorator_args.copy() + defined_resources.update(instantiation_args) + for resource, value in defined_resources.items(): + if resource == "num_cpus": + correct_resources["CPU"] = value + elif resource == "num_gpus": + correct_resources["GPU"] = value + elif resource == "resources": + for custom_resource, amount in value.items(): + correct_resources[custom_resource] = amount + for resource, amount in correct_resources.items(): + assert (actor_resources[resource][0][0] == + actor_method_resources[resource][0][0]), ( + "Should have assigned same {} for both actor ", + "and actor method.".format(resource)) + assert (actor_resources[resource][0][ + 1] == actor_method_resources[resource][0][1]), ( + "Should have assigned same amount of {} for both ", + "actor and actor method.".format(resource)) + assert actor_resources[resource][0][1] == amount, ( + "Actor should have {amount} {resource} but has ", + "{amount} {resource}".format( + amount=amount, resource=resource)) + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Failing with new GCS API on Linux.") +def test_actor_gpus(ray_start_cluster): + cluster = ray_start_cluster + num_nodes = 3 + num_gpus_per_raylet = 4 + for i in range(num_nodes): + cluster.add_node( + num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet) + ray.init(address=cluster.address) + + @ray.remote(num_gpus=1) + class Actor1(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + + def get_location_and_ids(self): + assert ray.get_gpu_ids() == self.gpu_ids + return (ray.worker.global_worker.node.unique_id, + tuple(self.gpu_ids)) + + # Create one actor per GPU. + actors = [Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet)] + # Make sure that no two actors are assigned to the same GPU. + locations_and_ids = ray.get( + [actor.get_location_and_ids.remote() for actor in actors]) + node_names = {location for location, gpu_id in locations_and_ids} + assert len(node_names) == num_nodes + location_actor_combinations = [] + for node_name in node_names: + for gpu_id in range(num_gpus_per_raylet): + location_actor_combinations.append((node_name, (gpu_id, ))) + assert set(locations_and_ids) == set(location_actor_combinations) + + # Creating a new actor should fail because all of the GPUs are being + # used. + a = Actor1.remote() + ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01) + assert ready_ids == [] + + +def test_actor_multiple_gpus(ray_start_cluster): + cluster = ray_start_cluster + num_nodes = 3 + num_gpus_per_raylet = 5 + for i in range(num_nodes): + cluster.add_node( + num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet) + ray.init(address=cluster.address) + + @ray.remote(num_gpus=2) + class Actor1(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + + def get_location_and_ids(self): + assert ray.get_gpu_ids() == self.gpu_ids + return (ray.worker.global_worker.node.unique_id, + tuple(self.gpu_ids)) + + # Create some actors. + actors1 = [Actor1.remote() for _ in range(num_nodes * 2)] + # Make sure that no two actors are assigned to the same GPU. + locations_and_ids = ray.get( + [actor.get_location_and_ids.remote() for actor in actors1]) + node_names = {location for location, gpu_id in locations_and_ids} + assert len(node_names) == num_nodes + + # Keep track of which GPU IDs are being used for each location. + gpus_in_use = {node_name: [] for node_name in node_names} + for location, gpu_ids in locations_and_ids: + gpus_in_use[location].extend(gpu_ids) + for node_name in node_names: + assert len(set(gpus_in_use[node_name])) == 4 + + # Creating a new actor should fail because all of the GPUs are being + # used. + a = Actor1.remote() + ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01) + assert ready_ids == [] + + # We should be able to create more actors that use only a single GPU. + @ray.remote(num_gpus=1) + class Actor2(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + + def get_location_and_ids(self): + return (ray.worker.global_worker.node.unique_id, + tuple(self.gpu_ids)) + + # Create some actors. + actors2 = [Actor2.remote() for _ in range(num_nodes)] + # Make sure that no two actors are assigned to the same GPU. + locations_and_ids = ray.get( + [actor.get_location_and_ids.remote() for actor in actors2]) + names = {location for location, gpu_id in locations_and_ids} + assert node_names == names + for location, gpu_ids in locations_and_ids: + gpus_in_use[location].extend(gpu_ids) + for node_name in node_names: + assert len(gpus_in_use[node_name]) == 5 + assert set(gpus_in_use[node_name]) == set(range(5)) + + # Creating a new actor should fail because all of the GPUs are being + # used. + a = Actor2.remote() + ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01) + assert ready_ids == [] + + +def test_actor_different_numbers_of_gpus(ray_start_cluster): + # Test that we can create actors on two nodes that have different + # numbers of GPUs. + cluster = ray_start_cluster + cluster.add_node(num_cpus=10, num_gpus=0) + cluster.add_node(num_cpus=10, num_gpus=5) + cluster.add_node(num_cpus=10, num_gpus=10) + ray.init(address=cluster.address) + + @ray.remote(num_gpus=1) + class Actor1(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + + def get_location_and_ids(self): + return (ray.worker.global_worker.node.unique_id, + tuple(self.gpu_ids)) + + # Create some actors. + actors = [Actor1.remote() for _ in range(0 + 5 + 10)] + # Make sure that no two actors are assigned to the same GPU. + locations_and_ids = ray.get( + [actor.get_location_and_ids.remote() for actor in actors]) + node_names = {location for location, gpu_id in locations_and_ids} + assert len(node_names) == 2 + for node_name in node_names: + node_gpu_ids = [ + gpu_id for location, gpu_id in locations_and_ids + if location == node_name + ] + assert len(node_gpu_ids) in [5, 10] + assert set(node_gpu_ids) == {(i, ) for i in range(len(node_gpu_ids))} + + # Creating a new actor should fail because all of the GPUs are being + # used. + a = Actor1.remote() + ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01) + assert ready_ids == [] + + +def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster): + cluster = ray_start_cluster + num_nodes = 5 + num_gpus_per_raylet = 5 + for i in range(num_nodes): + cluster.add_node( + num_cpus=10 * num_gpus_per_raylet, + num_gpus=num_gpus_per_raylet, + _internal_config=json.dumps({ + "num_heartbeats_timeout": 1000 + })) + ray.init(address=cluster.address) + + @ray.remote + def create_actors(i, n): + @ray.remote(num_gpus=1) + class Actor(object): + def __init__(self, i, j): + self.gpu_ids = ray.get_gpu_ids() + + def get_location_and_ids(self): + return ((ray.worker.global_worker.node.unique_id), + tuple(self.gpu_ids)) + + def sleep(self): + time.sleep(100) + + # Create n actors. + actors = [] + for j in range(n): + actors.append(Actor.remote(i, j)) + + locations = ray.get( + [actor.get_location_and_ids.remote() for actor in actors]) + + # Put each actor to sleep for a long time to prevent them from getting + # terminated. + for actor in actors: + actor.sleep.remote() + + return locations + + all_locations = ray.get([ + create_actors.remote(i, num_gpus_per_raylet) for i in range(num_nodes) + ]) + + # Make sure that no two actors are assigned to the same GPU. + node_names = { + location + for locations in all_locations for location, gpu_id in locations + } + assert len(node_names) == num_nodes + + # Keep track of which GPU IDs are being used for each location. + gpus_in_use = {node_name: [] for node_name in node_names} + for locations in all_locations: + for location, gpu_ids in locations: + gpus_in_use[location].extend(gpu_ids) + for node_name in node_names: + assert len(set(gpus_in_use[node_name])) == num_gpus_per_raylet + + @ray.remote(num_gpus=1) + class Actor(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + + def get_location_and_ids(self): + return (ray.worker.global_worker.node.unique_id, + tuple(self.gpu_ids)) + + # All the GPUs should be used up now. + a = Actor.remote() + ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=0.01) + assert ready_ids == [] + + +@pytest.mark.skipif( + sys.version_info < (3, 0), reason="This test requires Python 3.") +def test_actors_and_tasks_with_gpus(ray_start_cluster): + cluster = ray_start_cluster + num_nodes = 3 + num_gpus_per_raylet = 2 + for i in range(num_nodes): + cluster.add_node( + num_cpus=num_gpus_per_raylet, num_gpus=num_gpus_per_raylet) + ray.init(address=cluster.address) + + def check_intervals_non_overlapping(list_of_intervals): + for i in range(len(list_of_intervals)): + for j in range(i): + first_interval = list_of_intervals[i] + second_interval = list_of_intervals[j] + # Check that list_of_intervals[i] and list_of_intervals[j] + # don't overlap. + assert first_interval[0] < first_interval[1] + assert second_interval[0] < second_interval[1] + intervals_nonoverlapping = ( + first_interval[1] <= second_interval[0] + or second_interval[1] <= first_interval[0]) + assert intervals_nonoverlapping, ( + "Intervals {} and {} are overlapping.".format( + first_interval, second_interval)) + + @ray.remote(num_gpus=1) + def f1(): + t1 = time.monotonic() + time.sleep(0.1) + t2 = time.monotonic() + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 1 + assert gpu_ids[0] in range(num_gpus_per_raylet) + return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids), + [t1, t2]) + + @ray.remote(num_gpus=2) + def f2(): + t1 = time.monotonic() + time.sleep(0.1) + t2 = time.monotonic() + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 2 + assert gpu_ids[0] in range(num_gpus_per_raylet) + assert gpu_ids[1] in range(num_gpus_per_raylet) + return (ray.worker.global_worker.node.unique_id, tuple(gpu_ids), + [t1, t2]) + + @ray.remote(num_gpus=1) + class Actor1(object): + def __init__(self): + self.gpu_ids = ray.get_gpu_ids() + assert len(self.gpu_ids) == 1 + assert self.gpu_ids[0] in range(num_gpus_per_raylet) + + def get_location_and_ids(self): + assert ray.get_gpu_ids() == self.gpu_ids + return (ray.worker.global_worker.node.unique_id, + tuple(self.gpu_ids)) + + def locations_to_intervals_for_many_tasks(): + # Launch a bunch of GPU tasks. + locations_ids_and_intervals = ray.get( + [f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] + + [f2.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)] + + [f1.remote() for _ in range(5 * num_nodes * num_gpus_per_raylet)]) + + locations_to_intervals = collections.defaultdict(lambda: []) + for location, gpu_ids, interval in locations_ids_and_intervals: + for gpu_id in gpu_ids: + locations_to_intervals[(location, gpu_id)].append(interval) + return locations_to_intervals + + # Run a bunch of GPU tasks. + locations_to_intervals = locations_to_intervals_for_many_tasks() + # For each GPU, verify that the set of tasks that used this specific + # GPU did not overlap in time. + for locations in locations_to_intervals: + check_intervals_non_overlapping(locations_to_intervals[locations]) + + # Create an actor that uses a GPU. + a = Actor1.remote() + actor_location = ray.get(a.get_location_and_ids.remote()) + actor_location = (actor_location[0], actor_location[1][0]) + # This check makes sure that actor_location is formatted the same way + # that the keys of locations_to_intervals are formatted. + assert actor_location in locations_to_intervals + + # Run a bunch of GPU tasks. + locations_to_intervals = locations_to_intervals_for_many_tasks() + # For each GPU, verify that the set of tasks that used this specific + # GPU did not overlap in time. + for locations in locations_to_intervals: + check_intervals_non_overlapping(locations_to_intervals[locations]) + # Make sure that the actor's GPU was not used. + assert actor_location not in locations_to_intervals + + # Create more actors to fill up all the GPUs. + more_actors = [ + Actor1.remote() for _ in range(num_nodes * num_gpus_per_raylet - 1) + ] + # Wait for the actors to finish being created. + ray.get([actor.get_location_and_ids.remote() for actor in more_actors]) + + # Now if we run some GPU tasks, they should not be scheduled. + results = [f1.remote() for _ in range(30)] + ready_ids, remaining_ids = ray.wait(results, timeout=1.0) + assert len(ready_ids) == 0 + + +def test_actors_and_tasks_with_gpus_version_two(shutdown_only): + # Create tasks and actors that both use GPUs and make sure that they + # are given different GPUs + num_gpus = 4 + + ray.init( + num_cpus=(num_gpus + 1), + num_gpus=num_gpus, + object_store_memory=int(150 * 1024 * 1024)) + + # The point of this actor is to record which GPU IDs have been seen. We + # can't just return them from the tasks, because the tasks don't return + # for a long time in order to make sure the GPU is not released + # prematurely. + @ray.remote + class RecordGPUs(object): + def __init__(self): + self.gpu_ids_seen = [] + self.num_calls = 0 + + def add_ids(self, gpu_ids): + self.gpu_ids_seen += gpu_ids + self.num_calls += 1 + + def get_gpu_ids_and_calls(self): + return self.gpu_ids_seen, self.num_calls + + @ray.remote(num_gpus=1) + def f(record_gpu_actor): + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 1 + record_gpu_actor.add_ids.remote(gpu_ids) + # Sleep for a long time so that the GPU never gets released. This task + # will be killed by ray.shutdown() before it actually finishes. + time.sleep(1000) + + @ray.remote(num_gpus=1) + class Actor(object): + def __init__(self, record_gpu_actor): + self.gpu_ids = ray.get_gpu_ids() + assert len(self.gpu_ids) == 1 + record_gpu_actor.add_ids.remote(self.gpu_ids) + + def check_gpu_ids(self): + assert ray.get_gpu_ids() == self.gpu_ids + + record_gpu_actor = RecordGPUs.remote() + + actors = [] + actor_results = [] + for _ in range(num_gpus // 2): + f.remote(record_gpu_actor) + a = Actor.remote(record_gpu_actor) + actor_results.append(a.check_gpu_ids.remote()) + # Prevent the actor handle from going out of scope so that its GPU + # resources don't get released. + actors.append(a) + + # Make sure that the actor method calls succeeded. + ray.get(actor_results) + + start_time = time.time() + while time.time() - start_time < 30: + seen_gpu_ids, num_calls = ray.get( + record_gpu_actor.get_gpu_ids_and_calls.remote()) + if num_calls == num_gpus: + break + assert set(seen_gpu_ids) == set(range(num_gpus)) + + +def test_blocking_actor_task(shutdown_only): + ray.init( + num_cpus=1, num_gpus=1, object_store_memory=int(150 * 1024 * 1024)) + + @ray.remote(num_gpus=1) + def f(): + return 1 + + @ray.remote + class Foo(object): + def __init__(self): + pass + + def blocking_method(self): + ray.get(f.remote()) + + # Make sure we can execute a blocking actor method even if there is + # only one CPU. + actor = Foo.remote() + ray.get(actor.blocking_method.remote()) + + @ray.remote(num_cpus=1) + class CPUFoo(object): + def __init__(self): + pass + + def blocking_method(self): + ray.get(f.remote()) + + # Make sure that lifetime CPU resources are not released when actors + # block. + actor = CPUFoo.remote() + x_id = actor.blocking_method.remote() + ready_ids, remaining_ids = ray.wait([x_id], timeout=1.0) + assert ready_ids == [] + assert remaining_ids == [x_id] + + @ray.remote(num_gpus=1) + class GPUFoo(object): + def __init__(self): + pass + + def blocking_method(self): + ray.get(f.remote()) + + # Make sure that GPU resources are not released when actors block. + actor = GPUFoo.remote() + x_id = actor.blocking_method.remote() + ready_ids, remaining_ids = ray.wait([x_id], timeout=1.0) + assert ready_ids == [] + assert remaining_ids == [x_id] + + +@pytest.mark.skipif( + sys.version_info < (3, 0), + reason="This test is currently failing on Python 2.7.") +def test_lifetime_and_transient_resources(ray_start_regular): + # This actor acquires resources only when running methods. + @ray.remote + class Actor1(object): + def method(self): + pass + + # This actor acquires resources for its lifetime. + @ray.remote(num_cpus=1) + class Actor2(object): + def method(self): + pass + + actor1s = [Actor1.remote() for _ in range(10)] + ray.get([a.method.remote() for a in actor1s]) + + actor2s = [Actor2.remote() for _ in range(2)] + results = [a.method.remote() for a in actor2s] + ready_ids, remaining_ids = ray.wait( + results, num_returns=len(results), timeout=5.0) + assert len(ready_ids) == 1 + + +def test_custom_label_placement(ray_start_cluster): + cluster = ray_start_cluster + cluster.add_node(num_cpus=2, resources={"CustomResource1": 2}) + cluster.add_node(num_cpus=2, resources={"CustomResource2": 2}) + ray.init(address=cluster.address) + + @ray.remote(resources={"CustomResource1": 1}) + class ResourceActor1(object): + def get_location(self): + return ray.worker.global_worker.node.unique_id + + @ray.remote(resources={"CustomResource2": 1}) + class ResourceActor2(object): + def get_location(self): + return ray.worker.global_worker.node.unique_id + + node_id = ray.worker.global_worker.node.unique_id + + # Create some actors. + actors1 = [ResourceActor1.remote() for _ in range(2)] + actors2 = [ResourceActor2.remote() for _ in range(2)] + locations1 = ray.get([a.get_location.remote() for a in actors1]) + locations2 = ray.get([a.get_location.remote() for a in actors2]) + for location in locations1: + assert location == node_id + for location in locations2: + assert location != node_id + + +def test_creating_more_actors_than_resources(shutdown_only): + ray.init(num_cpus=10, num_gpus=2, resources={"CustomResource1": 1}) + + @ray.remote(num_gpus=1) + class ResourceActor1(object): + def method(self): + return ray.get_gpu_ids()[0] + + @ray.remote(resources={"CustomResource1": 1}) + class ResourceActor2(object): + def method(self): + pass + + # Make sure the first two actors get created and the third one does + # not. + actor1 = ResourceActor1.remote() + result1 = actor1.method.remote() + ray.wait([result1]) + actor2 = ResourceActor1.remote() + result2 = actor2.method.remote() + ray.wait([result2]) + actor3 = ResourceActor1.remote() + result3 = actor3.method.remote() + ready_ids, _ = ray.wait([result3], timeout=0.2) + assert len(ready_ids) == 0 + + # By deleting actor1, we free up resources to create actor3. + del actor1 + + results = ray.get([result1, result2, result3]) + assert results[0] == results[2] + assert set(results) == {0, 1} + + # Make sure that when one actor goes out of scope a new actor is + # created because some resources have been freed up. + results = [] + for _ in range(3): + actor = ResourceActor2.remote() + object_id = actor.method.remote() + results.append(object_id) + # Wait for the task to execute. We do this because otherwise it may + # be possible for the __ray_terminate__ task to execute before the + # method. + ray.wait([object_id]) + + ray.get(results) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_advanced.py b/python/ray/tests/test_advanced.py new file mode 100644 index 000000000..2356590ee --- /dev/null +++ b/python/ray/tests/test_advanced.py @@ -0,0 +1,2171 @@ +# coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from concurrent.futures import ThreadPoolExecutor +import glob +import json +import logging +import os +import random +import setproctitle +import shutil +import six +import sys +import socket +import subprocess +import tempfile +import threading +import time + +import numpy as np +import pickle +import pytest + +import ray +from ray import signature +import ray.ray_constants as ray_constants +import ray.cluster_utils +import ray.test_utils + +from ray.test_utils import RayTestTimeoutException + +logger = logging.getLogger(__name__) + + +def test_wait_iterables(ray_start_regular): + @ray.remote + def f(delay): + time.sleep(delay) + return 1 + + objectids = (f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5)) + ready_ids, remaining_ids = ray.experimental.wait(objectids) + assert len(ready_ids) == 1 + assert len(remaining_ids) == 3 + + objectids = np.array( + [f.remote(1.0), + f.remote(0.5), + f.remote(0.5), + f.remote(0.5)]) + ready_ids, remaining_ids = ray.experimental.wait(objectids) + assert len(ready_ids) == 1 + assert len(remaining_ids) == 3 + + +def test_multiple_waits_and_gets(shutdown_only): + # It is important to use three workers here, so that the three tasks + # launched in this experiment can run at the same time. + ray.init(num_cpus=3) + + @ray.remote + def f(delay): + time.sleep(delay) + return 1 + + @ray.remote + def g(l): + # The argument l should be a list containing one object ID. + ray.wait([l[0]]) + + @ray.remote + def h(l): + # The argument l should be a list containing one object ID. + ray.get(l[0]) + + # Make sure that multiple wait requests involving the same object ID + # all return. + x = f.remote(1) + ray.get([g.remote([x]), g.remote([x])]) + + # Make sure that multiple get requests involving the same object ID all + # return. + x = f.remote(1) + ray.get([h.remote([x]), h.remote([x])]) + + +def test_caching_functions_to_run(shutdown_only): + # Test that we export functions to run on all workers before the driver + # is connected. + def f(worker_info): + sys.path.append(1) + + ray.worker.global_worker.run_function_on_all_workers(f) + + def f(worker_info): + sys.path.append(2) + + ray.worker.global_worker.run_function_on_all_workers(f) + + def g(worker_info): + sys.path.append(3) + + ray.worker.global_worker.run_function_on_all_workers(g) + + def f(worker_info): + sys.path.append(4) + + ray.worker.global_worker.run_function_on_all_workers(f) + + ray.init(num_cpus=1) + + @ray.remote + def get_state(): + time.sleep(1) + return sys.path[-4], sys.path[-3], sys.path[-2], sys.path[-1] + + res1 = get_state.remote() + res2 = get_state.remote() + assert ray.get(res1) == (1, 2, 3, 4) + assert ray.get(res2) == (1, 2, 3, 4) + + # Clean up the path on the workers. + def f(worker_info): + sys.path.pop() + sys.path.pop() + sys.path.pop() + sys.path.pop() + + ray.worker.global_worker.run_function_on_all_workers(f) + + +def test_running_function_on_all_workers(ray_start_regular): + def f(worker_info): + sys.path.append("fake_directory") + + ray.worker.global_worker.run_function_on_all_workers(f) + + @ray.remote + def get_path1(): + return sys.path + + assert "fake_directory" == ray.get(get_path1.remote())[-1] + + def f(worker_info): + sys.path.pop(-1) + + ray.worker.global_worker.run_function_on_all_workers(f) + + # Create a second remote function to guarantee that when we call + # get_path2.remote(), the second function to run will have been run on + # the worker. + @ray.remote + def get_path2(): + return sys.path + + assert "fake_directory" not in ray.get(get_path2.remote()) + + +def test_profiling_api(ray_start_2_cpus): + @ray.remote + def f(): + with ray.profile("custom_event", extra_data={"name": "custom name"}): + pass + + ray.put(1) + object_id = f.remote() + ray.wait([object_id]) + ray.get(object_id) + + # Wait until all of the profiling information appears in the profile + # table. + timeout_seconds = 20 + start_time = time.time() + while True: + profile_data = ray.timeline() + event_types = {event["cat"] for event in profile_data} + expected_types = [ + "task", + "task:deserialize_arguments", + "task:execute", + "task:store_outputs", + "wait_for_function", + "ray.get", + "ray.put", + "ray.wait", + "submit_task", + "fetch_and_run_function", + "register_remote_function", + "custom_event", # This is the custom one from ray.profile. + ] + + if all(expected_type in event_types + for expected_type in expected_types): + break + + if time.time() - start_time > timeout_seconds: + raise RayTestTimeoutException( + "Timed out while waiting for information in " + "profile table. Missing events: {}.".format( + set(expected_types) - set(event_types))) + + # The profiling information only flushes once every second. + time.sleep(1.1) + + +def test_wait_cluster(ray_start_cluster): + cluster = ray_start_cluster + cluster.add_node(num_cpus=1, resources={"RemoteResource": 1}) + cluster.add_node(num_cpus=1, resources={"RemoteResource": 1}) + ray.init(address=cluster.address) + + @ray.remote(resources={"RemoteResource": 1}) + def f(): + return + + # Make sure we have enough workers on the remote nodes to execute some + # tasks. + tasks = [f.remote() for _ in range(10)] + start = time.time() + ray.get(tasks) + end = time.time() + + # Submit some more tasks that can only be executed on the remote nodes. + tasks = [f.remote() for _ in range(10)] + # Sleep for a bit to let the tasks finish. + time.sleep((end - start) * 2) + _, unready = ray.wait(tasks, num_returns=len(tasks), timeout=0) + # All remote tasks should have finished. + assert len(unready) == 0 + + +def test_object_transfer_dump(ray_start_cluster): + cluster = ray_start_cluster + + num_nodes = 3 + for i in range(num_nodes): + cluster.add_node(resources={str(i): 1}, object_store_memory=10**9) + ray.init(address=cluster.address) + + @ray.remote + def f(x): + return + + # These objects will live on different nodes. + object_ids = [ + f._remote(args=[1], resources={str(i): 1}) for i in range(num_nodes) + ] + + # Broadcast each object from each machine to each other machine. + for object_id in object_ids: + ray.get([ + f._remote(args=[object_id], resources={str(i): 1}) + for i in range(num_nodes) + ]) + + # The profiling information only flushes once every second. + time.sleep(1.1) + + transfer_dump = ray.object_transfer_timeline() + # Make sure the transfer dump can be serialized with JSON. + json.loads(json.dumps(transfer_dump)) + assert len(transfer_dump) >= num_nodes**2 + assert len({ + event["pid"] + for event in transfer_dump if event["name"] == "transfer_receive" + }) == num_nodes + assert len({ + event["pid"] + for event in transfer_dump if event["name"] == "transfer_send" + }) == num_nodes + + +def test_identical_function_names(ray_start_regular): + # Define a bunch of remote functions and make sure that we don't + # accidentally call an older version. + + num_calls = 200 + + @ray.remote + def f(): + return 1 + + results1 = [f.remote() for _ in range(num_calls)] + + @ray.remote + def f(): + return 2 + + results2 = [f.remote() for _ in range(num_calls)] + + @ray.remote + def f(): + return 3 + + results3 = [f.remote() for _ in range(num_calls)] + + @ray.remote + def f(): + return 4 + + results4 = [f.remote() for _ in range(num_calls)] + + @ray.remote + def f(): + return 5 + + results5 = [f.remote() for _ in range(num_calls)] + + assert ray.get(results1) == num_calls * [1] + assert ray.get(results2) == num_calls * [2] + assert ray.get(results3) == num_calls * [3] + assert ray.get(results4) == num_calls * [4] + assert ray.get(results5) == num_calls * [5] + + @ray.remote + def g(): + return 1 + + @ray.remote # noqa: F811 + def g(): + return 2 + + @ray.remote # noqa: F811 + def g(): + return 3 + + @ray.remote # noqa: F811 + def g(): + return 4 + + @ray.remote # noqa: F811 + def g(): + return 5 + + result_values = ray.get([g.remote() for _ in range(num_calls)]) + assert result_values == num_calls * [5] + + +def test_illegal_api_calls(ray_start_regular): + + # Verify that we cannot call put on an ObjectID. + x = ray.put(1) + with pytest.raises(Exception): + ray.put(x) + # Verify that we cannot call get on a regular value. + with pytest.raises(Exception): + ray.get(3) + + +# TODO(hchen): This test currently doesn't work in Python 2. This is likely +# because plasma client isn't thread-safe. This needs to be fixed from the +# Arrow side. See #4107 for relevant discussions. +@pytest.mark.skipif(six.PY2, reason="Doesn't work in Python 2.") +def test_multithreading(ray_start_2_cpus): + # This test requires at least 2 CPUs to finish since the worker does not + # release resources when joining the threads. + + def run_test_in_multi_threads(test_case, num_threads=10, num_repeats=25): + """A helper function that runs test cases in multiple threads.""" + + def wrapper(): + for _ in range(num_repeats): + test_case() + time.sleep(random.randint(0, 10) / 1000.0) + return "ok" + + executor = ThreadPoolExecutor(max_workers=num_threads) + futures = [executor.submit(wrapper) for _ in range(num_threads)] + for future in futures: + assert future.result() == "ok" + + @ray.remote + def echo(value, delay_ms=0): + if delay_ms > 0: + time.sleep(delay_ms / 1000.0) + return value + + def test_api_in_multi_threads(): + """Test using Ray api in multiple threads.""" + + @ray.remote + class Echo(object): + def echo(self, value): + return value + + # Test calling remote functions in multiple threads. + def test_remote_call(): + value = random.randint(0, 1000000) + result = ray.get(echo.remote(value)) + assert value == result + + run_test_in_multi_threads(test_remote_call) + + # Test multiple threads calling one actor. + actor = Echo.remote() + + def test_call_actor(): + value = random.randint(0, 1000000) + result = ray.get(actor.echo.remote(value)) + assert value == result + + run_test_in_multi_threads(test_call_actor) + + # Test put and get. + def test_put_and_get(): + value = random.randint(0, 1000000) + result = ray.get(ray.put(value)) + assert value == result + + run_test_in_multi_threads(test_put_and_get) + + # Test multiple threads waiting for objects. + num_wait_objects = 10 + objects = [ + echo.remote(i, delay_ms=10) for i in range(num_wait_objects) + ] + + def test_wait(): + ready, _ = ray.wait( + objects, + num_returns=len(objects), + timeout=1000.0, + ) + assert len(ready) == num_wait_objects + assert ray.get(ready) == list(range(num_wait_objects)) + + run_test_in_multi_threads(test_wait, num_repeats=1) + + # Run tests in a driver. + test_api_in_multi_threads() + + # Run tests in a worker. + @ray.remote + def run_tests_in_worker(): + test_api_in_multi_threads() + return "ok" + + assert ray.get(run_tests_in_worker.remote()) == "ok" + + # Test actor that runs background threads. + @ray.remote + class MultithreadedActor(object): + def __init__(self): + self.lock = threading.Lock() + self.thread_results = [] + + def background_thread(self, wait_objects): + try: + # Test wait + ready, _ = ray.wait( + wait_objects, + num_returns=len(wait_objects), + timeout=1000.0, + ) + assert len(ready) == len(wait_objects) + for _ in range(20): + num = 10 + # Test remote call + results = [echo.remote(i) for i in range(num)] + assert ray.get(results) == list(range(num)) + # Test put and get + objects = [ray.put(i) for i in range(num)] + assert ray.get(objects) == list(range(num)) + time.sleep(random.randint(0, 10) / 1000.0) + except Exception as e: + with self.lock: + self.thread_results.append(e) + else: + with self.lock: + self.thread_results.append("ok") + + def spawn(self): + wait_objects = [echo.remote(i, delay_ms=10) for i in range(10)] + self.threads = [ + threading.Thread( + target=self.background_thread, args=(wait_objects, )) + for _ in range(20) + ] + [thread.start() for thread in self.threads] + + def join(self): + [thread.join() for thread in self.threads] + assert self.thread_results == ["ok"] * len(self.threads) + return "ok" + + actor = MultithreadedActor.remote() + actor.spawn.remote() + ray.get(actor.join.remote()) == "ok" + + +def test_free_objects_multi_node(ray_start_cluster): + # This test will do following: + # 1. Create 3 raylets that each hold an actor. + # 2. Each actor creates an object which is the deletion target. + # 3. Wait 0.1 second for the objects to be deleted. + # 4. Check that the deletion targets have been deleted. + # Caution: if remote functions are used instead of actor methods, + # one raylet may create more than one worker to execute the + # tasks, so the flushing operations may be executed in different + # workers and the plasma client holding the deletion target + # may not be flushed. + cluster = ray_start_cluster + config = json.dumps({"object_manager_repeated_push_delay_ms": 1000}) + for i in range(3): + cluster.add_node( + num_cpus=1, + resources={"Custom{}".format(i): 1}, + _internal_config=config) + ray.init(address=cluster.address) + + class RawActor(object): + def get(self): + return ray.worker.global_worker.node.unique_id + + ActorOnNode0 = ray.remote(resources={"Custom0": 1})(RawActor) + ActorOnNode1 = ray.remote(resources={"Custom1": 1})(RawActor) + ActorOnNode2 = ray.remote(resources={"Custom2": 1})(RawActor) + + def create(actors): + a = actors[0].get.remote() + b = actors[1].get.remote() + c = actors[2].get.remote() + (l1, l2) = ray.wait([a, b, c], num_returns=3) + assert len(l1) == 3 + assert len(l2) == 0 + return (a, b, c) + + def run_one_test(actors, local_only, delete_creating_tasks): + (a, b, c) = create(actors) + # The three objects should be generated on different object stores. + assert ray.get(a) != ray.get(b) + assert ray.get(a) != ray.get(c) + assert ray.get(c) != ray.get(b) + ray.internal.free( + [a, b, c], + local_only=local_only, + delete_creating_tasks=delete_creating_tasks) + # Wait for the objects to be deleted. + time.sleep(0.1) + return (a, b, c) + + actors = [ + ActorOnNode0.remote(), + ActorOnNode1.remote(), + ActorOnNode2.remote() + ] + # Case 1: run this local_only=False. All 3 objects will be deleted. + (a, b, c) = run_one_test(actors, False, False) + (l1, l2) = ray.wait([a, b, c], timeout=0.01, num_returns=1) + # All the objects are deleted. + assert len(l1) == 0 + assert len(l2) == 3 + # Case 2: run this local_only=True. Only 1 object will be deleted. + (a, b, c) = run_one_test(actors, True, False) + (l1, l2) = ray.wait([a, b, c], timeout=0.01, num_returns=3) + # One object is deleted and 2 objects are not. + assert len(l1) == 2 + assert len(l2) == 1 + # The deleted object will have the same store with the driver. + local_return = ray.worker.global_worker.node.unique_id + for object_id in l1: + assert ray.get(object_id) != local_return + + # Case3: These cases test the deleting creating tasks for the object. + (a, b, c) = run_one_test(actors, False, False) + task_table = ray.tasks() + for obj in [a, b, c]: + assert ray._raylet.compute_task_id(obj).hex() in task_table + + (a, b, c) = run_one_test(actors, False, True) + task_table = ray.tasks() + for obj in [a, b, c]: + assert ray._raylet.compute_task_id(obj).hex() not in task_table + + +def test_local_mode(shutdown_only): + @ray.remote + def local_mode_f(): + return np.array([0, 0]) + + @ray.remote + def local_mode_g(x): + x[0] = 1 + return x + + ray.init(local_mode=True) + + @ray.remote + def f(): + return np.ones([3, 4, 5]) + + xref = f.remote() + # Remote functions should return ObjectIDs. + assert isinstance(xref, ray.ObjectID) + assert np.alltrue(ray.get(xref) == np.ones([3, 4, 5])) + y = np.random.normal(size=[11, 12]) + # Check that ray.get(ray.put) is the identity. + assert np.alltrue(y == ray.get(ray.put(y))) + + # Make sure objects are immutable, this example is why we need to copy + # arguments before passing them into remote functions in python mode + aref = local_mode_f.remote() + assert np.alltrue(ray.get(aref) == np.array([0, 0])) + bref = local_mode_g.remote(ray.get(aref)) + # Make sure local_mode_g does not mutate aref. + assert np.alltrue(ray.get(aref) == np.array([0, 0])) + assert np.alltrue(ray.get(bref) == np.array([1, 0])) + + # wait should return the first num_returns values passed in as the + # first list and the remaining values as the second list + num_returns = 5 + object_ids = [ray.put(i) for i in range(20)] + ready, remaining = ray.wait( + object_ids, num_returns=num_returns, timeout=None) + assert ready == object_ids[:num_returns] + assert remaining == object_ids[num_returns:] + + # Check that ray.put() and ray.internal.free() work in local mode. + + v1 = np.ones(10) + v2 = np.zeros(10) + + k1 = ray.put(v1) + assert np.alltrue(v1 == ray.get(k1)) + k2 = ray.put(v2) + assert np.alltrue(v2 == ray.get(k2)) + + ray.internal.free([k1, k2]) + with pytest.raises(Exception): + ray.get(k1) + with pytest.raises(Exception): + ray.get(k2) + + # Should fail silently. + ray.internal.free([k1, k2]) + + # Test actors in LOCAL_MODE. + + @ray.remote + class LocalModeTestClass(object): + def __init__(self, array): + self.array = array + + def set_array(self, array): + self.array = array + + def get_array(self): + return self.array + + def modify_and_set_array(self, array): + array[0] = -1 + self.array = array + + @ray.method(num_return_vals=3) + def returns_multiple(self): + return 1, 2, 3 + + test_actor = LocalModeTestClass.remote(np.arange(10)) + obj = test_actor.get_array.remote() + assert isinstance(obj, ray.ObjectID) + assert np.alltrue(ray.get(obj) == np.arange(10)) + + test_array = np.arange(10) + # Remote actor functions should not mutate arguments + test_actor.modify_and_set_array.remote(test_array) + assert np.alltrue(test_array == np.arange(10)) + # Remote actor functions should keep state + test_array[0] = -1 + assert np.alltrue(test_array == ray.get(test_actor.get_array.remote())) + + # Check that actor handles work in local mode. + + @ray.remote + def use_actor_handle(handle): + array = np.ones(10) + handle.set_array.remote(array) + assert np.alltrue(array == ray.get(handle.get_array.remote())) + + ray.get(use_actor_handle.remote(test_actor)) + + # Check that exceptions are deferred until ray.get(). + + exception_str = "test_advanced remote task exception" + + @ray.remote + def throws(): + raise Exception(exception_str) + + obj = throws.remote() + with pytest.raises(Exception, match=exception_str): + ray.get(obj) + + # Check that multiple return values are handled properly. + + @ray.remote(num_return_vals=3) + def returns_multiple(): + return 1, 2, 3 + + obj1, obj2, obj3 = returns_multiple.remote() + assert ray.get(obj1) == 1 + assert ray.get(obj2) == 2 + assert ray.get(obj3) == 3 + assert ray.get([obj1, obj2, obj3]) == [1, 2, 3] + + obj1, obj2, obj3 = test_actor.returns_multiple.remote() + assert ray.get(obj1) == 1 + assert ray.get(obj2) == 2 + assert ray.get(obj3) == 3 + assert ray.get([obj1, obj2, obj3]) == [1, 2, 3] + + @ray.remote(num_return_vals=2) + def returns_multiple_throws(): + raise Exception(exception_str) + + obj1, obj2 = returns_multiple_throws.remote() + with pytest.raises(Exception, match=exception_str): + ray.get(obj) + ray.get(obj1) + with pytest.raises(Exception, match=exception_str): + ray.get(obj2) + + # Check that Actors are not overwritten by remote calls from different + # classes. + @ray.remote + class RemoteActor1(object): + def __init__(self): + pass + + def function1(self): + return 0 + + @ray.remote + class RemoteActor2(object): + def __init__(self): + pass + + def function2(self): + return 1 + + actor1 = RemoteActor1.remote() + _ = RemoteActor2.remote() + assert ray.get(actor1.function1.remote()) == 0 + + # Test passing ObjectIDs. + @ray.remote + def direct_dep(input): + return input + + @ray.remote + def indirect_dep(input): + return ray.get(direct_dep.remote(input[0])) + + assert ray.get(indirect_dep.remote(["hello"])) == "hello" + + +def test_resource_constraints(shutdown_only): + num_workers = 20 + ray.init(num_cpus=10, num_gpus=2) + + @ray.remote(num_cpus=0) + def get_worker_id(): + time.sleep(0.1) + return os.getpid() + + # Attempt to wait for all of the workers to start up. + while True: + if len( + set( + ray.get([ + get_worker_id.remote() for _ in range(num_workers) + ]))) == num_workers: + break + + time_buffer = 2 + + # At most 10 copies of this can run at once. + @ray.remote(num_cpus=1) + def f(n): + time.sleep(n) + + start_time = time.time() + ray.get([f.remote(0.5) for _ in range(10)]) + duration = time.time() - start_time + assert duration < 0.5 + time_buffer + assert duration > 0.5 + + start_time = time.time() + ray.get([f.remote(0.5) for _ in range(11)]) + duration = time.time() - start_time + assert duration < 1 + time_buffer + assert duration > 1 + + @ray.remote(num_cpus=3) + def f(n): + time.sleep(n) + + start_time = time.time() + ray.get([f.remote(0.5) for _ in range(3)]) + duration = time.time() - start_time + assert duration < 0.5 + time_buffer + assert duration > 0.5 + + start_time = time.time() + ray.get([f.remote(0.5) for _ in range(4)]) + duration = time.time() - start_time + assert duration < 1 + time_buffer + assert duration > 1 + + @ray.remote(num_gpus=1) + def f(n): + time.sleep(n) + + start_time = time.time() + ray.get([f.remote(0.5) for _ in range(2)]) + duration = time.time() - start_time + assert duration < 0.5 + time_buffer + assert duration > 0.5 + + start_time = time.time() + ray.get([f.remote(0.5) for _ in range(3)]) + duration = time.time() - start_time + assert duration < 1 + time_buffer + assert duration > 1 + + start_time = time.time() + ray.get([f.remote(0.5) for _ in range(4)]) + duration = time.time() - start_time + assert duration < 1 + time_buffer + assert duration > 1 + + +def test_multi_resource_constraints(shutdown_only): + num_workers = 20 + ray.init(num_cpus=10, num_gpus=10) + + @ray.remote(num_cpus=0) + def get_worker_id(): + time.sleep(0.1) + return os.getpid() + + # Attempt to wait for all of the workers to start up. + while True: + if len( + set( + ray.get([ + get_worker_id.remote() for _ in range(num_workers) + ]))) == num_workers: + break + + @ray.remote(num_cpus=1, num_gpus=9) + def f(n): + time.sleep(n) + + @ray.remote(num_cpus=9, num_gpus=1) + def g(n): + time.sleep(n) + + time_buffer = 2 + + start_time = time.time() + ray.get([f.remote(0.5), g.remote(0.5)]) + duration = time.time() - start_time + assert duration < 0.5 + time_buffer + assert duration > 0.5 + + start_time = time.time() + ray.get([f.remote(0.5), f.remote(0.5)]) + duration = time.time() - start_time + assert duration < 1 + time_buffer + assert duration > 1 + + start_time = time.time() + ray.get([g.remote(0.5), g.remote(0.5)]) + duration = time.time() - start_time + assert duration < 1 + time_buffer + assert duration > 1 + + start_time = time.time() + ray.get([f.remote(0.5), f.remote(0.5), g.remote(0.5), g.remote(0.5)]) + duration = time.time() - start_time + assert duration < 1 + time_buffer + assert duration > 1 + + +def test_gpu_ids(shutdown_only): + num_gpus = 10 + ray.init(num_cpus=10, num_gpus=num_gpus) + + def get_gpu_ids(num_gpus_per_worker): + time.sleep(0.1) + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == num_gpus_per_worker + assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join( + [str(i) for i in gpu_ids])) + for gpu_id in gpu_ids: + assert gpu_id in range(num_gpus) + return gpu_ids + + f0 = ray.remote(num_gpus=0)(lambda: get_gpu_ids(0)) + f1 = ray.remote(num_gpus=1)(lambda: get_gpu_ids(1)) + f2 = ray.remote(num_gpus=2)(lambda: get_gpu_ids(2)) + f4 = ray.remote(num_gpus=4)(lambda: get_gpu_ids(4)) + f5 = ray.remote(num_gpus=5)(lambda: get_gpu_ids(5)) + + # Wait for all workers to start up. + @ray.remote + def f(): + time.sleep(0.1) + return os.getpid() + + start_time = time.time() + while True: + if len(set(ray.get([f.remote() for _ in range(10)]))) == 10: + break + if time.time() > start_time + 10: + raise RayTestTimeoutException( + "Timed out while waiting for workers to start " + "up.") + + list_of_ids = ray.get([f0.remote() for _ in range(10)]) + assert list_of_ids == 10 * [[]] + + list_of_ids = ray.get([f1.remote() for _ in range(10)]) + set_of_ids = {tuple(gpu_ids) for gpu_ids in list_of_ids} + assert set_of_ids == {(i, ) for i in range(10)} + + list_of_ids = ray.get([f2.remote(), f4.remote(), f4.remote()]) + all_ids = [gpu_id for gpu_ids in list_of_ids for gpu_id in gpu_ids] + assert set(all_ids) == set(range(10)) + + # There are only 10 GPUs, and each task uses 5 GPUs, so there should only + # be 2 tasks scheduled at a given time. + t1 = time.time() + ray.get([f5.remote() for _ in range(20)]) + assert time.time() - t1 >= 10 * 0.1 + + # Test that actors have CUDA_VISIBLE_DEVICES set properly. + + @ray.remote + class Actor0(object): + def __init__(self): + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 0 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join( + [str(i) for i in gpu_ids])) + # Set self.x to make sure that we got here. + self.x = 1 + + def test(self): + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 0 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join( + [str(i) for i in gpu_ids])) + return self.x + + @ray.remote(num_gpus=1) + class Actor1(object): + def __init__(self): + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 1 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join( + [str(i) for i in gpu_ids])) + # Set self.x to make sure that we got here. + self.x = 1 + + def test(self): + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 1 + assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join( + [str(i) for i in gpu_ids])) + return self.x + + a0 = Actor0.remote() + ray.get(a0.test.remote()) + + a1 = Actor1.remote() + ray.get(a1.test.remote()) + + +def test_zero_cpus(shutdown_only): + ray.init(num_cpus=0) + + # We should be able to execute a task that requires 0 CPU resources. + @ray.remote(num_cpus=0) + def f(): + return 1 + + ray.get(f.remote()) + + # We should be able to create an actor that requires 0 CPU resources. + @ray.remote(num_cpus=0) + class Actor(object): + def method(self): + pass + + a = Actor.remote() + x = a.method.remote() + ray.get(x) + + +def test_zero_cpus_actor(ray_start_cluster): + cluster = ray_start_cluster + cluster.add_node(num_cpus=0) + cluster.add_node(num_cpus=2) + ray.init(address=cluster.address) + + node_id = ray.worker.global_worker.node.unique_id + + @ray.remote + class Foo(object): + def method(self): + return ray.worker.global_worker.node.unique_id + + # Make sure tasks and actors run on the remote raylet. + a = Foo.remote() + assert ray.get(a.method.remote()) != node_id + + +def test_fractional_resources(shutdown_only): + ray.init(num_cpus=6, num_gpus=3, resources={"Custom": 1}) + + @ray.remote(num_gpus=0.5) + class Foo1(object): + def method(self): + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 1 + return gpu_ids[0] + + foos = [Foo1.remote() for _ in range(6)] + gpu_ids = ray.get([f.method.remote() for f in foos]) + for i in range(3): + assert gpu_ids.count(i) == 2 + del foos + + @ray.remote + class Foo2(object): + def method(self): + pass + + # Create an actor that requires 0.7 of the custom resource. + f1 = Foo2._remote([], {}, resources={"Custom": 0.7}) + ray.get(f1.method.remote()) + # Make sure that we cannot create an actor that requires 0.7 of the + # custom resource. TODO(rkn): Re-enable this once ray.wait is + # implemented. + f2 = Foo2._remote([], {}, resources={"Custom": 0.7}) + ready, _ = ray.wait([f2.method.remote()], timeout=0.5) + assert len(ready) == 0 + # Make sure we can start an actor that requries only 0.3 of the custom + # resource. + f3 = Foo2._remote([], {}, resources={"Custom": 0.3}) + ray.get(f3.method.remote()) + + del f1, f3 + + # Make sure that we get exceptions if we submit tasks that require a + # fractional number of resources greater than 1. + + @ray.remote(num_cpus=1.5) + def test(): + pass + + with pytest.raises(ValueError): + test.remote() + + with pytest.raises(ValueError): + Foo2._remote([], {}, resources={"Custom": 1.5}) + + +def test_multiple_raylets(ray_start_cluster): + # This test will define a bunch of tasks that can only be assigned to + # specific raylets, and we will check that they are assigned + # to the correct raylets. + cluster = ray_start_cluster + cluster.add_node(num_cpus=11, num_gpus=0) + cluster.add_node(num_cpus=5, num_gpus=5) + cluster.add_node(num_cpus=10, num_gpus=1) + ray.init(address=cluster.address) + cluster.wait_for_nodes() + + # Define a bunch of remote functions that all return the socket name of + # the plasma store. Since there is a one-to-one correspondence between + # plasma stores and raylets (at least right now), this can be + # used to identify which raylet the task was assigned to. + + # This must be run on the zeroth raylet. + @ray.remote(num_cpus=11) + def run_on_0(): + return ray.worker.global_worker.node.plasma_store_socket_name + + # This must be run on the first raylet. + @ray.remote(num_gpus=2) + def run_on_1(): + return ray.worker.global_worker.node.plasma_store_socket_name + + # This must be run on the second raylet. + @ray.remote(num_cpus=6, num_gpus=1) + def run_on_2(): + return ray.worker.global_worker.node.plasma_store_socket_name + + # This can be run anywhere. + @ray.remote(num_cpus=0, num_gpus=0) + def run_on_0_1_2(): + return ray.worker.global_worker.node.plasma_store_socket_name + + # This must be run on the first or second raylet. + @ray.remote(num_gpus=1) + def run_on_1_2(): + return ray.worker.global_worker.node.plasma_store_socket_name + + # This must be run on the zeroth or second raylet. + @ray.remote(num_cpus=8) + def run_on_0_2(): + return ray.worker.global_worker.node.plasma_store_socket_name + + def run_lots_of_tasks(): + names = [] + results = [] + for i in range(100): + index = np.random.randint(6) + if index == 0: + names.append("run_on_0") + results.append(run_on_0.remote()) + elif index == 1: + names.append("run_on_1") + results.append(run_on_1.remote()) + elif index == 2: + names.append("run_on_2") + results.append(run_on_2.remote()) + elif index == 3: + names.append("run_on_0_1_2") + results.append(run_on_0_1_2.remote()) + elif index == 4: + names.append("run_on_1_2") + results.append(run_on_1_2.remote()) + elif index == 5: + names.append("run_on_0_2") + results.append(run_on_0_2.remote()) + return names, results + + client_table = ray.nodes() + store_names = [] + store_names += [ + client["ObjectStoreSocketName"] for client in client_table + if client["Resources"].get("GPU", 0) == 0 + ] + store_names += [ + client["ObjectStoreSocketName"] for client in client_table + if client["Resources"].get("GPU", 0) == 5 + ] + store_names += [ + client["ObjectStoreSocketName"] for client in client_table + if client["Resources"].get("GPU", 0) == 1 + ] + assert len(store_names) == 3 + + def validate_names_and_results(names, results): + for name, result in zip(names, ray.get(results)): + if name == "run_on_0": + assert result in [store_names[0]] + elif name == "run_on_1": + assert result in [store_names[1]] + elif name == "run_on_2": + assert result in [store_names[2]] + elif name == "run_on_0_1_2": + assert (result in [ + store_names[0], store_names[1], store_names[2] + ]) + elif name == "run_on_1_2": + assert result in [store_names[1], store_names[2]] + elif name == "run_on_0_2": + assert result in [store_names[0], store_names[2]] + else: + raise Exception("This should be unreachable.") + assert set(ray.get(results)) == set(store_names) + + names, results = run_lots_of_tasks() + validate_names_and_results(names, results) + + # Make sure the same thing works when this is nested inside of a task. + + @ray.remote + def run_nested1(): + names, results = run_lots_of_tasks() + return names, results + + @ray.remote + def run_nested2(): + names, results = ray.get(run_nested1.remote()) + return names, results + + names, results = ray.get(run_nested2.remote()) + validate_names_and_results(names, results) + + +def test_custom_resources(ray_start_cluster): + cluster = ray_start_cluster + cluster.add_node(num_cpus=3, resources={"CustomResource": 0}) + cluster.add_node(num_cpus=3, resources={"CustomResource": 1}) + ray.init(address=cluster.address) + + @ray.remote + def f(): + time.sleep(0.001) + return ray.worker.global_worker.node.unique_id + + @ray.remote(resources={"CustomResource": 1}) + def g(): + time.sleep(0.001) + return ray.worker.global_worker.node.unique_id + + @ray.remote(resources={"CustomResource": 1}) + def h(): + ray.get([f.remote() for _ in range(5)]) + return ray.worker.global_worker.node.unique_id + + # The f tasks should be scheduled on both raylets. + assert len(set(ray.get([f.remote() for _ in range(50)]))) == 2 + + node_id = ray.worker.global_worker.node.unique_id + + # The g tasks should be scheduled only on the second raylet. + raylet_ids = set(ray.get([g.remote() for _ in range(50)])) + assert len(raylet_ids) == 1 + assert list(raylet_ids)[0] != node_id + + # Make sure that resource bookkeeping works when a task that uses a + # custom resources gets blocked. + ray.get([h.remote() for _ in range(5)]) + + +def test_node_id_resource(ray_start_cluster): + cluster = ray_start_cluster + cluster.add_node(num_cpus=3) + cluster.add_node(num_cpus=3) + ray.init(address=cluster.address) + + local_node = ray.state.current_node_id() + + # Note that these will have the same IP in the test cluster + assert len(ray.state.node_ids()) == 2 + assert local_node in ray.state.node_ids() + + @ray.remote(resources={local_node: 1}) + def f(): + return ray.state.current_node_id() + + # Check the node id resource is automatically usable for scheduling. + assert ray.get(f.remote()) == ray.state.current_node_id() + + +def test_two_custom_resources(ray_start_cluster): + cluster = ray_start_cluster + cluster.add_node( + num_cpus=3, resources={ + "CustomResource1": 1, + "CustomResource2": 2 + }) + cluster.add_node( + num_cpus=3, resources={ + "CustomResource1": 3, + "CustomResource2": 4 + }) + ray.init(address=cluster.address) + + @ray.remote(resources={"CustomResource1": 1}) + def f(): + time.sleep(0.001) + return ray.worker.global_worker.node.unique_id + + @ray.remote(resources={"CustomResource2": 1}) + def g(): + time.sleep(0.001) + return ray.worker.global_worker.node.unique_id + + @ray.remote(resources={"CustomResource1": 1, "CustomResource2": 3}) + def h(): + time.sleep(0.001) + return ray.worker.global_worker.node.unique_id + + @ray.remote(resources={"CustomResource1": 4}) + def j(): + time.sleep(0.001) + return ray.worker.global_worker.node.unique_id + + @ray.remote(resources={"CustomResource3": 1}) + def k(): + time.sleep(0.001) + return ray.worker.global_worker.node.unique_id + + # The f and g tasks should be scheduled on both raylets. + assert len(set(ray.get([f.remote() for _ in range(50)]))) == 2 + assert len(set(ray.get([g.remote() for _ in range(50)]))) == 2 + + node_id = ray.worker.global_worker.node.unique_id + + # The h tasks should be scheduled only on the second raylet. + raylet_ids = set(ray.get([h.remote() for _ in range(50)])) + assert len(raylet_ids) == 1 + assert list(raylet_ids)[0] != node_id + + # Make sure that tasks with unsatisfied custom resource requirements do + # not get scheduled. + ready_ids, remaining_ids = ray.wait([j.remote(), k.remote()], timeout=0.5) + assert ready_ids == [] + + +def test_many_custom_resources(shutdown_only): + num_custom_resources = 10000 + total_resources = { + str(i): np.random.randint(1, 7) + for i in range(num_custom_resources) + } + ray.init(num_cpus=5, resources=total_resources) + + def f(): + return 1 + + remote_functions = [] + for _ in range(20): + num_resources = np.random.randint(0, num_custom_resources + 1) + permuted_resources = np.random.permutation( + num_custom_resources)[:num_resources] + random_resources = { + str(i): total_resources[str(i)] + for i in permuted_resources + } + remote_function = ray.remote(resources=random_resources)(f) + remote_functions.append(remote_function) + + remote_functions.append(ray.remote(f)) + remote_functions.append(ray.remote(resources=total_resources)(f)) + + results = [] + for remote_function in remote_functions: + results.append(remote_function.remote()) + results.append(remote_function.remote()) + results.append(remote_function.remote()) + + ray.get(results) + + +# TODO: 5 retry attempts may be too little for Travis and we may need to +# increase it if this test begins to be flaky on Travis. +def test_zero_capacity_deletion_semantics(shutdown_only): + ray.init(num_cpus=2, num_gpus=1, resources={"test_resource": 1}) + + def test(): + resources = ray.available_resources() + MAX_RETRY_ATTEMPTS = 5 + retry_count = 0 + + del resources["memory"] + del resources["object_store_memory"] + for key in list(resources.keys()): + if key.startswith("node:"): + del resources[key] + + while resources and retry_count < MAX_RETRY_ATTEMPTS: + time.sleep(0.1) + resources = ray.available_resources() + retry_count += 1 + + if retry_count >= MAX_RETRY_ATTEMPTS: + raise RuntimeError( + "Resources were available even after five retries.", resources) + + return resources + + function = ray.remote( + num_cpus=2, num_gpus=1, resources={"test_resource": 1})(test) + cluster_resources = ray.get(function.remote()) + + # All cluster resources should be utilized and + # cluster_resources must be empty + assert cluster_resources == {} + + +@pytest.fixture +def save_gpu_ids_shutdown_only(): + # Record the curent value of this environment variable so that we can + # reset it after the test. + original_gpu_ids = os.environ.get("CUDA_VISIBLE_DEVICES", None) + + yield None + + # The code after the yield will run as teardown code. + ray.shutdown() + # Reset the environment variable. + if original_gpu_ids is not None: + os.environ["CUDA_VISIBLE_DEVICES"] = original_gpu_ids + else: + del os.environ["CUDA_VISIBLE_DEVICES"] + + +def test_specific_gpus(save_gpu_ids_shutdown_only): + allowed_gpu_ids = [4, 5, 6] + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( + [str(i) for i in allowed_gpu_ids]) + ray.init(num_gpus=3) + + @ray.remote(num_gpus=1) + def f(): + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 1 + assert gpu_ids[0] in allowed_gpu_ids + + @ray.remote(num_gpus=2) + def g(): + gpu_ids = ray.get_gpu_ids() + assert len(gpu_ids) == 2 + assert gpu_ids[0] in allowed_gpu_ids + assert gpu_ids[1] in allowed_gpu_ids + + ray.get([f.remote() for _ in range(100)]) + ray.get([g.remote() for _ in range(100)]) + + +def test_blocking_tasks(ray_start_regular): + @ray.remote + def f(i, j): + return (i, j) + + @ray.remote + def g(i): + # Each instance of g submits and blocks on the result of another + # remote task. + object_ids = [f.remote(i, j) for j in range(2)] + return ray.get(object_ids) + + @ray.remote + def h(i): + # Each instance of g submits and blocks on the result of another + # remote task using ray.wait. + object_ids = [f.remote(i, j) for j in range(2)] + return ray.wait(object_ids, num_returns=len(object_ids)) + + ray.get([h.remote(i) for i in range(4)]) + + @ray.remote + def _sleep(i): + time.sleep(0.01) + return (i) + + @ray.remote + def sleep(): + # Each instance of sleep submits and blocks on the result of + # another remote task, which takes some time to execute. + ray.get([_sleep.remote(i) for i in range(10)]) + + ray.get(sleep.remote()) + + +def test_max_call_tasks(ray_start_regular): + @ray.remote(max_calls=1) + def f(): + return os.getpid() + + pid = ray.get(f.remote()) + ray.test_utils.wait_for_pid_to_exit(pid) + + @ray.remote(max_calls=2) + def f(): + return os.getpid() + + pid1 = ray.get(f.remote()) + pid2 = ray.get(f.remote()) + assert pid1 == pid2 + ray.test_utils.wait_for_pid_to_exit(pid1) + + +def attempt_to_load_balance(remote_function, + args, + total_tasks, + num_nodes, + minimum_count, + num_attempts=100): + attempts = 0 + while attempts < num_attempts: + locations = ray.get( + [remote_function.remote(*args) for _ in range(total_tasks)]) + names = set(locations) + counts = [locations.count(name) for name in names] + logger.info("Counts are {}.".format(counts)) + if (len(names) == num_nodes + and all(count >= minimum_count for count in counts)): + break + attempts += 1 + assert attempts < num_attempts + + +def test_load_balancing(ray_start_cluster): + # This test ensures that tasks are being assigned to all raylets + # in a roughly equal manner. + cluster = ray_start_cluster + num_nodes = 3 + num_cpus = 7 + for _ in range(num_nodes): + cluster.add_node(num_cpus=num_cpus) + ray.init(address=cluster.address) + + @ray.remote + def f(): + time.sleep(0.01) + return ray.worker.global_worker.node.unique_id + + attempt_to_load_balance(f, [], 100, num_nodes, 10) + attempt_to_load_balance(f, [], 1000, num_nodes, 100) + + +def test_load_balancing_with_dependencies(ray_start_cluster): + # This test ensures that tasks are being assigned to all raylets in a + # roughly equal manner even when the tasks have dependencies. + cluster = ray_start_cluster + num_nodes = 3 + for _ in range(num_nodes): + cluster.add_node(num_cpus=1) + ray.init(address=cluster.address) + + @ray.remote + def f(x): + time.sleep(0.010) + return ray.worker.global_worker.node.unique_id + + # This object will be local to one of the raylets. Make sure + # this doesn't prevent tasks from being scheduled on other raylets. + x = ray.put(np.zeros(1000000)) + + attempt_to_load_balance(f, [x], 100, num_nodes, 25) + + +def wait_for_num_tasks(num_tasks, timeout=10): + start_time = time.time() + while time.time() - start_time < timeout: + if len(ray.tasks()) >= num_tasks: + return + time.sleep(0.1) + raise RayTestTimeoutException("Timed out while waiting for global state.") + + +def wait_for_num_objects(num_objects, timeout=10): + start_time = time.time() + while time.time() - start_time < timeout: + if len(ray.objects()) >= num_objects: + return + time.sleep(0.1) + raise RayTestTimeoutException("Timed out while waiting for global state.") + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="New GCS API doesn't have a Python API yet.") +def test_global_state_api(shutdown_only): + + error_message = ("The ray global state API cannot be used " + "before ray.init has been called.") + + with pytest.raises(Exception, match=error_message): + ray.objects() + + with pytest.raises(Exception, match=error_message): + ray.tasks() + + with pytest.raises(Exception, match=error_message): + ray.nodes() + + with pytest.raises(Exception, match=error_message): + ray.jobs() + + ray.init(num_cpus=5, num_gpus=3, resources={"CustomResource": 1}) + + assert ray.cluster_resources()["CPU"] == 5 + assert ray.cluster_resources()["GPU"] == 3 + assert ray.cluster_resources()["CustomResource"] == 1 + + assert ray.objects() == {} + + job_id = ray.utils.compute_job_id_from_driver( + ray.WorkerID(ray.worker.global_worker.worker_id)) + driver_task_id = ray.worker.global_worker.current_task_id.hex() + + # One task is put in the task table which corresponds to this driver. + wait_for_num_tasks(1) + task_table = ray.tasks() + assert len(task_table) == 1 + assert driver_task_id == list(task_table.keys())[0] + task_spec = task_table[driver_task_id]["TaskSpec"] + nil_unique_id_hex = ray.UniqueID.nil().hex() + nil_actor_id_hex = ray.ActorID.nil().hex() + + assert task_spec["TaskID"] == driver_task_id + assert task_spec["ActorID"] == nil_actor_id_hex + assert task_spec["Args"] == [] + assert task_spec["JobID"] == job_id.hex() + assert task_spec["FunctionID"] == nil_unique_id_hex + assert task_spec["ReturnObjectIDs"] == [] + + client_table = ray.nodes() + node_ip_address = ray.worker.global_worker.node_ip_address + + assert len(client_table) == 1 + assert client_table[0]["NodeManagerAddress"] == node_ip_address + + @ray.remote + def f(*xs): + return 1 + + x_id = ray.put(1) + result_id = f.remote(1, "hi", x_id) + + # Wait for one additional task to complete. + wait_for_num_tasks(1 + 1) + task_table = ray.tasks() + assert len(task_table) == 1 + 1 + task_id_set = set(task_table.keys()) + task_id_set.remove(driver_task_id) + task_id = list(task_id_set)[0] + + task_spec = task_table[task_id]["TaskSpec"] + assert task_spec["ActorID"] == nil_actor_id_hex + assert task_spec["Args"] == [ + signature.DUMMY_TYPE, 1, signature.DUMMY_TYPE, "hi", + signature.DUMMY_TYPE, x_id + ] + assert task_spec["JobID"] == job_id.hex() + assert task_spec["ReturnObjectIDs"] == [result_id] + + assert task_table[task_id] == ray.tasks(task_id) + + # Wait for two objects, one for the x_id and one for result_id. + wait_for_num_objects(2) + + def wait_for_object_table(): + timeout = 10 + start_time = time.time() + while time.time() - start_time < timeout: + object_table = ray.objects() + tables_ready = (object_table[x_id]["ManagerIDs"] is not None and + object_table[result_id]["ManagerIDs"] is not None) + if tables_ready: + return + time.sleep(0.1) + raise RayTestTimeoutException( + "Timed out while waiting for object table to " + "update.") + + object_table = ray.objects() + assert len(object_table) == 2 + + assert object_table[x_id] == ray.objects(x_id) + object_table_entry = ray.objects(result_id) + assert object_table[result_id] == object_table_entry + + job_table = ray.jobs() + + assert len(job_table) == 1 + assert job_table[0]["JobID"] == job_id.hex() + assert job_table[0]["NodeManagerAddress"] == node_ip_address + + +# TODO(rkn): Pytest actually has tools for capturing stdout and stderr, so we +# should use those, but they seem to conflict with Ray's use of faulthandler. +class CaptureOutputAndError(object): + """Capture stdout and stderr of some span. + + This can be used as follows. + + captured = {} + with CaptureOutputAndError(captured): + # Do stuff. + # Access captured["out"] and captured["err"]. + """ + + def __init__(self, captured_output_and_error): + if sys.version_info >= (3, 0): + import io + self.output_buffer = io.StringIO() + self.error_buffer = io.StringIO() + else: + import cStringIO + self.output_buffer = cStringIO.StringIO() + self.error_buffer = cStringIO.StringIO() + self.captured_output_and_error = captured_output_and_error + + def __enter__(self): + sys.stdout.flush() + sys.stderr.flush() + self.old_stdout = sys.stdout + self.old_stderr = sys.stderr + sys.stdout = self.output_buffer + sys.stderr = self.error_buffer + + def __exit__(self, exc_type, exc_value, traceback): + sys.stdout.flush() + sys.stderr.flush() + sys.stdout = self.old_stdout + sys.stderr = self.old_stderr + self.captured_output_and_error["out"] = self.output_buffer.getvalue() + self.captured_output_and_error["err"] = self.error_buffer.getvalue() + + +def test_logging_to_driver(shutdown_only): + ray.init(num_cpus=1, log_to_driver=True) + + @ray.remote + def f(): + # It's important to make sure that these print statements occur even + # without calling sys.stdout.flush() and sys.stderr.flush(). + for i in range(100): + print(i) + print(100 + i, file=sys.stderr) + + captured = {} + with CaptureOutputAndError(captured): + ray.get(f.remote()) + time.sleep(1) + + output_lines = captured["out"] + for i in range(200): + assert str(i) in output_lines + + # TODO(rkn): Check that no additional logs appear beyond what we expect + # and that there are no duplicate logs. Once we address the issue + # described in https://github.com/ray-project/ray/pull/5462, we should + # also check that nothing is logged to stderr. + + +def test_not_logging_to_driver(shutdown_only): + ray.init(num_cpus=1, log_to_driver=False) + + @ray.remote + def f(): + for i in range(100): + print(i) + print(100 + i, file=sys.stderr) + sys.stdout.flush() + sys.stderr.flush() + + captured = {} + with CaptureOutputAndError(captured): + ray.get(f.remote()) + time.sleep(1) + + output_lines = captured["out"] + assert len(output_lines) == 0 + + # TODO(rkn): Check that no additional logs appear beyond what we expect + # and that there are no duplicate logs. Once we address the issue + # described in https://github.com/ray-project/ray/pull/5462, we should + # also check that nothing is logged to stderr. + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="New GCS API doesn't have a Python API yet.") +def test_workers(shutdown_only): + num_workers = 3 + ray.init(num_cpus=num_workers) + + @ray.remote + def f(): + return id(ray.worker.global_worker), os.getpid() + + # Wait until all of the workers have started. + worker_ids = set() + while len(worker_ids) != num_workers: + worker_ids = set(ray.get([f.remote() for _ in range(10)])) + + +def test_specific_job_id(): + dummy_driver_id = ray.JobID.from_int(1) + ray.init(num_cpus=1, job_id=dummy_driver_id) + + # in driver + assert dummy_driver_id == ray._get_runtime_context().current_driver_id + + # in worker + @ray.remote + def f(): + return ray._get_runtime_context().current_driver_id + + assert dummy_driver_id == ray.get(f.remote()) + + ray.shutdown() + + +def test_object_id_properties(): + id_bytes = b"00112233445566778899" + object_id = ray.ObjectID(id_bytes) + assert object_id.binary() == id_bytes + object_id = ray.ObjectID.nil() + assert object_id.is_nil() + with pytest.raises(ValueError, match=r".*needs to have length 20.*"): + ray.ObjectID(id_bytes + b"1234") + with pytest.raises(ValueError, match=r".*needs to have length 20.*"): + ray.ObjectID(b"0123456789") + object_id = ray.ObjectID.from_random() + assert not object_id.is_nil() + assert object_id.binary() != id_bytes + id_dumps = pickle.dumps(object_id) + id_from_dumps = pickle.loads(id_dumps) + assert id_from_dumps == object_id + + +@pytest.fixture +def shutdown_only_with_initialization_check(): + yield None + # The code after the yield will run as teardown code. + ray.shutdown() + assert not ray.is_initialized() + + +def test_initialized(shutdown_only_with_initialization_check): + assert not ray.is_initialized() + ray.init(num_cpus=0) + assert ray.is_initialized() + + +def test_initialized_local_mode(shutdown_only_with_initialization_check): + assert not ray.is_initialized() + ray.init(num_cpus=0, local_mode=True) + assert ray.is_initialized() + + +def test_wait_reconstruction(shutdown_only): + ray.init(num_cpus=1, object_store_memory=int(10**8)) + + @ray.remote + def f(): + return np.zeros(6 * 10**7, dtype=np.uint8) + + x_id = f.remote() + ray.wait([x_id]) + ray.wait([f.remote()]) + assert not ray.worker.global_worker.core_worker.object_exists(x_id) + ready_ids, _ = ray.wait([x_id]) + assert len(ready_ids) == 1 + + +def test_ray_setproctitle(ray_start_2_cpus): + @ray.remote + class UniqueName(object): + def __init__(self): + assert setproctitle.getproctitle() == "ray_UniqueName:__init__()" + + def f(self): + assert setproctitle.getproctitle() == "ray_UniqueName:f()" + + @ray.remote + def unique_1(): + assert "unique_1" in setproctitle.getproctitle() + + actor = UniqueName.remote() + ray.get(actor.f.remote()) + ray.get(unique_1.remote()) + + +def test_duplicate_error_messages(shutdown_only): + ray.init(num_cpus=0) + + driver_id = ray.WorkerID.nil() + error_data = ray.gcs_utils.construct_error_message(driver_id, "test", + "message", 0) + + # Push the same message to the GCS twice (they are the same because we + # do not include a timestamp). + + r = ray.worker.global_worker.redis_client + + r.execute_command("RAY.TABLE_APPEND", + ray.gcs_utils.TablePrefix.Value("ERROR_INFO"), + ray.gcs_utils.TablePubsub.Value("ERROR_INFO_PUBSUB"), + driver_id.binary(), error_data) + + # Before https://github.com/ray-project/ray/pull/3316 this would + # give an error + r.execute_command("RAY.TABLE_APPEND", + ray.gcs_utils.TablePrefix.Value("ERROR_INFO"), + ray.gcs_utils.TablePubsub.Value("ERROR_INFO_PUBSUB"), + driver_id.binary(), error_data) + + +@pytest.mark.skipif( + os.getenv("TRAVIS") is None, + reason="This test should only be run on Travis.") +def test_ray_stack(ray_start_2_cpus): + def unique_name_1(): + time.sleep(1000) + + @ray.remote + def unique_name_2(): + time.sleep(1000) + + @ray.remote + def unique_name_3(): + unique_name_1() + + unique_name_2.remote() + unique_name_3.remote() + + success = False + start_time = time.time() + while time.time() - start_time < 30: + # Attempt to parse the "ray stack" call. + output = ray.utils.decode(subprocess.check_output(["ray", "stack"])) + if ("unique_name_1" in output and "unique_name_2" in output + and "unique_name_3" in output): + success = True + break + + if not success: + raise Exception("Failed to find necessary information with " + "'ray stack'") + + +def test_pandas_parquet_serialization(): + # Only test this if pandas is installed + pytest.importorskip("pandas") + + import pandas as pd + import pyarrow as pa + import pyarrow.parquet as pq + + tempdir = tempfile.mkdtemp() + filename = os.path.join(tempdir, "parquet-test") + pd.DataFrame({"col1": [0, 1], "col2": [0, 1]}).to_parquet(filename) + with open(os.path.join(tempdir, "parquet-compression"), "wb") as f: + table = pa.Table.from_arrays([pa.array([1, 2, 3])], ["hello"]) + pq.write_table(table, f, compression="lz4") + # Clean up + shutil.rmtree(tempdir) + + +def test_socket_dir_not_existing(shutdown_only): + random_name = ray.ObjectID.from_random().hex() + temp_raylet_socket_dir = "/tmp/ray/tests/{}".format(random_name) + temp_raylet_socket_name = os.path.join(temp_raylet_socket_dir, + "raylet_socket") + ray.init(num_cpus=1, raylet_socket_name=temp_raylet_socket_name) + + +def test_raylet_is_robust_to_random_messages(ray_start_regular): + node_manager_address = None + node_manager_port = None + for client in ray.nodes(): + if "NodeManagerAddress" in client: + node_manager_address = client["NodeManagerAddress"] + node_manager_port = client["NodeManagerPort"] + assert node_manager_address + assert node_manager_port + # Try to bring down the node manager: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect((node_manager_address, node_manager_port)) + s.send(1000 * b"asdf") + + @ray.remote + def f(): + return 1 + + assert ray.get(f.remote()) == 1 + + +def test_non_ascii_comment(ray_start_regular): + @ray.remote + def f(): + # 日本語 Japanese comment + return 1 + + assert ray.get(f.remote()) == 1 + + +def test_shutdown_disconnect_global_state(): + ray.init(num_cpus=0) + ray.shutdown() + + with pytest.raises(Exception) as e: + ray.objects() + assert str(e.value).endswith("ray.init has been called.") + + +@pytest.mark.parametrize( + "ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True) +def test_put_pins_object(ray_start_object_store_memory): + x_id = ray.put("HI") + x_copy = ray.ObjectID(x_id.binary()) + assert ray.get(x_copy) == "HI" + + # x cannot be evicted since x_id pins it + for _ in range(10): + ray.put(np.zeros(10 * 1024 * 1024)) + assert ray.get(x_id) == "HI" + assert ray.get(x_copy) == "HI" + + # now it can be evicted since x_id pins it but x_copy does not + del x_id + for _ in range(10): + ray.put(np.zeros(10 * 1024 * 1024)) + with pytest.raises(ray.exceptions.UnreconstructableError): + ray.get(x_copy) + + # weakref put + y_id = ray.put("HI", weakref=True) + for _ in range(10): + ray.put(np.zeros(10 * 1024 * 1024)) + with pytest.raises(ray.exceptions.UnreconstructableError): + ray.get(y_id) + + @ray.remote + def check_no_buffer_ref(x): + assert x[0].get_buffer_ref() is None + + z_id = ray.put("HI") + assert z_id.get_buffer_ref() is not None + ray.get(check_no_buffer_ref.remote([z_id])) + + +@pytest.mark.parametrize( + "ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True) +def test_redis_lru_with_set(ray_start_object_store_memory): + x = np.zeros(8 * 10**7, dtype=np.uint8) + x_id = ray.put(x, weakref=True) + + # Remove the object from the object table to simulate Redis LRU eviction. + removed = False + start_time = time.time() + while time.time() < start_time + 10: + if ray.state.state.redis_clients[0].delete(b"OBJECT" + + x_id.binary()) == 1: + removed = True + break + assert removed + + # Now evict the object from the object store. + ray.put(x) # This should not crash. + + +def test_decorated_function(ray_start_regular): + def function_invocation_decorator(f): + def new_f(args, kwargs): + # Reverse the arguments. + return f(args[::-1], {"d": 5}), kwargs + + return new_f + + def f(a, b, c, d=None): + return a, b, c, d + + f.__ray_invocation_decorator__ = function_invocation_decorator + f = ray.remote(f) + + result_id, kwargs = f.remote(1, 2, 3, d=4) + assert kwargs == {"d": 4} + assert ray.get(result_id) == (3, 2, 1, 5) + + +def test_get_postprocess(ray_start_regular): + def get_postprocessor(object_ids, values): + return [value for value in values if value > 0] + + ray.worker.global_worker._post_get_hooks.append(get_postprocessor) + + assert ray.get( + [ray.put(i) for i in [0, 1, 3, 5, -1, -3, 4]]) == [1, 3, 5, 4] + + +def test_export_after_shutdown(ray_start_regular): + # This test checks that we can use actor and remote function definitions + # across multiple Ray sessions. + + @ray.remote + def f(): + pass + + @ray.remote + class Actor(object): + def method(self): + pass + + ray.get(f.remote()) + a = Actor.remote() + ray.get(a.method.remote()) + + ray.shutdown() + + # Start Ray and use the remote function and actor again. + ray.init(num_cpus=1) + ray.get(f.remote()) + a = Actor.remote() + ray.get(a.method.remote()) + + ray.shutdown() + + # Start Ray again and make sure that these definitions can be exported from + # workers. + ray.init(num_cpus=2) + + @ray.remote + def export_definitions_from_worker(remote_function, actor_class): + ray.get(remote_function.remote()) + actor_handle = actor_class.remote() + ray.get(actor_handle.method.remote()) + + ray.get(export_definitions_from_worker.remote(f, Actor)) + + +def test_invalid_unicode_in_worker_log(shutdown_only): + info = ray.init(num_cpus=1) + + logs_dir = os.path.join(info["session_dir"], "logs") + + # Wait till first worker log file is created. + while True: + log_file_paths = glob.glob("{}/worker*.out".format(logs_dir)) + if len(log_file_paths) == 0: + time.sleep(0.2) + else: + break + + with open(log_file_paths[0], "wb") as f: + f.write(b"\xe5abc\nline2\nline3\n") + f.write(b"\xe5abc\nline2\nline3\n") + f.write(b"\xe5abc\nline2\nline3\n") + f.flush() + + # Wait till the log monitor reads the file. + time.sleep(1.0) + + # Make sure that nothing has died. + assert ray.services.remaining_processes_alive() + + +@pytest.mark.skip(reason="This test is too expensive to run.") +def test_move_log_files_to_old(shutdown_only): + info = ray.init(num_cpus=1) + + logs_dir = os.path.join(info["session_dir"], "logs") + + @ray.remote + class Actor(object): + def f(self): + print("function f finished") + + # First create a temporary actor. + actors = [ + Actor.remote() for i in range(ray_constants.LOG_MONITOR_MAX_OPEN_FILES) + ] + ray.get([a.f.remote() for a in actors]) + + # Make sure no log files are in the "old" directory before the actors + # are killed. + assert len(glob.glob("{}/old/worker*.out".format(logs_dir))) == 0 + + # Now kill the actors so the files get moved to logs/old/. + [a.__ray_terminate__.remote() for a in actors] + + while True: + log_file_paths = glob.glob("{}/old/worker*.out".format(logs_dir)) + if len(log_file_paths) > 0: + with open(log_file_paths[0], "r") as f: + assert "function f finished\n" in f.readlines() + break + + # Make sure that nothing has died. + assert ray.services.remaining_processes_alive() + + +if __name__ == "__main__": + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_array.py b/python/ray/tests/test_array.py index a0b600737..688b2a878 100644 --- a/python/ray/tests/test_array.py +++ b/python/ray/tests/test_array.py @@ -10,7 +10,7 @@ import sys import ray import ray.experimental.array.remote as ra import ray.experimental.array.distributed as da -import ray.tests.cluster_utils +import ray.cluster_utils if sys.version_info >= (3, 0): from importlib import reload @@ -216,3 +216,9 @@ def test_distributed_array_methods(ray_start_cluster_2_nodes, reload_modules): d1 = np.random.randint(1, 35) d2 = np.random.randint(1, 35) test_dist_qr(d1, d2) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_autoscaler.py b/python/ray/tests/test_autoscaler.py index bb1524c68..46108f645 100644 --- a/python/ray/tests/test_autoscaler.py +++ b/python/ray/tests/test_autoscaler.py @@ -17,7 +17,7 @@ from ray.autoscaler.autoscaler import StandardAutoscaler, LoadMetrics, \ from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_NODE_STATUS, \ STATUS_UP_TO_DATE, STATUS_UPDATE_FAILED from ray.autoscaler.node_provider import NODE_PROVIDERS, NodeProvider -from ray.tests.utils import RayTestTimeoutException +from ray.test_utils import RayTestTimeoutException import pytest @@ -1084,4 +1084,5 @@ class AutoscalingTest(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_autoscaler_yaml.py b/python/ray/tests/test_autoscaler_yaml.py index ac7d6657b..6b2886890 100644 --- a/python/ray/tests/test_autoscaler_yaml.py +++ b/python/ray/tests/test_autoscaler_yaml.py @@ -7,7 +7,7 @@ import unittest import yaml from ray.autoscaler.autoscaler import fillout_defaults, validate_config -from ray.tests.utils import recursive_fnmatch +from ray.test_utils import recursive_fnmatch RAY_PATH = os.path.abspath(os.path.join(__file__, "../../")) CONFIG_PATHS = recursive_fnmatch( @@ -31,4 +31,6 @@ class AutoscalingConfigTest(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py index 1ce5bf856..16aef4830 100644 --- a/python/ray/tests/test_basic.py +++ b/python/ray/tests/test_basic.py @@ -4,37 +4,22 @@ from __future__ import division from __future__ import print_function import collections -from concurrent.futures import ThreadPoolExecutor -import glob import io import json import logging -import os -import random import re -import setproctitle -import shutil -import six -import socket import string -import subprocess import sys -import tempfile import threading import time import numpy as np -import pickle import pytest import ray -from ray import signature from ray.exceptions import RayTimeoutError -import ray.ray_constants as ray_constants -import ray.tests.cluster_utils -import ray.tests.utils - -from ray.tests.utils import RayTestTimeoutException +import ray.cluster_utils +import ray.test_utils logger = logging.getLogger(__name__) @@ -1562,2194 +1547,7 @@ def test_wait(ray_start_regular): ray.wait([1]) -def test_wait_iterables(ray_start_regular): - @ray.remote - def f(delay): - time.sleep(delay) - return 1 - - objectids = (f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5)) - ready_ids, remaining_ids = ray.experimental.wait(objectids) - assert len(ready_ids) == 1 - assert len(remaining_ids) == 3 - - objectids = np.array( - [f.remote(1.0), - f.remote(0.5), - f.remote(0.5), - f.remote(0.5)]) - ready_ids, remaining_ids = ray.experimental.wait(objectids) - assert len(ready_ids) == 1 - assert len(remaining_ids) == 3 - - -def test_multiple_waits_and_gets(shutdown_only): - # It is important to use three workers here, so that the three tasks - # launched in this experiment can run at the same time. - ray.init(num_cpus=3) - - @ray.remote - def f(delay): - time.sleep(delay) - return 1 - - @ray.remote - def g(l): - # The argument l should be a list containing one object ID. - ray.wait([l[0]]) - - @ray.remote - def h(l): - # The argument l should be a list containing one object ID. - ray.get(l[0]) - - # Make sure that multiple wait requests involving the same object ID - # all return. - x = f.remote(1) - ray.get([g.remote([x]), g.remote([x])]) - - # Make sure that multiple get requests involving the same object ID all - # return. - x = f.remote(1) - ray.get([h.remote([x]), h.remote([x])]) - - -def test_caching_functions_to_run(shutdown_only): - # Test that we export functions to run on all workers before the driver - # is connected. - def f(worker_info): - sys.path.append(1) - - ray.worker.global_worker.run_function_on_all_workers(f) - - def f(worker_info): - sys.path.append(2) - - ray.worker.global_worker.run_function_on_all_workers(f) - - def g(worker_info): - sys.path.append(3) - - ray.worker.global_worker.run_function_on_all_workers(g) - - def f(worker_info): - sys.path.append(4) - - ray.worker.global_worker.run_function_on_all_workers(f) - - ray.init(num_cpus=1) - - @ray.remote - def get_state(): - time.sleep(1) - return sys.path[-4], sys.path[-3], sys.path[-2], sys.path[-1] - - res1 = get_state.remote() - res2 = get_state.remote() - assert ray.get(res1) == (1, 2, 3, 4) - assert ray.get(res2) == (1, 2, 3, 4) - - # Clean up the path on the workers. - def f(worker_info): - sys.path.pop() - sys.path.pop() - sys.path.pop() - sys.path.pop() - - ray.worker.global_worker.run_function_on_all_workers(f) - - -def test_running_function_on_all_workers(ray_start_regular): - def f(worker_info): - sys.path.append("fake_directory") - - ray.worker.global_worker.run_function_on_all_workers(f) - - @ray.remote - def get_path1(): - return sys.path - - assert "fake_directory" == ray.get(get_path1.remote())[-1] - - def f(worker_info): - sys.path.pop(-1) - - ray.worker.global_worker.run_function_on_all_workers(f) - - # Create a second remote function to guarantee that when we call - # get_path2.remote(), the second function to run will have been run on - # the worker. - @ray.remote - def get_path2(): - return sys.path - - assert "fake_directory" not in ray.get(get_path2.remote()) - - -def test_profiling_api(ray_start_2_cpus): - @ray.remote - def f(): - with ray.profile("custom_event", extra_data={"name": "custom name"}): - pass - - ray.put(1) - object_id = f.remote() - ray.wait([object_id]) - ray.get(object_id) - - # Wait until all of the profiling information appears in the profile - # table. - timeout_seconds = 20 - start_time = time.time() - while True: - profile_data = ray.timeline() - event_types = {event["cat"] for event in profile_data} - expected_types = [ - "task", - "task:deserialize_arguments", - "task:execute", - "task:store_outputs", - "wait_for_function", - "ray.get", - "ray.put", - "ray.wait", - "submit_task", - "fetch_and_run_function", - "register_remote_function", - "custom_event", # This is the custom one from ray.profile. - ] - - if all(expected_type in event_types - for expected_type in expected_types): - break - - if time.time() - start_time > timeout_seconds: - raise RayTestTimeoutException( - "Timed out while waiting for information in " - "profile table. Missing events: {}.".format( - set(expected_types) - set(event_types))) - - # The profiling information only flushes once every second. - time.sleep(1.1) - - -def test_wait_cluster(ray_start_cluster): - cluster = ray_start_cluster - cluster.add_node(num_cpus=1, resources={"RemoteResource": 1}) - cluster.add_node(num_cpus=1, resources={"RemoteResource": 1}) - ray.init(address=cluster.address) - - @ray.remote(resources={"RemoteResource": 1}) - def f(): - return - - # Make sure we have enough workers on the remote nodes to execute some - # tasks. - tasks = [f.remote() for _ in range(10)] - start = time.time() - ray.get(tasks) - end = time.time() - - # Submit some more tasks that can only be executed on the remote nodes. - tasks = [f.remote() for _ in range(10)] - # Sleep for a bit to let the tasks finish. - time.sleep((end - start) * 2) - _, unready = ray.wait(tasks, num_returns=len(tasks), timeout=0) - # All remote tasks should have finished. - assert len(unready) == 0 - - -def test_object_transfer_dump(ray_start_cluster): - cluster = ray_start_cluster - - num_nodes = 3 - for i in range(num_nodes): - cluster.add_node(resources={str(i): 1}, object_store_memory=10**9) - ray.init(address=cluster.address) - - @ray.remote - def f(x): - return - - # These objects will live on different nodes. - object_ids = [ - f._remote(args=[1], resources={str(i): 1}) for i in range(num_nodes) - ] - - # Broadcast each object from each machine to each other machine. - for object_id in object_ids: - ray.get([ - f._remote(args=[object_id], resources={str(i): 1}) - for i in range(num_nodes) - ]) - - # The profiling information only flushes once every second. - time.sleep(1.1) - - transfer_dump = ray.object_transfer_timeline() - # Make sure the transfer dump can be serialized with JSON. - json.loads(json.dumps(transfer_dump)) - assert len(transfer_dump) >= num_nodes**2 - assert len({ - event["pid"] - for event in transfer_dump if event["name"] == "transfer_receive" - }) == num_nodes - assert len({ - event["pid"] - for event in transfer_dump if event["name"] == "transfer_send" - }) == num_nodes - - -def test_identical_function_names(ray_start_regular): - # Define a bunch of remote functions and make sure that we don't - # accidentally call an older version. - - num_calls = 200 - - @ray.remote - def f(): - return 1 - - results1 = [f.remote() for _ in range(num_calls)] - - @ray.remote - def f(): - return 2 - - results2 = [f.remote() for _ in range(num_calls)] - - @ray.remote - def f(): - return 3 - - results3 = [f.remote() for _ in range(num_calls)] - - @ray.remote - def f(): - return 4 - - results4 = [f.remote() for _ in range(num_calls)] - - @ray.remote - def f(): - return 5 - - results5 = [f.remote() for _ in range(num_calls)] - - assert ray.get(results1) == num_calls * [1] - assert ray.get(results2) == num_calls * [2] - assert ray.get(results3) == num_calls * [3] - assert ray.get(results4) == num_calls * [4] - assert ray.get(results5) == num_calls * [5] - - @ray.remote - def g(): - return 1 - - @ray.remote # noqa: F811 - def g(): - return 2 - - @ray.remote # noqa: F811 - def g(): - return 3 - - @ray.remote # noqa: F811 - def g(): - return 4 - - @ray.remote # noqa: F811 - def g(): - return 5 - - result_values = ray.get([g.remote() for _ in range(num_calls)]) - assert result_values == num_calls * [5] - - -def test_illegal_api_calls(ray_start_regular): - - # Verify that we cannot call put on an ObjectID. - x = ray.put(1) - with pytest.raises(Exception): - ray.put(x) - # Verify that we cannot call get on a regular value. - with pytest.raises(Exception): - ray.get(3) - - -# TODO(hchen): This test currently doesn't work in Python 2. This is likely -# because plasma client isn't thread-safe. This needs to be fixed from the -# Arrow side. See #4107 for relevant discussions. -@pytest.mark.skipif(six.PY2, reason="Doesn't work in Python 2.") -def test_multithreading(ray_start_2_cpus): - # This test requires at least 2 CPUs to finish since the worker does not - # release resources when joining the threads. - - def run_test_in_multi_threads(test_case, num_threads=10, num_repeats=25): - """A helper function that runs test cases in multiple threads.""" - - def wrapper(): - for _ in range(num_repeats): - test_case() - time.sleep(random.randint(0, 10) / 1000.0) - return "ok" - - executor = ThreadPoolExecutor(max_workers=num_threads) - futures = [executor.submit(wrapper) for _ in range(num_threads)] - for future in futures: - assert future.result() == "ok" - - @ray.remote - def echo(value, delay_ms=0): - if delay_ms > 0: - time.sleep(delay_ms / 1000.0) - return value - - def test_api_in_multi_threads(): - """Test using Ray api in multiple threads.""" - - @ray.remote - class Echo(object): - def echo(self, value): - return value - - # Test calling remote functions in multiple threads. - def test_remote_call(): - value = random.randint(0, 1000000) - result = ray.get(echo.remote(value)) - assert value == result - - run_test_in_multi_threads(test_remote_call) - - # Test multiple threads calling one actor. - actor = Echo.remote() - - def test_call_actor(): - value = random.randint(0, 1000000) - result = ray.get(actor.echo.remote(value)) - assert value == result - - run_test_in_multi_threads(test_call_actor) - - # Test put and get. - def test_put_and_get(): - value = random.randint(0, 1000000) - result = ray.get(ray.put(value)) - assert value == result - - run_test_in_multi_threads(test_put_and_get) - - # Test multiple threads waiting for objects. - num_wait_objects = 10 - objects = [ - echo.remote(i, delay_ms=10) for i in range(num_wait_objects) - ] - - def test_wait(): - ready, _ = ray.wait( - objects, - num_returns=len(objects), - timeout=1000.0, - ) - assert len(ready) == num_wait_objects - assert ray.get(ready) == list(range(num_wait_objects)) - - run_test_in_multi_threads(test_wait, num_repeats=1) - - # Run tests in a driver. - test_api_in_multi_threads() - - # Run tests in a worker. - @ray.remote - def run_tests_in_worker(): - test_api_in_multi_threads() - return "ok" - - assert ray.get(run_tests_in_worker.remote()) == "ok" - - # Test actor that runs background threads. - @ray.remote - class MultithreadedActor(object): - def __init__(self): - self.lock = threading.Lock() - self.thread_results = [] - - def background_thread(self, wait_objects): - try: - # Test wait - ready, _ = ray.wait( - wait_objects, - num_returns=len(wait_objects), - timeout=1000.0, - ) - assert len(ready) == len(wait_objects) - for _ in range(20): - num = 10 - # Test remote call - results = [echo.remote(i) for i in range(num)] - assert ray.get(results) == list(range(num)) - # Test put and get - objects = [ray.put(i) for i in range(num)] - assert ray.get(objects) == list(range(num)) - time.sleep(random.randint(0, 10) / 1000.0) - except Exception as e: - with self.lock: - self.thread_results.append(e) - else: - with self.lock: - self.thread_results.append("ok") - - def spawn(self): - wait_objects = [echo.remote(i, delay_ms=10) for i in range(10)] - self.threads = [ - threading.Thread( - target=self.background_thread, args=(wait_objects, )) - for _ in range(20) - ] - [thread.start() for thread in self.threads] - - def join(self): - [thread.join() for thread in self.threads] - assert self.thread_results == ["ok"] * len(self.threads) - return "ok" - - actor = MultithreadedActor.remote() - actor.spawn.remote() - ray.get(actor.join.remote()) == "ok" - - -def test_free_objects_multi_node(ray_start_cluster): - # This test will do following: - # 1. Create 3 raylets that each hold an actor. - # 2. Each actor creates an object which is the deletion target. - # 3. Wait 0.1 second for the objects to be deleted. - # 4. Check that the deletion targets have been deleted. - # Caution: if remote functions are used instead of actor methods, - # one raylet may create more than one worker to execute the - # tasks, so the flushing operations may be executed in different - # workers and the plasma client holding the deletion target - # may not be flushed. - cluster = ray_start_cluster - config = json.dumps({"object_manager_repeated_push_delay_ms": 1000}) - for i in range(3): - cluster.add_node( - num_cpus=1, - resources={"Custom{}".format(i): 1}, - _internal_config=config) - ray.init(address=cluster.address) - - class RawActor(object): - def get(self): - return ray.worker.global_worker.node.unique_id - - ActorOnNode0 = ray.remote(resources={"Custom0": 1})(RawActor) - ActorOnNode1 = ray.remote(resources={"Custom1": 1})(RawActor) - ActorOnNode2 = ray.remote(resources={"Custom2": 1})(RawActor) - - def create(actors): - a = actors[0].get.remote() - b = actors[1].get.remote() - c = actors[2].get.remote() - (l1, l2) = ray.wait([a, b, c], num_returns=3) - assert len(l1) == 3 - assert len(l2) == 0 - return (a, b, c) - - def run_one_test(actors, local_only, delete_creating_tasks): - (a, b, c) = create(actors) - # The three objects should be generated on different object stores. - assert ray.get(a) != ray.get(b) - assert ray.get(a) != ray.get(c) - assert ray.get(c) != ray.get(b) - ray.internal.free( - [a, b, c], - local_only=local_only, - delete_creating_tasks=delete_creating_tasks) - # Wait for the objects to be deleted. - time.sleep(0.1) - return (a, b, c) - - actors = [ - ActorOnNode0.remote(), - ActorOnNode1.remote(), - ActorOnNode2.remote() - ] - # Case 1: run this local_only=False. All 3 objects will be deleted. - (a, b, c) = run_one_test(actors, False, False) - (l1, l2) = ray.wait([a, b, c], timeout=0.01, num_returns=1) - # All the objects are deleted. - assert len(l1) == 0 - assert len(l2) == 3 - # Case 2: run this local_only=True. Only 1 object will be deleted. - (a, b, c) = run_one_test(actors, True, False) - (l1, l2) = ray.wait([a, b, c], timeout=0.01, num_returns=3) - # One object is deleted and 2 objects are not. - assert len(l1) == 2 - assert len(l2) == 1 - # The deleted object will have the same store with the driver. - local_return = ray.worker.global_worker.node.unique_id - for object_id in l1: - assert ray.get(object_id) != local_return - - # Case3: These cases test the deleting creating tasks for the object. - (a, b, c) = run_one_test(actors, False, False) - task_table = ray.tasks() - for obj in [a, b, c]: - assert ray._raylet.compute_task_id(obj).hex() in task_table - - (a, b, c) = run_one_test(actors, False, True) - task_table = ray.tasks() - for obj in [a, b, c]: - assert ray._raylet.compute_task_id(obj).hex() not in task_table - - -def test_local_mode(shutdown_only): - @ray.remote - def local_mode_f(): - return np.array([0, 0]) - - @ray.remote - def local_mode_g(x): - x[0] = 1 - return x - - ray.init(local_mode=True) - - @ray.remote - def f(): - return np.ones([3, 4, 5]) - - xref = f.remote() - # Remote functions should return ObjectIDs. - assert isinstance(xref, ray.ObjectID) - assert np.alltrue(ray.get(xref) == np.ones([3, 4, 5])) - y = np.random.normal(size=[11, 12]) - # Check that ray.get(ray.put) is the identity. - assert np.alltrue(y == ray.get(ray.put(y))) - - # Make sure objects are immutable, this example is why we need to copy - # arguments before passing them into remote functions in python mode - aref = local_mode_f.remote() - assert np.alltrue(ray.get(aref) == np.array([0, 0])) - bref = local_mode_g.remote(ray.get(aref)) - # Make sure local_mode_g does not mutate aref. - assert np.alltrue(ray.get(aref) == np.array([0, 0])) - assert np.alltrue(ray.get(bref) == np.array([1, 0])) - - # wait should return the first num_returns values passed in as the - # first list and the remaining values as the second list - num_returns = 5 - object_ids = [ray.put(i) for i in range(20)] - ready, remaining = ray.wait( - object_ids, num_returns=num_returns, timeout=None) - assert ready == object_ids[:num_returns] - assert remaining == object_ids[num_returns:] - - # Check that ray.put() and ray.internal.free() work in local mode. - - v1 = np.ones(10) - v2 = np.zeros(10) - - k1 = ray.put(v1) - assert np.alltrue(v1 == ray.get(k1)) - k2 = ray.put(v2) - assert np.alltrue(v2 == ray.get(k2)) - - ray.internal.free([k1, k2]) - with pytest.raises(Exception): - ray.get(k1) - with pytest.raises(Exception): - ray.get(k2) - - # Should fail silently. - ray.internal.free([k1, k2]) - - # Test actors in LOCAL_MODE. - - @ray.remote - class LocalModeTestClass(object): - def __init__(self, array): - self.array = array - - def set_array(self, array): - self.array = array - - def get_array(self): - return self.array - - def modify_and_set_array(self, array): - array[0] = -1 - self.array = array - - @ray.method(num_return_vals=3) - def returns_multiple(self): - return 1, 2, 3 - - test_actor = LocalModeTestClass.remote(np.arange(10)) - obj = test_actor.get_array.remote() - assert isinstance(obj, ray.ObjectID) - assert np.alltrue(ray.get(obj) == np.arange(10)) - - test_array = np.arange(10) - # Remote actor functions should not mutate arguments - test_actor.modify_and_set_array.remote(test_array) - assert np.alltrue(test_array == np.arange(10)) - # Remote actor functions should keep state - test_array[0] = -1 - assert np.alltrue(test_array == ray.get(test_actor.get_array.remote())) - - # Check that actor handles work in local mode. - - @ray.remote - def use_actor_handle(handle): - array = np.ones(10) - handle.set_array.remote(array) - assert np.alltrue(array == ray.get(handle.get_array.remote())) - - ray.get(use_actor_handle.remote(test_actor)) - - # Check that exceptions are deferred until ray.get(). - - exception_str = "test_basic remote task exception" - - @ray.remote - def throws(): - raise Exception(exception_str) - - obj = throws.remote() - with pytest.raises(Exception, match=exception_str): - ray.get(obj) - - # Check that multiple return values are handled properly. - - @ray.remote(num_return_vals=3) - def returns_multiple(): - return 1, 2, 3 - - obj1, obj2, obj3 = returns_multiple.remote() - assert ray.get(obj1) == 1 - assert ray.get(obj2) == 2 - assert ray.get(obj3) == 3 - assert ray.get([obj1, obj2, obj3]) == [1, 2, 3] - - obj1, obj2, obj3 = test_actor.returns_multiple.remote() - assert ray.get(obj1) == 1 - assert ray.get(obj2) == 2 - assert ray.get(obj3) == 3 - assert ray.get([obj1, obj2, obj3]) == [1, 2, 3] - - @ray.remote(num_return_vals=2) - def returns_multiple_throws(): - raise Exception(exception_str) - - obj1, obj2 = returns_multiple_throws.remote() - with pytest.raises(Exception, match=exception_str): - ray.get(obj) - ray.get(obj1) - with pytest.raises(Exception, match=exception_str): - ray.get(obj2) - - # Check that Actors are not overwritten by remote calls from different - # classes. - @ray.remote - class RemoteActor1(object): - def __init__(self): - pass - - def function1(self): - return 0 - - @ray.remote - class RemoteActor2(object): - def __init__(self): - pass - - def function2(self): - return 1 - - actor1 = RemoteActor1.remote() - _ = RemoteActor2.remote() - assert ray.get(actor1.function1.remote()) == 0 - - # Test passing ObjectIDs. - @ray.remote - def direct_dep(input): - return input - - @ray.remote - def indirect_dep(input): - return ray.get(direct_dep.remote(input[0])) - - assert ray.get(indirect_dep.remote(["hello"])) == "hello" - - -def test_resource_constraints(shutdown_only): - num_workers = 20 - ray.init(num_cpus=10, num_gpus=2) - - @ray.remote(num_cpus=0) - def get_worker_id(): - time.sleep(0.1) - return os.getpid() - - # Attempt to wait for all of the workers to start up. - while True: - if len( - set( - ray.get([ - get_worker_id.remote() for _ in range(num_workers) - ]))) == num_workers: - break - - time_buffer = 2 - - # At most 10 copies of this can run at once. - @ray.remote(num_cpus=1) - def f(n): - time.sleep(n) - - start_time = time.time() - ray.get([f.remote(0.5) for _ in range(10)]) - duration = time.time() - start_time - assert duration < 0.5 + time_buffer - assert duration > 0.5 - - start_time = time.time() - ray.get([f.remote(0.5) for _ in range(11)]) - duration = time.time() - start_time - assert duration < 1 + time_buffer - assert duration > 1 - - @ray.remote(num_cpus=3) - def f(n): - time.sleep(n) - - start_time = time.time() - ray.get([f.remote(0.5) for _ in range(3)]) - duration = time.time() - start_time - assert duration < 0.5 + time_buffer - assert duration > 0.5 - - start_time = time.time() - ray.get([f.remote(0.5) for _ in range(4)]) - duration = time.time() - start_time - assert duration < 1 + time_buffer - assert duration > 1 - - @ray.remote(num_gpus=1) - def f(n): - time.sleep(n) - - start_time = time.time() - ray.get([f.remote(0.5) for _ in range(2)]) - duration = time.time() - start_time - assert duration < 0.5 + time_buffer - assert duration > 0.5 - - start_time = time.time() - ray.get([f.remote(0.5) for _ in range(3)]) - duration = time.time() - start_time - assert duration < 1 + time_buffer - assert duration > 1 - - start_time = time.time() - ray.get([f.remote(0.5) for _ in range(4)]) - duration = time.time() - start_time - assert duration < 1 + time_buffer - assert duration > 1 - - -def test_multi_resource_constraints(shutdown_only): - num_workers = 20 - ray.init(num_cpus=10, num_gpus=10) - - @ray.remote(num_cpus=0) - def get_worker_id(): - time.sleep(0.1) - return os.getpid() - - # Attempt to wait for all of the workers to start up. - while True: - if len( - set( - ray.get([ - get_worker_id.remote() for _ in range(num_workers) - ]))) == num_workers: - break - - @ray.remote(num_cpus=1, num_gpus=9) - def f(n): - time.sleep(n) - - @ray.remote(num_cpus=9, num_gpus=1) - def g(n): - time.sleep(n) - - time_buffer = 2 - - start_time = time.time() - ray.get([f.remote(0.5), g.remote(0.5)]) - duration = time.time() - start_time - assert duration < 0.5 + time_buffer - assert duration > 0.5 - - start_time = time.time() - ray.get([f.remote(0.5), f.remote(0.5)]) - duration = time.time() - start_time - assert duration < 1 + time_buffer - assert duration > 1 - - start_time = time.time() - ray.get([g.remote(0.5), g.remote(0.5)]) - duration = time.time() - start_time - assert duration < 1 + time_buffer - assert duration > 1 - - start_time = time.time() - ray.get([f.remote(0.5), f.remote(0.5), g.remote(0.5), g.remote(0.5)]) - duration = time.time() - start_time - assert duration < 1 + time_buffer - assert duration > 1 - - -def test_gpu_ids(shutdown_only): - num_gpus = 10 - ray.init(num_cpus=10, num_gpus=num_gpus) - - def get_gpu_ids(num_gpus_per_worker): - time.sleep(0.1) - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == num_gpus_per_worker - assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join( - [str(i) for i in gpu_ids])) - for gpu_id in gpu_ids: - assert gpu_id in range(num_gpus) - return gpu_ids - - f0 = ray.remote(num_gpus=0)(lambda: get_gpu_ids(0)) - f1 = ray.remote(num_gpus=1)(lambda: get_gpu_ids(1)) - f2 = ray.remote(num_gpus=2)(lambda: get_gpu_ids(2)) - f4 = ray.remote(num_gpus=4)(lambda: get_gpu_ids(4)) - f5 = ray.remote(num_gpus=5)(lambda: get_gpu_ids(5)) - - # Wait for all workers to start up. - @ray.remote - def f(): - time.sleep(0.1) - return os.getpid() - - start_time = time.time() - while True: - if len(set(ray.get([f.remote() for _ in range(10)]))) == 10: - break - if time.time() > start_time + 10: - raise RayTestTimeoutException( - "Timed out while waiting for workers to start " - "up.") - - list_of_ids = ray.get([f0.remote() for _ in range(10)]) - assert list_of_ids == 10 * [[]] - - list_of_ids = ray.get([f1.remote() for _ in range(10)]) - set_of_ids = {tuple(gpu_ids) for gpu_ids in list_of_ids} - assert set_of_ids == {(i, ) for i in range(10)} - - list_of_ids = ray.get([f2.remote(), f4.remote(), f4.remote()]) - all_ids = [gpu_id for gpu_ids in list_of_ids for gpu_id in gpu_ids] - assert set(all_ids) == set(range(10)) - - # There are only 10 GPUs, and each task uses 5 GPUs, so there should only - # be 2 tasks scheduled at a given time. - t1 = time.time() - ray.get([f5.remote() for _ in range(20)]) - assert time.time() - t1 >= 10 * 0.1 - - # Test that actors have CUDA_VISIBLE_DEVICES set properly. - - @ray.remote - class Actor0(object): - def __init__(self): - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 0 - assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join( - [str(i) for i in gpu_ids])) - # Set self.x to make sure that we got here. - self.x = 1 - - def test(self): - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 0 - assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join( - [str(i) for i in gpu_ids])) - return self.x - - @ray.remote(num_gpus=1) - class Actor1(object): - def __init__(self): - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 1 - assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join( - [str(i) for i in gpu_ids])) - # Set self.x to make sure that we got here. - self.x = 1 - - def test(self): - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 1 - assert (os.environ["CUDA_VISIBLE_DEVICES"] == ",".join( - [str(i) for i in gpu_ids])) - return self.x - - a0 = Actor0.remote() - ray.get(a0.test.remote()) - - a1 = Actor1.remote() - ray.get(a1.test.remote()) - - -def test_zero_cpus(shutdown_only): - ray.init(num_cpus=0) - - # We should be able to execute a task that requires 0 CPU resources. - @ray.remote(num_cpus=0) - def f(): - return 1 - - ray.get(f.remote()) - - # We should be able to create an actor that requires 0 CPU resources. - @ray.remote(num_cpus=0) - class Actor(object): - def method(self): - pass - - a = Actor.remote() - x = a.method.remote() - ray.get(x) - - -def test_zero_cpus_actor(ray_start_cluster): - cluster = ray_start_cluster - cluster.add_node(num_cpus=0) - cluster.add_node(num_cpus=2) - ray.init(address=cluster.address) - - node_id = ray.worker.global_worker.node.unique_id - - @ray.remote - class Foo(object): - def method(self): - return ray.worker.global_worker.node.unique_id - - # Make sure tasks and actors run on the remote raylet. - a = Foo.remote() - assert ray.get(a.method.remote()) != node_id - - -def test_fractional_resources(shutdown_only): - ray.init(num_cpus=6, num_gpus=3, resources={"Custom": 1}) - - @ray.remote(num_gpus=0.5) - class Foo1(object): - def method(self): - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 1 - return gpu_ids[0] - - foos = [Foo1.remote() for _ in range(6)] - gpu_ids = ray.get([f.method.remote() for f in foos]) - for i in range(3): - assert gpu_ids.count(i) == 2 - del foos - - @ray.remote - class Foo2(object): - def method(self): - pass - - # Create an actor that requires 0.7 of the custom resource. - f1 = Foo2._remote([], {}, resources={"Custom": 0.7}) - ray.get(f1.method.remote()) - # Make sure that we cannot create an actor that requires 0.7 of the - # custom resource. TODO(rkn): Re-enable this once ray.wait is - # implemented. - f2 = Foo2._remote([], {}, resources={"Custom": 0.7}) - ready, _ = ray.wait([f2.method.remote()], timeout=0.5) - assert len(ready) == 0 - # Make sure we can start an actor that requries only 0.3 of the custom - # resource. - f3 = Foo2._remote([], {}, resources={"Custom": 0.3}) - ray.get(f3.method.remote()) - - del f1, f3 - - # Make sure that we get exceptions if we submit tasks that require a - # fractional number of resources greater than 1. - - @ray.remote(num_cpus=1.5) - def test(): - pass - - with pytest.raises(ValueError): - test.remote() - - with pytest.raises(ValueError): - Foo2._remote([], {}, resources={"Custom": 1.5}) - - -def test_multiple_raylets(ray_start_cluster): - # This test will define a bunch of tasks that can only be assigned to - # specific raylets, and we will check that they are assigned - # to the correct raylets. - cluster = ray_start_cluster - cluster.add_node(num_cpus=11, num_gpus=0) - cluster.add_node(num_cpus=5, num_gpus=5) - cluster.add_node(num_cpus=10, num_gpus=1) - ray.init(address=cluster.address) - cluster.wait_for_nodes() - - # Define a bunch of remote functions that all return the socket name of - # the plasma store. Since there is a one-to-one correspondence between - # plasma stores and raylets (at least right now), this can be - # used to identify which raylet the task was assigned to. - - # This must be run on the zeroth raylet. - @ray.remote(num_cpus=11) - def run_on_0(): - return ray.worker.global_worker.node.plasma_store_socket_name - - # This must be run on the first raylet. - @ray.remote(num_gpus=2) - def run_on_1(): - return ray.worker.global_worker.node.plasma_store_socket_name - - # This must be run on the second raylet. - @ray.remote(num_cpus=6, num_gpus=1) - def run_on_2(): - return ray.worker.global_worker.node.plasma_store_socket_name - - # This can be run anywhere. - @ray.remote(num_cpus=0, num_gpus=0) - def run_on_0_1_2(): - return ray.worker.global_worker.node.plasma_store_socket_name - - # This must be run on the first or second raylet. - @ray.remote(num_gpus=1) - def run_on_1_2(): - return ray.worker.global_worker.node.plasma_store_socket_name - - # This must be run on the zeroth or second raylet. - @ray.remote(num_cpus=8) - def run_on_0_2(): - return ray.worker.global_worker.node.plasma_store_socket_name - - def run_lots_of_tasks(): - names = [] - results = [] - for i in range(100): - index = np.random.randint(6) - if index == 0: - names.append("run_on_0") - results.append(run_on_0.remote()) - elif index == 1: - names.append("run_on_1") - results.append(run_on_1.remote()) - elif index == 2: - names.append("run_on_2") - results.append(run_on_2.remote()) - elif index == 3: - names.append("run_on_0_1_2") - results.append(run_on_0_1_2.remote()) - elif index == 4: - names.append("run_on_1_2") - results.append(run_on_1_2.remote()) - elif index == 5: - names.append("run_on_0_2") - results.append(run_on_0_2.remote()) - return names, results - - client_table = ray.nodes() - store_names = [] - store_names += [ - client["ObjectStoreSocketName"] for client in client_table - if client["Resources"].get("GPU", 0) == 0 - ] - store_names += [ - client["ObjectStoreSocketName"] for client in client_table - if client["Resources"].get("GPU", 0) == 5 - ] - store_names += [ - client["ObjectStoreSocketName"] for client in client_table - if client["Resources"].get("GPU", 0) == 1 - ] - assert len(store_names) == 3 - - def validate_names_and_results(names, results): - for name, result in zip(names, ray.get(results)): - if name == "run_on_0": - assert result in [store_names[0]] - elif name == "run_on_1": - assert result in [store_names[1]] - elif name == "run_on_2": - assert result in [store_names[2]] - elif name == "run_on_0_1_2": - assert (result in [ - store_names[0], store_names[1], store_names[2] - ]) - elif name == "run_on_1_2": - assert result in [store_names[1], store_names[2]] - elif name == "run_on_0_2": - assert result in [store_names[0], store_names[2]] - else: - raise Exception("This should be unreachable.") - assert set(ray.get(results)) == set(store_names) - - names, results = run_lots_of_tasks() - validate_names_and_results(names, results) - - # Make sure the same thing works when this is nested inside of a task. - - @ray.remote - def run_nested1(): - names, results = run_lots_of_tasks() - return names, results - - @ray.remote - def run_nested2(): - names, results = ray.get(run_nested1.remote()) - return names, results - - names, results = ray.get(run_nested2.remote()) - validate_names_and_results(names, results) - - -def test_custom_resources(ray_start_cluster): - cluster = ray_start_cluster - cluster.add_node(num_cpus=3, resources={"CustomResource": 0}) - cluster.add_node(num_cpus=3, resources={"CustomResource": 1}) - ray.init(address=cluster.address) - - @ray.remote - def f(): - time.sleep(0.001) - return ray.worker.global_worker.node.unique_id - - @ray.remote(resources={"CustomResource": 1}) - def g(): - time.sleep(0.001) - return ray.worker.global_worker.node.unique_id - - @ray.remote(resources={"CustomResource": 1}) - def h(): - ray.get([f.remote() for _ in range(5)]) - return ray.worker.global_worker.node.unique_id - - # The f tasks should be scheduled on both raylets. - assert len(set(ray.get([f.remote() for _ in range(50)]))) == 2 - - node_id = ray.worker.global_worker.node.unique_id - - # The g tasks should be scheduled only on the second raylet. - raylet_ids = set(ray.get([g.remote() for _ in range(50)])) - assert len(raylet_ids) == 1 - assert list(raylet_ids)[0] != node_id - - # Make sure that resource bookkeeping works when a task that uses a - # custom resources gets blocked. - ray.get([h.remote() for _ in range(5)]) - - -def test_node_id_resource(ray_start_cluster): - cluster = ray_start_cluster - cluster.add_node(num_cpus=3) - cluster.add_node(num_cpus=3) - ray.init(address=cluster.address) - - local_node = ray.state.current_node_id() - - # Note that these will have the same IP in the test cluster - assert len(ray.state.node_ids()) == 2 - assert local_node in ray.state.node_ids() - - @ray.remote(resources={local_node: 1}) - def f(): - return ray.state.current_node_id() - - # Check the node id resource is automatically usable for scheduling. - assert ray.get(f.remote()) == ray.state.current_node_id() - - -def test_two_custom_resources(ray_start_cluster): - cluster = ray_start_cluster - cluster.add_node( - num_cpus=3, resources={ - "CustomResource1": 1, - "CustomResource2": 2 - }) - cluster.add_node( - num_cpus=3, resources={ - "CustomResource1": 3, - "CustomResource2": 4 - }) - ray.init(address=cluster.address) - - @ray.remote(resources={"CustomResource1": 1}) - def f(): - time.sleep(0.001) - return ray.worker.global_worker.node.unique_id - - @ray.remote(resources={"CustomResource2": 1}) - def g(): - time.sleep(0.001) - return ray.worker.global_worker.node.unique_id - - @ray.remote(resources={"CustomResource1": 1, "CustomResource2": 3}) - def h(): - time.sleep(0.001) - return ray.worker.global_worker.node.unique_id - - @ray.remote(resources={"CustomResource1": 4}) - def j(): - time.sleep(0.001) - return ray.worker.global_worker.node.unique_id - - @ray.remote(resources={"CustomResource3": 1}) - def k(): - time.sleep(0.001) - return ray.worker.global_worker.node.unique_id - - # The f and g tasks should be scheduled on both raylets. - assert len(set(ray.get([f.remote() for _ in range(50)]))) == 2 - assert len(set(ray.get([g.remote() for _ in range(50)]))) == 2 - - node_id = ray.worker.global_worker.node.unique_id - - # The h tasks should be scheduled only on the second raylet. - raylet_ids = set(ray.get([h.remote() for _ in range(50)])) - assert len(raylet_ids) == 1 - assert list(raylet_ids)[0] != node_id - - # Make sure that tasks with unsatisfied custom resource requirements do - # not get scheduled. - ready_ids, remaining_ids = ray.wait([j.remote(), k.remote()], timeout=0.5) - assert ready_ids == [] - - -def test_many_custom_resources(shutdown_only): - num_custom_resources = 10000 - total_resources = { - str(i): np.random.randint(1, 7) - for i in range(num_custom_resources) - } - ray.init(num_cpus=5, resources=total_resources) - - def f(): - return 1 - - remote_functions = [] - for _ in range(20): - num_resources = np.random.randint(0, num_custom_resources + 1) - permuted_resources = np.random.permutation( - num_custom_resources)[:num_resources] - random_resources = { - str(i): total_resources[str(i)] - for i in permuted_resources - } - remote_function = ray.remote(resources=random_resources)(f) - remote_functions.append(remote_function) - - remote_functions.append(ray.remote(f)) - remote_functions.append(ray.remote(resources=total_resources)(f)) - - results = [] - for remote_function in remote_functions: - results.append(remote_function.remote()) - results.append(remote_function.remote()) - results.append(remote_function.remote()) - - ray.get(results) - - -# TODO: 5 retry attempts may be too little for Travis and we may need to -# increase it if this test begins to be flaky on Travis. -def test_zero_capacity_deletion_semantics(shutdown_only): - ray.init(num_cpus=2, num_gpus=1, resources={"test_resource": 1}) - - def test(): - resources = ray.available_resources() - MAX_RETRY_ATTEMPTS = 5 - retry_count = 0 - - del resources["memory"] - del resources["object_store_memory"] - for key in list(resources.keys()): - if key.startswith("node:"): - del resources[key] - - while resources and retry_count < MAX_RETRY_ATTEMPTS: - time.sleep(0.1) - resources = ray.available_resources() - retry_count += 1 - - if retry_count >= MAX_RETRY_ATTEMPTS: - raise RuntimeError( - "Resources were available even after five retries.", resources) - - return resources - - function = ray.remote( - num_cpus=2, num_gpus=1, resources={"test_resource": 1})(test) - cluster_resources = ray.get(function.remote()) - - # All cluster resources should be utilized and - # cluster_resources must be empty - assert cluster_resources == {} - - -@pytest.fixture -def save_gpu_ids_shutdown_only(): - # Record the curent value of this environment variable so that we can - # reset it after the test. - original_gpu_ids = os.environ.get("CUDA_VISIBLE_DEVICES", None) - - yield None - - # The code after the yield will run as teardown code. - ray.shutdown() - # Reset the environment variable. - if original_gpu_ids is not None: - os.environ["CUDA_VISIBLE_DEVICES"] = original_gpu_ids - else: - del os.environ["CUDA_VISIBLE_DEVICES"] - - -def test_specific_gpus(save_gpu_ids_shutdown_only): - allowed_gpu_ids = [4, 5, 6] - os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( - [str(i) for i in allowed_gpu_ids]) - ray.init(num_gpus=3) - - @ray.remote(num_gpus=1) - def f(): - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 1 - assert gpu_ids[0] in allowed_gpu_ids - - @ray.remote(num_gpus=2) - def g(): - gpu_ids = ray.get_gpu_ids() - assert len(gpu_ids) == 2 - assert gpu_ids[0] in allowed_gpu_ids - assert gpu_ids[1] in allowed_gpu_ids - - ray.get([f.remote() for _ in range(100)]) - ray.get([g.remote() for _ in range(100)]) - - -def test_blocking_tasks(ray_start_regular): - @ray.remote - def f(i, j): - return (i, j) - - @ray.remote - def g(i): - # Each instance of g submits and blocks on the result of another - # remote task. - object_ids = [f.remote(i, j) for j in range(2)] - return ray.get(object_ids) - - @ray.remote - def h(i): - # Each instance of g submits and blocks on the result of another - # remote task using ray.wait. - object_ids = [f.remote(i, j) for j in range(2)] - return ray.wait(object_ids, num_returns=len(object_ids)) - - ray.get([h.remote(i) for i in range(4)]) - - @ray.remote - def _sleep(i): - time.sleep(0.01) - return (i) - - @ray.remote - def sleep(): - # Each instance of sleep submits and blocks on the result of - # another remote task, which takes some time to execute. - ray.get([_sleep.remote(i) for i in range(10)]) - - ray.get(sleep.remote()) - - -def test_max_call_tasks(ray_start_regular): - @ray.remote(max_calls=1) - def f(): - return os.getpid() - - pid = ray.get(f.remote()) - ray.tests.utils.wait_for_pid_to_exit(pid) - - @ray.remote(max_calls=2) - def f(): - return os.getpid() - - pid1 = ray.get(f.remote()) - pid2 = ray.get(f.remote()) - assert pid1 == pid2 - ray.tests.utils.wait_for_pid_to_exit(pid1) - - -def attempt_to_load_balance(remote_function, - args, - total_tasks, - num_nodes, - minimum_count, - num_attempts=100): - attempts = 0 - while attempts < num_attempts: - locations = ray.get( - [remote_function.remote(*args) for _ in range(total_tasks)]) - names = set(locations) - counts = [locations.count(name) for name in names] - logger.info("Counts are {}.".format(counts)) - if (len(names) == num_nodes - and all(count >= minimum_count for count in counts)): - break - attempts += 1 - assert attempts < num_attempts - - -def test_load_balancing(ray_start_cluster): - # This test ensures that tasks are being assigned to all raylets - # in a roughly equal manner. - cluster = ray_start_cluster - num_nodes = 3 - num_cpus = 7 - for _ in range(num_nodes): - cluster.add_node(num_cpus=num_cpus) - ray.init(address=cluster.address) - - @ray.remote - def f(): - time.sleep(0.01) - return ray.worker.global_worker.node.unique_id - - attempt_to_load_balance(f, [], 100, num_nodes, 10) - attempt_to_load_balance(f, [], 1000, num_nodes, 100) - - -def test_load_balancing_with_dependencies(ray_start_cluster): - # This test ensures that tasks are being assigned to all raylets in a - # roughly equal manner even when the tasks have dependencies. - cluster = ray_start_cluster - num_nodes = 3 - for _ in range(num_nodes): - cluster.add_node(num_cpus=1) - ray.init(address=cluster.address) - - @ray.remote - def f(x): - time.sleep(0.010) - return ray.worker.global_worker.node.unique_id - - # This object will be local to one of the raylets. Make sure - # this doesn't prevent tasks from being scheduled on other raylets. - x = ray.put(np.zeros(1000000)) - - attempt_to_load_balance(f, [x], 100, num_nodes, 25) - - -def wait_for_num_tasks(num_tasks, timeout=10): - start_time = time.time() - while time.time() - start_time < timeout: - if len(ray.tasks()) >= num_tasks: - return - time.sleep(0.1) - raise RayTestTimeoutException("Timed out while waiting for global state.") - - -def wait_for_num_objects(num_objects, timeout=10): - start_time = time.time() - while time.time() - start_time < timeout: - if len(ray.objects()) >= num_objects: - return - time.sleep(0.1) - raise RayTestTimeoutException("Timed out while waiting for global state.") - - -@pytest.mark.skipif( - os.environ.get("RAY_USE_NEW_GCS") == "on", - reason="New GCS API doesn't have a Python API yet.") -def test_global_state_api(shutdown_only): - - error_message = ("The ray global state API cannot be used " - "before ray.init has been called.") - - with pytest.raises(Exception, match=error_message): - ray.objects() - - with pytest.raises(Exception, match=error_message): - ray.tasks() - - with pytest.raises(Exception, match=error_message): - ray.nodes() - - with pytest.raises(Exception, match=error_message): - ray.jobs() - - ray.init(num_cpus=5, num_gpus=3, resources={"CustomResource": 1}) - - assert ray.cluster_resources()["CPU"] == 5 - assert ray.cluster_resources()["GPU"] == 3 - assert ray.cluster_resources()["CustomResource"] == 1 - - assert ray.objects() == {} - - job_id = ray.utils.compute_job_id_from_driver( - ray.WorkerID(ray.worker.global_worker.worker_id)) - driver_task_id = ray.worker.global_worker.current_task_id.hex() - - # One task is put in the task table which corresponds to this driver. - wait_for_num_tasks(1) - task_table = ray.tasks() - assert len(task_table) == 1 - assert driver_task_id == list(task_table.keys())[0] - task_spec = task_table[driver_task_id]["TaskSpec"] - nil_unique_id_hex = ray.UniqueID.nil().hex() - nil_actor_id_hex = ray.ActorID.nil().hex() - - assert task_spec["TaskID"] == driver_task_id - assert task_spec["ActorID"] == nil_actor_id_hex - assert task_spec["Args"] == [] - assert task_spec["JobID"] == job_id.hex() - assert task_spec["FunctionID"] == nil_unique_id_hex - assert task_spec["ReturnObjectIDs"] == [] - - client_table = ray.nodes() - node_ip_address = ray.worker.global_worker.node_ip_address - - assert len(client_table) == 1 - assert client_table[0]["NodeManagerAddress"] == node_ip_address - - @ray.remote - def f(*xs): - return 1 - - x_id = ray.put(1) - result_id = f.remote(1, "hi", x_id) - - # Wait for one additional task to complete. - wait_for_num_tasks(1 + 1) - task_table = ray.tasks() - assert len(task_table) == 1 + 1 - task_id_set = set(task_table.keys()) - task_id_set.remove(driver_task_id) - task_id = list(task_id_set)[0] - - task_spec = task_table[task_id]["TaskSpec"] - assert task_spec["ActorID"] == nil_actor_id_hex - assert task_spec["Args"] == [ - signature.DUMMY_TYPE, 1, signature.DUMMY_TYPE, "hi", - signature.DUMMY_TYPE, x_id - ] - assert task_spec["JobID"] == job_id.hex() - assert task_spec["ReturnObjectIDs"] == [result_id] - - assert task_table[task_id] == ray.tasks(task_id) - - # Wait for two objects, one for the x_id and one for result_id. - wait_for_num_objects(2) - - def wait_for_object_table(): - timeout = 10 - start_time = time.time() - while time.time() - start_time < timeout: - object_table = ray.objects() - tables_ready = (object_table[x_id]["ManagerIDs"] is not None and - object_table[result_id]["ManagerIDs"] is not None) - if tables_ready: - return - time.sleep(0.1) - raise RayTestTimeoutException( - "Timed out while waiting for object table to " - "update.") - - object_table = ray.objects() - assert len(object_table) == 2 - - assert object_table[x_id] == ray.objects(x_id) - object_table_entry = ray.objects(result_id) - assert object_table[result_id] == object_table_entry - - job_table = ray.jobs() - - assert len(job_table) == 1 - assert job_table[0]["JobID"] == job_id.hex() - assert job_table[0]["NodeManagerAddress"] == node_ip_address - - -# TODO(rkn): Pytest actually has tools for capturing stdout and stderr, so we -# should use those, but they seem to conflict with Ray's use of faulthandler. -class CaptureOutputAndError(object): - """Capture stdout and stderr of some span. - - This can be used as follows. - - captured = {} - with CaptureOutputAndError(captured): - # Do stuff. - # Access captured["out"] and captured["err"]. - """ - - def __init__(self, captured_output_and_error): - if sys.version_info >= (3, 0): - import io - self.output_buffer = io.StringIO() - self.error_buffer = io.StringIO() - else: - import cStringIO - self.output_buffer = cStringIO.StringIO() - self.error_buffer = cStringIO.StringIO() - self.captured_output_and_error = captured_output_and_error - - def __enter__(self): - sys.stdout.flush() - sys.stderr.flush() - self.old_stdout = sys.stdout - self.old_stderr = sys.stderr - sys.stdout = self.output_buffer - sys.stderr = self.error_buffer - - def __exit__(self, exc_type, exc_value, traceback): - sys.stdout.flush() - sys.stderr.flush() - sys.stdout = self.old_stdout - sys.stderr = self.old_stderr - self.captured_output_and_error["out"] = self.output_buffer.getvalue() - self.captured_output_and_error["err"] = self.error_buffer.getvalue() - - -def test_logging_to_driver(shutdown_only): - ray.init(num_cpus=1, log_to_driver=True) - - @ray.remote - def f(): - # It's important to make sure that these print statements occur even - # without calling sys.stdout.flush() and sys.stderr.flush(). - for i in range(100): - print(i) - print(100 + i, file=sys.stderr) - - captured = {} - with CaptureOutputAndError(captured): - ray.get(f.remote()) - time.sleep(1) - - output_lines = captured["out"] - for i in range(200): - assert str(i) in output_lines - - # TODO(rkn): Check that no additional logs appear beyond what we expect - # and that there are no duplicate logs. Once we address the issue - # described in https://github.com/ray-project/ray/pull/5462, we should - # also check that nothing is logged to stderr. - - -def test_not_logging_to_driver(shutdown_only): - ray.init(num_cpus=1, log_to_driver=False) - - @ray.remote - def f(): - for i in range(100): - print(i) - print(100 + i, file=sys.stderr) - sys.stdout.flush() - sys.stderr.flush() - - captured = {} - with CaptureOutputAndError(captured): - ray.get(f.remote()) - time.sleep(1) - - output_lines = captured["out"] - assert len(output_lines) == 0 - - # TODO(rkn): Check that no additional logs appear beyond what we expect - # and that there are no duplicate logs. Once we address the issue - # described in https://github.com/ray-project/ray/pull/5462, we should - # also check that nothing is logged to stderr. - - -@pytest.mark.skipif( - os.environ.get("RAY_USE_NEW_GCS") == "on", - reason="New GCS API doesn't have a Python API yet.") -def test_workers(shutdown_only): - num_workers = 3 - ray.init(num_cpus=num_workers) - - @ray.remote - def f(): - return id(ray.worker.global_worker), os.getpid() - - # Wait until all of the workers have started. - worker_ids = set() - while len(worker_ids) != num_workers: - worker_ids = set(ray.get([f.remote() for _ in range(10)])) - - -def test_specific_job_id(): - dummy_driver_id = ray.JobID.from_int(1) - ray.init(num_cpus=1, job_id=dummy_driver_id) - - # in driver - assert dummy_driver_id == ray._get_runtime_context().current_driver_id - - # in worker - @ray.remote - def f(): - return ray._get_runtime_context().current_driver_id - - assert dummy_driver_id == ray.get(f.remote()) - - ray.shutdown() - - -def test_object_id_properties(): - id_bytes = b"00112233445566778899" - object_id = ray.ObjectID(id_bytes) - assert object_id.binary() == id_bytes - object_id = ray.ObjectID.nil() - assert object_id.is_nil() - with pytest.raises(ValueError, match=r".*needs to have length 20.*"): - ray.ObjectID(id_bytes + b"1234") - with pytest.raises(ValueError, match=r".*needs to have length 20.*"): - ray.ObjectID(b"0123456789") - object_id = ray.ObjectID.from_random() - assert not object_id.is_nil() - assert object_id.binary() != id_bytes - id_dumps = pickle.dumps(object_id) - id_from_dumps = pickle.loads(id_dumps) - assert id_from_dumps == object_id - - -@pytest.fixture -def shutdown_only_with_initialization_check(): - yield None - # The code after the yield will run as teardown code. - ray.shutdown() - assert not ray.is_initialized() - - -def test_initialized(shutdown_only_with_initialization_check): - assert not ray.is_initialized() - ray.init(num_cpus=0) - assert ray.is_initialized() - - -def test_initialized_local_mode(shutdown_only_with_initialization_check): - assert not ray.is_initialized() - ray.init(num_cpus=0, local_mode=True) - assert ray.is_initialized() - - -def test_wait_reconstruction(shutdown_only): - ray.init(num_cpus=1, object_store_memory=int(10**8)) - - @ray.remote - def f(): - return np.zeros(6 * 10**7, dtype=np.uint8) - - x_id = f.remote() - ray.wait([x_id]) - ray.wait([f.remote()]) - assert not ray.worker.global_worker.core_worker.object_exists(x_id) - ready_ids, _ = ray.wait([x_id]) - assert len(ready_ids) == 1 - - -def test_ray_setproctitle(ray_start_2_cpus): - @ray.remote - class UniqueName(object): - def __init__(self): - assert setproctitle.getproctitle() == "ray_UniqueName:__init__()" - - def f(self): - assert setproctitle.getproctitle() == "ray_UniqueName:f()" - - @ray.remote - def unique_1(): - assert setproctitle.getproctitle( - ) == "ray_worker:ray.tests.test_basic.unique_1()" - - actor = UniqueName.remote() - ray.get(actor.f.remote()) - ray.get(unique_1.remote()) - - -def test_duplicate_error_messages(shutdown_only): - ray.init(num_cpus=0) - - driver_id = ray.WorkerID.nil() - error_data = ray.gcs_utils.construct_error_message(driver_id, "test", - "message", 0) - - # Push the same message to the GCS twice (they are the same because we - # do not include a timestamp). - - r = ray.worker.global_worker.redis_client - - r.execute_command("RAY.TABLE_APPEND", - ray.gcs_utils.TablePrefix.Value("ERROR_INFO"), - ray.gcs_utils.TablePubsub.Value("ERROR_INFO_PUBSUB"), - driver_id.binary(), error_data) - - # Before https://github.com/ray-project/ray/pull/3316 this would - # give an error - r.execute_command("RAY.TABLE_APPEND", - ray.gcs_utils.TablePrefix.Value("ERROR_INFO"), - ray.gcs_utils.TablePubsub.Value("ERROR_INFO_PUBSUB"), - driver_id.binary(), error_data) - - -@pytest.mark.skipif( - os.getenv("TRAVIS") is None, - reason="This test should only be run on Travis.") -def test_ray_stack(ray_start_2_cpus): - def unique_name_1(): - time.sleep(1000) - - @ray.remote - def unique_name_2(): - time.sleep(1000) - - @ray.remote - def unique_name_3(): - unique_name_1() - - unique_name_2.remote() - unique_name_3.remote() - - success = False - start_time = time.time() - while time.time() - start_time < 30: - # Attempt to parse the "ray stack" call. - output = ray.utils.decode(subprocess.check_output(["ray", "stack"])) - if ("unique_name_1" in output and "unique_name_2" in output - and "unique_name_3" in output): - success = True - break - - if not success: - raise Exception("Failed to find necessary information with " - "'ray stack'") - - -def test_pandas_parquet_serialization(): - # Only test this if pandas is installed - pytest.importorskip("pandas") - - import pandas as pd - import pyarrow as pa - import pyarrow.parquet as pq - - tempdir = tempfile.mkdtemp() - filename = os.path.join(tempdir, "parquet-test") - pd.DataFrame({"col1": [0, 1], "col2": [0, 1]}).to_parquet(filename) - with open(os.path.join(tempdir, "parquet-compression"), "wb") as f: - table = pa.Table.from_arrays([pa.array([1, 2, 3])], ["hello"]) - pq.write_table(table, f, compression="lz4") - # Clean up - shutil.rmtree(tempdir) - - -def test_socket_dir_not_existing(shutdown_only): - random_name = ray.ObjectID.from_random().hex() - temp_raylet_socket_dir = "/tmp/ray/tests/{}".format(random_name) - temp_raylet_socket_name = os.path.join(temp_raylet_socket_dir, - "raylet_socket") - ray.init(num_cpus=1, raylet_socket_name=temp_raylet_socket_name) - - -def test_raylet_is_robust_to_random_messages(ray_start_regular): - node_manager_address = None - node_manager_port = None - for client in ray.nodes(): - if "NodeManagerAddress" in client: - node_manager_address = client["NodeManagerAddress"] - node_manager_port = client["NodeManagerPort"] - assert node_manager_address - assert node_manager_port - # Try to bring down the node manager: - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.connect((node_manager_address, node_manager_port)) - s.send(1000 * b"asdf") - - @ray.remote - def f(): - return 1 - - assert ray.get(f.remote()) == 1 - - -def test_non_ascii_comment(ray_start_regular): - @ray.remote - def f(): - # 日本語 Japanese comment - return 1 - - assert ray.get(f.remote()) == 1 - - -@ray.remote -def echo(x): - return x - - -@ray.remote -class WithConstructor(object): - def __init__(self, data): - self.data = data - - def get_data(self): - return self.data - - -@ray.remote -class WithoutConstructor(object): - def set_data(self, data): - self.data = data - - def get_data(self): - return self.data - - -class BaseClass(object): - def __init__(self, data): - self.data = data - - def get_data(self): - return self.data - - -@ray.remote -class DerivedClass(BaseClass): - def __init__(self, data): - # Due to different behaviors of super in Python 2 and Python 3, - # we use BaseClass directly here. - BaseClass.__init__(self, data) - - -def test_load_code_from_local(shutdown_only): - ray.init(load_code_from_local=True, num_cpus=4) - message = "foo" - # Test normal function. - assert ray.get(echo.remote(message)) == message - # Test actor class with constructor. - actor = WithConstructor.remote(1) - assert ray.get(actor.get_data.remote()) == 1 - # Test actor class without constructor. - actor = WithoutConstructor.remote() - actor.set_data.remote(1) - assert ray.get(actor.get_data.remote()) == 1 - # Test derived actor class. - actor = DerivedClass.remote(1) - assert ray.get(actor.get_data.remote()) == 1 - # Test using ray.remote decorator on raw classes. - base_actor_class = ray.remote(num_cpus=1)(BaseClass) - base_actor = base_actor_class.remote(message) - assert ray.get(base_actor.get_data.remote()) == message - - -def test_shutdown_disconnect_global_state(): - ray.init(num_cpus=0) - ray.shutdown() - - with pytest.raises(Exception) as e: - ray.objects() - assert str(e.value).endswith("ray.init has been called.") - - -@pytest.mark.parametrize( - "ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True) -def test_put_pins_object(ray_start_object_store_memory): - x_id = ray.put("HI") - x_copy = ray.ObjectID(x_id.binary()) - assert ray.get(x_copy) == "HI" - - # x cannot be evicted since x_id pins it - for _ in range(10): - ray.put(np.zeros(10 * 1024 * 1024)) - assert ray.get(x_id) == "HI" - assert ray.get(x_copy) == "HI" - - # now it can be evicted since x_id pins it but x_copy does not - del x_id - for _ in range(10): - ray.put(np.zeros(10 * 1024 * 1024)) - with pytest.raises(ray.exceptions.UnreconstructableError): - ray.get(x_copy) - - # weakref put - y_id = ray.put("HI", weakref=True) - for _ in range(10): - ray.put(np.zeros(10 * 1024 * 1024)) - with pytest.raises(ray.exceptions.UnreconstructableError): - ray.get(y_id) - - @ray.remote - def check_no_buffer_ref(x): - assert x[0].get_buffer_ref() is None - - z_id = ray.put("HI") - assert z_id.get_buffer_ref() is not None - ray.get(check_no_buffer_ref.remote([z_id])) - - -@pytest.mark.parametrize( - "ray_start_object_store_memory", [150 * 1024 * 1024], indirect=True) -def test_redis_lru_with_set(ray_start_object_store_memory): - x = np.zeros(8 * 10**7, dtype=np.uint8) - x_id = ray.put(x, weakref=True) - - # Remove the object from the object table to simulate Redis LRU eviction. - removed = False - start_time = time.time() - while time.time() < start_time + 10: - if ray.state.state.redis_clients[0].delete(b"OBJECT" + - x_id.binary()) == 1: - removed = True - break - assert removed - - # Now evict the object from the object store. - ray.put(x) # This should not crash. - - -def test_decorated_function(ray_start_regular): - def function_invocation_decorator(f): - def new_f(args, kwargs): - # Reverse the arguments. - return f(args[::-1], {"d": 5}), kwargs - - return new_f - - def f(a, b, c, d=None): - return a, b, c, d - - f.__ray_invocation_decorator__ = function_invocation_decorator - f = ray.remote(f) - - result_id, kwargs = f.remote(1, 2, 3, d=4) - assert kwargs == {"d": 4} - assert ray.get(result_id) == (3, 2, 1, 5) - - -def test_get_postprocess(ray_start_regular): - def get_postprocessor(object_ids, values): - return [value for value in values if value > 0] - - ray.worker.global_worker._post_get_hooks.append(get_postprocessor) - - assert ray.get( - [ray.put(i) for i in [0, 1, 3, 5, -1, -3, 4]]) == [1, 3, 5, 4] - - -def test_export_after_shutdown(ray_start_regular): - # This test checks that we can use actor and remote function definitions - # across multiple Ray sessions. - - @ray.remote - def f(): - pass - - @ray.remote - class Actor(object): - def method(self): - pass - - ray.get(f.remote()) - a = Actor.remote() - ray.get(a.method.remote()) - - ray.shutdown() - - # Start Ray and use the remote function and actor again. - ray.init(num_cpus=1) - ray.get(f.remote()) - a = Actor.remote() - ray.get(a.method.remote()) - - ray.shutdown() - - # Start Ray again and make sure that these definitions can be exported from - # workers. - ray.init(num_cpus=2) - - @ray.remote - def export_definitions_from_worker(remote_function, actor_class): - ray.get(remote_function.remote()) - actor_handle = actor_class.remote() - ray.get(actor_handle.method.remote()) - - ray.get(export_definitions_from_worker.remote(f, Actor)) - - -def test_invalid_unicode_in_worker_log(shutdown_only): - info = ray.init(num_cpus=1) - - logs_dir = os.path.join(info["session_dir"], "logs") - - # Wait till first worker log file is created. - while True: - log_file_paths = glob.glob("{}/worker*.out".format(logs_dir)) - if len(log_file_paths) == 0: - time.sleep(0.2) - else: - break - - with open(log_file_paths[0], "wb") as f: - f.write(b"\xe5abc\nline2\nline3\n") - f.write(b"\xe5abc\nline2\nline3\n") - f.write(b"\xe5abc\nline2\nline3\n") - f.flush() - - # Wait till the log monitor reads the file. - time.sleep(1.0) - - # Make sure that nothing has died. - assert ray.services.remaining_processes_alive() - - -@pytest.mark.skip(reason="This test is too expensive to run.") -def test_move_log_files_to_old(shutdown_only): - info = ray.init(num_cpus=1) - - logs_dir = os.path.join(info["session_dir"], "logs") - - @ray.remote - class Actor(object): - def f(self): - print("function f finished") - - # First create a temporary actor. - actors = [ - Actor.remote() for i in range(ray_constants.LOG_MONITOR_MAX_OPEN_FILES) - ] - ray.get([a.f.remote() for a in actors]) - - # Make sure no log files are in the "old" directory before the actors - # are killed. - assert len(glob.glob("{}/old/worker*.out".format(logs_dir))) == 0 - - # Now kill the actors so the files get moved to logs/old/. - [a.__ray_terminate__.remote() for a in actors] - - while True: - log_file_paths = glob.glob("{}/old/worker*.out".format(logs_dir)) - if len(log_file_paths) > 0: - with open(log_file_paths[0], "r") as f: - assert "function f finished\n" in f.readlines() - break - - # Make sure that nothing has died. - assert ray.services.remaining_processes_alive() +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_batched_queue.py b/python/ray/tests/test_batched_queue.py deleted file mode 100644 index 0a52e4144..000000000 --- a/python/ray/tests/test_batched_queue.py +++ /dev/null @@ -1,56 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import time - -import ray -from ray.experimental.streaming.batched_queue import BatchedQueue - - -@ray.remote -class Reader(object): - def __init__(self, queue): - self.queue = queue - self.num_reads = 0 - self.start = time.time() - - def read(self, read_slowly): - expected_value = 0 - for _ in range(1000): - x = self.queue.read_next() - assert x == expected_value, (x, expected_value) - expected_value += 1 - self.num_reads += 1 - if read_slowly: - time.sleep(0.001) - - -def test_batched_queue(ray_start_regular): - # Batched queue parameters - max_queue_size = 10000 # Max number of batches in queue - max_batch_size = 1000 # Max number of elements per batch - batch_timeout = 0.001 # 1ms flush timeout - prefetch_depth = 10 # Number of batches to prefetch from plasma - background_flush = False # Don't use daemon thread for flushing - # Two tests: one with a big queue and slow reader, and - # a second one with a small queue and a faster reader - for read_slowly in [True, False]: - # Construct the batched queue - queue = BatchedQueue( - max_size=max_queue_size, - max_batch_size=max_batch_size, - max_batch_time=batch_timeout, - prefetch_depth=prefetch_depth, - background_flush=background_flush) - # Create and start the reader - reader = Reader.remote(queue) - object_id = reader.read.remote(read_slowly=read_slowly) - value = 0 - for _ in range(1000): - queue.put_next(value) - value += 1 - queue._flush_writes() - ray.get(object_id) - # Test once more with a very small queue size and a faster reader - max_queue_size = 10 diff --git a/python/ray/tests/test_component_failures.py b/python/ray/tests/test_component_failures.py index b71b30bfb..c92c50e6a 100644 --- a/python/ray/tests/test_component_failures.py +++ b/python/ray/tests/test_component_failures.py @@ -13,9 +13,9 @@ import pytest import ray import ray.ray_constants as ray_constants -from ray.tests.cluster_utils import Cluster -from ray.tests.utils import (run_string_as_driver_nonblocking, - RayTestTimeoutException) +from ray.cluster_utils import Cluster +from ray.test_utils import (run_string_as_driver_nonblocking, + RayTestTimeoutException) # This test checks that when a worker dies in the middle of a get, the plasma @@ -441,3 +441,8 @@ def test_driver_lives_parallel(ray_start_regular): process_info.process.wait() # If the driver can reach the tearDown method, then it is still alive. + + +if __name__ == "__main__": + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_credis.py b/python/ray/tests/test_credis.py deleted file mode 100644 index 455bcbab0..000000000 --- a/python/ray/tests/test_credis.py +++ /dev/null @@ -1,42 +0,0 @@ -from __future__ import absolute_import, division, print_function - -import os -import unittest - -import redis - -import ray - - -def parse_client(addr_port_str): - address, redis_port = addr_port_str.split(":") - return redis.StrictRedis(host=address, port=redis_port) - - -@unittest.skipIf(not os.environ.get("RAY_USE_NEW_GCS", False), - "Tests functionality of the new GCS.") -class CredisTest(unittest.TestCase): - def setUp(self): - self.config = ray.init(num_cpus=0) - - def tearDown(self): - ray.shutdown() - - def test_credis_started(self): - assert "redis_address" in self.config - primary = parse_client(self.config["redis_address"]) - assert primary.ping() is True - member = primary.lrange("RedisShards", 0, -1)[0] - shard = parse_client(member.decode()) - - # Check that primary has loaded credis's master module. - chain = primary.execute_command("MASTER.GET_CHAIN") - assert len(chain) == 1 - - # Check that the shard has loaded credis' member module. - assert chain[0] == member - assert shard.execute_command("MEMBER.SN") == -1 - - -if __name__ == "__main__": - unittest.main(verbosity=2) diff --git a/python/ray/tests/test_cython.py b/python/ray/tests/test_cython.py index e7ad36fed..04e770d59 100644 --- a/python/ray/tests/test_cython.py +++ b/python/ray/tests/test_cython.py @@ -54,4 +54,6 @@ class CythonTest(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_debug_tools.py b/python/ray/tests/test_debug_tools.py index 1c6e4668c..45a694627 100644 --- a/python/ray/tests/test_debug_tools.py +++ b/python/ray/tests/test_debug_tools.py @@ -47,3 +47,12 @@ def test_raylet_gdb(ray_gdb_start): stdout=subprocess.PIPE, stderr=subprocess.PIPE) assert pgrep_command.communicate()[0] + + +if __name__ == "__main__": + import pytest + import sys + # Make subprocess happy in bazel. + os.environ["LC_ALL"] = "en_US.UTF-8" + os.environ["LANG"] = "en_US.UTF-8" + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_dynres.py b/python/ray/tests/test_dynres.py index e3c73189b..50d782c37 100644 --- a/python/ray/tests/test_dynres.py +++ b/python/ray/tests/test_dynres.py @@ -6,8 +6,8 @@ import logging import time import ray -import ray.tests.cluster_utils -import ray.tests.utils +import ray.cluster_utils +import ray.test_utils logger = logging.getLogger(__name__) @@ -605,3 +605,9 @@ def test_release_cpus_when_actor_creation_task_blocking(shutdown_only): result = wait_until(assert_available_resources, 1000) assert result is True + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index bc5c38e98..48f670c54 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -15,8 +15,8 @@ import redis import ray import ray.ray_constants as ray_constants -from ray.tests.cluster_utils import Cluster -from ray.tests.utils import ( +from ray.cluster_utils import Cluster +from ray.test_utils import ( relevant_errors, wait_for_errors, RayTestTimeoutException, @@ -903,3 +903,8 @@ def test_direct_call_serialized_id(ray_start_cluster): obj = small_object.remote() ray.get(get.remote([obj])) + + +if __name__ == "__main__": + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_garbage_collection.py b/python/ray/tests/test_garbage_collection.py index fe5ec0ce2..bc2e20565 100644 --- a/python/ray/tests/test_garbage_collection.py +++ b/python/ray/tests/test_garbage_collection.py @@ -10,8 +10,8 @@ import logging import pytest import ray -import ray.tests.cluster_utils -import ray.tests.utils +import ray.cluster_utils +import ray.test_utils logger = logging.getLogger(__name__) @@ -78,3 +78,9 @@ def test_pending_task_dependency(shutdown_only): ray.put(np_array) ray.get(oid) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_global_state.py b/python/ray/tests/test_global_state.py index db71fc69c..4089c538a 100644 --- a/python/ray/tests/test_global_state.py +++ b/python/ray/tests/test_global_state.py @@ -78,3 +78,9 @@ def test_add_remove_cluster_resources(ray_start_cluster_head): nodes += [cluster.add_node(num_cpus=1)] cluster.wait_for_nodes() assert ray.cluster_resources()["CPU"] == 6 + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_logical_graph.py b/python/ray/tests/test_logical_graph.py index 54578ae9c..1cfe3f232 100644 --- a/python/ray/tests/test_logical_graph.py +++ b/python/ray/tests/test_logical_graph.py @@ -202,3 +202,9 @@ def test_channel_generation(): def test_wordcount(): """Tests a simple streaming wordcount.""" pass + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_memory_limits.py b/python/ray/tests/test_memory_limits.py index 47ccfad09..98f9890e3 100644 --- a/python/ray/tests/test_memory_limits.py +++ b/python/ray/tests/test_memory_limits.py @@ -86,4 +86,6 @@ class TestMemoryLimits(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_memory_scheduling.py b/python/ray/tests/test_memory_scheduling.py index 36faa8075..ebcebd54b 100644 --- a/python/ray/tests/test_memory_scheduling.py +++ b/python/ray/tests/test_memory_scheduling.py @@ -152,4 +152,6 @@ class TestMemoryScheduling(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_metrics.py b/python/ray/tests/test_metrics.py index c1c26334d..5977b16f1 100644 --- a/python/ray/tests/test_metrics.py +++ b/python/ray/tests/test_metrics.py @@ -10,7 +10,7 @@ import time import ray from ray.core.generated import node_manager_pb2 from ray.core.generated import node_manager_pb2_grpc -from ray.tests.utils import RayTestTimeoutException +from ray.test_utils import RayTestTimeoutException def test_worker_stats(ray_start_regular): @@ -53,5 +53,13 @@ def test_worker_stats(ray_start_regular): if p.info["pid"] in pids ] for process in processes: - assert "python" in process or "ray" in process + # TODO(ekl) why does travis/mi end up in the process list + assert ("python" in process or "ray" in process + or "travis" in process) break + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_microbenchmarks.py b/python/ray/tests/test_microbenchmarks.py index e42e0fba4..ba8b6b7d1 100644 --- a/python/ray/tests/test_microbenchmarks.py +++ b/python/ray/tests/test_microbenchmarks.py @@ -2,7 +2,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import os import pytest import time import numpy as np @@ -104,9 +103,11 @@ def test_cache(ray_start_regular): d = time.time() - c if d > 1.5 * b: - if os.getenv("TRAVIS") is None: - raise Exception("The caching test was too slow. " - "d = {}, b = {}".format(d, b)) - else: - print("WARNING: The caching test was too slow. " - "d = {}, b = {}".format(d, b)) + print("WARNING: The caching test was too slow. " + "d = {}, b = {}".format(d, b)) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_monitors.py b/python/ray/tests/test_monitors.py index 5b3c66627..db2d458e3 100644 --- a/python/ray/tests/test_monitors.py +++ b/python/ray/tests/test_monitors.py @@ -107,3 +107,12 @@ def test_cleanup_on_driver_exit_single_redis_shard(): def test_cleanup_on_driver_exit_many_redis_shards(): _test_cleanup_on_driver_exit(num_redis_shards=5) _test_cleanup_on_driver_exit(num_redis_shards=31) + + +if __name__ == "__main__": + import pytest + import sys + # Make subprocess happy in bazel. + os.environ["LC_ALL"] = "en_US.UTF-8" + os.environ["LANG"] = "en_US.UTF-8" + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_multi_node.py b/python/ray/tests/test_multi_node.py index 8324d4ab2..aa03d37ff 100644 --- a/python/ray/tests/test_multi_node.py +++ b/python/ray/tests/test_multi_node.py @@ -8,7 +8,7 @@ import subprocess import time import ray -from ray.tests.utils import ( +from ray.test_utils import ( RayTestTimeoutException, run_string_as_driver, run_string_as_driver_nonblocking, @@ -615,3 +615,12 @@ def test_use_pickle(call_ray_start): return (3, "world") assert ray.get(f.remote(x)) == (3, "world") + + +if __name__ == "__main__": + import pytest + import sys + # Make subprocess happy in bazel. + os.environ["LC_ALL"] = "en_US.UTF-8" + os.environ["LANG"] = "en_US.UTF-8" + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_multi_node_2.py b/python/ray/tests/test_multi_node_2.py index 4149a4fa6..fba8925b0 100644 --- a/python/ray/tests/test_multi_node_2.py +++ b/python/ray/tests/test_multi_node_2.py @@ -9,8 +9,8 @@ import time import ray import ray.ray_constants as ray_constants from ray.monitor import Monitor -from ray.tests.cluster_utils import Cluster -from ray.tests.conftest import generate_internal_config_map +from ray.cluster_utils import Cluster +from ray.test_utils import generate_internal_config_map logger = logging.getLogger(__name__) @@ -221,3 +221,9 @@ def test_worker_plasma_store_failure(ray_start_cluster_head): worker.kill_plasma_store() worker.all_processes[ray_constants.PROCESS_TYPE_RAYLET][0].process.wait() assert not worker.any_processes_alive(), worker.live_processes() + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_multinode_failures.py b/python/ray/tests/test_multinode_failures.py new file mode 100644 index 000000000..32d1c1a7b --- /dev/null +++ b/python/ray/tests/test_multinode_failures.py @@ -0,0 +1,273 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import os +import signal +import sys +import time + +import numpy as np +import pytest + +import ray +import ray.ray_constants as ray_constants +from ray.cluster_utils import Cluster +from ray.test_utils import RayTestTimeoutException + + +@pytest.fixture(params=[(1, 4), (4, 4)]) +def ray_start_workers_separate_multinode(request): + num_nodes = request.param[0] + num_initial_workers = request.param[1] + # Start the Ray processes. + cluster = Cluster() + for _ in range(num_nodes): + cluster.add_node(num_cpus=num_initial_workers) + ray.init(address=cluster.address) + + yield num_nodes, num_initial_workers + # The code after the yield will run as teardown code. + ray.shutdown() + cluster.shutdown() + + +def test_worker_failed(ray_start_workers_separate_multinode): + num_nodes, num_initial_workers = (ray_start_workers_separate_multinode) + + @ray.remote + def get_pids(): + time.sleep(0.25) + return os.getpid() + + start_time = time.time() + pids = set() + while len(pids) < num_nodes * num_initial_workers: + new_pids = ray.get([ + get_pids.remote() + for _ in range(2 * num_nodes * num_initial_workers) + ]) + for pid in new_pids: + pids.add(pid) + if time.time() - start_time > 60: + raise RayTestTimeoutException( + "Timed out while waiting to get worker PIDs.") + + @ray.remote + def f(x): + time.sleep(0.5) + return x + + # Submit more tasks than there are workers so that all workers and + # cores are utilized. + object_ids = [f.remote(i) for i in range(num_initial_workers * num_nodes)] + object_ids += [f.remote(object_id) for object_id in object_ids] + # Allow the tasks some time to begin executing. + time.sleep(0.1) + # Kill the workers as the tasks execute. + for pid in pids: + os.kill(pid, signal.SIGKILL) + time.sleep(0.1) + # Make sure that we either get the object or we get an appropriate + # exception. + for object_id in object_ids: + try: + ray.get(object_id) + except (ray.exceptions.RayTaskError, ray.exceptions.RayWorkerError): + pass + + +def _test_component_failed(cluster, component_type): + """Kill a component on all worker nodes and check workload succeeds.""" + # Submit many tasks with many dependencies. + @ray.remote + def f(x): + return x + + @ray.remote + def g(*xs): + return 1 + + # Kill the component on all nodes except the head node as the tasks + # execute. Do this in a loop while submitting tasks between each + # component failure. + time.sleep(0.1) + worker_nodes = cluster.list_all_nodes()[1:] + assert len(worker_nodes) > 0 + for node in worker_nodes: + process = node.all_processes[component_type][0].process + # Submit a round of tasks with many dependencies. + x = 1 + for _ in range(1000): + x = f.remote(x) + + xs = [g.remote(1)] + for _ in range(100): + xs.append(g.remote(*xs)) + xs.append(g.remote(1)) + + # Kill a component on one of the nodes. + process.terminate() + time.sleep(1) + process.kill() + process.wait() + assert not process.poll() is None + + # Make sure that we can still get the objects after the + # executing tasks died. + ray.get(x) + ray.get(xs) + + +def check_components_alive(cluster, component_type, check_component_alive): + """Check that a given component type is alive on all worker nodes.""" + worker_nodes = cluster.list_all_nodes()[1:] + assert len(worker_nodes) > 0 + for node in worker_nodes: + process = node.all_processes[component_type][0].process + if check_component_alive: + assert process.poll() is None + else: + print("waiting for " + component_type + " with PID " + + str(process.pid) + "to terminate") + process.wait() + print("done waiting for " + component_type + " with PID " + + str(process.pid) + "to terminate") + assert not process.poll() is None + + +@pytest.mark.parametrize( + "ray_start_cluster", [{ + "num_cpus": 8, + "num_nodes": 4, + "_internal_config": json.dumps({ + "num_heartbeats_timeout": 100 + }), + }], + indirect=True) +def test_raylet_failed(ray_start_cluster): + cluster = ray_start_cluster + # Kill all raylets on worker nodes. + _test_component_failed(cluster, ray_constants.PROCESS_TYPE_RAYLET) + + # The plasma stores should still be alive on the worker nodes. + check_components_alive(cluster, ray_constants.PROCESS_TYPE_PLASMA_STORE, + True) + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +@pytest.mark.parametrize( + "ray_start_cluster", [{ + "num_cpus": 8, + "num_nodes": 2, + "_internal_config": json.dumps({ + "num_heartbeats_timeout": 100 + }), + }], + indirect=True) +def test_plasma_store_failed(ray_start_cluster): + cluster = ray_start_cluster + # Kill all plasma stores on worker nodes. + _test_component_failed(cluster, ray_constants.PROCESS_TYPE_PLASMA_STORE) + + # No processes should be left alive on the worker nodes. + check_components_alive(cluster, ray_constants.PROCESS_TYPE_PLASMA_STORE, + False) + check_components_alive(cluster, ray_constants.PROCESS_TYPE_RAYLET, False) + + +@pytest.mark.parametrize( + "ray_start_cluster", [{ + "num_cpus": 4, + "num_nodes": 3, + "do_init": True + }], + indirect=True) +def test_actor_creation_node_failure(ray_start_cluster): + # TODO(swang): Refactor test_raylet_failed, etc to reuse the below code. + cluster = ray_start_cluster + + @ray.remote + class Child(object): + def __init__(self, death_probability): + self.death_probability = death_probability + + def ping(self): + # Exit process with some probability. + exit_chance = np.random.rand() + if exit_chance < self.death_probability: + sys.exit(-1) + + num_children = 50 + # Children actors will die about half the time. + death_probability = 0.5 + + children = [Child.remote(death_probability) for _ in range(num_children)] + while len(cluster.list_all_nodes()) > 1: + for j in range(2): + # Submit some tasks on the actors. About half of the actors will + # fail. + children_out = [child.ping.remote() for child in children] + # Wait a while for all the tasks to complete. This should trigger + # reconstruction for any actor creation tasks that were forwarded + # to nodes that then failed. + ready, _ = ray.wait( + children_out, num_returns=len(children_out), timeout=5 * 60.0) + assert len(ready) == len(children_out) + + # Replace any actors that died. + for i, out in enumerate(children_out): + try: + ray.get(out) + except ray.exceptions.RayActorError: + children[i] = Child.remote(death_probability) + # Remove a node. Any actor creation tasks that were forwarded to this + # node must be reconstructed. + cluster.remove_node(cluster.list_all_nodes()[-1]) + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_driver_lives_sequential(ray_start_regular): + ray.worker._global_node.kill_raylet() + ray.worker._global_node.kill_plasma_store() + ray.worker._global_node.kill_log_monitor() + ray.worker._global_node.kill_monitor() + ray.worker._global_node.kill_raylet_monitor() + + # If the driver can reach the tearDown method, then it is still alive. + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Hanging with new GCS API.") +def test_driver_lives_parallel(ray_start_regular): + all_processes = ray.worker._global_node.all_processes + process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] + + all_processes[ray_constants.PROCESS_TYPE_RAYLET] + + all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] + + all_processes[ray_constants.PROCESS_TYPE_MONITOR] + + all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR]) + assert len(process_infos) == 5 + + # Kill all the components in parallel. + for process_info in process_infos: + process_info.process.terminate() + + time.sleep(0.1) + for process_info in process_infos: + process_info.process.kill() + + for process_info in process_infos: + process_info.process.wait() + + # If the driver can reach the tearDown method, then it is still alive. + + +if __name__ == "__main__": + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_node_manager.py b/python/ray/tests/test_node_manager.py index d382e49ec..b8afecc52 100644 --- a/python/ray/tests/test_node_manager.py +++ b/python/ray/tests/test_node_manager.py @@ -3,7 +3,7 @@ from __future__ import division from __future__ import print_function import ray -from ray.tests.utils import run_string_as_driver +from ray.test_utils import run_string_as_driver # This tests the queue transitions for infeasible tasks. This has been an issue @@ -48,3 +48,9 @@ f.remote() ray.get([ f._submit(args=[], kwargs={}, resources={str(i): 1}) for i in range(3) ]) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_object_manager.py b/python/ray/tests/test_object_manager.py index 61b2a7a16..e2892c9d2 100644 --- a/python/ray/tests/test_object_manager.py +++ b/python/ray/tests/test_object_manager.py @@ -11,7 +11,7 @@ import time import warnings import ray -from ray.tests.cluster_utils import Cluster +from ray.cluster_utils import Cluster # TODO(yuhguo): This test file requires a lot of CPU/memory, and # better be put in Jenkins. However, it fails frequently in Jenkins, but @@ -325,3 +325,9 @@ def test_many_small_transfers(ray_start_cluster_with_resource): do_transfers() do_transfers() do_transfers() + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_projects.py b/python/ray/tests/test_projects.py index 2f862a854..0b836e594 100644 --- a/python/ray/tests/test_projects.py +++ b/python/ray/tests/test_projects.py @@ -237,3 +237,11 @@ def test_session_create_multiple(): "session-tests/commands-test", session_start, ["first", "--a", "*", "--b", "*"]) assert result.exit_code == 1 + + +if __name__ == "__main__": + import sys + # Make subprocess happy in bazel. + os.environ["LC_ALL"] = "en_US.UTF-8" + os.environ["LANG"] = "en_US.UTF-8" + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_queue.py b/python/ray/tests/test_queue.py index 5bcb6b9f3..69c991554 100644 --- a/python/ray/tests/test_queue.py +++ b/python/ray/tests/test_queue.py @@ -119,3 +119,8 @@ def test_queue(ray_start_regular): assert q.get() == item size -= 1 assert q.qsize() == size + + +if __name__ == "__main__": + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_ray_init.py b/python/ray/tests/test_ray_init.py index 050215755..8ce3f07d8 100644 --- a/python/ray/tests/test_ray_init.py +++ b/python/ray/tests/test_ray_init.py @@ -7,7 +7,7 @@ import pytest import redis import ray -from ray.tests.cluster_utils import Cluster +from ray.cluster_utils import Cluster @pytest.fixture @@ -61,3 +61,9 @@ class TestRedisPassword(object): object_id = f.remote() ray.get(object_id) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_signal.py b/python/ray/tests/test_signal.py index 066281d15..de238ff36 100644 --- a/python/ray/tests/test_signal.py +++ b/python/ray/tests/test_signal.py @@ -387,3 +387,9 @@ def test_small_receive_timeout(ray_start_regular): result_list = ray.experimental.signal.receive([a], timeout=small_timeout) assert len(result_list) == 1 + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_stress.py b/python/ray/tests/test_stress.py index 0ab8ec501..be43cfc8c 100644 --- a/python/ray/tests/test_stress.py +++ b/python/ray/tests/test_stress.py @@ -2,39 +2,12 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import json import numpy as np -import os import pytest -import sys import time import ray -from ray.tests.cluster_utils import Cluster -from ray.tests.utils import flat_errors -import ray.ray_constants as ray_constants - - -@pytest.fixture(params=[1, 4]) -def ray_start_sharded(request): - num_redis_shards = request.param - - if os.environ.get("RAY_USE_NEW_GCS") == "on": - num_redis_shards = 1 - # For now, RAY_USE_NEW_GCS supports 1 shard, and credis supports - # 1-node chain for that shard only. - - # Start the Ray processes. - ray.init( - object_store_memory=int(0.5 * 10**9), - num_cpus=10, - num_redis_shards=num_redis_shards, - redis_max_memory=10**7) - - yield None - - # The code after the yield will run as teardown code. - ray.shutdown() +from ray.cluster_utils import Cluster @pytest.fixture(params=[(1, 4), (4, 4)]) @@ -105,71 +78,6 @@ def test_dependencies(ray_start_combination): assert cluster.remaining_processes_alive() -def test_submitting_many_tasks(ray_start_sharded): - @ray.remote - def f(x): - return 1 - - def g(n): - x = 1 - for i in range(n): - x = f.remote(x) - return x - - ray.get([g(1000) for _ in range(100)]) - assert ray.services.remaining_processes_alive() - - -def test_submitting_many_actors_to_one(ray_start_sharded): - @ray.remote - class Actor(object): - def __init__(self): - pass - - def ping(self): - return - - @ray.remote - class Worker(object): - def __init__(self, actor): - self.actor = actor - - def ping(self): - return ray.get(self.actor.ping.remote()) - - a = Actor.remote() - workers = [Worker.remote(a) for _ in range(10)] - for _ in range(10): - out = ray.get([w.ping.remote() for w in workers]) - assert out == [None for _ in workers] - - -def test_getting_and_putting(ray_start_sharded): - for n in range(8): - x = np.zeros(10**n) - - for _ in range(100): - ray.put(x) - - x_id = ray.put(x) - for _ in range(1000): - ray.get(x_id) - - assert ray.services.remaining_processes_alive() - - -def test_getting_many_objects(ray_start_sharded): - @ray.remote - def f(): - return 1 - - n = 10**4 # TODO(pcm): replace by 10 ** 5 once this is faster. - lst = ray.get([f.remote() for _ in range(n)]) - assert lst == n * [1] - - assert ray.services.remaining_processes_alive() - - def test_wait(ray_start_combination): num_nodes, num_workers_per_scheduler, cluster = ray_start_combination num_workers = num_nodes * num_workers_per_scheduler @@ -197,360 +105,7 @@ def test_wait(ray_start_combination): assert cluster.remaining_processes_alive() -@pytest.fixture(params=[1, 4]) -def ray_start_reconstruction(request): - num_nodes = request.param - - plasma_store_memory = int(0.5 * 10**9) - - cluster = Cluster( - initialize_head=True, - head_node_args={ - "num_cpus": 1, - "object_store_memory": plasma_store_memory // num_nodes, - "redis_max_memory": 10**7, - "_internal_config": json.dumps({ - "initial_reconstruction_timeout_milliseconds": 200 - }) - }) - for i in range(num_nodes - 1): - cluster.add_node( - num_cpus=1, - object_store_memory=plasma_store_memory // num_nodes, - _internal_config=json.dumps({ - "initial_reconstruction_timeout_milliseconds": 200 - })) - ray.init(address=cluster.address) - - yield plasma_store_memory, num_nodes, cluster - - # Clean up the Ray cluster. - ray.shutdown() - cluster.shutdown() - - -@pytest.mark.skipif( - os.environ.get("RAY_USE_NEW_GCS") == "on", - reason="Failing with new GCS API on Linux.") -def test_simple(ray_start_reconstruction): - plasma_store_memory, num_nodes, cluster = ray_start_reconstruction - # Define the size of one task's return argument so that the combined - # sum of all objects' sizes is at least twice the plasma stores' - # combined allotted memory. - num_objects = 100 - size = int(plasma_store_memory * 1.5 / (num_objects * 8)) - - # Define a remote task with no dependencies, which returns a numpy - # array of the given size. - @ray.remote - def foo(i, size): - array = np.zeros(size) - array[0] = i - return array - - # Launch num_objects instances of the remote task. - args = [] - for i in range(num_objects): - args.append(foo.remote(i, size)) - - # Get each value to force each task to finish. After some number of - # gets, old values should be evicted. - for i in range(num_objects): - value = ray.get(args[i]) - assert value[0] == i - # Get each value again to force reconstruction. - for i in range(num_objects): - value = ray.get(args[i]) - assert value[0] == i - # Get values sequentially, in chunks. - num_chunks = 4 * num_nodes - chunk = num_objects // num_chunks - for i in range(num_chunks): - values = ray.get(args[i * chunk:(i + 1) * chunk]) - del values - - assert cluster.remaining_processes_alive() - - -def sorted_random_indexes(total, output_num): - random_indexes = [np.random.randint(total) for _ in range(output_num)] - random_indexes.sort() - return random_indexes - - -@pytest.mark.skipif( - os.environ.get("RAY_USE_NEW_GCS") == "on", - reason="Failing with new GCS API on Linux.") -def test_recursive(ray_start_reconstruction): - plasma_store_memory, num_nodes, cluster = ray_start_reconstruction - # Define the size of one task's return argument so that the combined - # sum of all objects' sizes is at least twice the plasma stores' - # combined allotted memory. - num_objects = 100 - size = int(plasma_store_memory * 1.5 / (num_objects * 8)) - - # Define a root task with no dependencies, which returns a numpy array - # of the given size. - @ray.remote - def no_dependency_task(size): - array = np.zeros(size) - return array - - # Define a task with a single dependency, which returns its one - # argument. - @ray.remote - def single_dependency(i, arg): - arg = np.copy(arg) - arg[0] = i - return arg - - # Launch num_objects instances of the remote task, each dependent on - # the one before it. - arg = no_dependency_task.remote(size) - args = [] - for i in range(num_objects): - arg = single_dependency.remote(i, arg) - args.append(arg) - - # Get each value to force each task to finish. After some number of - # gets, old values should be evicted. - for i in range(num_objects): - value = ray.get(args[i]) - assert value[0] == i - # Get each value again to force reconstruction. - for i in range(num_objects): - value = ray.get(args[i]) - assert value[0] == i - # Get 10 values randomly. - random_indexes = sorted_random_indexes(num_objects, 10) - for i in random_indexes: - value = ray.get(args[i]) - assert value[0] == i - # Get values sequentially, in chunks. - num_chunks = 4 * num_nodes - chunk = num_objects // num_chunks - for i in range(num_chunks): - values = ray.get(args[i * chunk:(i + 1) * chunk]) - del values - - assert cluster.remaining_processes_alive() - - -@pytest.mark.skip(reason="This test often hangs or fails in CI.") -@pytest.mark.skipif( - os.environ.get("RAY_USE_NEW_GCS") == "on", - reason="Failing with new GCS API on Linux.") -def test_multiple_recursive(ray_start_reconstruction): - plasma_store_memory, _, cluster = ray_start_reconstruction - # Define the size of one task's return argument so that the combined - # sum of all objects' sizes is at least twice the plasma stores' - # combined allotted memory. - num_objects = 100 - size = plasma_store_memory * 2 // (num_objects * 8) - - # Define a root task with no dependencies, which returns a numpy array - # of the given size. - @ray.remote - def no_dependency_task(size): - array = np.zeros(size) - return array - - # Define a task with multiple dependencies, which returns its first - # argument. - @ray.remote - def multiple_dependency(i, arg1, arg2, arg3): - arg1 = np.copy(arg1) - arg1[0] = i - return arg1 - - # Launch num_args instances of the root task. Then launch num_objects - # instances of the multi-dependency remote task, each dependent on the - # num_args tasks before it. - num_args = 3 - args = [] - for i in range(num_args): - arg = no_dependency_task.remote(size) - args.append(arg) - for i in range(num_objects): - args.append(multiple_dependency.remote(i, *args[i:i + num_args])) - - # Get each value to force each task to finish. After some number of - # gets, old values should be evicted. - args = args[num_args:] - for i in range(num_objects): - value = ray.get(args[i]) - assert value[0] == i - # Get each value again to force reconstruction. - for i in range(num_objects): - value = ray.get(args[i]) - assert value[0] == i - # Get 10 values randomly. - random_indexes = sorted_random_indexes(num_objects, 10) - for i in random_indexes: - value = ray.get(args[i]) - assert value[0] == i - - assert cluster.remaining_processes_alive() - - -def wait_for_errors(error_check): - # Wait for errors from all the nondeterministic tasks. - errors = [] - time_left = 100 - while time_left > 0: - errors = flat_errors() - if error_check(errors): - break - time_left -= 1 - time.sleep(1) - - # Make sure that enough errors came through. - assert error_check(errors) - return errors - - -@pytest.mark.skip("This test does not work yet.") -@pytest.mark.skipif( - os.environ.get("RAY_USE_NEW_GCS") == "on", - reason="Failing with new GCS API on Linux.") -def test_nondeterministic_task(ray_start_reconstruction): - plasma_store_memory, num_nodes, cluster = ray_start_reconstruction - # Define the size of one task's return argument so that the combined - # sum of all objects' sizes is at least twice the plasma stores' - # combined allotted memory. - num_objects = 1000 - size = plasma_store_memory * 2 // (num_objects * 8) - - # Define a nondeterministic remote task with no dependencies, which - # returns a random numpy array of the given size. This task should - # produce an error on the driver if it is ever reexecuted. - @ray.remote - def foo(i, size): - array = np.random.rand(size) - array[0] = i - return array - - # Define a deterministic remote task with no dependencies, which - # returns a numpy array of zeros of the given size. - @ray.remote - def bar(i, size): - array = np.zeros(size) - array[0] = i - return array - - # Launch num_objects instances, half deterministic and half - # nondeterministic. - args = [] - for i in range(num_objects): - if i % 2 == 0: - args.append(foo.remote(i, size)) - else: - args.append(bar.remote(i, size)) - - # Get each value to force each task to finish. After some number of - # gets, old values should be evicted. - for i in range(num_objects): - value = ray.get(args[i]) - assert value[0] == i - # Get each value again to force reconstruction. - for i in range(num_objects): - value = ray.get(args[i]) - assert value[0] == i - - def error_check(errors): - if num_nodes == 1: - # In a single-node setting, each object is evicted and - # reconstructed exactly once, so exactly half the objects will - # produce an error during reconstruction. - min_errors = num_objects // 2 - else: - # In a multinode setting, each object is evicted zero or one - # times, so some of the nondeterministic tasks may not be - # reexecuted. - min_errors = 1 - return len(errors) >= min_errors - - errors = wait_for_errors(error_check) - # Make sure all the errors have the correct type. - assert all(error["type"] == ray_constants.HASH_MISMATCH_PUSH_ERROR - for error in errors) - - assert cluster.remaining_processes_alive() - - -@pytest.mark.skipif( - os.environ.get("RAY_USE_NEW_GCS") == "on", - reason="Failing with new GCS API on Linux.") -@pytest.mark.skipif( - sys.version_info < (3, 0), reason="This test requires Python 3.") -@pytest.mark.parametrize( - "ray_start_object_store_memory", [10**9], indirect=True) -def test_driver_put_errors(ray_start_object_store_memory): - plasma_store_memory = ray_start_object_store_memory - # Define the size of one task's return argument so that the combined - # sum of all objects' sizes is at least twice the plasma stores' - # combined allotted memory. - num_objects = 100 - size = plasma_store_memory * 2 // (num_objects * 8) - - # Define a task with a single dependency, a numpy array, that returns - # another array. - @ray.remote - def single_dependency(i, arg): - arg = np.copy(arg) - arg[0] = i - return arg - - # Launch num_objects instances of the remote task, each dependent on - # the one before it. The first instance of the task takes a numpy array - # as an argument, which is put into the object store. - args = [] - arg = single_dependency.remote(0, np.zeros(size)) - for i in range(num_objects): - arg = single_dependency.remote(i, arg) - args.append(arg) - # Get each value to force each task to finish. After some number of - # gets, old values should be evicted. - for i in range(num_objects): - value = ray.get(args[i]) - assert value[0] == i - - # Get each value starting from the beginning to force reconstruction. - # Currently, since we're not able to reconstruct `ray.put` objects that - # were evicted and whose originating tasks are still running, this - # for-loop should hang on its first iteration and push an error to the - # driver. - ray.worker.global_worker.raylet_client.fetch_or_reconstruct([args[0]], - False) - - def error_check(errors): - return len(errors) > 1 - - errors = wait_for_errors(error_check) - assert all(error["type"] == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR - or "ray.exceptions.UnreconstructableError" in error["message"] - for error in errors) - - -# NOTE(swang): This test tries to launch 1000 workers and breaks. -# TODO(rkn): This test needs to be updated to use pytest. -# class WorkerPoolTests(unittest.TestCase): -# -# def tearDown(self): -# ray.shutdown() -# -# def testBlockingTasks(self): -# @ray.remote -# def f(i, j): -# return (i, j) -# -# @ray.remote -# def g(i): -# # Each instance of g submits and blocks on the result of another remote -# # task. -# object_ids = [f.remote(i, j) for j in range(10)] -# return ray.get(object_ids) -# -# ray.init(num_workers=1) -# ray.get([g.remote(i) for i in range(1000)]) -# ray.shutdown() +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_stress_failure.py b/python/ray/tests/test_stress_failure.py new file mode 100644 index 000000000..2b8ab89e5 --- /dev/null +++ b/python/ray/tests/test_stress_failure.py @@ -0,0 +1,379 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import numpy as np +import os +import pytest +import sys +import time + +import ray +from ray.cluster_utils import Cluster +from ray.test_utils import flat_errors +import ray.ray_constants as ray_constants + + +@pytest.fixture(params=[1, 4]) +def ray_start_reconstruction(request): + num_nodes = request.param + + plasma_store_memory = int(0.5 * 10**9) + + cluster = Cluster( + initialize_head=True, + head_node_args={ + "num_cpus": 1, + "object_store_memory": plasma_store_memory // num_nodes, + "redis_max_memory": 10**7, + "_internal_config": json.dumps({ + "initial_reconstruction_timeout_milliseconds": 200 + }) + }) + for i in range(num_nodes - 1): + cluster.add_node( + num_cpus=1, + object_store_memory=plasma_store_memory // num_nodes, + _internal_config=json.dumps({ + "initial_reconstruction_timeout_milliseconds": 200 + })) + ray.init(address=cluster.address) + + yield plasma_store_memory, num_nodes, cluster + + # Clean up the Ray cluster. + ray.shutdown() + cluster.shutdown() + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Failing with new GCS API on Linux.") +def test_simple(ray_start_reconstruction): + plasma_store_memory, num_nodes, cluster = ray_start_reconstruction + # Define the size of one task's return argument so that the combined + # sum of all objects' sizes is at least twice the plasma stores' + # combined allotted memory. + num_objects = 100 + size = int(plasma_store_memory * 1.5 / (num_objects * 8)) + + # Define a remote task with no dependencies, which returns a numpy + # array of the given size. + @ray.remote + def foo(i, size): + array = np.zeros(size) + array[0] = i + return array + + # Launch num_objects instances of the remote task. + args = [] + for i in range(num_objects): + args.append(foo.remote(i, size)) + + # Get each value to force each task to finish. After some number of + # gets, old values should be evicted. + for i in range(num_objects): + value = ray.get(args[i]) + assert value[0] == i + # Get each value again to force reconstruction. + for i in range(num_objects): + value = ray.get(args[i]) + assert value[0] == i + # Get values sequentially, in chunks. + num_chunks = 4 * num_nodes + chunk = num_objects // num_chunks + for i in range(num_chunks): + values = ray.get(args[i * chunk:(i + 1) * chunk]) + del values + + assert cluster.remaining_processes_alive() + + +def sorted_random_indexes(total, output_num): + random_indexes = [np.random.randint(total) for _ in range(output_num)] + random_indexes.sort() + return random_indexes + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Failing with new GCS API on Linux.") +def test_recursive(ray_start_reconstruction): + plasma_store_memory, num_nodes, cluster = ray_start_reconstruction + # Define the size of one task's return argument so that the combined + # sum of all objects' sizes is at least twice the plasma stores' + # combined allotted memory. + num_objects = 100 + size = int(plasma_store_memory * 1.5 / (num_objects * 8)) + + # Define a root task with no dependencies, which returns a numpy array + # of the given size. + @ray.remote + def no_dependency_task(size): + array = np.zeros(size) + return array + + # Define a task with a single dependency, which returns its one + # argument. + @ray.remote + def single_dependency(i, arg): + arg = np.copy(arg) + arg[0] = i + return arg + + # Launch num_objects instances of the remote task, each dependent on + # the one before it. + arg = no_dependency_task.remote(size) + args = [] + for i in range(num_objects): + arg = single_dependency.remote(i, arg) + args.append(arg) + + # Get each value to force each task to finish. After some number of + # gets, old values should be evicted. + for i in range(num_objects): + value = ray.get(args[i]) + assert value[0] == i + # Get each value again to force reconstruction. + for i in range(num_objects): + value = ray.get(args[i]) + assert value[0] == i + # Get 10 values randomly. + random_indexes = sorted_random_indexes(num_objects, 10) + for i in random_indexes: + value = ray.get(args[i]) + assert value[0] == i + # Get values sequentially, in chunks. + num_chunks = 4 * num_nodes + chunk = num_objects // num_chunks + for i in range(num_chunks): + values = ray.get(args[i * chunk:(i + 1) * chunk]) + del values + + assert cluster.remaining_processes_alive() + + +@pytest.mark.skip(reason="This test often hangs or fails in CI.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Failing with new GCS API on Linux.") +def test_multiple_recursive(ray_start_reconstruction): + plasma_store_memory, _, cluster = ray_start_reconstruction + # Define the size of one task's return argument so that the combined + # sum of all objects' sizes is at least twice the plasma stores' + # combined allotted memory. + num_objects = 100 + size = plasma_store_memory * 2 // (num_objects * 8) + + # Define a root task with no dependencies, which returns a numpy array + # of the given size. + @ray.remote + def no_dependency_task(size): + array = np.zeros(size) + return array + + # Define a task with multiple dependencies, which returns its first + # argument. + @ray.remote + def multiple_dependency(i, arg1, arg2, arg3): + arg1 = np.copy(arg1) + arg1[0] = i + return arg1 + + # Launch num_args instances of the root task. Then launch num_objects + # instances of the multi-dependency remote task, each dependent on the + # num_args tasks before it. + num_args = 3 + args = [] + for i in range(num_args): + arg = no_dependency_task.remote(size) + args.append(arg) + for i in range(num_objects): + args.append(multiple_dependency.remote(i, *args[i:i + num_args])) + + # Get each value to force each task to finish. After some number of + # gets, old values should be evicted. + args = args[num_args:] + for i in range(num_objects): + value = ray.get(args[i]) + assert value[0] == i + # Get each value again to force reconstruction. + for i in range(num_objects): + value = ray.get(args[i]) + assert value[0] == i + # Get 10 values randomly. + random_indexes = sorted_random_indexes(num_objects, 10) + for i in random_indexes: + value = ray.get(args[i]) + assert value[0] == i + + assert cluster.remaining_processes_alive() + + +def wait_for_errors(error_check): + # Wait for errors from all the nondeterministic tasks. + errors = [] + time_left = 100 + while time_left > 0: + errors = flat_errors() + if error_check(errors): + break + time_left -= 1 + time.sleep(1) + + # Make sure that enough errors came through. + assert error_check(errors) + return errors + + +@pytest.mark.skip("This test does not work yet.") +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Failing with new GCS API on Linux.") +def test_nondeterministic_task(ray_start_reconstruction): + plasma_store_memory, num_nodes, cluster = ray_start_reconstruction + # Define the size of one task's return argument so that the combined + # sum of all objects' sizes is at least twice the plasma stores' + # combined allotted memory. + num_objects = 1000 + size = plasma_store_memory * 2 // (num_objects * 8) + + # Define a nondeterministic remote task with no dependencies, which + # returns a random numpy array of the given size. This task should + # produce an error on the driver if it is ever reexecuted. + @ray.remote + def foo(i, size): + array = np.random.rand(size) + array[0] = i + return array + + # Define a deterministic remote task with no dependencies, which + # returns a numpy array of zeros of the given size. + @ray.remote + def bar(i, size): + array = np.zeros(size) + array[0] = i + return array + + # Launch num_objects instances, half deterministic and half + # nondeterministic. + args = [] + for i in range(num_objects): + if i % 2 == 0: + args.append(foo.remote(i, size)) + else: + args.append(bar.remote(i, size)) + + # Get each value to force each task to finish. After some number of + # gets, old values should be evicted. + for i in range(num_objects): + value = ray.get(args[i]) + assert value[0] == i + # Get each value again to force reconstruction. + for i in range(num_objects): + value = ray.get(args[i]) + assert value[0] == i + + def error_check(errors): + if num_nodes == 1: + # In a single-node setting, each object is evicted and + # reconstructed exactly once, so exactly half the objects will + # produce an error during reconstruction. + min_errors = num_objects // 2 + else: + # In a multinode setting, each object is evicted zero or one + # times, so some of the nondeterministic tasks may not be + # reexecuted. + min_errors = 1 + return len(errors) >= min_errors + + errors = wait_for_errors(error_check) + # Make sure all the errors have the correct type. + assert all(error["type"] == ray_constants.HASH_MISMATCH_PUSH_ERROR + for error in errors) + + assert cluster.remaining_processes_alive() + + +@pytest.mark.skipif( + os.environ.get("RAY_USE_NEW_GCS") == "on", + reason="Failing with new GCS API on Linux.") +@pytest.mark.skipif( + sys.version_info < (3, 0), reason="This test requires Python 3.") +@pytest.mark.parametrize( + "ray_start_object_store_memory", [10**9], indirect=True) +def test_driver_put_errors(ray_start_object_store_memory): + plasma_store_memory = ray_start_object_store_memory + # Define the size of one task's return argument so that the combined + # sum of all objects' sizes is at least twice the plasma stores' + # combined allotted memory. + num_objects = 100 + size = plasma_store_memory * 2 // (num_objects * 8) + + # Define a task with a single dependency, a numpy array, that returns + # another array. + @ray.remote + def single_dependency(i, arg): + arg = np.copy(arg) + arg[0] = i + return arg + + # Launch num_objects instances of the remote task, each dependent on + # the one before it. The first instance of the task takes a numpy array + # as an argument, which is put into the object store. + args = [] + arg = single_dependency.remote(0, np.zeros(size)) + for i in range(num_objects): + arg = single_dependency.remote(i, arg) + args.append(arg) + # Get each value to force each task to finish. After some number of + # gets, old values should be evicted. + for i in range(num_objects): + value = ray.get(args[i]) + assert value[0] == i + + # Get each value starting from the beginning to force reconstruction. + # Currently, since we're not able to reconstruct `ray.put` objects that + # were evicted and whose originating tasks are still running, this + # for-loop should hang on its first iteration and push an error to the + # driver. + ray.worker.global_worker.raylet_client.fetch_or_reconstruct([args[0]], + False) + + def error_check(errors): + return len(errors) > 1 + + errors = wait_for_errors(error_check) + assert all(error["type"] == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR + or "ray.exceptions.UnreconstructableError" in error["message"] + for error in errors) + + +# NOTE(swang): This test tries to launch 1000 workers and breaks. +# TODO(rkn): This test needs to be updated to use pytest. +# class WorkerPoolTests(unittest.TestCase): +# +# def tearDown(self): +# ray.shutdown() +# +# def testBlockingTasks(self): +# @ray.remote +# def f(i, j): +# return (i, j) +# +# @ray.remote +# def g(i): +# # Each instance of g submits and blocks on the result of another remote +# # task. +# object_ids = [f.remote(i, j) for j in range(10)] +# return ray.get(object_ids) +# +# ray.init(num_workers=1) +# ray.get([g.remote(i) for i in range(1000)]) +# ray.shutdown() + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_stress_sharded.py b/python/ray/tests/test_stress_sharded.py new file mode 100644 index 000000000..259f6fc2a --- /dev/null +++ b/python/ray/tests/test_stress_sharded.py @@ -0,0 +1,102 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import os +import pytest + +import ray + + +@pytest.fixture(params=[1, 4]) +def ray_start_sharded(request): + num_redis_shards = request.param + + if os.environ.get("RAY_USE_NEW_GCS") == "on": + num_redis_shards = 1 + # For now, RAY_USE_NEW_GCS supports 1 shard, and credis supports + # 1-node chain for that shard only. + + # Start the Ray processes. + ray.init( + object_store_memory=int(0.5 * 10**9), + num_cpus=10, + num_redis_shards=num_redis_shards, + redis_max_memory=10**7) + + yield None + + # The code after the yield will run as teardown code. + ray.shutdown() + + +def test_submitting_many_tasks(ray_start_sharded): + @ray.remote + def f(x): + return 1 + + def g(n): + x = 1 + for i in range(n): + x = f.remote(x) + return x + + ray.get([g(1000) for _ in range(100)]) + assert ray.services.remaining_processes_alive() + + +def test_submitting_many_actors_to_one(ray_start_sharded): + @ray.remote + class Actor(object): + def __init__(self): + pass + + def ping(self): + return + + @ray.remote + class Worker(object): + def __init__(self, actor): + self.actor = actor + + def ping(self): + return ray.get(self.actor.ping.remote()) + + a = Actor.remote() + workers = [Worker.remote(a) for _ in range(10)] + for _ in range(10): + out = ray.get([w.ping.remote() for w in workers]) + assert out == [None for _ in workers] + + +def test_getting_and_putting(ray_start_sharded): + for n in range(8): + x = np.zeros(10**n) + + for _ in range(100): + ray.put(x) + + x_id = ray.put(x) + for _ in range(1000): + ray.get(x_id) + + assert ray.services.remaining_processes_alive() + + +def test_getting_many_objects(ray_start_sharded): + @ray.remote + def f(): + return 1 + + n = 10**4 # TODO(pcm): replace by 10 ** 5 once this is faster. + lst = ray.get([f.remote() for _ in range(n)]) + assert lst == n * [1] + + assert ray.services.remaining_processes_alive() + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_tempfile.py b/python/ray/tests/test_tempfile.py index 9981dd526..21de4e5d6 100644 --- a/python/ray/tests/test_tempfile.py +++ b/python/ray/tests/test_tempfile.py @@ -7,7 +7,7 @@ import shutil import time import pytest import ray -from ray.tests.cluster_utils import Cluster +from ray.cluster_utils import Cluster def test_conn_cluster(): @@ -150,3 +150,11 @@ def test_session_dir_uniqueness(): session_dirs.add(ray.worker._global_node.get_session_dir_path) ray.shutdown() assert len(session_dirs) == 3 + + +if __name__ == "__main__": + import sys + # Make subprocess happy in bazel. + os.environ["LC_ALL"] = "en_US.UTF-8" + os.environ["LANG"] = "en_US.UTF-8" + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_tensorflow.py b/python/ray/tests/test_tensorflow.py index c7bfe6ec4..9e15f8616 100644 --- a/python/ray/tests/test_tensorflow.py +++ b/python/ray/tests/test_tensorflow.py @@ -252,3 +252,9 @@ def test_remote_training_loss(ray_start_2_cpus): after_acc = sess.run( loss, feed_dict=dict(zip(placeholders, [[2] * 100, [4] * 100]))) assert before_acc < after_acc + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_unreconstructable_errors.py b/python/ray/tests/test_unreconstructable_errors.py index 7438c11c1..8886edd08 100644 --- a/python/ray/tests/test_unreconstructable_errors.py +++ b/python/ray/tests/test_unreconstructable_errors.py @@ -44,4 +44,6 @@ class TestUnreconstructableErrors(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_webui.py b/python/ray/tests/test_webui.py index 56f28e25e..78a5ec723 100644 --- a/python/ray/tests/test_webui.py +++ b/python/ray/tests/test_webui.py @@ -33,3 +33,9 @@ def test_get_webui(shutdown_only): assert node_info["error"] is None assert node_info["result"] is not None assert isinstance(node_info["timestamp"], float) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/BUILD b/python/ray/tune/BUILD index 3b1c9840f..d28f0e4ea 100644 --- a/python/ray/tune/BUILD +++ b/python/ray/tune/BUILD @@ -3,12 +3,14 @@ py_test( size = "medium", srcs = ["tests/test_actor_reuse.py"], tags = ["jenkins_only"], + deps = [":tune_lib"], ) py_test( name = "test_automl_searcher", size = "small", srcs = ["tests/test_automl_searcher.py"], + deps = [":tune_lib"], ) py_test( @@ -77,19 +79,52 @@ py_test( deps = [":tune_lib"], ) +py_test( + name = "test_run_experiment", + size = "medium", + srcs = ["tests/test_run_experiment.py"], + deps = [":tune_lib"], + tags = ["exclusive"], +) + py_test( name = "test_trial_runner", - size = "large", + size = "medium", srcs = ["tests/test_trial_runner.py"], deps = [":tune_lib"], tags = ["exclusive"], ) +py_test( + name = "test_var", + size = "medium", + srcs = ["tests/test_var.py"], + deps = [":tune_lib"], + tags = ["exclusive"], +) + +py_test( + name = "test_api", + size = "medium", + srcs = ["tests/test_api.py"], + deps = [":tune_lib"], + tags = ["exclusive"], +) + +py_test( + name = "test_sync", + size = "medium", + srcs = ["tests/test_sync.py"], + deps = [":tune_lib"], + tags = ["exclusive"], +) + py_test( name = "test_trial_scheduler", size = "medium", srcs = ["tests/test_trial_scheduler.py"], deps = [":tune_lib"], + tags = ["exclusive"], ) py_test( @@ -113,11 +148,12 @@ py_test( size = "medium", srcs = ["tests/test_tune_server.py"], deps = [":tune_lib"], + tags = ["exclusive"], ) # This is a dummy test dependency that causes the above tests to be # re-run if any of these files changes. py_library( - name="tune_lib", + name = "tune_lib", srcs = glob(["**/*.py"], exclude=["tests/*.py"]), ) diff --git a/python/ray/tune/tests/test_actor_reuse.py b/python/ray/tune/tests/test_actor_reuse.py index 227f81515..c3a4b2442 100644 --- a/python/ray/tune/tests/test_actor_reuse.py +++ b/python/ray/tune/tests/test_actor_reuse.py @@ -96,4 +96,6 @@ class ActorReuseTest(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_api.py b/python/ray/tune/tests/test_api.py new file mode 100644 index 000000000..f7d34d662 --- /dev/null +++ b/python/ray/tune/tests/test_api.py @@ -0,0 +1,797 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import os +import time +import unittest + +import ray +from ray.rllib import _register_all + +from ray import tune +from ray.tune import Trainable, TuneError +from ray.tune import register_env, register_trainable, run_experiments +from ray.tune.schedulers import TrialScheduler, FIFOScheduler +from ray.tune.trial import Trial +from ray.tune.result import (TIMESTEPS_TOTAL, DONE, HOSTNAME, NODE_IP, PID, + EPISODES_TOTAL, TRAINING_ITERATION, + TIMESTEPS_THIS_ITER, TIME_THIS_ITER_S, + TIME_TOTAL_S, TRIAL_ID, EXPERIMENT_TAG) +from ray.tune.logger import Logger +from ray.tune.util import pin_in_object_store, get_pinned_object, flatten_dict +from ray.tune.experiment import Experiment +from ray.tune.resources import Resources +from ray.tune.suggest import grid_search +from ray.tune.suggest.suggestion import _MockSuggestionAlgorithm + + +class TrainableFunctionApiTest(unittest.TestCase): + def setUp(self): + ray.init(num_cpus=4, num_gpus=0, object_store_memory=150 * 1024 * 1024) + + def tearDown(self): + ray.shutdown() + _register_all() # re-register the evicted objects + + def checkAndReturnConsistentLogs(self, results, sleep_per_iter=None): + """Checks logging is the same between APIs. + + Ignore "DONE" for logging but checks that the + scheduler is notified properly with the last result. + """ + class_results = copy.deepcopy(results) + function_results = copy.deepcopy(results) + + class_output = [] + function_output = [] + scheduler_notif = [] + + class MockScheduler(FIFOScheduler): + def on_trial_complete(self, runner, trial, result): + scheduler_notif.append(result) + + class ClassAPILogger(Logger): + def on_result(self, result): + class_output.append(result) + + class FunctionAPILogger(Logger): + def on_result(self, result): + function_output.append(result) + + class _WrappedTrainable(Trainable): + def _setup(self, config): + del config + self._result_iter = copy.deepcopy(class_results) + + def _train(self): + if sleep_per_iter: + time.sleep(sleep_per_iter) + res = self._result_iter.pop(0) # This should not fail + if not self._result_iter: # Mark "Done" for last result + res[DONE] = True + return res + + def _function_trainable(config, reporter): + for result in function_results: + if sleep_per_iter: + time.sleep(sleep_per_iter) + reporter(**result) + + class_trainable_name = "class_trainable" + register_trainable(class_trainable_name, _WrappedTrainable) + + trials = run_experiments( + { + "function_api": { + "run": _function_trainable, + "loggers": [FunctionAPILogger], + }, + "class_api": { + "run": class_trainable_name, + "loggers": [ClassAPILogger], + }, + }, + raise_on_failed_trial=False, + scheduler=MockScheduler()) + + # Ignore these fields + NO_COMPARE_FIELDS = { + HOSTNAME, + NODE_IP, + TRIAL_ID, + EXPERIMENT_TAG, + PID, + TIME_THIS_ITER_S, + TIME_TOTAL_S, + DONE, # This is ignored because FunctionAPI has different handling + "timestamp", + "time_since_restore", + "experiment_id", + "date", + } + + self.assertEqual(len(class_output), len(results)) + self.assertEqual(len(function_output), len(results)) + + def as_comparable_result(result): + return { + k: v + for k, v in result.items() if k not in NO_COMPARE_FIELDS + } + + function_comparable = [ + as_comparable_result(result) for result in function_output + ] + class_comparable = [ + as_comparable_result(result) for result in class_output + ] + + self.assertEqual(function_comparable, class_comparable) + + self.assertEqual(sum(t.get(DONE) for t in scheduler_notif), 2) + self.assertEqual( + as_comparable_result(scheduler_notif[0]), + as_comparable_result(scheduler_notif[1])) + + # Make sure the last result is the same. + self.assertEqual( + as_comparable_result(trials[0].last_result), + as_comparable_result(trials[1].last_result)) + + return function_output, trials + + def testPinObject(self): + X = pin_in_object_store("hello") + + @ray.remote + def f(): + return get_pinned_object(X) + + self.assertEqual(ray.get(f.remote()), "hello") + + def testFetchPinned(self): + X = pin_in_object_store("hello") + + def train(config, reporter): + get_pinned_object(X) + reporter(timesteps_total=100, done=True) + + register_trainable("f1", train) + [trial] = run_experiments({ + "foo": { + "run": "f1", + } + }) + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 100) + + def testRegisterEnv(self): + register_env("foo", lambda: None) + self.assertRaises(TypeError, lambda: register_env("foo", 2)) + + def testRegisterEnvOverwrite(self): + def train(config, reporter): + reporter(timesteps_total=100, done=True) + + def train2(config, reporter): + reporter(timesteps_total=200, done=True) + + register_trainable("f1", train) + register_trainable("f1", train2) + [trial] = run_experiments({ + "foo": { + "run": "f1", + } + }) + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 200) + + def testRegisterTrainable(self): + def train(config, reporter): + pass + + class A(object): + pass + + class B(Trainable): + pass + + register_trainable("foo", train) + Experiment("test", train) + register_trainable("foo", B) + Experiment("test", B) + self.assertRaises(TypeError, lambda: register_trainable("foo", B())) + self.assertRaises(TuneError, lambda: Experiment("foo", B())) + self.assertRaises(TypeError, lambda: register_trainable("foo", A)) + self.assertRaises(TypeError, lambda: Experiment("foo", A)) + + def testTrainableCallable(self): + def dummy_fn(config, reporter, steps): + reporter(timesteps_total=steps, done=True) + + from functools import partial + steps = 500 + register_trainable("test", partial(dummy_fn, steps=steps)) + [trial] = run_experiments({ + "foo": { + "run": "test", + } + }) + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], steps) + [trial] = tune.run(partial(dummy_fn, steps=steps)).trials + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], steps) + + def testBuiltInTrainableResources(self): + class B(Trainable): + @classmethod + def default_resource_request(cls, config): + return Resources(cpu=config["cpu"], gpu=config["gpu"]) + + def _train(self): + return {"timesteps_this_iter": 1, "done": True} + + register_trainable("B", B) + + def f(cpus, gpus, queue_trials): + return run_experiments( + { + "foo": { + "run": "B", + "config": { + "cpu": cpus, + "gpu": gpus, + }, + } + }, + queue_trials=queue_trials)[0] + + # Should all succeed + self.assertEqual(f(0, 0, False).status, Trial.TERMINATED) + self.assertEqual(f(1, 0, True).status, Trial.TERMINATED) + self.assertEqual(f(1, 0, True).status, Trial.TERMINATED) + + # Too large resource request + self.assertRaises(TuneError, lambda: f(100, 100, False)) + self.assertRaises(TuneError, lambda: f(0, 100, False)) + self.assertRaises(TuneError, lambda: f(100, 0, False)) + + # TODO(ekl) how can we test this is queued (hangs)? + # f(100, 0, True) + + def testRewriteEnv(self): + def train(config, reporter): + reporter(timesteps_total=1) + + register_trainable("f1", train) + + [trial] = run_experiments({ + "foo": { + "run": "f1", + "env": "CartPole-v0", + } + }) + self.assertEqual(trial.config["env"], "CartPole-v0") + + def testConfigPurity(self): + def train(config, reporter): + assert config == {"a": "b"}, config + reporter(timesteps_total=1) + + register_trainable("f1", train) + run_experiments({ + "foo": { + "run": "f1", + "config": { + "a": "b" + }, + } + }) + + def testLogdir(self): + def train(config, reporter): + assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd() + reporter(timesteps_total=1) + + register_trainable("f1", train) + run_experiments({ + "foo": { + "run": "f1", + "local_dir": "/tmp/logdir", + "config": { + "a": "b" + }, + } + }) + + def testLogdirStartingWithTilde(self): + local_dir = "~/ray_results/local_dir" + + def train(config, reporter): + cwd = os.getcwd() + assert cwd.startswith(os.path.expanduser(local_dir)), cwd + assert not cwd.startswith("~"), cwd + reporter(timesteps_total=1) + + register_trainable("f1", train) + run_experiments({ + "foo": { + "run": "f1", + "local_dir": local_dir, + "config": { + "a": "b" + }, + } + }) + + def testLongFilename(self): + def train(config, reporter): + assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd() + reporter(timesteps_total=1) + + register_trainable("f1", train) + run_experiments({ + "foo": { + "run": "f1", + "local_dir": "/tmp/logdir", + "config": { + "a" * 50: tune.sample_from(lambda spec: 5.0 / 7), + "b" * 50: tune.sample_from(lambda spec: "long" * 40), + }, + } + }) + + def testBadParams(self): + def f(): + run_experiments({"foo": {}}) + + self.assertRaises(TuneError, f) + + def testBadParams2(self): + def f(): + run_experiments({ + "foo": { + "run": "asdf", + "bah": "this param is not allowed", + } + }) + + self.assertRaises(TuneError, f) + + def testBadParams3(self): + def f(): + run_experiments({ + "foo": { + "run": grid_search("invalid grid search"), + } + }) + + self.assertRaises(TuneError, f) + + def testBadParams4(self): + def f(): + run_experiments({ + "foo": { + "run": "asdf", + } + }) + + self.assertRaises(TuneError, f) + + def testBadParams5(self): + def f(): + run_experiments({"foo": {"run": "PPO", "stop": {"asdf": 1}}}) + + self.assertRaises(TuneError, f) + + def testBadParams6(self): + def f(): + run_experiments({ + "foo": { + "run": "PPO", + "resources_per_trial": { + "asdf": 1 + } + } + }) + + self.assertRaises(TuneError, f) + + def testBadStoppingReturn(self): + def train(config, reporter): + reporter() + + register_trainable("f1", train) + + def f(): + run_experiments({ + "foo": { + "run": "f1", + "stop": { + "time": 10 + }, + } + }) + + self.assertRaises(TuneError, f) + + def testNestedStoppingReturn(self): + def train(config, reporter): + for i in range(10): + reporter(test={"test1": {"test2": i}}) + + with self.assertRaises(TuneError): + [trial] = tune.run( + train, stop={ + "test": { + "test1": { + "test2": 6 + } + } + }).trials + [trial] = tune.run(train, stop={"test/test1/test2": 6}).trials + self.assertEqual(trial.last_result["training_iteration"], 7) + + def testStoppingFunction(self): + def train(config, reporter): + for i in range(10): + reporter(test=i) + + def stop(trial_id, result): + return result["test"] > 6 + + [trial] = tune.run(train, stop=stop).trials + self.assertEqual(trial.last_result["training_iteration"], 8) + + def testStoppingMemberFunction(self): + def train(config, reporter): + for i in range(10): + reporter(test=i) + + class Stopper: + def stop(self, trial_id, result): + return result["test"] > 6 + + [trial] = tune.run(train, stop=Stopper().stop).trials + self.assertEqual(trial.last_result["training_iteration"], 8) + + def testBadStoppingFunction(self): + def train(config, reporter): + for i in range(10): + reporter(test=i) + + class Stopper: + def stop(self, result): + return result["test"] > 6 + + def stop(result): + return result["test"] > 6 + + with self.assertRaises(ValueError): + tune.run(train, stop=Stopper().stop) + with self.assertRaises(ValueError): + tune.run(train, stop=stop) + + def testEarlyReturn(self): + def train(config, reporter): + reporter(timesteps_total=100, done=True) + time.sleep(99999) + + register_trainable("f1", train) + [trial] = run_experiments({ + "foo": { + "run": "f1", + } + }) + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 100) + + def testReporterNoUsage(self): + def run_task(config, reporter): + print("hello") + + experiment = Experiment(run=run_task, name="ray_crash_repro") + [trial] = ray.tune.run(experiment).trials + print(trial.last_result) + self.assertEqual(trial.last_result[DONE], True) + + def testErrorReturn(self): + def train(config, reporter): + raise Exception("uh oh") + + register_trainable("f1", train) + + def f(): + run_experiments({ + "foo": { + "run": "f1", + } + }) + + self.assertRaises(TuneError, f) + + def testSuccess(self): + def train(config, reporter): + for i in range(100): + reporter(timesteps_total=i) + + register_trainable("f1", train) + [trial] = run_experiments({ + "foo": { + "run": "f1", + } + }) + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) + + def testNoRaiseFlag(self): + def train(config, reporter): + raise Exception() + + register_trainable("f1", train) + + [trial] = run_experiments( + { + "foo": { + "run": "f1", + } + }, raise_on_failed_trial=False) + self.assertEqual(trial.status, Trial.ERROR) + + def testReportInfinity(self): + def train(config, reporter): + for i in range(100): + reporter(mean_accuracy=float("inf")) + + register_trainable("f1", train) + [trial] = run_experiments({ + "foo": { + "run": "f1", + } + }) + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertEqual(trial.last_result["mean_accuracy"], float("inf")) + + def testNestedResults(self): + def create_result(i): + return {"test": {"1": {"2": {"3": i, "4": False}}}} + + flattened_keys = list(flatten_dict(create_result(0))) + + class _MockScheduler(FIFOScheduler): + results = [] + + def on_trial_result(self, trial_runner, trial, result): + self.results += [result] + return TrialScheduler.CONTINUE + + def on_trial_complete(self, trial_runner, trial, result): + self.complete_result = result + + def train(config, reporter): + for i in range(100): + reporter(**create_result(i)) + + algo = _MockSuggestionAlgorithm() + scheduler = _MockScheduler() + [trial] = tune.run( + train, + scheduler=scheduler, + search_alg=algo, + stop={ + "test/1/2/3": 20 + }).trials + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertEqual(trial.last_result["test"]["1"]["2"]["3"], 20) + self.assertEqual(trial.last_result["test"]["1"]["2"]["4"], False) + self.assertEqual(trial.last_result[TRAINING_ITERATION], 21) + self.assertEqual(len(scheduler.results), 20) + self.assertTrue( + all( + set(result) >= set(flattened_keys) + for result in scheduler.results)) + self.assertTrue(set(scheduler.complete_result) >= set(flattened_keys)) + self.assertEqual(len(algo.results), 20) + self.assertTrue( + all(set(result) >= set(flattened_keys) for result in algo.results)) + with self.assertRaises(TuneError): + [trial] = tune.run(train, stop={"1/2/3": 20}) + with self.assertRaises(TuneError): + [trial] = tune.run(train, stop={"test": 1}).trials + + def testReportTimeStep(self): + # Test that no timestep count are logged if never the Trainable never + # returns any. + results1 = [dict(mean_accuracy=5, done=i == 99) for i in range(100)] + logs1, _ = self.checkAndReturnConsistentLogs(results1) + + self.assertTrue(all(log[TIMESTEPS_TOTAL] is None for log in logs1)) + + # Test that no timesteps_this_iter are logged if only timesteps_total + # are returned. + results2 = [dict(timesteps_total=5, done=i == 9) for i in range(10)] + logs2, _ = self.checkAndReturnConsistentLogs(results2) + + # Re-run the same trials but with added delay. This is to catch some + # inconsistent timestep counting that was present in the multi-threaded + # FunctionRunner. This part of the test can be removed once the + # multi-threaded FunctionRunner is removed from ray/tune. + # TODO: remove once the multi-threaded function runner is gone. + logs2, _ = self.checkAndReturnConsistentLogs(results2, 0.5) + + # check all timesteps_total report the same value + self.assertTrue(all(log[TIMESTEPS_TOTAL] == 5 for log in logs2)) + # check that none of the logs report timesteps_this_iter + self.assertFalse( + any(hasattr(log, TIMESTEPS_THIS_ITER) for log in logs2)) + + # Test that timesteps_total and episodes_total are reported when + # timesteps_this_iter and episodes_this_iter despite only return zeros. + results3 = [ + dict(timesteps_this_iter=0, episodes_this_iter=0) + for i in range(10) + ] + logs3, _ = self.checkAndReturnConsistentLogs(results3) + + self.assertTrue(all(log[TIMESTEPS_TOTAL] == 0 for log in logs3)) + self.assertTrue(all(log[EPISODES_TOTAL] == 0 for log in logs3)) + + # Test that timesteps_total and episodes_total are properly counted + # when timesteps_this_iter and episodes_this_iter report non-zero + # values. + results4 = [ + dict(timesteps_this_iter=3, episodes_this_iter=i) + for i in range(10) + ] + logs4, _ = self.checkAndReturnConsistentLogs(results4) + + # The last reported result should not be double-logged. + self.assertEqual(logs4[-1][TIMESTEPS_TOTAL], 30) + self.assertNotEqual(logs4[-2][TIMESTEPS_TOTAL], + logs4[-1][TIMESTEPS_TOTAL]) + self.assertEqual(logs4[-1][EPISODES_TOTAL], 45) + self.assertNotEqual(logs4[-2][EPISODES_TOTAL], + logs4[-1][EPISODES_TOTAL]) + + def testAllValuesReceived(self): + results1 = [ + dict(timesteps_total=(i + 1), my_score=i**2, done=i == 4) + for i in range(5) + ] + + logs1, _ = self.checkAndReturnConsistentLogs(results1) + + # check if the correct number of results were reported + self.assertEqual(len(logs1), len(results1)) + + def check_no_missing(reported_result, result): + common_results = [reported_result[k] == result[k] for k in result] + return all(common_results) + + # check that no result was dropped or modified + complete_results = [ + check_no_missing(log, result) + for log, result in zip(logs1, results1) + ] + self.assertTrue(all(complete_results)) + + # check if done was logged exactly once + self.assertEqual(len([r for r in logs1 if r.get("done")]), 1) + + def testNoDoneReceived(self): + # repeat same test but without explicitly reporting done=True + results1 = [ + dict(timesteps_total=(i + 1), my_score=i**2) for i in range(5) + ] + + logs1, trials = self.checkAndReturnConsistentLogs(results1) + + # check if the correct number of results were reported. + self.assertEqual(len(logs1), len(results1)) + + def check_no_missing(reported_result, result): + common_results = [reported_result[k] == result[k] for k in result] + return all(common_results) + + # check that no result was dropped or modified + complete_results1 = [ + check_no_missing(log, result) + for log, result in zip(logs1, results1) + ] + self.assertTrue(all(complete_results1)) + + def testCheckpointDict(self): + class TestTrain(Trainable): + def _setup(self, config): + self.state = {"hi": 1} + + def _train(self): + return {"timesteps_this_iter": 1, "done": True} + + def _save(self, path): + return self.state + + def _restore(self, state): + self.state = state + + test_trainable = TestTrain() + result = test_trainable.save() + test_trainable.state["hi"] = 2 + test_trainable.restore(result) + self.assertEqual(test_trainable.state["hi"], 1) + + trials = run_experiments({ + "foo": { + "run": TestTrain, + "checkpoint_at_end": True + } + }) + for trial in trials: + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertTrue(trial.has_checkpoint()) + + def testMultipleCheckpoints(self): + class TestTrain(Trainable): + def _setup(self, config): + self.state = {"hi": 1, "iter": 0} + + def _train(self): + self.state["iter"] += 1 + return {"timesteps_this_iter": 1, "done": True} + + def _save(self, path): + return self.state + + def _restore(self, state): + self.state = state + + test_trainable = TestTrain() + checkpoint_1 = test_trainable.save() + test_trainable.train() + checkpoint_2 = test_trainable.save() + self.assertNotEqual(checkpoint_1, checkpoint_2) + test_trainable.restore(checkpoint_2) + self.assertEqual(test_trainable.state["iter"], 1) + test_trainable.restore(checkpoint_1) + self.assertEqual(test_trainable.state["iter"], 0) + + trials = run_experiments({ + "foo": { + "run": TestTrain, + "checkpoint_at_end": True + } + }) + for trial in trials: + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertTrue(trial.has_checkpoint()) + + def testIterationCounter(self): + def train(config, reporter): + for i in range(100): + reporter(itr=i, timesteps_this_iter=1) + + register_trainable("exp", train) + config = { + "my_exp": { + "run": "exp", + "config": { + "iterations": 100, + }, + "stop": { + "timesteps_total": 100 + }, + } + } + [trial] = run_experiments(config) + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertEqual(trial.last_result[TRAINING_ITERATION], 100) + self.assertEqual(trial.last_result["itr"], 99) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_automl_searcher.py b/python/ray/tune/tests/test_automl_searcher.py index 49f03af87..05cd4b574 100644 --- a/python/ray/tune/tests/test_automl_searcher.py +++ b/python/ray/tune/tests/test_automl_searcher.py @@ -70,4 +70,6 @@ class AutoMLSearcherTest(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_checkpoint_manager.py b/python/ray/tune/tests/test_checkpoint_manager.py index ec6741e12..9211607b8 100644 --- a/python/ray/tune/tests/test_checkpoint_manager.py +++ b/python/ray/tune/tests/test_checkpoint_manager.py @@ -109,4 +109,4 @@ class CheckpointManagerTest(unittest.TestCase): if __name__ == "__main__": import pytest import sys - sys.exit(pytest.main(["-v", "-s", __file__])) + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_cluster.py b/python/ray/tune/tests/test_cluster.py index cf1024db9..726c25aeb 100644 --- a/python/ray/tune/tests/test_cluster.py +++ b/python/ray/tune/tests/test_cluster.py @@ -13,8 +13,8 @@ import sys import ray from ray import tune from ray.rllib import _register_all -from ray.tests.cluster_utils import Cluster -from ray.tests.utils import run_string_as_driver_nonblocking +from ray.cluster_utils import Cluster +from ray.test_utils import run_string_as_driver_nonblocking from ray.tune.error import TuneError from ray.tune.ray_trial_executor import RayTrialExecutor from ray.tune.experiment import Experiment @@ -598,4 +598,4 @@ tune.run( if __name__ == "__main__": import pytest import sys - sys.exit(pytest.main(["-v", "-s", __file__])) + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_experiment.py b/python/ray/tune/tests/test_experiment.py index 841295680..355d20253 100644 --- a/python/ray/tune/tests/test_experiment.py +++ b/python/ray/tune/tests/test_experiment.py @@ -62,4 +62,6 @@ class ExperimentTest(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_experiment_analysis.py b/python/ray/tune/tests/test_experiment_analysis.py index b1c830359..72de55818 100644 --- a/python/ray/tune/tests/test_experiment_analysis.py +++ b/python/ray/tune/tests/test_experiment_analysis.py @@ -204,4 +204,6 @@ class AnalysisSuite(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_logger.py b/python/ray/tune/tests/test_logger.py index 0ead18b96..30a7e491e 100644 --- a/python/ray/tune/tests/test_logger.py +++ b/python/ray/tune/tests/test_logger.py @@ -64,4 +64,6 @@ class LoggerSuite(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_ray_trial_executor.py b/python/ray/tune/tests/test_ray_trial_executor.py index 12e49f505..1c91e813f 100644 --- a/python/ray/tune/tests/test_ray_trial_executor.py +++ b/python/ray/tune/tests/test_ray_trial_executor.py @@ -14,7 +14,7 @@ from ray.tune.registry import _global_registry, TRAINABLE_CLASS from ray.tune.suggest import BasicVariantGenerator from ray.tune.trial import Trial, Checkpoint from ray.tune.resources import Resources -from ray.tests.cluster_utils import Cluster +from ray.cluster_utils import Cluster class RayTrialExecutorTest(unittest.TestCase): @@ -190,4 +190,6 @@ class LocalModeExecutorTest(RayTrialExecutorTest): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_run_experiment.py b/python/ray/tune/tests/test_run_experiment.py new file mode 100644 index 000000000..9ce25d39c --- /dev/null +++ b/python/ray/tune/tests/test_run_experiment.py @@ -0,0 +1,241 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import unittest + +import ray +from ray.rllib import _register_all + +from ray.tune.result import TIMESTEPS_TOTAL +from ray.tune import Trainable, TuneError +from ray.tune import register_trainable, run_experiments +from ray.tune.logger import Logger +from ray.tune.experiment import Experiment +from ray.tune.trial import Trial, ExportFormat + + +class RunExperimentTest(unittest.TestCase): + def tearDown(self): + ray.shutdown() + _register_all() # re-register the evicted objects + + def testDict(self): + def train(config, reporter): + for i in range(100): + reporter(timesteps_total=i) + + register_trainable("f1", train) + trials = run_experiments({ + "foo": { + "run": "f1", + }, + "bar": { + "run": "f1", + } + }) + for trial in trials: + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) + + def testExperiment(self): + def train(config, reporter): + for i in range(100): + reporter(timesteps_total=i) + + register_trainable("f1", train) + exp1 = Experiment(**{ + "name": "foo", + "run": "f1", + }) + [trial] = run_experiments(exp1) + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) + + def testExperimentList(self): + def train(config, reporter): + for i in range(100): + reporter(timesteps_total=i) + + register_trainable("f1", train) + exp1 = Experiment(**{ + "name": "foo", + "run": "f1", + }) + exp2 = Experiment(**{ + "name": "bar", + "run": "f1", + }) + trials = run_experiments([exp1, exp2]) + for trial in trials: + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) + + def testAutoregisterTrainable(self): + def train(config, reporter): + for i in range(100): + reporter(timesteps_total=i) + + class B(Trainable): + def _train(self): + return {"timesteps_this_iter": 1, "done": True} + + register_trainable("f1", train) + trials = run_experiments({ + "foo": { + "run": train, + }, + "bar": { + "run": B + } + }) + for trial in trials: + self.assertEqual(trial.status, Trial.TERMINATED) + + def testCheckpointAtEnd(self): + class train(Trainable): + def _train(self): + return {"timesteps_this_iter": 1, "done": True} + + def _save(self, path): + checkpoint = path + "/checkpoint" + with open(checkpoint, "w") as f: + f.write("OK") + return checkpoint + + trials = run_experiments({ + "foo": { + "run": train, + "checkpoint_at_end": True + } + }) + for trial in trials: + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertTrue(trial.has_checkpoint()) + + def testExportFormats(self): + class train(Trainable): + def _train(self): + return {"timesteps_this_iter": 1, "done": True} + + def _export_model(self, export_formats, export_dir): + path = export_dir + "/exported" + with open(path, "w") as f: + f.write("OK") + return {export_formats[0]: path} + + trials = run_experiments({ + "foo": { + "run": train, + "export_formats": ["format"] + } + }) + for trial in trials: + self.assertEqual(trial.status, Trial.TERMINATED) + self.assertTrue( + os.path.exists(os.path.join(trial.logdir, "exported"))) + + def testInvalidExportFormats(self): + class train(Trainable): + def _train(self): + return {"timesteps_this_iter": 1, "done": True} + + def _export_model(self, export_formats, export_dir): + ExportFormat.validate(export_formats) + return {} + + def fail_trial(): + run_experiments({ + "foo": { + "run": train, + "export_formats": ["format"] + } + }) + + self.assertRaises(TuneError, fail_trial) + + def testCustomResources(self): + ray.shutdown() + ray.init(resources={"hi": 3}) + + class train(Trainable): + def _train(self): + return {"timesteps_this_iter": 1, "done": True} + + trials = run_experiments({ + "foo": { + "run": train, + "resources_per_trial": { + "cpu": 1, + "custom_resources": { + "hi": 2 + } + } + } + }) + for trial in trials: + self.assertEqual(trial.status, Trial.TERMINATED) + + def testCustomLogger(self): + class CustomLogger(Logger): + def on_result(self, result): + with open(os.path.join(self.logdir, "test.log"), "w") as f: + f.write("hi") + + [trial] = run_experiments({ + "foo": { + "run": "__fake", + "stop": { + "training_iteration": 1 + }, + "loggers": [CustomLogger] + } + }) + self.assertTrue(os.path.exists(os.path.join(trial.logdir, "test.log"))) + self.assertFalse( + os.path.exists(os.path.join(trial.logdir, "params.json"))) + + [trial] = run_experiments({ + "foo": { + "run": "__fake", + "stop": { + "training_iteration": 1 + } + } + }) + self.assertTrue( + os.path.exists(os.path.join(trial.logdir, "params.json"))) + + [trial] = run_experiments({ + "foo": { + "run": "__fake", + "stop": { + "training_iteration": 1 + }, + "loggers": [] + } + }) + self.assertFalse( + os.path.exists(os.path.join(trial.logdir, "params.json"))) + + def testCustomTrialString(self): + [trial] = run_experiments({ + "foo": { + "run": "__fake", + "stop": { + "training_iteration": 1 + }, + "trial_name_creator": + lambda t: "{}_{}_321".format(t.trainable_name, t.trial_id) + } + }) + self.assertEquals( + str(trial), "{}_{}_321".format(trial.trainable_name, + trial.trial_id)) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_sync.py b/python/ray/tune/tests/test_sync.py new file mode 100644 index 000000000..b103236ed --- /dev/null +++ b/python/ray/tune/tests/test_sync.py @@ -0,0 +1,218 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import glob +import os +import shutil +import sys +import tempfile +import unittest + +import ray +from ray.rllib import _register_all + +from ray import tune +from ray.tune import TuneError +from ray.tune.syncer import CommandBasedClient + +if sys.version_info >= (3, 3): + from unittest.mock import patch +else: + from mock import patch + + +class TestSyncFunctionality(unittest.TestCase): + def setUp(self): + ray.init() + + def tearDown(self): + ray.shutdown() + _register_all() # re-register the evicted objects + + @patch("ray.tune.syncer.S3_PREFIX", "test") + def testNoUploadDir(self): + """No Upload Dir is given.""" + with self.assertRaises(AssertionError): + [trial] = tune.run( + "__fake", + name="foo", + max_failures=0, + **{ + "stop": { + "training_iteration": 1 + }, + "sync_to_cloud": "echo {source} {target}" + }).trials + + @patch("ray.tune.syncer.S3_PREFIX", "test") + def testCloudProperString(self): + with self.assertRaises(ValueError): + [trial] = tune.run( + "__fake", + name="foo", + max_failures=0, + **{ + "stop": { + "training_iteration": 1 + }, + "upload_dir": "test", + "sync_to_cloud": "ls {target}" + }).trials + + with self.assertRaises(ValueError): + [trial] = tune.run( + "__fake", + name="foo", + max_failures=0, + **{ + "stop": { + "training_iteration": 1 + }, + "upload_dir": "test", + "sync_to_cloud": "ls {source}" + }).trials + + tmpdir = tempfile.mkdtemp() + logfile = os.path.join(tmpdir, "test.log") + + [trial] = tune.run( + "__fake", + name="foo", + max_failures=0, + **{ + "stop": { + "training_iteration": 1 + }, + "upload_dir": "test", + "sync_to_cloud": "echo {source} {target} > " + logfile + }).trials + with open(logfile) as f: + lines = f.read() + self.assertTrue("test" in lines) + shutil.rmtree(tmpdir) + + def testClusterProperString(self): + """Tests that invalid commands throw..""" + with self.assertRaises(TuneError): + # This raises TuneError because logger is init in safe zone. + [trial] = tune.run( + "__fake", + name="foo", + max_failures=0, + **{ + "stop": { + "training_iteration": 1 + }, + "sync_to_driver": "ls {target}" + }).trials + + with self.assertRaises(TuneError): + # This raises TuneError because logger is init in safe zone. + [trial] = tune.run( + "__fake", + name="foo", + max_failures=0, + **{ + "stop": { + "training_iteration": 1 + }, + "sync_to_driver": "ls {source}" + }).trials + + with patch.object(CommandBasedClient, "execute") as mock_fn: + with patch("ray.services.get_node_ip_address") as mock_sync: + mock_sync.return_value = "0.0.0.0" + [trial] = tune.run( + "__fake", + name="foo", + max_failures=0, + **{ + "stop": { + "training_iteration": 1 + }, + "sync_to_driver": "echo {source} {target}" + }).trials + self.assertGreater(mock_fn.call_count, 0) + + def testCloudFunctions(self): + tmpdir = tempfile.mkdtemp() + tmpdir2 = tempfile.mkdtemp() + os.mkdir(os.path.join(tmpdir2, "foo")) + + def sync_func(local, remote): + for filename in glob.glob(os.path.join(local, "*.json")): + shutil.copy(filename, remote) + + [trial] = tune.run( + "__fake", + name="foo", + max_failures=0, + local_dir=tmpdir, + stop={ + "training_iteration": 1 + }, + upload_dir=tmpdir2, + sync_to_cloud=sync_func).trials + test_file_path = glob.glob(os.path.join(tmpdir2, "foo", "*.json")) + self.assertTrue(test_file_path) + shutil.rmtree(tmpdir) + shutil.rmtree(tmpdir2) + + def testClusterSyncFunction(self): + def sync_func_driver(source, target): + assert ":" in source, "Source {} not a remote path.".format(source) + assert ":" not in target, "Target is supposed to be local." + with open(os.path.join(target, "test.log2"), "w") as f: + print("writing to", f.name) + f.write(source) + + [trial] = tune.run( + "__fake", + name="foo", + max_failures=0, + stop={ + "training_iteration": 1 + }, + sync_to_driver=sync_func_driver).trials + test_file_path = os.path.join(trial.logdir, "test.log2") + self.assertFalse(os.path.exists(test_file_path)) + + with patch("ray.services.get_node_ip_address") as mock_sync: + mock_sync.return_value = "0.0.0.0" + [trial] = tune.run( + "__fake", + name="foo", + max_failures=0, + stop={ + "training_iteration": 1 + }, + sync_to_driver=sync_func_driver).trials + test_file_path = os.path.join(trial.logdir, "test.log2") + self.assertTrue(os.path.exists(test_file_path)) + os.remove(test_file_path) + + def testNoSync(self): + """Sync should not run on a single node.""" + + def sync_func(source, target): + pass + + with patch.object(CommandBasedClient, "execute") as mock_sync: + [trial] = tune.run( + "__fake", + name="foo", + max_failures=0, + **{ + "stop": { + "training_iteration": 1 + }, + "sync_to_driver": sync_func + }).trials + self.assertEqual(mock_sync.call_count, 0) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_track.py b/python/ray/tune/tests/test_track.py index 7bfc8683e..6bbe4ee1c 100644 --- a/python/ray/tune/tests/test_track.py +++ b/python/ray/tune/tests/test_track.py @@ -85,4 +85,6 @@ class TrackApiTest(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_trial_runner.py b/python/ray/tune/tests/test_trial_runner.py index 6c89ebce1..770e2fd19 100644 --- a/python/ray/tune/tests/test_trial_runner.py +++ b/python/ray/tune/tests/test_trial_runner.py @@ -2,41 +2,28 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import copy -import glob import os -import numpy as np import shutil import sys import tempfile -import time import unittest import ray from ray.rllib import _register_all from ray import tune -from ray.tune import Trainable, TuneError -from ray.tune import register_env, register_trainable, run_experiments +from ray.tune import TuneError, register_trainable from ray.tune.ray_trial_executor import RayTrialExecutor from ray.tune.schedulers import TrialScheduler, FIFOScheduler +from ray.tune.result import DONE from ray.tune.registry import _global_registry, TRAINABLE_CLASS -from ray.tune.result import ( - DEFAULT_RESULTS_DIR, TIMESTEPS_TOTAL, DONE, HOSTNAME, NODE_IP, PID, - EPISODES_TOTAL, TRAINING_ITERATION, TIMESTEPS_THIS_ITER, TIME_THIS_ITER_S, - TIME_TOTAL_S, TRIAL_ID, EXPERIMENT_TAG) -from ray.tune.logger import Logger -from ray.tune.syncer import CommandBasedClient -from ray.tune.util import pin_in_object_store, get_pinned_object, flatten_dict from ray.tune.experiment import Experiment -from ray.tune.trial import Trial, ExportFormat +from ray.tune.trial import Trial from ray.tune.trial_runner import TrialRunner from ray.tune.resources import Resources, json_to_resources, resources_to_json -from ray.tune.suggest import grid_search, BasicVariantGenerator +from ray.tune.suggest import BasicVariantGenerator from ray.tune.suggest.suggestion import (_MockSuggestionAlgorithm, SuggestionAlgorithm) -from ray.tune.suggest.variant_generator import (RecursiveDependencyError, - resolve_nested_dict) if sys.version_info >= (3, 3): from unittest.mock import patch @@ -44,1474 +31,6 @@ else: from mock import patch -class TrainableFunctionApiTest(unittest.TestCase): - def setUp(self): - ray.init(num_cpus=4, num_gpus=0, object_store_memory=150 * 1024 * 1024) - - def tearDown(self): - ray.shutdown() - _register_all() # re-register the evicted objects - - def checkAndReturnConsistentLogs(self, results, sleep_per_iter=None): - """Checks logging is the same between APIs. - - Ignore "DONE" for logging but checks that the - scheduler is notified properly with the last result. - """ - class_results = copy.deepcopy(results) - function_results = copy.deepcopy(results) - - class_output = [] - function_output = [] - scheduler_notif = [] - - class MockScheduler(FIFOScheduler): - def on_trial_complete(self, runner, trial, result): - scheduler_notif.append(result) - - class ClassAPILogger(Logger): - def on_result(self, result): - class_output.append(result) - - class FunctionAPILogger(Logger): - def on_result(self, result): - function_output.append(result) - - class _WrappedTrainable(Trainable): - def _setup(self, config): - del config - self._result_iter = copy.deepcopy(class_results) - - def _train(self): - if sleep_per_iter: - time.sleep(sleep_per_iter) - res = self._result_iter.pop(0) # This should not fail - if not self._result_iter: # Mark "Done" for last result - res[DONE] = True - return res - - def _function_trainable(config, reporter): - for result in function_results: - if sleep_per_iter: - time.sleep(sleep_per_iter) - reporter(**result) - - class_trainable_name = "class_trainable" - register_trainable(class_trainable_name, _WrappedTrainable) - - trials = run_experiments( - { - "function_api": { - "run": _function_trainable, - "loggers": [FunctionAPILogger], - }, - "class_api": { - "run": class_trainable_name, - "loggers": [ClassAPILogger], - }, - }, - raise_on_failed_trial=False, - scheduler=MockScheduler()) - - # Ignore these fields - NO_COMPARE_FIELDS = { - HOSTNAME, - NODE_IP, - TRIAL_ID, - EXPERIMENT_TAG, - PID, - TIME_THIS_ITER_S, - TIME_TOTAL_S, - DONE, # This is ignored because FunctionAPI has different handling - "timestamp", - "time_since_restore", - "experiment_id", - "date", - } - - self.assertEqual(len(class_output), len(results)) - self.assertEqual(len(function_output), len(results)) - - def as_comparable_result(result): - return { - k: v - for k, v in result.items() if k not in NO_COMPARE_FIELDS - } - - function_comparable = [ - as_comparable_result(result) for result in function_output - ] - class_comparable = [ - as_comparable_result(result) for result in class_output - ] - - self.assertEqual(function_comparable, class_comparable) - - self.assertEqual(sum(t.get(DONE) for t in scheduler_notif), 2) - self.assertEqual( - as_comparable_result(scheduler_notif[0]), - as_comparable_result(scheduler_notif[1])) - - # Make sure the last result is the same. - self.assertEqual( - as_comparable_result(trials[0].last_result), - as_comparable_result(trials[1].last_result)) - - return function_output, trials - - def testPinObject(self): - X = pin_in_object_store("hello") - - @ray.remote - def f(): - return get_pinned_object(X) - - self.assertEqual(ray.get(f.remote()), "hello") - - def testFetchPinned(self): - X = pin_in_object_store("hello") - - def train(config, reporter): - get_pinned_object(X) - reporter(timesteps_total=100, done=True) - - register_trainable("f1", train) - [trial] = run_experiments({ - "foo": { - "run": "f1", - } - }) - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 100) - - def testRegisterEnv(self): - register_env("foo", lambda: None) - self.assertRaises(TypeError, lambda: register_env("foo", 2)) - - def testRegisterEnvOverwrite(self): - def train(config, reporter): - reporter(timesteps_total=100, done=True) - - def train2(config, reporter): - reporter(timesteps_total=200, done=True) - - register_trainable("f1", train) - register_trainable("f1", train2) - [trial] = run_experiments({ - "foo": { - "run": "f1", - } - }) - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 200) - - def testRegisterTrainable(self): - def train(config, reporter): - pass - - class A(object): - pass - - class B(Trainable): - pass - - register_trainable("foo", train) - Experiment("test", train) - register_trainable("foo", B) - Experiment("test", B) - self.assertRaises(TypeError, lambda: register_trainable("foo", B())) - self.assertRaises(TuneError, lambda: Experiment("foo", B())) - self.assertRaises(TypeError, lambda: register_trainable("foo", A)) - self.assertRaises(TypeError, lambda: Experiment("foo", A)) - - def testTrainableCallable(self): - def dummy_fn(config, reporter, steps): - reporter(timesteps_total=steps, done=True) - - from functools import partial - steps = 500 - register_trainable("test", partial(dummy_fn, steps=steps)) - [trial] = run_experiments({ - "foo": { - "run": "test", - } - }) - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], steps) - [trial] = tune.run(partial(dummy_fn, steps=steps)).trials - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], steps) - - def testBuiltInTrainableResources(self): - class B(Trainable): - @classmethod - def default_resource_request(cls, config): - return Resources(cpu=config["cpu"], gpu=config["gpu"]) - - def _train(self): - return {"timesteps_this_iter": 1, "done": True} - - register_trainable("B", B) - - def f(cpus, gpus, queue_trials): - return run_experiments( - { - "foo": { - "run": "B", - "config": { - "cpu": cpus, - "gpu": gpus, - }, - } - }, - queue_trials=queue_trials)[0] - - # Should all succeed - self.assertEqual(f(0, 0, False).status, Trial.TERMINATED) - self.assertEqual(f(1, 0, True).status, Trial.TERMINATED) - self.assertEqual(f(1, 0, True).status, Trial.TERMINATED) - - # Too large resource request - self.assertRaises(TuneError, lambda: f(100, 100, False)) - self.assertRaises(TuneError, lambda: f(0, 100, False)) - self.assertRaises(TuneError, lambda: f(100, 0, False)) - - # TODO(ekl) how can we test this is queued (hangs)? - # f(100, 0, True) - - def testRewriteEnv(self): - def train(config, reporter): - reporter(timesteps_total=1) - - register_trainable("f1", train) - - [trial] = run_experiments({ - "foo": { - "run": "f1", - "env": "CartPole-v0", - } - }) - self.assertEqual(trial.config["env"], "CartPole-v0") - - def testConfigPurity(self): - def train(config, reporter): - assert config == {"a": "b"}, config - reporter(timesteps_total=1) - - register_trainable("f1", train) - run_experiments({ - "foo": { - "run": "f1", - "config": { - "a": "b" - }, - } - }) - - def testLogdir(self): - def train(config, reporter): - assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd() - reporter(timesteps_total=1) - - register_trainable("f1", train) - run_experiments({ - "foo": { - "run": "f1", - "local_dir": "/tmp/logdir", - "config": { - "a": "b" - }, - } - }) - - def testLogdirStartingWithTilde(self): - local_dir = "~/ray_results/local_dir" - - def train(config, reporter): - cwd = os.getcwd() - assert cwd.startswith(os.path.expanduser(local_dir)), cwd - assert not cwd.startswith("~"), cwd - reporter(timesteps_total=1) - - register_trainable("f1", train) - run_experiments({ - "foo": { - "run": "f1", - "local_dir": local_dir, - "config": { - "a": "b" - }, - } - }) - - def testLongFilename(self): - def train(config, reporter): - assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd() - reporter(timesteps_total=1) - - register_trainable("f1", train) - run_experiments({ - "foo": { - "run": "f1", - "local_dir": "/tmp/logdir", - "config": { - "a" * 50: tune.sample_from(lambda spec: 5.0 / 7), - "b" * 50: tune.sample_from(lambda spec: "long" * 40), - }, - } - }) - - def testBadParams(self): - def f(): - run_experiments({"foo": {}}) - - self.assertRaises(TuneError, f) - - def testBadParams2(self): - def f(): - run_experiments({ - "foo": { - "run": "asdf", - "bah": "this param is not allowed", - } - }) - - self.assertRaises(TuneError, f) - - def testBadParams3(self): - def f(): - run_experiments({ - "foo": { - "run": grid_search("invalid grid search"), - } - }) - - self.assertRaises(TuneError, f) - - def testBadParams4(self): - def f(): - run_experiments({ - "foo": { - "run": "asdf", - } - }) - - self.assertRaises(TuneError, f) - - def testBadParams5(self): - def f(): - run_experiments({"foo": {"run": "PPO", "stop": {"asdf": 1}}}) - - self.assertRaises(TuneError, f) - - def testBadParams6(self): - def f(): - run_experiments({ - "foo": { - "run": "PPO", - "resources_per_trial": { - "asdf": 1 - } - } - }) - - self.assertRaises(TuneError, f) - - def testBadStoppingReturn(self): - def train(config, reporter): - reporter() - - register_trainable("f1", train) - - def f(): - run_experiments({ - "foo": { - "run": "f1", - "stop": { - "time": 10 - }, - } - }) - - self.assertRaises(TuneError, f) - - def testNestedStoppingReturn(self): - def train(config, reporter): - for i in range(10): - reporter(test={"test1": {"test2": i}}) - - with self.assertRaises(TuneError): - [trial] = tune.run( - train, stop={ - "test": { - "test1": { - "test2": 6 - } - } - }).trials - [trial] = tune.run(train, stop={"test/test1/test2": 6}).trials - self.assertEqual(trial.last_result["training_iteration"], 7) - - def testStoppingFunction(self): - def train(config, reporter): - for i in range(10): - reporter(test=i) - - def stop(trial_id, result): - return result["test"] > 6 - - [trial] = tune.run(train, stop=stop).trials - self.assertEqual(trial.last_result["training_iteration"], 8) - - def testStoppingMemberFunction(self): - def train(config, reporter): - for i in range(10): - reporter(test=i) - - class Stopper: - def stop(self, trial_id, result): - return result["test"] > 6 - - [trial] = tune.run(train, stop=Stopper().stop).trials - self.assertEqual(trial.last_result["training_iteration"], 8) - - def testBadStoppingFunction(self): - def train(config, reporter): - for i in range(10): - reporter(test=i) - - class Stopper: - def stop(self, result): - return result["test"] > 6 - - def stop(result): - return result["test"] > 6 - - with self.assertRaises(ValueError): - tune.run(train, stop=Stopper().stop) - with self.assertRaises(ValueError): - tune.run(train, stop=stop) - - def testEarlyReturn(self): - def train(config, reporter): - reporter(timesteps_total=100, done=True) - time.sleep(99999) - - register_trainable("f1", train) - [trial] = run_experiments({ - "foo": { - "run": "f1", - } - }) - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 100) - - def testReporterNoUsage(self): - def run_task(config, reporter): - print("hello") - - experiment = Experiment(run=run_task, name="ray_crash_repro") - [trial] = ray.tune.run(experiment).trials - print(trial.last_result) - self.assertEqual(trial.last_result[DONE], True) - - def testErrorReturn(self): - def train(config, reporter): - raise Exception("uh oh") - - register_trainable("f1", train) - - def f(): - run_experiments({ - "foo": { - "run": "f1", - } - }) - - self.assertRaises(TuneError, f) - - def testSuccess(self): - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - register_trainable("f1", train) - [trial] = run_experiments({ - "foo": { - "run": "f1", - } - }) - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) - - def testNoRaiseFlag(self): - def train(config, reporter): - raise Exception() - - register_trainable("f1", train) - - [trial] = run_experiments( - { - "foo": { - "run": "f1", - } - }, raise_on_failed_trial=False) - self.assertEqual(trial.status, Trial.ERROR) - - def testReportInfinity(self): - def train(config, reporter): - for i in range(100): - reporter(mean_accuracy=float("inf")) - - register_trainable("f1", train) - [trial] = run_experiments({ - "foo": { - "run": "f1", - } - }) - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result["mean_accuracy"], float("inf")) - - def testNestedResults(self): - def create_result(i): - return {"test": {"1": {"2": {"3": i, "4": False}}}} - - flattened_keys = list(flatten_dict(create_result(0))) - - class _MockScheduler(FIFOScheduler): - results = [] - - def on_trial_result(self, trial_runner, trial, result): - self.results += [result] - return TrialScheduler.CONTINUE - - def on_trial_complete(self, trial_runner, trial, result): - self.complete_result = result - - def train(config, reporter): - for i in range(100): - reporter(**create_result(i)) - - algo = _MockSuggestionAlgorithm() - scheduler = _MockScheduler() - [trial] = tune.run( - train, - scheduler=scheduler, - search_alg=algo, - stop={ - "test/1/2/3": 20 - }).trials - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result["test"]["1"]["2"]["3"], 20) - self.assertEqual(trial.last_result["test"]["1"]["2"]["4"], False) - self.assertEqual(trial.last_result[TRAINING_ITERATION], 21) - self.assertEqual(len(scheduler.results), 20) - self.assertTrue( - all( - set(result) >= set(flattened_keys) - for result in scheduler.results)) - self.assertTrue(set(scheduler.complete_result) >= set(flattened_keys)) - self.assertEqual(len(algo.results), 20) - self.assertTrue( - all(set(result) >= set(flattened_keys) for result in algo.results)) - with self.assertRaises(TuneError): - [trial] = tune.run(train, stop={"1/2/3": 20}) - with self.assertRaises(TuneError): - [trial] = tune.run(train, stop={"test": 1}).trials - - def testReportTimeStep(self): - # Test that no timestep count are logged if never the Trainable never - # returns any. - results1 = [dict(mean_accuracy=5, done=i == 99) for i in range(100)] - logs1, _ = self.checkAndReturnConsistentLogs(results1) - - self.assertTrue(all(log[TIMESTEPS_TOTAL] is None for log in logs1)) - - # Test that no timesteps_this_iter are logged if only timesteps_total - # are returned. - results2 = [dict(timesteps_total=5, done=i == 9) for i in range(10)] - logs2, _ = self.checkAndReturnConsistentLogs(results2) - - # Re-run the same trials but with added delay. This is to catch some - # inconsistent timestep counting that was present in the multi-threaded - # FunctionRunner. This part of the test can be removed once the - # multi-threaded FunctionRunner is removed from ray/tune. - # TODO: remove once the multi-threaded function runner is gone. - logs2, _ = self.checkAndReturnConsistentLogs(results2, 0.5) - - # check all timesteps_total report the same value - self.assertTrue(all(log[TIMESTEPS_TOTAL] == 5 for log in logs2)) - # check that none of the logs report timesteps_this_iter - self.assertFalse( - any(hasattr(log, TIMESTEPS_THIS_ITER) for log in logs2)) - - # Test that timesteps_total and episodes_total are reported when - # timesteps_this_iter and episodes_this_iter despite only return zeros. - results3 = [ - dict(timesteps_this_iter=0, episodes_this_iter=0) - for i in range(10) - ] - logs3, _ = self.checkAndReturnConsistentLogs(results3) - - self.assertTrue(all(log[TIMESTEPS_TOTAL] == 0 for log in logs3)) - self.assertTrue(all(log[EPISODES_TOTAL] == 0 for log in logs3)) - - # Test that timesteps_total and episodes_total are properly counted - # when timesteps_this_iter and episodes_this_iter report non-zero - # values. - results4 = [ - dict(timesteps_this_iter=3, episodes_this_iter=i) - for i in range(10) - ] - logs4, _ = self.checkAndReturnConsistentLogs(results4) - - # The last reported result should not be double-logged. - self.assertEqual(logs4[-1][TIMESTEPS_TOTAL], 30) - self.assertNotEqual(logs4[-2][TIMESTEPS_TOTAL], - logs4[-1][TIMESTEPS_TOTAL]) - self.assertEqual(logs4[-1][EPISODES_TOTAL], 45) - self.assertNotEqual(logs4[-2][EPISODES_TOTAL], - logs4[-1][EPISODES_TOTAL]) - - def testAllValuesReceived(self): - results1 = [ - dict(timesteps_total=(i + 1), my_score=i**2, done=i == 4) - for i in range(5) - ] - - logs1, _ = self.checkAndReturnConsistentLogs(results1) - - # check if the correct number of results were reported - self.assertEqual(len(logs1), len(results1)) - - def check_no_missing(reported_result, result): - common_results = [reported_result[k] == result[k] for k in result] - return all(common_results) - - # check that no result was dropped or modified - complete_results = [ - check_no_missing(log, result) - for log, result in zip(logs1, results1) - ] - self.assertTrue(all(complete_results)) - - # check if done was logged exactly once - self.assertEqual(len([r for r in logs1 if r.get("done")]), 1) - - def testNoDoneReceived(self): - # repeat same test but without explicitly reporting done=True - results1 = [ - dict(timesteps_total=(i + 1), my_score=i**2) for i in range(5) - ] - - logs1, trials = self.checkAndReturnConsistentLogs(results1) - - # check if the correct number of results were reported. - self.assertEqual(len(logs1), len(results1)) - - def check_no_missing(reported_result, result): - common_results = [reported_result[k] == result[k] for k in result] - return all(common_results) - - # check that no result was dropped or modified - complete_results1 = [ - check_no_missing(log, result) - for log, result in zip(logs1, results1) - ] - self.assertTrue(all(complete_results1)) - - def testCheckpointDict(self): - class TestTrain(Trainable): - def _setup(self, config): - self.state = {"hi": 1} - - def _train(self): - return {"timesteps_this_iter": 1, "done": True} - - def _save(self, path): - return self.state - - def _restore(self, state): - self.state = state - - test_trainable = TestTrain() - result = test_trainable.save() - test_trainable.state["hi"] = 2 - test_trainable.restore(result) - self.assertEqual(test_trainable.state["hi"], 1) - - trials = run_experiments({ - "foo": { - "run": TestTrain, - "checkpoint_at_end": True - } - }) - for trial in trials: - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertTrue(trial.has_checkpoint()) - - def testMultipleCheckpoints(self): - class TestTrain(Trainable): - def _setup(self, config): - self.state = {"hi": 1, "iter": 0} - - def _train(self): - self.state["iter"] += 1 - return {"timesteps_this_iter": 1, "done": True} - - def _save(self, path): - return self.state - - def _restore(self, state): - self.state = state - - test_trainable = TestTrain() - checkpoint_1 = test_trainable.save() - test_trainable.train() - checkpoint_2 = test_trainable.save() - self.assertNotEqual(checkpoint_1, checkpoint_2) - test_trainable.restore(checkpoint_2) - self.assertEqual(test_trainable.state["iter"], 1) - test_trainable.restore(checkpoint_1) - self.assertEqual(test_trainable.state["iter"], 0) - - trials = run_experiments({ - "foo": { - "run": TestTrain, - "checkpoint_at_end": True - } - }) - for trial in trials: - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertTrue(trial.has_checkpoint()) - - def testIterationCounter(self): - def train(config, reporter): - for i in range(100): - reporter(itr=i, timesteps_this_iter=1) - - register_trainable("exp", train) - config = { - "my_exp": { - "run": "exp", - "config": { - "iterations": 100, - }, - "stop": { - "timesteps_total": 100 - }, - } - } - [trial] = run_experiments(config) - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TRAINING_ITERATION], 100) - self.assertEqual(trial.last_result["itr"], 99) - - -class RunExperimentTest(unittest.TestCase): - def tearDown(self): - ray.shutdown() - _register_all() # re-register the evicted objects - - def testDict(self): - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - register_trainable("f1", train) - trials = run_experiments({ - "foo": { - "run": "f1", - }, - "bar": { - "run": "f1", - } - }) - for trial in trials: - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) - - def testExperiment(self): - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - register_trainable("f1", train) - exp1 = Experiment(**{ - "name": "foo", - "run": "f1", - }) - [trial] = run_experiments(exp1) - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) - - def testExperimentList(self): - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - register_trainable("f1", train) - exp1 = Experiment(**{ - "name": "foo", - "run": "f1", - }) - exp2 = Experiment(**{ - "name": "bar", - "run": "f1", - }) - trials = run_experiments([exp1, exp2]) - for trial in trials: - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 99) - - def testAutoregisterTrainable(self): - def train(config, reporter): - for i in range(100): - reporter(timesteps_total=i) - - class B(Trainable): - def _train(self): - return {"timesteps_this_iter": 1, "done": True} - - register_trainable("f1", train) - trials = run_experiments({ - "foo": { - "run": train, - }, - "bar": { - "run": B - } - }) - for trial in trials: - self.assertEqual(trial.status, Trial.TERMINATED) - - def testCheckpointAtEnd(self): - class train(Trainable): - def _train(self): - return {"timesteps_this_iter": 1, "done": True} - - def _save(self, path): - checkpoint = path + "/checkpoint" - with open(checkpoint, "w") as f: - f.write("OK") - return checkpoint - - trials = run_experiments({ - "foo": { - "run": train, - "checkpoint_at_end": True - } - }) - for trial in trials: - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertTrue(trial.has_checkpoint()) - - def testExportFormats(self): - class train(Trainable): - def _train(self): - return {"timesteps_this_iter": 1, "done": True} - - def _export_model(self, export_formats, export_dir): - path = export_dir + "/exported" - with open(path, "w") as f: - f.write("OK") - return {export_formats[0]: path} - - trials = run_experiments({ - "foo": { - "run": train, - "export_formats": ["format"] - } - }) - for trial in trials: - self.assertEqual(trial.status, Trial.TERMINATED) - self.assertTrue( - os.path.exists(os.path.join(trial.logdir, "exported"))) - - def testInvalidExportFormats(self): - class train(Trainable): - def _train(self): - return {"timesteps_this_iter": 1, "done": True} - - def _export_model(self, export_formats, export_dir): - ExportFormat.validate(export_formats) - return {} - - def fail_trial(): - run_experiments({ - "foo": { - "run": train, - "export_formats": ["format"] - } - }) - - self.assertRaises(TuneError, fail_trial) - - def testCustomResources(self): - ray.shutdown() - ray.init(resources={"hi": 3}) - - class train(Trainable): - def _train(self): - return {"timesteps_this_iter": 1, "done": True} - - trials = run_experiments({ - "foo": { - "run": train, - "resources_per_trial": { - "cpu": 1, - "custom_resources": { - "hi": 2 - } - } - } - }) - for trial in trials: - self.assertEqual(trial.status, Trial.TERMINATED) - - def testCustomLogger(self): - class CustomLogger(Logger): - def on_result(self, result): - with open(os.path.join(self.logdir, "test.log"), "w") as f: - f.write("hi") - - [trial] = run_experiments({ - "foo": { - "run": "__fake", - "stop": { - "training_iteration": 1 - }, - "loggers": [CustomLogger] - } - }) - self.assertTrue(os.path.exists(os.path.join(trial.logdir, "test.log"))) - self.assertFalse( - os.path.exists(os.path.join(trial.logdir, "params.json"))) - - [trial] = run_experiments({ - "foo": { - "run": "__fake", - "stop": { - "training_iteration": 1 - } - } - }) - self.assertTrue( - os.path.exists(os.path.join(trial.logdir, "params.json"))) - - [trial] = run_experiments({ - "foo": { - "run": "__fake", - "stop": { - "training_iteration": 1 - }, - "loggers": [] - } - }) - self.assertFalse( - os.path.exists(os.path.join(trial.logdir, "params.json"))) - - def testCustomTrialString(self): - [trial] = run_experiments({ - "foo": { - "run": "__fake", - "stop": { - "training_iteration": 1 - }, - "trial_name_creator": - lambda t: "{}_{}_321".format(t.trainable_name, t.trial_id) - } - }) - self.assertEquals( - str(trial), "{}_{}_321".format(trial.trainable_name, - trial.trial_id)) - - -class TestSyncFunctionality(unittest.TestCase): - def setUp(self): - ray.init() - - def tearDown(self): - ray.shutdown() - _register_all() # re-register the evicted objects - - @patch("ray.tune.syncer.S3_PREFIX", "test") - def testNoUploadDir(self): - """No Upload Dir is given.""" - with self.assertRaises(AssertionError): - [trial] = tune.run( - "__fake", - name="foo", - max_failures=0, - **{ - "stop": { - "training_iteration": 1 - }, - "sync_to_cloud": "echo {source} {target}" - }).trials - - @patch("ray.tune.syncer.S3_PREFIX", "test") - def testCloudProperString(self): - with self.assertRaises(ValueError): - [trial] = tune.run( - "__fake", - name="foo", - max_failures=0, - **{ - "stop": { - "training_iteration": 1 - }, - "upload_dir": "test", - "sync_to_cloud": "ls {target}" - }).trials - - with self.assertRaises(ValueError): - [trial] = tune.run( - "__fake", - name="foo", - max_failures=0, - **{ - "stop": { - "training_iteration": 1 - }, - "upload_dir": "test", - "sync_to_cloud": "ls {source}" - }).trials - - tmpdir = tempfile.mkdtemp() - logfile = os.path.join(tmpdir, "test.log") - - [trial] = tune.run( - "__fake", - name="foo", - max_failures=0, - **{ - "stop": { - "training_iteration": 1 - }, - "upload_dir": "test", - "sync_to_cloud": "echo {source} {target} > " + logfile - }).trials - with open(logfile) as f: - lines = f.read() - self.assertTrue("test" in lines) - shutil.rmtree(tmpdir) - - def testClusterProperString(self): - """Tests that invalid commands throw..""" - with self.assertRaises(TuneError): - # This raises TuneError because logger is init in safe zone. - [trial] = tune.run( - "__fake", - name="foo", - max_failures=0, - **{ - "stop": { - "training_iteration": 1 - }, - "sync_to_driver": "ls {target}" - }).trials - - with self.assertRaises(TuneError): - # This raises TuneError because logger is init in safe zone. - [trial] = tune.run( - "__fake", - name="foo", - max_failures=0, - **{ - "stop": { - "training_iteration": 1 - }, - "sync_to_driver": "ls {source}" - }).trials - - with patch.object(CommandBasedClient, "execute") as mock_fn: - with patch("ray.services.get_node_ip_address") as mock_sync: - mock_sync.return_value = "0.0.0.0" - [trial] = tune.run( - "__fake", - name="foo", - max_failures=0, - **{ - "stop": { - "training_iteration": 1 - }, - "sync_to_driver": "echo {source} {target}" - }).trials - self.assertGreater(mock_fn.call_count, 0) - - def testCloudFunctions(self): - tmpdir = tempfile.mkdtemp() - tmpdir2 = tempfile.mkdtemp() - os.mkdir(os.path.join(tmpdir2, "foo")) - - def sync_func(local, remote): - for filename in glob.glob(os.path.join(local, "*.json")): - shutil.copy(filename, remote) - - [trial] = tune.run( - "__fake", - name="foo", - max_failures=0, - local_dir=tmpdir, - stop={ - "training_iteration": 1 - }, - upload_dir=tmpdir2, - sync_to_cloud=sync_func).trials - test_file_path = glob.glob(os.path.join(tmpdir2, "foo", "*.json")) - self.assertTrue(test_file_path) - shutil.rmtree(tmpdir) - shutil.rmtree(tmpdir2) - - def testClusterSyncFunction(self): - def sync_func_driver(source, target): - assert ":" in source, "Source {} not a remote path.".format(source) - assert ":" not in target, "Target is supposed to be local." - with open(os.path.join(target, "test.log2"), "w") as f: - print("writing to", f.name) - f.write(source) - - [trial] = tune.run( - "__fake", - name="foo", - max_failures=0, - stop={ - "training_iteration": 1 - }, - sync_to_driver=sync_func_driver).trials - test_file_path = os.path.join(trial.logdir, "test.log2") - self.assertFalse(os.path.exists(test_file_path)) - - with patch("ray.services.get_node_ip_address") as mock_sync: - mock_sync.return_value = "0.0.0.0" - [trial] = tune.run( - "__fake", - name="foo", - max_failures=0, - stop={ - "training_iteration": 1 - }, - sync_to_driver=sync_func_driver).trials - test_file_path = os.path.join(trial.logdir, "test.log2") - self.assertTrue(os.path.exists(test_file_path)) - os.remove(test_file_path) - - def testNoSync(self): - """Sync should not run on a single node.""" - - def sync_func(source, target): - pass - - with patch.object(CommandBasedClient, "execute") as mock_sync: - [trial] = tune.run( - "__fake", - name="foo", - max_failures=0, - **{ - "stop": { - "training_iteration": 1 - }, - "sync_to_driver": sync_func - }).trials - self.assertEqual(mock_sync.call_count, 0) - - -class VariantGeneratorTest(unittest.TestCase): - def setUp(self): - ray.init() - - def tearDown(self): - ray.shutdown() - _register_all() # re-register the evicted objects - - def generate_trials(self, spec, name): - suggester = BasicVariantGenerator() - suggester.add_configurations({name: spec}) - return suggester.next_trials() - - def testParseToTrials(self): - trials = self.generate_trials({ - "run": "PPO", - "num_samples": 2, - "max_failures": 5, - "config": { - "env": "Pong-v0", - "foo": "bar" - }, - }, "tune-pong") - trials = list(trials) - self.assertEqual(len(trials), 2) - self.assertTrue("PPO_Pong-v0" in str(trials[0])) - self.assertEqual(trials[0].config, {"foo": "bar", "env": "Pong-v0"}) - self.assertEqual(trials[0].trainable_name, "PPO") - self.assertEqual(trials[0].experiment_tag, "0") - self.assertEqual(trials[0].max_failures, 5) - self.assertEqual(trials[0].evaluated_params, {}) - self.assertEqual(trials[0].local_dir, - os.path.join(DEFAULT_RESULTS_DIR, "tune-pong")) - self.assertEqual(trials[1].experiment_tag, "1") - - def testEval(self): - trials = self.generate_trials({ - "run": "PPO", - "config": { - "foo": { - "eval": "2 + 2" - }, - }, - }, "eval") - trials = list(trials) - self.assertEqual(len(trials), 1) - self.assertEqual(trials[0].config, {"foo": 4}) - self.assertEqual(trials[0].evaluated_params, {"foo": 4}) - self.assertEqual(trials[0].experiment_tag, "0_foo=4") - - def testGridSearch(self): - trials = self.generate_trials({ - "run": "PPO", - "config": { - "bar": { - "grid_search": [True, False] - }, - "foo": { - "grid_search": [1, 2, 3] - }, - "baz": "asd", - }, - }, "grid_search") - trials = list(trials) - self.assertEqual(len(trials), 6) - self.assertEqual(trials[0].config, { - "bar": True, - "foo": 1, - "baz": "asd", - }) - self.assertEqual(trials[0].evaluated_params, { - "bar": True, - "foo": 1, - }) - self.assertEqual(trials[0].experiment_tag, "0_bar=True,foo=1") - - self.assertEqual(trials[1].config, { - "bar": False, - "foo": 1, - "baz": "asd", - }) - self.assertEqual(trials[1].evaluated_params, { - "bar": False, - "foo": 1, - }) - self.assertEqual(trials[1].experiment_tag, "1_bar=False,foo=1") - - self.assertEqual(trials[2].config, { - "bar": True, - "foo": 2, - "baz": "asd", - }) - self.assertEqual(trials[2].evaluated_params, { - "bar": True, - "foo": 2, - }) - - self.assertEqual(trials[3].config, { - "bar": False, - "foo": 2, - "baz": "asd", - }) - self.assertEqual(trials[3].evaluated_params, { - "bar": False, - "foo": 2, - }) - - self.assertEqual(trials[4].config, { - "bar": True, - "foo": 3, - "baz": "asd", - }) - self.assertEqual(trials[4].evaluated_params, { - "bar": True, - "foo": 3, - }) - - self.assertEqual(trials[5].config, { - "bar": False, - "foo": 3, - "baz": "asd", - }) - self.assertEqual(trials[5].evaluated_params, { - "bar": False, - "foo": 3, - }) - - def testGridSearchAndEval(self): - trials = self.generate_trials({ - "run": "PPO", - "config": { - "qux": tune.sample_from(lambda spec: 2 + 2), - "bar": grid_search([True, False]), - "foo": grid_search([1, 2, 3]), - "baz": "asd", - }, - }, "grid_eval") - trials = list(trials) - self.assertEqual(len(trials), 6) - self.assertEqual(trials[0].config, { - "bar": True, - "foo": 1, - "qux": 4, - "baz": "asd", - }) - self.assertEqual(trials[0].evaluated_params, { - "bar": True, - "foo": 1, - "qux": 4, - }) - self.assertEqual(trials[0].experiment_tag, "0_bar=True,foo=1,qux=4") - - def testConditionResolution(self): - trials = self.generate_trials({ - "run": "PPO", - "config": { - "x": 1, - "y": tune.sample_from(lambda spec: spec.config.x + 1), - "z": tune.sample_from(lambda spec: spec.config.y + 1), - }, - }, "condition_resolution") - trials = list(trials) - self.assertEqual(len(trials), 1) - self.assertEqual(trials[0].config, {"x": 1, "y": 2, "z": 3}) - self.assertEqual(trials[0].evaluated_params, {"y": 2, "z": 3}) - self.assertEqual(trials[0].experiment_tag, "0_y=2,z=3") - - def testDependentLambda(self): - trials = self.generate_trials({ - "run": "PPO", - "config": { - "x": grid_search([1, 2]), - "y": tune.sample_from(lambda spec: spec.config.x * 100), - }, - }, "dependent_lambda") - trials = list(trials) - self.assertEqual(len(trials), 2) - self.assertEqual(trials[0].config, {"x": 1, "y": 100}) - self.assertEqual(trials[1].config, {"x": 2, "y": 200}) - - def testDependentGridSearch(self): - trials = self.generate_trials({ - "run": "PPO", - "config": { - "x": grid_search([ - tune.sample_from(lambda spec: spec.config.y * 100), - tune.sample_from(lambda spec: spec.config.y * 200) - ]), - "y": tune.sample_from(lambda spec: 1), - }, - }, "dependent_grid_search") - trials = list(trials) - self.assertEqual(len(trials), 2) - self.assertEqual(trials[0].config, {"x": 100, "y": 1}) - self.assertEqual(trials[1].config, {"x": 200, "y": 1}) - - def testNestedValues(self): - trials = self.generate_trials({ - "run": "PPO", - "config": { - "x": { - "y": { - "z": tune.sample_from(lambda spec: 1) - } - }, - "y": tune.sample_from(lambda spec: 12), - "z": tune.sample_from(lambda spec: spec.config.x.y.z * 100), - }, - }, "nested_values") - trials = list(trials) - self.assertEqual(len(trials), 1) - self.assertEqual(trials[0].config, { - "x": { - "y": { - "z": 1 - } - }, - "y": 12, - "z": 100 - }) - self.assertEqual(trials[0].evaluated_params, { - "x/y/z": 1, - "y": 12, - "z": 100 - }) - - def testLogUniform(self): - sampler = tune.loguniform(1e-10, 1e-1).func - results = [sampler(None) for i in range(1000)] - assert abs(np.log(min(results)) / np.log(10) - -10) < 0.1 - assert abs(np.log(max(results)) / np.log(10) - -1) < 0.1 - - sampler_e = tune.loguniform(np.e**-4, np.e, base=np.e).func - results_e = [sampler_e(None) for i in range(1000)] - assert abs(np.log(min(results_e)) - -4) < 0.1 - assert abs(np.log(max(results_e)) - 1) < 0.1 - - def test_resolve_dict(self): - config = { - "a": { - "b": 1, - "c": 2, - }, - "b": { - "a": 3 - } - } - resolved = resolve_nested_dict(config) - for k, v in [(("a", "b"), 1), (("a", "c"), 2), (("b", "a"), 3)]: - self.assertEqual(resolved.get(k), v) - - def testRecursiveDep(self): - try: - list( - self.generate_trials({ - "run": "PPO", - "config": { - "foo": tune.sample_from(lambda spec: spec.config.foo), - }, - }, "recursive_dep")) - except RecursiveDependencyError as e: - assert "`foo` recursively depends on" in str(e), e - else: - assert False - - def testMaxConcurrentSuggestions(self): - """Checks that next_trials() supports throttling.""" - experiment_spec = { - "run": "PPO", - "num_samples": 6, - } - experiments = [Experiment.from_json("test", experiment_spec)] - - searcher = _MockSuggestionAlgorithm(max_concurrent=4) - searcher.add_configurations(experiments) - trials = searcher.next_trials() - self.assertEqual(len(trials), 4) - self.assertEqual(searcher.next_trials(), []) - - finished_trial = trials.pop() - searcher.on_trial_complete(finished_trial.trial_id) - self.assertEqual(len(searcher.next_trials()), 1) - - finished_trial = trials.pop() - searcher.on_trial_complete(finished_trial.trial_id) - - finished_trial = trials.pop() - searcher.on_trial_complete(finished_trial.trial_id) - - finished_trial = trials.pop() - searcher.on_trial_complete(finished_trial.trial_id) - self.assertEqual(len(searcher.next_trials()), 1) - self.assertEqual(len(searcher.next_trials()), 0) - - def create_mock_components(): class _MockScheduler(FIFOScheduler): errored_trials = [] @@ -2586,4 +1105,6 @@ class ResourcesTest(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_trial_scheduler.py b/python/ray/tune/tests/test_trial_scheduler.py index 559f9de4d..4ca60236e 100644 --- a/python/ray/tune/tests/test_trial_scheduler.py +++ b/python/ray/tune/tests/test_trial_scheduler.py @@ -1194,4 +1194,6 @@ class AsyncHyperBandSuite(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_tune_restore.py b/python/ray/tune/tests/test_tune_restore.py index 769c1bdae..b24a755a5 100644 --- a/python/ray/tune/tests/test_tune_restore.py +++ b/python/ray/tune/tests/test_tune_restore.py @@ -13,7 +13,7 @@ import numpy as np import ray from ray import tune -from ray.tests.utils import recursive_fnmatch +from ray.test_utils import recursive_fnmatch from ray.tune.util import validate_save_restore from ray.rllib import _register_all from ray.tune.suggest.hyperopt import HyperOptSearch @@ -277,4 +277,6 @@ class SigOptWarmStartTest(AbstractWarmStartTest, unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_tune_save_restore.py b/python/ray/tune/tests/test_tune_save_restore.py index f474ad664..358436bed 100644 --- a/python/ray/tune/tests/test_tune_save_restore.py +++ b/python/ray/tune/tests/test_tune_save_restore.py @@ -151,4 +151,6 @@ class SerialTuneRelativeLocalDirTest(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_tune_server.py b/python/ray/tune/tests/test_tune_server.py index 24bbef5a9..9f9d6e91e 100644 --- a/python/ray/tune/tests/test_tune_server.py +++ b/python/ray/tune/tests/test_tune_server.py @@ -144,4 +144,6 @@ class TuneServerSuite(unittest.TestCase): if __name__ == "__main__": - unittest.main(verbosity=2) + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_var.py b/python/ray/tune/tests/test_var.py new file mode 100644 index 000000000..8a0fba749 --- /dev/null +++ b/python/ray/tune/tests/test_var.py @@ -0,0 +1,319 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import numpy as np +import unittest + +import ray +from ray.rllib import _register_all + +from ray import tune +from ray.tune.result import DEFAULT_RESULTS_DIR +from ray.tune.experiment import Experiment +from ray.tune.suggest import grid_search, BasicVariantGenerator +from ray.tune.suggest.suggestion import _MockSuggestionAlgorithm +from ray.tune.suggest.variant_generator import (RecursiveDependencyError, + resolve_nested_dict) + + +class VariantGeneratorTest(unittest.TestCase): + def setUp(self): + ray.init() + + def tearDown(self): + ray.shutdown() + _register_all() # re-register the evicted objects + + def generate_trials(self, spec, name): + suggester = BasicVariantGenerator() + suggester.add_configurations({name: spec}) + return suggester.next_trials() + + def testParseToTrials(self): + trials = self.generate_trials({ + "run": "PPO", + "num_samples": 2, + "max_failures": 5, + "config": { + "env": "Pong-v0", + "foo": "bar" + }, + }, "tune-pong") + trials = list(trials) + self.assertEqual(len(trials), 2) + self.assertTrue("PPO_Pong-v0" in str(trials[0])) + self.assertEqual(trials[0].config, {"foo": "bar", "env": "Pong-v0"}) + self.assertEqual(trials[0].trainable_name, "PPO") + self.assertEqual(trials[0].experiment_tag, "0") + self.assertEqual(trials[0].max_failures, 5) + self.assertEqual(trials[0].evaluated_params, {}) + self.assertEqual(trials[0].local_dir, + os.path.join(DEFAULT_RESULTS_DIR, "tune-pong")) + self.assertEqual(trials[1].experiment_tag, "1") + + def testEval(self): + trials = self.generate_trials({ + "run": "PPO", + "config": { + "foo": { + "eval": "2 + 2" + }, + }, + }, "eval") + trials = list(trials) + self.assertEqual(len(trials), 1) + self.assertEqual(trials[0].config, {"foo": 4}) + self.assertEqual(trials[0].evaluated_params, {"foo": 4}) + self.assertEqual(trials[0].experiment_tag, "0_foo=4") + + def testGridSearch(self): + trials = self.generate_trials({ + "run": "PPO", + "config": { + "bar": { + "grid_search": [True, False] + }, + "foo": { + "grid_search": [1, 2, 3] + }, + "baz": "asd", + }, + }, "grid_search") + trials = list(trials) + self.assertEqual(len(trials), 6) + self.assertEqual(trials[0].config, { + "bar": True, + "foo": 1, + "baz": "asd", + }) + self.assertEqual(trials[0].evaluated_params, { + "bar": True, + "foo": 1, + }) + self.assertEqual(trials[0].experiment_tag, "0_bar=True,foo=1") + + self.assertEqual(trials[1].config, { + "bar": False, + "foo": 1, + "baz": "asd", + }) + self.assertEqual(trials[1].evaluated_params, { + "bar": False, + "foo": 1, + }) + self.assertEqual(trials[1].experiment_tag, "1_bar=False,foo=1") + + self.assertEqual(trials[2].config, { + "bar": True, + "foo": 2, + "baz": "asd", + }) + self.assertEqual(trials[2].evaluated_params, { + "bar": True, + "foo": 2, + }) + + self.assertEqual(trials[3].config, { + "bar": False, + "foo": 2, + "baz": "asd", + }) + self.assertEqual(trials[3].evaluated_params, { + "bar": False, + "foo": 2, + }) + + self.assertEqual(trials[4].config, { + "bar": True, + "foo": 3, + "baz": "asd", + }) + self.assertEqual(trials[4].evaluated_params, { + "bar": True, + "foo": 3, + }) + + self.assertEqual(trials[5].config, { + "bar": False, + "foo": 3, + "baz": "asd", + }) + self.assertEqual(trials[5].evaluated_params, { + "bar": False, + "foo": 3, + }) + + def testGridSearchAndEval(self): + trials = self.generate_trials({ + "run": "PPO", + "config": { + "qux": tune.sample_from(lambda spec: 2 + 2), + "bar": grid_search([True, False]), + "foo": grid_search([1, 2, 3]), + "baz": "asd", + }, + }, "grid_eval") + trials = list(trials) + self.assertEqual(len(trials), 6) + self.assertEqual(trials[0].config, { + "bar": True, + "foo": 1, + "qux": 4, + "baz": "asd", + }) + self.assertEqual(trials[0].evaluated_params, { + "bar": True, + "foo": 1, + "qux": 4, + }) + self.assertEqual(trials[0].experiment_tag, "0_bar=True,foo=1,qux=4") + + def testConditionResolution(self): + trials = self.generate_trials({ + "run": "PPO", + "config": { + "x": 1, + "y": tune.sample_from(lambda spec: spec.config.x + 1), + "z": tune.sample_from(lambda spec: spec.config.y + 1), + }, + }, "condition_resolution") + trials = list(trials) + self.assertEqual(len(trials), 1) + self.assertEqual(trials[0].config, {"x": 1, "y": 2, "z": 3}) + self.assertEqual(trials[0].evaluated_params, {"y": 2, "z": 3}) + self.assertEqual(trials[0].experiment_tag, "0_y=2,z=3") + + def testDependentLambda(self): + trials = self.generate_trials({ + "run": "PPO", + "config": { + "x": grid_search([1, 2]), + "y": tune.sample_from(lambda spec: spec.config.x * 100), + }, + }, "dependent_lambda") + trials = list(trials) + self.assertEqual(len(trials), 2) + self.assertEqual(trials[0].config, {"x": 1, "y": 100}) + self.assertEqual(trials[1].config, {"x": 2, "y": 200}) + + def testDependentGridSearch(self): + trials = self.generate_trials({ + "run": "PPO", + "config": { + "x": grid_search([ + tune.sample_from(lambda spec: spec.config.y * 100), + tune.sample_from(lambda spec: spec.config.y * 200) + ]), + "y": tune.sample_from(lambda spec: 1), + }, + }, "dependent_grid_search") + trials = list(trials) + self.assertEqual(len(trials), 2) + self.assertEqual(trials[0].config, {"x": 100, "y": 1}) + self.assertEqual(trials[1].config, {"x": 200, "y": 1}) + + def testNestedValues(self): + trials = self.generate_trials({ + "run": "PPO", + "config": { + "x": { + "y": { + "z": tune.sample_from(lambda spec: 1) + } + }, + "y": tune.sample_from(lambda spec: 12), + "z": tune.sample_from(lambda spec: spec.config.x.y.z * 100), + }, + }, "nested_values") + trials = list(trials) + self.assertEqual(len(trials), 1) + self.assertEqual(trials[0].config, { + "x": { + "y": { + "z": 1 + } + }, + "y": 12, + "z": 100 + }) + self.assertEqual(trials[0].evaluated_params, { + "x/y/z": 1, + "y": 12, + "z": 100 + }) + + def testLogUniform(self): + sampler = tune.loguniform(1e-10, 1e-1).func + results = [sampler(None) for i in range(1000)] + assert abs(np.log(min(results)) / np.log(10) - -10) < 0.1 + assert abs(np.log(max(results)) / np.log(10) - -1) < 0.1 + + sampler_e = tune.loguniform(np.e**-4, np.e, base=np.e).func + results_e = [sampler_e(None) for i in range(1000)] + assert abs(np.log(min(results_e)) - -4) < 0.1 + assert abs(np.log(max(results_e)) - 1) < 0.1 + + def test_resolve_dict(self): + config = { + "a": { + "b": 1, + "c": 2, + }, + "b": { + "a": 3 + } + } + resolved = resolve_nested_dict(config) + for k, v in [(("a", "b"), 1), (("a", "c"), 2), (("b", "a"), 3)]: + self.assertEqual(resolved.get(k), v) + + def testRecursiveDep(self): + try: + list( + self.generate_trials({ + "run": "PPO", + "config": { + "foo": tune.sample_from(lambda spec: spec.config.foo), + }, + }, "recursive_dep")) + except RecursiveDependencyError as e: + assert "`foo` recursively depends on" in str(e), e + else: + assert False + + def testMaxConcurrentSuggestions(self): + """Checks that next_trials() supports throttling.""" + experiment_spec = { + "run": "PPO", + "num_samples": 6, + } + experiments = [Experiment.from_json("test", experiment_spec)] + + searcher = _MockSuggestionAlgorithm(max_concurrent=4) + searcher.add_configurations(experiments) + trials = searcher.next_trials() + self.assertEqual(len(trials), 4) + self.assertEqual(searcher.next_trials(), []) + + finished_trial = trials.pop() + searcher.on_trial_complete(finished_trial.trial_id) + self.assertEqual(len(searcher.next_trials()), 1) + + finished_trial = trials.pop() + searcher.on_trial_complete(finished_trial.trial_id) + + finished_trial = trials.pop() + searcher.on_trial_complete(finished_trial.trial_id) + + finished_trial = trials.pop() + searcher.on_trial_complete(finished_trial.trial_id) + self.assertEqual(len(searcher.next_trials()), 1) + self.assertEqual(len(searcher.next_trials()), 0) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/train.py b/rllib/train.py index 907cfc785..a632b85f1 100755 --- a/rllib/train.py +++ b/rllib/train.py @@ -8,7 +8,7 @@ import argparse import yaml import ray -from ray.tests.cluster_utils import Cluster +from ray.cluster_utils import Cluster from ray.tune.config_parser import make_parser from ray.tune.result import DEFAULT_RESULTS_DIR from ray.tune.resources import resources_to_json