diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index 09d021902..a93acc629 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -691,6 +691,7 @@ def stop(force, verbose, log_style, log_color): ["log_monitor.py", False], ["reporter.py", False], ["dashboard.py", False], + ["new_dashboard/agent.py", False], ["ray_process_reaper.py", False], ] diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index acc323563..ad9ab4ec0 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -12,8 +12,10 @@ py_test_module_list( files = [ "test_async.py", "test_actor.py", - "test_actor_failures.py", "test_actor_advanced.py", + "test_advanced_3.py", + "test_actor_failures.py", + "test_actor_resources.py", "test_advanced.py", "test_advanced_2.py", "test_array.py", @@ -22,10 +24,14 @@ py_test_module_list( "test_basic_2.py", "test_cancel.py", "test_cli.py", + "test_component_failures_2.py", "test_component_failures_3.py", "test_error_ray_not_initialized.py", "test_gcs_fault_tolerance.py", "test_iter.py", + "test_joblib.py", + "test_global_state.py", + "test_global_gc.py", "test_mldataset.py", ], size = "medium", @@ -36,13 +42,7 @@ py_test_module_list( py_test_module_list( files = [ - "test_actor_resources.py", - "test_advanced_3.py", - "test_component_failures_2.py", - "test_dynres.py", - "test_global_gc.py", - "test_global_state.py", - "test_joblib.py", + "test_dynres.py", # dyn res not implemented ], size = "medium", extra_srcs = SRCS, @@ -55,14 +55,22 @@ py_test_module_list( "test_memory_limits.py", "test_memory_scheduling.py", "test_metrics.py", + "test_multi_node.py", "test_multi_node_2.py", + "test_multi_tenancy.py", + "test_multinode_failures.py", "test_multinode_failures_2.py", "test_multiprocessing.py", + "test_object_manager.py", "test_object_spilling.py", "test_output.py", + "test_reconstruction.py", + "test_reference_counting.py", "test_reference_counting_2.py", "test_resource_demand_scheduler.py", "test_serialization.py", + "test_stress.py", + "test_stress_sharded.py", "test_tensorflow.py", "test_unreconstructable_errors.py", ], @@ -72,23 +80,6 @@ py_test_module_list( deps = ["//:ray_lib"], ) -py_test_module_list( - files = [ - "test_multinode_failures.py", - "test_multi_node.py", - "test_object_manager.py", - "test_reconstruction.py", - "test_reference_counting.py", - "test_stress.py", - "test_stress_sharded.py", - "test_multi_tenancy.py", - ], - size = "medium", - extra_srcs = SRCS, - tags = ["exclusive", "medium_size_python_tests_k_to_z", "new_scheduler_broken"], - deps = ["//:ray_lib"], -) - py_test_module_list( files = [ "test_actor_pool.py", @@ -125,6 +116,7 @@ py_test_module_list( py_test_module_list( files = [ + "test_failure.py", "test_stress_failure.py", ], size = "large", @@ -135,8 +127,7 @@ py_test_module_list( py_test_module_list( files = [ - "test_failure.py", - "test_placement_group.py", + "test_placement_group.py", # placement groups not implemented ], size = "large", extra_srcs = SRCS, diff --git a/python/ray/tests/test_advanced_3.py b/python/ray/tests/test_advanced_3.py index 6e279bcd4..b40d95d8b 100644 --- a/python/ray/tests/test_advanced_3.py +++ b/python/ray/tests/test_advanced_3.py @@ -21,7 +21,8 @@ import setproctitle import subprocess from ray.test_utils import (check_call_ray, RayTestTimeoutException, - wait_for_condition, wait_for_num_actors) + wait_for_condition, wait_for_num_actors, + new_scheduler_enabled) logger = logging.getLogger(__name__) @@ -93,6 +94,7 @@ def test_local_scheduling_first(ray_start_cluster): assert local() +@pytest.mark.skipif(new_scheduler_enabled(), reason="flakes more often") def test_load_balancing_with_dependencies(ray_start_cluster): # This test ensures that tasks are being assigned to all raylets in a # roughly equal manner even when the tasks have dependencies. diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index ff4c3a810..1c01977e4 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -22,6 +22,7 @@ from ray.test_utils import ( init_error_pubsub, get_error_message, Semaphore, + new_scheduler_enabled, ) @@ -662,6 +663,7 @@ def test_warning_for_resource_deadlock(error_pubsub, shutdown_only): assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR +@pytest.mark.skipif(new_scheduler_enabled(), reason="broken") def test_warning_for_infeasible_tasks(ray_start_regular, error_pubsub): p = error_pubsub # Check that we get warning messages for infeasible tasks. @@ -687,6 +689,7 @@ def test_warning_for_infeasible_tasks(ray_start_regular, error_pubsub): assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR +@pytest.mark.skipif(new_scheduler_enabled(), reason="broken") def test_warning_for_infeasible_zero_cpu_actor(shutdown_only): # Check that we cannot place an actor on a 0 CPU machine and that we get an # infeasibility warning (even though the actor creation task itself @@ -953,6 +956,7 @@ def test_raylet_crash_when_get(ray_start_regular): thread.join() +@pytest.mark.skipif(new_scheduler_enabled(), reason="broken") def test_connect_with_disconnected_node(shutdown_only): config = { "num_heartbeats_timeout": 50, diff --git a/python/ray/tests/test_global_gc.py b/python/ray/tests/test_global_gc.py index 76512f691..685edacf0 100644 --- a/python/ray/tests/test_global_gc.py +++ b/python/ray/tests/test_global_gc.py @@ -9,7 +9,7 @@ import pytest import ray import ray.cluster_utils -from ray.test_utils import wait_for_condition +from ray.test_utils import wait_for_condition, new_scheduler_enabled from ray.internal.internal_api import global_gc logger = logging.getLogger(__name__) @@ -129,6 +129,7 @@ def test_global_gc_when_full(shutdown_only): gc.enable() +@pytest.mark.skipif(new_scheduler_enabled(), reason="hangs") def test_global_gc_actors(shutdown_only): ray.init(num_cpus=1) diff --git a/python/ray/tests/test_global_state.py b/python/ray/tests/test_global_state.py index e5c90ee68..967d6d7ea 100644 --- a/python/ray/tests/test_global_state.py +++ b/python/ray/tests/test_global_state.py @@ -8,6 +8,7 @@ import time import ray import ray.ray_constants import ray.test_utils +from ray.test_utils import new_scheduler_enabled from ray._raylet import GlobalStateAccessor @@ -143,6 +144,7 @@ def test_global_state_actor_entry(ray_start_regular): @pytest.mark.parametrize("max_shapes", [0, 2, -1]) +@pytest.mark.skipif(new_scheduler_enabled(), reason="broken") def test_load_report(shutdown_only, max_shapes): resource1 = "A" resource2 = "B" @@ -213,6 +215,7 @@ def test_load_report(shutdown_only, max_shapes): global_state_accessor.disconnect() +@pytest.mark.skipif(new_scheduler_enabled(), reason="broken") def test_placement_group_load_report(ray_start_cluster): cluster = ray_start_cluster # Add a head node that doesn't have gpu resource. @@ -281,6 +284,7 @@ def test_placement_group_load_report(ray_start_cluster): global_state_accessor.disconnect() +@pytest.mark.skipif(new_scheduler_enabled(), reason="broken") def test_backlog_report(shutdown_only): cluster = ray.init( num_cpus=1, _system_config={ diff --git a/python/ray/tests/test_multi_node.py b/python/ray/tests/test_multi_node.py index cb206112d..582193266 100644 --- a/python/ray/tests/test_multi_node.py +++ b/python/ray/tests/test_multi_node.py @@ -9,7 +9,7 @@ from ray.test_utils import ( RayTestTimeoutException, check_call_ray, run_string_as_driver, run_string_as_driver_nonblocking, wait_for_children_of_pid, wait_for_children_of_pid_to_exit, wait_for_condition, kill_process_by_name, - Semaphore, init_error_pubsub, get_error_message) + Semaphore, init_error_pubsub, get_error_message, new_scheduler_enabled) def test_remote_raylet_cleanup(ray_start_cluster): @@ -139,6 +139,7 @@ print("success") assert "success" in out +@pytest.mark.skipif(new_scheduler_enabled(), reason="hangs") def test_driver_exiting_quickly(call_ray_start): # This test will create some drivers that submit some tasks and then # exit without waiting for the tasks to complete. @@ -304,6 +305,7 @@ ray.get([a.log.remote(), f.remote()]) "--min-worker-port=0 --max-worker-port=0 --port 0" ], indirect=True) +@pytest.mark.skipif(new_scheduler_enabled(), reason="hangs") def test_drivers_release_resources(call_ray_start): address = call_ray_start diff --git a/python/ray/tests/test_multi_node_2.py b/python/ray/tests/test_multi_node_2.py index 7a4eeb0e6..0579a0c41 100644 --- a/python/ray/tests/test_multi_node_2.py +++ b/python/ray/tests/test_multi_node_2.py @@ -6,7 +6,8 @@ import ray import ray.ray_constants as ray_constants from ray.monitor import Monitor from ray.cluster_utils import Cluster -from ray.test_utils import generate_system_config_map, SignalActor +from ray.test_utils import generate_system_config_map, SignalActor, \ + new_scheduler_enabled logger = logging.getLogger(__name__) @@ -117,6 +118,7 @@ def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): "num_cpus": 2, }], indirect=True) +@pytest.mark.skipif(new_scheduler_enabled(), reason="fails") def test_heartbeats_single(ray_start_cluster_head): """Unit test for `Cluster.wait_for_nodes`. diff --git a/python/ray/tests/test_multi_tenancy.py b/python/ray/tests/test_multi_tenancy.py index 74fa2e5b2..c3e440235 100644 --- a/python/ray/tests/test_multi_tenancy.py +++ b/python/ray/tests/test_multi_tenancy.py @@ -10,9 +10,9 @@ import ray import ray.test_utils from ray.core.generated import common_pb2 from ray.core.generated import node_manager_pb2, node_manager_pb2_grpc -from ray.test_utils import (wait_for_condition, wait_for_pid_to_exit, - run_string_as_driver, - run_string_as_driver_nonblocking) +from ray.test_utils import ( + wait_for_condition, wait_for_pid_to_exit, run_string_as_driver, + run_string_as_driver_nonblocking, new_scheduler_enabled) def get_workers(): @@ -207,6 +207,7 @@ def test_worker_capping_run_chained_tasks(shutdown_only): assert len(get_workers()) == 2 +@pytest.mark.skipif(new_scheduler_enabled(), reason="fails") def test_worker_capping_fifo(shutdown_only): # Start 2 initial workers by setting num_cpus to 2. info = ray.init(num_cpus=2) @@ -250,6 +251,7 @@ ray.shutdown() assert worker2.pid == get_workers()[0].pid +@pytest.mark.skipif(new_scheduler_enabled(), reason="raylet hang 100% cpu") def test_worker_registration_failure_after_driver_exit(shutdown_only): info = ray.init(num_cpus=1) diff --git a/python/ray/tests/test_reconstruction.py b/python/ray/tests/test_reconstruction.py index 382225cea..b471964c2 100644 --- a/python/ray/tests/test_reconstruction.py +++ b/python/ray/tests/test_reconstruction.py @@ -9,6 +9,7 @@ import ray from ray.test_utils import ( wait_for_condition, wait_for_pid_to_exit, + new_scheduler_enabled, ) SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM @@ -487,6 +488,7 @@ def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled): raise e.as_instanceof_cause() +@pytest.mark.skipif(new_scheduler_enabled(), reason="hangs") def test_reconstruction_stress(ray_start_cluster): config = { "num_heartbeats_timeout": 10, diff --git a/python/ray/tests/test_reference_counting.py b/python/ray/tests/test_reference_counting.py index ba6a4b067..a0dc10f6b 100644 --- a/python/ray/tests/test_reference_counting.py +++ b/python/ray/tests/test_reference_counting.py @@ -10,7 +10,8 @@ import pytest import ray import ray.cluster_utils -from ray.test_utils import SignalActor, put_object, wait_for_condition +from ray.test_utils import SignalActor, put_object, wait_for_condition, \ + new_scheduler_enabled logger = logging.getLogger(__name__) @@ -166,6 +167,7 @@ def test_dependency_refcounts(ray_start_regular): check_refcounts({}) +@pytest.mark.skipif(new_scheduler_enabled(), reason="dynres notimpl") def test_actor_creation_task(ray_start_regular): @ray.remote def large_object():