From 4f02ad4ef9d4277d100fcff1c05578a99c5c71d2 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Sat, 5 Sep 2020 13:14:07 -0700 Subject: [PATCH] [core] Disable GCS reconnect (#10579) * Set default GCS retries to 1 * Fix cc test --- python/ray/tests/test_gcs_fault_tolerance.py | 18 ++++++++++++------ python/ray/tests/test_multi_node.py | 18 ++++++++++++++++-- src/ray/common/ray_config_def.h | 2 +- .../test/service_based_gcs_client_test.cc | 6 +++++- 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/python/ray/tests/test_gcs_fault_tolerance.py b/python/ray/tests/test_gcs_fault_tolerance.py index 836749049..b6585cabd 100644 --- a/python/ray/tests/test_gcs_fault_tolerance.py +++ b/python/ray/tests/test_gcs_fault_tolerance.py @@ -21,8 +21,10 @@ def increase(x): @pytest.mark.parametrize( - "ray_start_regular", - [generate_system_config_map(num_heartbeats_timeout=20)], + "ray_start_regular", [ + generate_system_config_map( + num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60) + ], indirect=True) def test_gcs_server_restart(ray_start_regular): actor1 = Increase.remote() @@ -44,8 +46,10 @@ def test_gcs_server_restart(ray_start_regular): @pytest.mark.parametrize( - "ray_start_regular", - [generate_system_config_map(num_heartbeats_timeout=20)], + "ray_start_regular", [ + generate_system_config_map( + num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60) + ], indirect=True) def test_gcs_server_restart_during_actor_creation(ray_start_regular): ids = [] @@ -63,8 +67,10 @@ def test_gcs_server_restart_during_actor_creation(ray_start_regular): @pytest.mark.parametrize( - "ray_start_cluster_head", - [generate_system_config_map(num_heartbeats_timeout=20)], + "ray_start_cluster_head", [ + generate_system_config_map( + num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60) + ], indirect=True) def test_node_failure_detector_when_gcs_server_restart(ray_start_cluster_head): """Checks that the node failure detector is correct when gcs server restart. diff --git a/python/ray/tests/test_multi_node.py b/python/ray/tests/test_multi_node.py index 22474a3e1..4247e340f 100644 --- a/python/ray/tests/test_multi_node.py +++ b/python/ray/tests/test_multi_node.py @@ -8,8 +8,22 @@ import ray from ray.test_utils import ( RayTestTimeoutException, check_call_ray, run_string_as_driver, run_string_as_driver_nonblocking, wait_for_children_of_pid, - wait_for_children_of_pid_to_exit, kill_process_by_name, Semaphore, - init_error_pubsub, get_error_message) + wait_for_children_of_pid_to_exit, wait_for_condition, kill_process_by_name, + Semaphore, init_error_pubsub, get_error_message) + + +def test_remote_raylet_cleanup(ray_start_cluster): + cluster = ray_start_cluster + cluster.add_node() + cluster.add_node() + cluster.add_node() + cluster.wait_for_nodes() + + def remote_raylets_dead(): + return not cluster.remaining_processes_alive() + + cluster.remove_node(cluster.head_node, allow_graceful=False) + wait_for_condition(remote_raylets_dead) def test_error_isolation(call_ray_start): diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index 08280b9b6..9b99dc9b7 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -247,7 +247,7 @@ RAY_CONFIG(uint32_t, cancellation_retry_ms, 2000) RAY_CONFIG(int64_t, ping_gcs_rpc_server_interval_milliseconds, 1000) /// Maximum number of times to retry ping gcs rpc server when gcs server restarts. -RAY_CONFIG(int32_t, ping_gcs_rpc_server_max_retries, 600) +RAY_CONFIG(int32_t, ping_gcs_rpc_server_max_retries, 1) /// Whether start the Plasma Store as a Raylet thread. RAY_CONFIG(bool, plasma_store_as_thread, false) diff --git a/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc b/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc index f49d98377..64f044f4c 100644 --- a/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc +++ b/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc @@ -25,7 +25,11 @@ namespace ray { class ServiceBasedGcsClientTest : public ::testing::Test { public: - ServiceBasedGcsClientTest() { TestSetupUtil::StartUpRedisServers(std::vector()); } + ServiceBasedGcsClientTest() { + RayConfig::instance().initialize( + {{"ping_gcs_rpc_server_max_retries", std::to_string(60)}}); + TestSetupUtil::StartUpRedisServers(std::vector()); + } virtual ~ServiceBasedGcsClientTest() { TestSetupUtil::ShutDownRedisServers(); }