From 35f7d84dbe0c4c2bfe8117727eeda5256fe2c3a2 Mon Sep 17 00:00:00 2001 From: Tao Wang Date: Mon, 14 Dec 2020 16:58:40 +0800 Subject: [PATCH] Revert heartbeat interval to keep ci stable (#12836) * Revert heartbeat interval to keep ci stable * fix missing one --- python/ray/tests/test_actor_resources.py | 2 +- python/ray/tests/test_advanced_3.py | 7 +++---- python/ray/tests/test_component_failures_2.py | 2 +- python/ray/tests/test_gcs_fault_tolerance.py | 2 +- python/ray/tests/test_multi_node_2.py | 6 +++--- python/ray/tests/test_placement_group.py | 6 +++--- src/ray/common/ray_config_def.h | 4 ++-- 7 files changed, 14 insertions(+), 15 deletions(-) diff --git a/python/ray/tests/test_actor_resources.py b/python/ray/tests/test_actor_resources.py index 65357fb8c..f0c254c5f 100644 --- a/python/ray/tests/test_actor_resources.py +++ b/python/ray/tests/test_actor_resources.py @@ -237,7 +237,7 @@ def test_actor_multiple_gpus_from_multiple_tasks(ray_start_cluster): cluster.add_node( num_cpus=10 * num_gpus_per_raylet, num_gpus=num_gpus_per_raylet, - _system_config={"num_heartbeats_timeout": 100} if i == 0 else {}) + _system_config={"num_heartbeats_timeout": 1000} if i == 0 else {}) ray.init(address=cluster.address) @ray.remote diff --git a/python/ray/tests/test_advanced_3.py b/python/ray/tests/test_advanced_3.py index 4fffeb5b0..7f1e8e639 100644 --- a/python/ray/tests/test_advanced_3.py +++ b/python/ray/tests/test_advanced_3.py @@ -610,10 +610,9 @@ def test_lease_request_leak(shutdown_only): del obj_ref ray.get(tasks) - def _no_objects(): - return len(ray.objects()) == 0 - - wait_for_condition(_no_objects, timeout=10) + time.sleep( + 1) # Sleep for an amount longer than the reconstruction timeout. + assert len(ray.objects()) == 0, ray.objects() @pytest.mark.parametrize( diff --git a/python/ray/tests/test_component_failures_2.py b/python/ray/tests/test_component_failures_2.py index c45156ba6..2235c5745 100644 --- a/python/ray/tests/test_component_failures_2.py +++ b/python/ray/tests/test_component_failures_2.py @@ -143,7 +143,7 @@ def check_components_alive(cluster, component_type, check_component_alive): "num_cpus": 8, "num_nodes": 4, "_system_config": { - "num_heartbeats_timeout": 10 + "num_heartbeats_timeout": 100 }, }], indirect=True) diff --git a/python/ray/tests/test_gcs_fault_tolerance.py b/python/ray/tests/test_gcs_fault_tolerance.py index d13dbf75a..32f20d42a 100644 --- a/python/ray/tests/test_gcs_fault_tolerance.py +++ b/python/ray/tests/test_gcs_fault_tolerance.py @@ -76,7 +76,7 @@ def test_gcs_server_restart_during_actor_creation(ray_start_regular): @pytest.mark.parametrize( "ray_start_cluster_head", [ generate_system_config_map( - num_heartbeats_timeout=2, ping_gcs_rpc_server_max_retries=60) + num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60) ], indirect=True) def test_node_failure_detector_when_gcs_server_restart(ray_start_cluster_head): diff --git a/python/ray/tests/test_multi_node_2.py b/python/ray/tests/test_multi_node_2.py index d53d243c0..6578bdeb9 100644 --- a/python/ray/tests/test_multi_node_2.py +++ b/python/ray/tests/test_multi_node_2.py @@ -34,7 +34,7 @@ def test_shutdown(): @pytest.mark.parametrize( "ray_start_cluster_head", [ generate_system_config_map( - num_heartbeats_timeout=2, object_timeout_milliseconds=12345) + num_heartbeats_timeout=20, object_timeout_milliseconds=12345) ], indirect=True) def test_system_config(ray_start_cluster_head): @@ -52,12 +52,12 @@ def test_system_config(ray_start_cluster_head): @ray.remote def f(): assert ray._config.object_timeout_milliseconds() == 12345 - assert ray._config.num_heartbeats_timeout() == 2 + assert ray._config.num_heartbeats_timeout() == 20 ray.get([f.remote() for _ in range(5)]) cluster.remove_node(worker, allow_graceful=False) - time.sleep(0.9) + time.sleep(1) assert ray.cluster_resources()["CPU"] == 2 time.sleep(2) diff --git a/python/ray/tests/test_placement_group.py b/python/ray/tests/test_placement_group.py index 628e1ed85..a82c449a5 100644 --- a/python/ray/tests/test_placement_group.py +++ b/python/ray/tests/test_placement_group.py @@ -1172,7 +1172,7 @@ ray.shutdown() @pytest.mark.parametrize( "ray_start_cluster_head", [ generate_system_config_map( - num_heartbeats_timeout=3, ping_gcs_rpc_server_max_retries=60) + num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60) ], indirect=True) def test_create_placement_group_after_gcs_server_restart( @@ -1210,7 +1210,7 @@ def test_create_placement_group_after_gcs_server_restart( @pytest.mark.parametrize( "ray_start_cluster_head", [ generate_system_config_map( - num_heartbeats_timeout=3, ping_gcs_rpc_server_max_retries=60) + num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60) ], indirect=True) def test_create_actor_with_placement_group_after_gcs_server_restart( @@ -1234,7 +1234,7 @@ def test_create_actor_with_placement_group_after_gcs_server_restart( @pytest.mark.parametrize( "ray_start_cluster_head", [ generate_system_config_map( - num_heartbeats_timeout=3, ping_gcs_rpc_server_max_retries=60) + num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60) ], indirect=True) def test_create_placement_group_during_gcs_server_restart( diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index 3d9630a9d..a1af82469 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -35,11 +35,11 @@ RAY_CONFIG(int64_t, ray_cookie, 0x5241590000000000) RAY_CONFIG(int64_t, handler_warning_timeout_ms, 1000) /// The duration between heartbeats sent by the raylets. -RAY_CONFIG(int64_t, raylet_heartbeat_timeout_milliseconds, 1000) +RAY_CONFIG(int64_t, raylet_heartbeat_timeout_milliseconds, 100) /// If a component has not sent a heartbeat in the last num_heartbeats_timeout /// heartbeat intervals, the raylet monitor process will report /// it as dead to the db_client table. -RAY_CONFIG(int64_t, num_heartbeats_timeout, 30) +RAY_CONFIG(int64_t, num_heartbeats_timeout, 300) /// For a raylet, if the last heartbeat was sent more than this many /// heartbeat periods ago, then a warning will be logged that the heartbeat /// handler is drifting.