[core] Disable GCS reconnect (#10579)

* Set default GCS retries to 1

* Fix cc test
This commit is contained in:
Stephanie Wang
2020-09-05 13:14:07 -07:00
committed by GitHub
parent 54215ff287
commit 4f02ad4ef9
4 changed files with 34 additions and 10 deletions
+12 -6
View File
@@ -21,8 +21,10 @@ def increase(x):
@pytest.mark.parametrize(
"ray_start_regular",
[generate_system_config_map(num_heartbeats_timeout=20)],
"ray_start_regular", [
generate_system_config_map(
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
],
indirect=True)
def test_gcs_server_restart(ray_start_regular):
actor1 = Increase.remote()
@@ -44,8 +46,10 @@ def test_gcs_server_restart(ray_start_regular):
@pytest.mark.parametrize(
"ray_start_regular",
[generate_system_config_map(num_heartbeats_timeout=20)],
"ray_start_regular", [
generate_system_config_map(
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
],
indirect=True)
def test_gcs_server_restart_during_actor_creation(ray_start_regular):
ids = []
@@ -63,8 +67,10 @@ def test_gcs_server_restart_during_actor_creation(ray_start_regular):
@pytest.mark.parametrize(
"ray_start_cluster_head",
[generate_system_config_map(num_heartbeats_timeout=20)],
"ray_start_cluster_head", [
generate_system_config_map(
num_heartbeats_timeout=20, ping_gcs_rpc_server_max_retries=60)
],
indirect=True)
def test_node_failure_detector_when_gcs_server_restart(ray_start_cluster_head):
"""Checks that the node failure detector is correct when gcs server restart.
+16 -2
View File
@@ -8,8 +8,22 @@ import ray
from ray.test_utils import (
RayTestTimeoutException, check_call_ray, run_string_as_driver,
run_string_as_driver_nonblocking, wait_for_children_of_pid,
wait_for_children_of_pid_to_exit, kill_process_by_name, Semaphore,
init_error_pubsub, get_error_message)
wait_for_children_of_pid_to_exit, wait_for_condition, kill_process_by_name,
Semaphore, init_error_pubsub, get_error_message)
def test_remote_raylet_cleanup(ray_start_cluster):
cluster = ray_start_cluster
cluster.add_node()
cluster.add_node()
cluster.add_node()
cluster.wait_for_nodes()
def remote_raylets_dead():
return not cluster.remaining_processes_alive()
cluster.remove_node(cluster.head_node, allow_graceful=False)
wait_for_condition(remote_raylets_dead)
def test_error_isolation(call_ray_start):