Skip dead nodes to avoid connection timeout. (#4154)

This commit is contained in:
Yuhong Guo
2019-03-03 05:11:19 +08:00
committed by Philipp Moritz
parent 9950f63e8c
commit 6f46edca51
9 changed files with 131 additions and 38 deletions
+1
View File
@@ -56,6 +56,7 @@ MONITOR_DIED_ERROR = "monitor_died"
LOG_MONITOR_DIED_ERROR = "log_monitor_died"
REPORTER_DIED_ERROR = "reporter_died"
DASHBOARD_DIED_ERROR = "dashboard_died"
RAYLET_CONNECTION_ERROR = "raylet_connection_error"
# Abort autoscaling if more than this number of errors are encountered. This
# is a safety feature to prevent e.g. runaway node launches.
+5 -3
View File
@@ -102,7 +102,7 @@ class Cluster(object):
return node
def remove_node(self, node):
def remove_node(self, node, allow_graceful=False):
"""Kills all processes associated with worker node.
Args:
@@ -110,11 +110,13 @@ class Cluster(object):
will be removed.
"""
if self.head_node == node:
self.head_node.kill_all_processes(check_alive=False)
self.head_node.kill_all_processes(
check_alive=False, allow_graceful=allow_graceful)
self.head_node = None
# TODO(rliaw): Do we need to kill all worker processes?
else:
node.kill_all_processes(check_alive=False)
node.kill_all_processes(
check_alive=False, allow_graceful=allow_graceful)
self.worker_nodes.remove(node)
assert not node.any_processes_alive(), (
+28
View File
@@ -722,3 +722,31 @@ def test_raylet_crash_when_get(ray_start_regular):
with pytest.raises(Exception, match=r".*Connection closed unexpectedly.*"):
ray.get(nonexistent_id)
thread.join()
def test_connect_with_disconnected_node(shutdown_only):
config = json.dumps({
"num_heartbeats_timeout": 50,
"heartbeat_timeout_milliseconds": 10,
})
cluster = Cluster()
cluster.add_node(num_cpus=0, _internal_config=config)
ray.init(redis_address=cluster.redis_address)
info = relevant_errors(ray_constants.REMOVED_NODE_ERROR)
assert len(info) == 0
# This node is killed by SIGKILL, ray_monitor will mark it to dead.
dead_node = cluster.add_node(num_cpus=0, _internal_config=config)
cluster.remove_node(dead_node, allow_graceful=False)
wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 1, timeout=2)
# This node is killed by SIGKILL, ray_monitor will mark it to dead.
dead_node = cluster.add_node(num_cpus=0, _internal_config=config)
cluster.remove_node(dead_node, allow_graceful=False)
wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2, timeout=2)
# This node is killed by SIGTERM, ray_monitor will not mark it again.
removing_node = cluster.add_node(num_cpus=0, _internal_config=config)
cluster.remove_node(removing_node, allow_graceful=True)
with pytest.raises(Exception, match=('Timing out of wait.')):
wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 3, timeout=2)
# There is no connection error to a dead node.
info = relevant_errors(ray_constants.RAYLET_CONNECTION_ERROR)
assert len(info) == 0