mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 23:23:15 +08:00
Skip dead nodes to avoid connection timeout. (#4154)
This commit is contained in:
committed by
Philipp Moritz
parent
9950f63e8c
commit
6f46edca51
@@ -56,6 +56,7 @@ MONITOR_DIED_ERROR = "monitor_died"
|
||||
LOG_MONITOR_DIED_ERROR = "log_monitor_died"
|
||||
REPORTER_DIED_ERROR = "reporter_died"
|
||||
DASHBOARD_DIED_ERROR = "dashboard_died"
|
||||
RAYLET_CONNECTION_ERROR = "raylet_connection_error"
|
||||
|
||||
# Abort autoscaling if more than this number of errors are encountered. This
|
||||
# is a safety feature to prevent e.g. runaway node launches.
|
||||
|
||||
@@ -102,7 +102,7 @@ class Cluster(object):
|
||||
|
||||
return node
|
||||
|
||||
def remove_node(self, node):
|
||||
def remove_node(self, node, allow_graceful=False):
|
||||
"""Kills all processes associated with worker node.
|
||||
|
||||
Args:
|
||||
@@ -110,11 +110,13 @@ class Cluster(object):
|
||||
will be removed.
|
||||
"""
|
||||
if self.head_node == node:
|
||||
self.head_node.kill_all_processes(check_alive=False)
|
||||
self.head_node.kill_all_processes(
|
||||
check_alive=False, allow_graceful=allow_graceful)
|
||||
self.head_node = None
|
||||
# TODO(rliaw): Do we need to kill all worker processes?
|
||||
else:
|
||||
node.kill_all_processes(check_alive=False)
|
||||
node.kill_all_processes(
|
||||
check_alive=False, allow_graceful=allow_graceful)
|
||||
self.worker_nodes.remove(node)
|
||||
|
||||
assert not node.any_processes_alive(), (
|
||||
|
||||
@@ -722,3 +722,31 @@ def test_raylet_crash_when_get(ray_start_regular):
|
||||
with pytest.raises(Exception, match=r".*Connection closed unexpectedly.*"):
|
||||
ray.get(nonexistent_id)
|
||||
thread.join()
|
||||
|
||||
|
||||
def test_connect_with_disconnected_node(shutdown_only):
|
||||
config = json.dumps({
|
||||
"num_heartbeats_timeout": 50,
|
||||
"heartbeat_timeout_milliseconds": 10,
|
||||
})
|
||||
cluster = Cluster()
|
||||
cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
ray.init(redis_address=cluster.redis_address)
|
||||
info = relevant_errors(ray_constants.REMOVED_NODE_ERROR)
|
||||
assert len(info) == 0
|
||||
# This node is killed by SIGKILL, ray_monitor will mark it to dead.
|
||||
dead_node = cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
cluster.remove_node(dead_node, allow_graceful=False)
|
||||
wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 1, timeout=2)
|
||||
# This node is killed by SIGKILL, ray_monitor will mark it to dead.
|
||||
dead_node = cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
cluster.remove_node(dead_node, allow_graceful=False)
|
||||
wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2, timeout=2)
|
||||
# This node is killed by SIGTERM, ray_monitor will not mark it again.
|
||||
removing_node = cluster.add_node(num_cpus=0, _internal_config=config)
|
||||
cluster.remove_node(removing_node, allow_graceful=True)
|
||||
with pytest.raises(Exception, match=('Timing out of wait.')):
|
||||
wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 3, timeout=2)
|
||||
# There is no connection error to a dead node.
|
||||
info = relevant_errors(ray_constants.RAYLET_CONNECTION_ERROR)
|
||||
assert len(info) == 0
|
||||
|
||||
Reference in New Issue
Block a user