From e66ddab190455850f824650e028ac1f861a6ef92 Mon Sep 17 00:00:00 2001 From: Gekho457 <62982571+Gekho457@users.noreply.github.com> Date: Tue, 24 Nov 2020 12:13:15 -0500 Subject: [PATCH] [autoscaler/k8s] Handle unavailable k8s API (#12283) --- python/ray/autoscaler/_private/autoscaler.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/ray/autoscaler/_private/autoscaler.py b/python/ray/autoscaler/_private/autoscaler.py index c77d33dfc..b918df7f3 100644 --- a/python/ray/autoscaler/_private/autoscaler.py +++ b/python/ray/autoscaler/_private/autoscaler.py @@ -1,5 +1,6 @@ from collections import defaultdict, namedtuple from typing import Any, Optional, Dict, List +from urllib3.exceptions import MaxRetryError import copy import logging import math @@ -130,7 +131,13 @@ class StandardAutoscaler: if _internal_kv_initialized(): _internal_kv_put( DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True) - self.num_failures += 1 + # Don't abort the autoscaler if the K8s API server is down. + # https://github.com/ray-project/ray/issues/12255 + is_k8s_connection_error = ( + self.config["provider"]["type"] == "kubernetes" + and isinstance(e, MaxRetryError)) + if not is_k8s_connection_error: + self.num_failures += 1 if self.num_failures > self.max_failures: logger.critical("StandardAutoscaler: " "Too many errors, abort.")