From 589555455548bca0aeaf434cdeb83f3bf4999446 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 24 Nov 2020 12:16:01 -0800 Subject: [PATCH] [autoscaler] Raise node "start" deadline to 900s, make configurable (#12316) --- python/ray/autoscaler/_private/command_runner.py | 4 ++-- python/ray/autoscaler/_private/constants.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/ray/autoscaler/_private/command_runner.py b/python/ray/autoscaler/_private/command_runner.py index 99367f96a..52ead65cd 100644 --- a/python/ray/autoscaler/_private/command_runner.py +++ b/python/ray/autoscaler/_private/command_runner.py @@ -14,7 +14,8 @@ import warnings from ray.autoscaler.command_runner import CommandRunnerInterface from ray.autoscaler._private.constants import \ DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES,\ - DEFAULT_OBJECT_STORE_MEMORY_PROPORTION + DEFAULT_OBJECT_STORE_MEMORY_PROPORTION, \ + NODE_START_WAIT_S from ray.autoscaler._private.docker import check_bind_mounts_cmd, \ check_docker_running_cmd, \ check_docker_image, \ @@ -33,7 +34,6 @@ from ray.autoscaler._private.constants import RAY_HOME logger = logging.getLogger(__name__) # How long to wait for a node to start, in seconds -NODE_START_WAIT_S = 300 HASH_MAX_LENGTH = 10 KUBECTL_RSYNC = os.path.join( os.path.dirname(os.path.abspath(__file__)), "kubernetes/kubectl-rsync.sh") diff --git a/python/ray/autoscaler/_private/constants.py b/python/ray/autoscaler/_private/constants.py index 3007eedc6..ac0e97124 100644 --- a/python/ray/autoscaler/_private/constants.py +++ b/python/ray/autoscaler/_private/constants.py @@ -12,6 +12,9 @@ def env_integer(key, default): return default +# How long to wait for a node to start, in seconds +NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900) + # Abort autoscaling if more than this number of errors are encountered. This # is a safety feature to prevent e.g. runaway node launches. AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)