[autoscaler] Raise node "start" deadline to 900s, make configurable (#12316)

This commit is contained in:
Eric Liang
2020-11-24 12:16:01 -08:00
committed by GitHub
parent 4ada3e4c99
commit 5895554555
2 changed files with 5 additions and 2 deletions
@@ -14,7 +14,8 @@ import warnings
from ray.autoscaler.command_runner import CommandRunnerInterface
from ray.autoscaler._private.constants import \
DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES,\
DEFAULT_OBJECT_STORE_MEMORY_PROPORTION
DEFAULT_OBJECT_STORE_MEMORY_PROPORTION, \
NODE_START_WAIT_S
from ray.autoscaler._private.docker import check_bind_mounts_cmd, \
check_docker_running_cmd, \
check_docker_image, \
@@ -33,7 +34,6 @@ from ray.autoscaler._private.constants import RAY_HOME
logger = logging.getLogger(__name__)
# How long to wait for a node to start, in seconds
NODE_START_WAIT_S = 300
HASH_MAX_LENGTH = 10
KUBECTL_RSYNC = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "kubernetes/kubectl-rsync.sh")
@@ -12,6 +12,9 @@ def env_integer(key, default):
return default
# How long to wait for a node to start, in seconds
NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900)
# Abort autoscaling if more than this number of errors are encountered. This
# is a safety feature to prevent e.g. runaway node launches.
AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)