[autoscaler] Remove faulty assert that breaks during downscaling, pull configs from env (#2006)

* fixes

* coment out test

* Update ray_constants.py

* Update autoscaler_test.py

* Update ray_constants.py

* lint

* lint
This commit is contained in:
Eric Liang
2018-05-15 12:47:11 -07:00
committed by GitHub
parent 825c227c2b
commit 3f1dd29eab
3 changed files with 19 additions and 16 deletions
+3 -3
View File
@@ -142,6 +142,7 @@ class LoadMetrics(object):
def prune(mapping):
unwanted = set(mapping) - active_ips
for unwanted_key in unwanted:
print("Removed mapping", unwanted_key, mapping[unwanted_key])
del mapping[unwanted_key]
if unwanted:
print("Removed {} stale ip mappings: {} not in {}".format(
@@ -454,9 +455,8 @@ class StandardAutoscaler(object):
TAG_RAY_NODE_STATUS: "Uninitialized",
TAG_RAY_LAUNCH_CONFIG: self.launch_hash,
}, count)
# TODO(ekl) be less conservative in this check
assert len(self.workers()) > num_before, \
"Num nodes failed to increase after creating a new node"
if len(self.workers()) <= num_before:
print("Warning: Num nodes failed to increase after node creation")
def workers(self):
return self.provider.nodes(tag_filters={
+16 -5
View File
@@ -3,19 +3,30 @@ from __future__ import division
from __future__ import print_function
"""Ray constants used in the Python code."""
import os
def env_integer(key, default):
if key in os.environ:
return int(os.environ(key))
return default
# Abort autoscaling if more than this number of errors are encountered. This
# is a safety feature to prevent e.g. runaway node launches.
AUTOSCALER_MAX_NUM_FAILURES = 5
AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)
# Max number of nodes to launch at a time.
AUTOSCALER_MAX_CONCURRENT_LAUNCHES = 10
AUTOSCALER_MAX_CONCURRENT_LAUNCHES = env_integer(
"AUTOSCALER_MAX_CONCURRENT_LAUNCHES", 10)
# Interval at which to perform autoscaling updates.
AUTOSCALER_UPDATE_INTERVAL_S = 5
AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5)
# The autoscaler will attempt to restart Ray on nodes it hasn't heard from
# in more than this interval.
AUTOSCALER_HEARTBEAT_TIMEOUT_S = 30
AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S",
30)
# Max number of retries to AWS (default is 5, time increases exponentially)
BOTO_MAX_RETRIES = 12
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)