From 3f1dd29eab31a3dae090139729feb481dda02827 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 15 May 2018 12:47:11 -0700 Subject: [PATCH] [autoscaler] Remove faulty assert that breaks during downscaling, pull configs from env (#2006) * fixes * coment out test * Update ray_constants.py * Update autoscaler_test.py * Update ray_constants.py * lint * lint --- python/ray/autoscaler/autoscaler.py | 6 +++--- python/ray/ray_constants.py | 21 ++++++++++++++++----- test/autoscaler_test.py | 8 -------- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/python/ray/autoscaler/autoscaler.py b/python/ray/autoscaler/autoscaler.py index d5a5336f4..502b4c6c7 100644 --- a/python/ray/autoscaler/autoscaler.py +++ b/python/ray/autoscaler/autoscaler.py @@ -142,6 +142,7 @@ class LoadMetrics(object): def prune(mapping): unwanted = set(mapping) - active_ips for unwanted_key in unwanted: + print("Removed mapping", unwanted_key, mapping[unwanted_key]) del mapping[unwanted_key] if unwanted: print("Removed {} stale ip mappings: {} not in {}".format( @@ -454,9 +455,8 @@ class StandardAutoscaler(object): TAG_RAY_NODE_STATUS: "Uninitialized", TAG_RAY_LAUNCH_CONFIG: self.launch_hash, }, count) - # TODO(ekl) be less conservative in this check - assert len(self.workers()) > num_before, \ - "Num nodes failed to increase after creating a new node" + if len(self.workers()) <= num_before: + print("Warning: Num nodes failed to increase after node creation") def workers(self): return self.provider.nodes(tag_filters={ diff --git a/python/ray/ray_constants.py b/python/ray/ray_constants.py index 4f337ad41..7e5df9650 100644 --- a/python/ray/ray_constants.py +++ b/python/ray/ray_constants.py @@ -3,19 +3,30 @@ from __future__ import division from __future__ import print_function """Ray constants used in the Python code.""" +import os + + +def env_integer(key, default): + if key in os.environ: + return int(os.environ(key)) + return default + + # Abort autoscaling if more than this number of errors are encountered. This # is a safety feature to prevent e.g. runaway node launches. -AUTOSCALER_MAX_NUM_FAILURES = 5 +AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5) # Max number of nodes to launch at a time. -AUTOSCALER_MAX_CONCURRENT_LAUNCHES = 10 +AUTOSCALER_MAX_CONCURRENT_LAUNCHES = env_integer( + "AUTOSCALER_MAX_CONCURRENT_LAUNCHES", 10) # Interval at which to perform autoscaling updates. -AUTOSCALER_UPDATE_INTERVAL_S = 5 +AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5) # The autoscaler will attempt to restart Ray on nodes it hasn't heard from # in more than this interval. -AUTOSCALER_HEARTBEAT_TIMEOUT_S = 30 +AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S", + 30) # Max number of retries to AWS (default is 5, time increases exponentially) -BOTO_MAX_RETRIES = 12 +BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12) diff --git a/test/autoscaler_test.py b/test/autoscaler_test.py index c3b84ca96..c56bd3ce5 100644 --- a/test/autoscaler_test.py +++ b/test/autoscaler_test.py @@ -365,14 +365,6 @@ class AutoscalingTest(unittest.TestCase): autoscaler.update() self.assertRaises(Exception, autoscaler.update) - def testAbortOnCreationFailures(self): - config_path = self.write_config(SMALL_CLUSTER) - self.provider = MockProvider() - self.provider.fail_creates = True - autoscaler = StandardAutoscaler( - config_path, LoadMetrics(), max_failures=0, update_interval_s=0) - self.assertRaises(AssertionError, autoscaler.update) - def testLaunchNewNodeOnOutOfBandTerminate(self): config_path = self.write_config(SMALL_CLUSTER) self.provider = MockProvider()