From 962f18756b3ed81276e4466c457db13cd2cc4646 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Mon, 10 Dec 2018 11:58:27 -0800 Subject: [PATCH] [autoscaler] Use fixed timestamp to check against health timeouts (#3503) --- python/ray/autoscaler/autoscaler.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/python/ray/autoscaler/autoscaler.py b/python/ray/autoscaler/autoscaler.py index 9c4a452ee..a806e3b62 100644 --- a/python/ray/autoscaler/autoscaler.py +++ b/python/ray/autoscaler/autoscaler.py @@ -362,12 +362,14 @@ class StandardAutoscaler(object): raise e def _update(self): + now = time.time() + # Throttle autoscaling updates to this interval to avoid exceeding # rate limits on API calls. - if time.time() - self.last_update_time < self.update_interval_s: + if now - self.last_update_time < self.update_interval_s: return - self.last_update_time = time.time() + self.last_update_time = now num_pending = self.num_launches_pending.value nodes = self.workers() logger.info(self.info_string(nodes)) @@ -377,7 +379,7 @@ class StandardAutoscaler(object): # Terminate any idle or out of date nodes last_used = self.load_metrics.last_used_time_by_ip - horizon = time.time() - (60 * self.config["idle_timeout_minutes"]) + horizon = now - (60 * self.config["idle_timeout_minutes"]) num_terminated = 0 for node_id in nodes: node_ip = self.provider.internal_ip(node_id) @@ -441,7 +443,7 @@ class StandardAutoscaler(object): # Attempt to recover unhealthy nodes for node_id in nodes: - self.recover_if_needed(node_id) + self.recover_if_needed(node_id, now) def reload_config(self, errors_fatal=False): try: @@ -488,14 +490,14 @@ class StandardAutoscaler(object): return False return True - def recover_if_needed(self, node_id): + def recover_if_needed(self, node_id, now): if not self.can_update(node_id): return key = self.provider.internal_ip(node_id) if key not in self.load_metrics.last_heartbeat_time_by_ip: - self.load_metrics.last_heartbeat_time_by_ip[key] = time.time() + self.load_metrics.last_heartbeat_time_by_ip[key] = now last_heartbeat_time = self.load_metrics.last_heartbeat_time_by_ip[key] - delta = time.time() - last_heartbeat_time + delta = now - last_heartbeat_time if delta < AUTOSCALER_HEARTBEAT_TIMEOUT_S: return logger.warning("StandardAutoscaler: No heartbeat from node "