mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 11:53:32 +08:00
[autoscaler] Remove faulty assert that breaks during downscaling, pull configs from env (#2006)
* fixes * coment out test * Update ray_constants.py * Update autoscaler_test.py * Update ray_constants.py * lint * lint
This commit is contained in:
@@ -142,6 +142,7 @@ class LoadMetrics(object):
|
||||
def prune(mapping):
|
||||
unwanted = set(mapping) - active_ips
|
||||
for unwanted_key in unwanted:
|
||||
print("Removed mapping", unwanted_key, mapping[unwanted_key])
|
||||
del mapping[unwanted_key]
|
||||
if unwanted:
|
||||
print("Removed {} stale ip mappings: {} not in {}".format(
|
||||
@@ -454,9 +455,8 @@ class StandardAutoscaler(object):
|
||||
TAG_RAY_NODE_STATUS: "Uninitialized",
|
||||
TAG_RAY_LAUNCH_CONFIG: self.launch_hash,
|
||||
}, count)
|
||||
# TODO(ekl) be less conservative in this check
|
||||
assert len(self.workers()) > num_before, \
|
||||
"Num nodes failed to increase after creating a new node"
|
||||
if len(self.workers()) <= num_before:
|
||||
print("Warning: Num nodes failed to increase after node creation")
|
||||
|
||||
def workers(self):
|
||||
return self.provider.nodes(tag_filters={
|
||||
|
||||
@@ -3,19 +3,30 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
"""Ray constants used in the Python code."""
|
||||
|
||||
import os
|
||||
|
||||
|
||||
def env_integer(key, default):
|
||||
if key in os.environ:
|
||||
return int(os.environ(key))
|
||||
return default
|
||||
|
||||
|
||||
# Abort autoscaling if more than this number of errors are encountered. This
|
||||
# is a safety feature to prevent e.g. runaway node launches.
|
||||
AUTOSCALER_MAX_NUM_FAILURES = 5
|
||||
AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)
|
||||
|
||||
# Max number of nodes to launch at a time.
|
||||
AUTOSCALER_MAX_CONCURRENT_LAUNCHES = 10
|
||||
AUTOSCALER_MAX_CONCURRENT_LAUNCHES = env_integer(
|
||||
"AUTOSCALER_MAX_CONCURRENT_LAUNCHES", 10)
|
||||
|
||||
# Interval at which to perform autoscaling updates.
|
||||
AUTOSCALER_UPDATE_INTERVAL_S = 5
|
||||
AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5)
|
||||
|
||||
# The autoscaler will attempt to restart Ray on nodes it hasn't heard from
|
||||
# in more than this interval.
|
||||
AUTOSCALER_HEARTBEAT_TIMEOUT_S = 30
|
||||
AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S",
|
||||
30)
|
||||
|
||||
# Max number of retries to AWS (default is 5, time increases exponentially)
|
||||
BOTO_MAX_RETRIES = 12
|
||||
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)
|
||||
|
||||
Reference in New Issue
Block a user