From b8811cbe3418ab0d3ea10deaa54947d5bb26cecf Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 16 Jan 2018 16:11:09 -0800 Subject: [PATCH] [autoscaling] increase connect timeout, boto retries, and check subnet conf (#1422) * some autoscaling config tweaks * Sun Jan 14 13:56:55 PST 2018 * Mon Jan 15 14:21:09 PST 2018 * increase backoff * Mon Jan 15 14:40:47 PST 2018 * check boto version --- doc/source/autoscaling.rst | 2 +- python/ray/autoscaler/aws/config.py | 40 ++++++++++++++++------ python/ray/autoscaler/aws/node_provider.py | 6 +++- python/ray/autoscaler/updater.py | 6 ++-- python/ray/ray_constants.py | 3 ++ 5 files changed, 41 insertions(+), 16 deletions(-) diff --git a/doc/source/autoscaling.rst b/doc/source/autoscaling.rst index a882abb75..bfe16d3fa 100644 --- a/doc/source/autoscaling.rst +++ b/doc/source/autoscaling.rst @@ -41,7 +41,7 @@ Autoscaling Ray clusters come with a load-based auto-scaler. When cluster resource usage exceeds a configurable threshold (80% by default), new nodes will be launched up the specified ``max_workers`` limit. When nodes are idle for more than a timeout, they will be removed, down to the ``min_workers`` limit. The head node is never removed. -The default idle timeout is 5 minutes. This is because in AWS there is a minimum billing charge of 5 minutes per instance, after which usage is billed by the second. +The default idle timeout is 5 minutes. This is to prevent excessive node churn which could impact performance and increase costs (in AWS there is a minimum billing charge of 1 minute per instance, after which usage is billed by the second). Monitoring cluster status ------------------------- diff --git a/python/ray/autoscaler/aws/config.py b/python/ray/autoscaler/aws/config.py index 765bc8570..616f13f79 100644 --- a/python/ray/autoscaler/aws/config.py +++ b/python/ray/autoscaler/aws/config.py @@ -2,26 +2,35 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from distutils.version import StrictVersion import json import logging import os import time import boto3 +from botocore.config import Config + +from ray.ray_constants import BOTO_MAX_RETRIES RAY = "ray-autoscaler" DEFAULT_RAY_INSTANCE_PROFILE = RAY DEFAULT_RAY_IAM_ROLE = RAY SECURITY_GROUP_TEMPLATE = RAY + "-{}" +assert StrictVersion(boto3.__version__) >= StrictVersion("1.4.8"), \ + "Boto3 version >= 1.4.8 required, try `pip install -U boto3`" -def key_pair(i): + +def key_pair(i, region): """Returns the ith default (aws_key_pair_name, key_pair_path).""" if i == 0: - return RAY, os.path.expanduser("~/.ssh/{}.pem".format(RAY)) + return ( + "{}_{}".format(RAY, region), + os.path.expanduser("~/.ssh/{}_{}.pem".format(RAY, region))) return ( - "{}_{}".format(RAY, i), - os.path.expanduser("~/.ssh/{}_{}.pem".format(RAY, i))) + "{}_{}_{}".format(RAY, i, region), + os.path.expanduser("~/.ssh/{}_{}_{}.pem".format(RAY, i, region))) # Suppress excessive connection dropped logs from boto @@ -103,7 +112,7 @@ def _configure_key_pair(config): # Try a few times to get or create a good key pair. for i in range(10): - key_name, key_path = key_pair(i) + key_name, key_path = key_pair(i, config["provider"]["region"]) key = _get_key(key_name, config) # Found a good key. @@ -135,14 +144,16 @@ def _configure_key_pair(config): def _configure_subnet(config): ec2 = _resource("ec2", config) subnets = sorted( - [s for s in ec2.subnets.all() if s.state == "available"], + [s for s in ec2.subnets.all() + if s.state == "available" and s.map_public_ip_on_launch], reverse=True, # sort from Z-A key=lambda subnet: subnet.availability_zone) if not subnets: raise Exception( - "No subnets found, try manually creating an instance in " + "No usable subnets found, try manually creating an instance in " "your specified region to populate the list of subnets " - "and trying this again.") + "and trying this again. Note that the subnet must map public IPs " + "on instance launch.") if "availability_zone" in config["provider"]: default_subnet = next((s for s in subnets if s.availability_zone == @@ -150,7 +161,7 @@ def _configure_subnet(config): None) if not default_subnet: raise Exception( - "No available subnets matching availability zone {} " + "No usable subnets matching availability zone {} " "found. Choose a different availability zone or try " "manually creating an instance in your specified region " "to populate the list of subnets and trying this again." @@ -159,11 +170,15 @@ def _configure_subnet(config): default_subnet = subnets[0] if "SubnetId" not in config["head_node"]: + assert default_subnet.map_public_ip_on_launch, \ + "The chosen subnet must map nodes with public IPs on launch" config["head_node"]["SubnetId"] = default_subnet.id print("SubnetId not specified for head node, using {} in {}".format( default_subnet.id, default_subnet.availability_zone)) if "SubnetId" not in config["worker_nodes"]: + assert default_subnet.map_public_ip_on_launch, \ + "The chosen subnet must map nodes with public IPs on launch" config["worker_nodes"]["SubnetId"] = default_subnet.id print("SubnetId not specified for workers, using {} in {}".format( default_subnet.id, default_subnet.availability_zone)) @@ -260,8 +275,11 @@ def _get_key(key_name, config): def _client(name, config): - return boto3.client(name, config["provider"]["region"]) + boto_config = Config(retries=dict(max_attempts=BOTO_MAX_RETRIES)) + return boto3.client(name, config["provider"]["region"], config=boto_config) def _resource(name, config): - return boto3.resource(name, config["provider"]["region"]) + boto_config = Config(retries=dict(max_attempts=BOTO_MAX_RETRIES)) + return boto3.resource( + name, config["provider"]["region"], config=boto_config) diff --git a/python/ray/autoscaler/aws/node_provider.py b/python/ray/autoscaler/aws/node_provider.py index ca6576ac7..ac91593a3 100644 --- a/python/ray/autoscaler/aws/node_provider.py +++ b/python/ray/autoscaler/aws/node_provider.py @@ -3,15 +3,19 @@ from __future__ import division from __future__ import print_function import boto3 +from botocore.config import Config from ray.autoscaler.node_provider import NodeProvider from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME +from ray.ray_constants import BOTO_MAX_RETRIES class AWSNodeProvider(NodeProvider): def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) - self.ec2 = boto3.resource("ec2", region_name=provider_config["region"]) + config = Config(retries=dict(max_attempts=BOTO_MAX_RETRIES)) + self.ec2 = boto3.resource( + "ec2", region_name=provider_config["region"], config=config) # Cache of node objects from the last nodes() call. This avoids # excessive DescribeInstances requests. diff --git a/python/ray/autoscaler/updater.py b/python/ray/autoscaler/updater.py index d200a522b..5b8fbd066 100644 --- a/python/ray/autoscaler/updater.py +++ b/python/ray/autoscaler/updater.py @@ -90,7 +90,7 @@ class NodeUpdater(object): self.ssh_ip = self.provider.external_ip(self.node_id) if self.ssh_ip is not None: break - time.sleep(5) + time.sleep(10) assert self.ssh_ip is not None, "Unable to find IP of node" # Wait for SSH access @@ -135,7 +135,7 @@ class NodeUpdater(object): "mkdir -p {}".format(os.path.dirname(remote_path))) self.process_runner.check_call([ "rsync", "-e", "ssh -i {} ".format(self.ssh_private_key) + - "-o ConnectTimeout=60s -o StrictHostKeyChecking=no", + "-o ConnectTimeout=120s -o StrictHostKeyChecking=no", "--delete", "-avz", "{}".format(local_path), "{}@{}:{}".format(self.ssh_user, self.ssh_ip, remote_path) ], stdout=self.stdout, stderr=self.stderr) @@ -146,7 +146,7 @@ class NodeUpdater(object): for cmd in self.setup_cmds: self.ssh_cmd(cmd, verbose=True) - def ssh_cmd(self, cmd, connect_timeout=60, redirect=None, verbose=False): + def ssh_cmd(self, cmd, connect_timeout=120, redirect=None, verbose=False): if verbose: print( "NodeUpdater: running {} on {}...".format( diff --git a/python/ray/ray_constants.py b/python/ray/ray_constants.py index 30e239cf0..2ce692c61 100644 --- a/python/ray/ray_constants.py +++ b/python/ray/ray_constants.py @@ -18,3 +18,6 @@ AUTOSCALER_UPDATE_INTERVAL_S = 5 # The autoscaler will attempt to restart Ray on nodes it hasn't heard from # in more than this interval. AUTOSCALER_HEARTBEAT_TIMEOUT_S = 30 + +# Max number of retries to AWS (default is 5, time increases exponentially) +BOTO_MAX_RETRIES = 12