[autoscaler] Azure deployment fixes (#11613)

Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
This commit is contained in:
Scott Graham
2020-10-27 18:27:18 -04:00
committed by GitHub
parent 293483ed0b
commit c4ae94d60b
14 changed files with 108 additions and 77 deletions
@@ -10,18 +10,19 @@
}
},
"variables": {
"Contributor": "[concat('/subscriptions/', subscription().subscriptionId, '/providers/Microsoft.Authorization/roleDefinitions/', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]"
"Contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
"location": "[resourceGroup().location]"
},
"resources": [
{
"type": "Microsoft.ManagedIdentity/userAssignedIdentities",
"apiVersion": "2018-11-30",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"name": "ray-msi-user-identity"
},
{
"type": "Microsoft.Authorization/roleAssignments",
"apiVersion": "2018-09-01-preview",
"apiVersion": "2020-04-01-preview",
"name": "[guid(resourceGroup().id)]",
"properties": {
"principalId": "[reference('ray-msi-user-identity').principalId]",
@@ -37,7 +38,7 @@
"type": "Microsoft.Network/networkSecurityGroups",
"apiVersion": "2019-02-01",
"name": "ray-nsg",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"properties": {
"securityRules": [
{
@@ -60,7 +61,7 @@
"type": "Microsoft.Network/virtualNetworks",
"apiVersion": "2019-11-01",
"name": "ray-vnet",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"properties": {
"addressSpace": {
"addressPrefixes": [
@@ -85,20 +85,21 @@
}
},
"variables": {
"publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]",
"location": "[resourceGroup().location]",
"networkInterfaceNamePrivate": "[concat(parameters('vmName'),'-nic')]",
"networkInterfaceNamePublic": "[concat(parameters('vmName'),'-nic-public')]",
"networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
"networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]",
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]",
"osDiskType": "Standard_LRS"
"osDiskType": "Standard_LRS",
"publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]",
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]"
},
"resources": [
{
"type": "Microsoft.Network/networkInterfaces",
"apiVersion": "2018-10-01",
"apiVersion": "2020-06-01",
"name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"dependsOn": [
"[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]"
],
@@ -129,9 +130,9 @@
},
{
"type": "Microsoft.Network/networkInterfaces",
"apiVersion": "2018-10-01",
"apiVersion": "2020-06-01",
"name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"copy": {
"name": "NICPrivateCopy",
"count": "[parameters('vmCount')]"
@@ -158,7 +159,7 @@
"type": "Microsoft.Network/publicIpAddresses",
"apiVersion": "2019-02-01",
"name": "[concat(variables('publicIpAddressName'), copyIndex())]",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"properties": {
"publicIpAllocationMethod": "Static",
"publicIPAddressVersion": "IPv4"
@@ -177,7 +178,7 @@
"type": "Microsoft.Compute/virtualMachines",
"apiVersion": "2019-03-01",
"name": "[concat(parameters('vmName'), copyIndex())]",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"dependsOn": [
"[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]"
],
@@ -1,7 +1,7 @@
import json
import logging
from pathlib import Path
import random
import os
from azure.common.client_factory import get_client_from_cli_profile
from azure.mgmt.resource import ResourceManagementClient
@@ -55,8 +55,8 @@ def _configure_resource_group(config):
resource_group_name=resource_group, parameters=params)
# load the template file
current_path = os.path.dirname(os.path.abspath(__file__))
template_path = os.path.join(current_path, "azure-config-template.json")
current_path = Path(__file__).parent
template_path = current_path.joinpath("azure-config-template.json")
with open(template_path, "r") as template_fp:
template = json.load(template_fp)
@@ -86,16 +86,17 @@ def _configure_resource_group(config):
def _configure_key_pair(config):
ssh_user = config["auth"]["ssh_user"]
public_key = None
# search if the keys exist
for key_type in ["ssh_private_key", "ssh_public_key"]:
try:
key_path = os.path.expanduser(config["auth"][key_type])
key_path = Path(config["auth"][key_type]).expanduser()
except KeyError:
raise Exception("Config must define {}".format(key_type))
except TypeError:
raise Exception("Invalid config value for {}".format(key_type))
assert os.path.exists(key_path), (
assert key_path.is_file(), (
"Could not find ssh key: {}".format(key_path))
if key_type == "ssh_public_key":
@@ -1,6 +1,6 @@
import json
import logging
import os
from pathlib import Path
from threading import RLock
from uuid import uuid4
@@ -178,8 +178,8 @@ class AzureNodeProvider(NodeProvider):
resource_group = self.provider_config["resource_group"]
# load the template file
current_path = os.path.dirname(os.path.abspath(__file__))
template_path = os.path.join(current_path, "azure-vm-template.json")
current_path = Path(__file__).parent
template_path = current_path.joinpath("azure-vm-template.json")
with open(template_path, "r") as template_fp:
template = json.load(template_fp)
+5 -2
View File
@@ -49,6 +49,7 @@ auth:
# you must specify paths to matching private and public key pair files
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
ssh_private_key: ~/.ssh/id_rsa
# changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub
# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
@@ -64,7 +65,7 @@ head_node:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.02.01
imageVersion: 20.07.06
# Provider-specific config for worker nodes, e.g. instance type.
worker_nodes:
@@ -74,7 +75,7 @@ worker_nodes:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.02.01
imageVersion: 20.07.06
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
@@ -86,6 +87,7 @@ worker_nodes:
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
}
# Files or directories to copy from the head node to the worker nodes. The format is a
@@ -118,6 +120,7 @@ setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
@@ -62,6 +62,7 @@ auth:
# you must specify paths to matching private and public key pair files
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
ssh_private_key: ~/.ssh/id_rsa
# changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub
# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
@@ -77,7 +78,7 @@ head_node:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.02.01
imageVersion: 20.07.06
# Provider-specific config for worker nodes, e.g. instance type.
worker_nodes:
@@ -87,7 +88,7 @@ worker_nodes:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.02.01
imageVersion: 20.07.06
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
@@ -99,6 +100,7 @@ worker_nodes:
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
}
# Files or directories to copy from the head node to the worker nodes. The format is a
@@ -130,12 +132,14 @@ initialization_commands:
- touch ~/.sudo_as_admin_successful
# List of shell commands to run to set up nodes.
setup_commands: []
setup_commands:
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
@@ -52,6 +52,7 @@ auth:
# you must specify paths to matching private and public key pair files
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
ssh_private_key: ~/.ssh/id_rsa
# changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default
@@ -71,6 +72,7 @@ worker_nodes:
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
}
# List of shell commands to run to set up nodes.
+5 -2
View File
@@ -60,6 +60,7 @@ auth:
# you must specify paths to matching private and public key pair files
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
ssh_private_key: ~/.ssh/id_rsa
# changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default
@@ -71,7 +72,7 @@ head_node:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: "1804"
imageVersion: 20.02.01
imageVersion: 20.07.06
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields using defaults.yaml
@@ -82,13 +83,14 @@ worker_nodes:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: "1804"
imageVersion: 20.02.01
imageVersion: 20.07.06
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
}
# List of commands that will be run before `setup_commands`. If docker is
@@ -103,6 +105,7 @@ setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
@@ -17,4 +17,5 @@ auth:
# you must specify paths to matching private and public key pair files
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
ssh_private_key: ~/.ssh/id_rsa
# changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub
+15 -9
View File
@@ -71,19 +71,23 @@ generated_python_directories = [
optional_ray_files = ["ray/nightly-wheels.yaml"]
ray_autoscaler_files = [
"ray/autoscaler/aws/defaults.yaml", "ray/autoscaler/azure/defaults.yaml",
"ray/autoscaler/azure/azure-vm-template.json",
"ray/autoscaler/azure/azure-config-template.json",
"ray/autoscaler/gcp/defaults.yaml", "ray/autoscaler/local/defaults.yaml",
"ray/autoscaler/aws/defaults.yaml",
"ray/autoscaler/azure/defaults.yaml",
"ray/autoscaler/_private/azure/azure-vm-template.json",
"ray/autoscaler/_private/azure/azure-config-template.json",
"ray/autoscaler/gcp/defaults.yaml",
"ray/autoscaler/local/defaults.yaml",
"ray/autoscaler/kubernetes/defaults.yaml",
"ray/autoscaler/kubernetes/kubectl-rsync.sh",
"ray/autoscaler/staroid/defaults.yaml", "ray/autoscaler/ray-schema.json"
"ray/autoscaler/staroid/defaults.yaml",
"ray/autoscaler/ray-schema.json",
]
ray_project_files = [
"ray/projects/schema.json", "ray/projects/templates/cluster_template.yaml",
"ray/projects/schema.json",
"ray/projects/templates/cluster_template.yaml",
"ray/projects/templates/project_template.yaml",
"ray/projects/templates/requirements.txt"
"ray/projects/templates/requirements.txt",
]
ray_dashboard_files = [
@@ -105,8 +109,10 @@ extras = {
"dataclasses; python_version < '3.7'"
],
"tune": [
"tabulate", "tensorboardX", "pandas",
"dataclasses; python_version < '3.7'"
"dataclasses; python_version < '3.7'",
"pandas",
"tabulate",
"tensorboardX",
]
}