[autoscaler] Switch to ARM for Azure deployment (#7717)

* switch to ARM templates for config and VMs

* switch to ARM templates for config and VMs

* auto-formatting

* addressed Scotts comment

* added missing imports

* fixed gpu templates
fixed wheel reference

* added missing reference

* cleanup wording and yamls

* Update doc/source/autoscaling.rst

Co-Authored-By: Scott Graham <5720537+gramhagen@users.noreply.github.com>

Co-authored-by: Ubuntu <marcozo@marcozodev2.zqvgrdyupqrudayw1il1agipig.jx.internal.cloudapp.net>
Co-authored-by: Scott Graham <5720537+gramhagen@users.noreply.github.com>
This commit is contained in:
Markus Cozowicz
2020-04-04 00:51:56 +02:00
committed by GitHub
parent 1d4823c0ec
commit b853df7a3b
9 changed files with 496 additions and 357 deletions
@@ -0,0 +1,81 @@
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"subnet": {
"type": "string",
"metadata": {
"description": "The subnet to be used"
}
}
},
"variables": {
"Contributor": "[concat('/subscriptions/', subscription().subscriptionId, '/providers/Microsoft.Authorization/roleDefinitions/', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]"
},
"resources": [
{
"type": "Microsoft.ManagedIdentity/userAssignedIdentities",
"apiVersion": "2018-11-30",
"location": "[resourceGroup().location]",
"name": "ray-msi-user-identity"
},
{
"type": "Microsoft.Authorization/roleAssignments",
"apiVersion": "2018-09-01-preview",
"name": "[guid(resourceGroup().id)]",
"properties": {
"principalId": "[reference('ray-msi-user-identity').principalId]",
"roleDefinitionId": "[variables('Contributor')]",
"scope": "[resourceGroup().id]",
"principalType": "ServicePrincipal"
},
"dependsOn": [
"[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', 'ray-msi-user-identity')]"
]
},
{
"type": "Microsoft.Network/networkSecurityGroups",
"apiVersion": "2019-02-01",
"name": "ray-nsg",
"location": "[resourceGroup().location]",
"properties": {
"securityRules": [
{
"name": "SSH",
"properties": {
"priority": 1000,
"protocol": "TCP",
"access": "Allow",
"direction": "Inbound",
"sourceAddressPrefix": "*",
"sourcePortRange": "*",
"destinationAddressPrefix": "*",
"destinationPortRange": "22"
}
}
]
}
},
{
"type": "Microsoft.Network/virtualNetworks",
"apiVersion": "2019-11-01",
"name": "ray-vnet",
"location": "[resourceGroup().location]",
"properties": {
"addressSpace": {
"addressPrefixes": [
"[parameters('subnet')]"
]
},
"subnets": [
{
"name": "ray-subnet",
"properties": {
"addressPrefix": "[parameters('subnet')]"
}
}
]
}
}
]
}
@@ -0,0 +1,243 @@
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"vmName": {
"type": "string",
"metadata": {
"description": "The name of you Virtual Machine."
}
},
"adminUsername": {
"type": "string",
"metadata": {
"description": "Username for the Virtual Machine."
}
},
"publicKey": {
"type": "securestring",
"metadata": {
"description": "SSH Key for the Virtual Machine"
}
},
"imagePublisher": {
"type": "string",
"metadata": {
"description": "The publisher of the VM image"
}
},
"imageOffer": {
"type": "string",
"metadata": {
"description": "The offer of the VM image"
}
},
"imageSku": {
"type": "string",
"metadata": {
"description": "The sku of the VM image"
}
},
"imageVersion": {
"type": "string",
"metadata": {
"description": "The version of the VM image"
}
},
"vmSize": {
"type": "string",
"metadata": {
"description": "The size of the VM"
}
},
"vmTags": {
"type": "object",
"metadata": {
"description": "Tags for the VM"
}
},
"vmCount": {
"type": "int",
"metadata": {
"description": "Number of VMs to deploy"
}
},
"provisionPublicIp": {
"type": "bool",
"defaultValue": true,
"metadata": {
"description": "If true creates a public ip"
}
}
},
"variables": {
"publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]",
"networkInterfaceNamePrivate": "[concat(parameters('vmName'),'-nic')]",
"networkInterfaceNamePublic": "[concat(parameters('vmName'),'-nic-public')]",
"networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
"networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]",
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]",
"osDiskType": "Standard_LRS"
},
"resources": [
{
"type": "Microsoft.Network/networkInterfaces",
"apiVersion": "2018-10-01",
"name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]",
"location": "[resourceGroup().location]",
"dependsOn": [
"[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]"
],
"copy": {
"name": "NICPublicCopy",
"count": "[parameters('vmCount')]"
},
"properties": {
"ipConfigurations": [
{
"name": "[variables('networkIpConfig')]",
"properties": {
"subnet": {
"id": "[variables('subnetRef')]"
},
"privateIPAllocationMethod": "Dynamic",
"publicIpAddress": {
"id": "[resourceId('Microsoft.Network/publicIPAddresses', concat(variables('publicIPAddressName'), copyIndex()))]"
}
}
}
],
"networkSecurityGroup": {
"id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]"
}
},
"condition": "[parameters('provisionPublicIp')]"
},
{
"type": "Microsoft.Network/networkInterfaces",
"apiVersion": "2018-10-01",
"name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]",
"location": "[resourceGroup().location]",
"copy": {
"name": "NICPrivateCopy",
"count": "[parameters('vmCount')]"
},
"properties": {
"ipConfigurations": [
{
"name": "[variables('networkIpConfig')]",
"properties": {
"subnet": {
"id": "[variables('subnetRef')]"
},
"privateIPAllocationMethod": "Dynamic"
}
}
],
"networkSecurityGroup": {
"id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]"
}
},
"condition": "[not(parameters('provisionPublicIp'))]"
},
{
"type": "Microsoft.Network/publicIpAddresses",
"apiVersion": "2019-02-01",
"name": "[concat(variables('publicIpAddressName'), copyIndex())]",
"location": "[resourceGroup().location]",
"properties": {
"publicIpAllocationMethod": "Static",
"publicIPAddressVersion": "IPv4"
},
"copy": {
"name": "PublicIpCopy",
"count": "[parameters('vmCount')]"
},
"sku": {
"name": "Basic",
"tier": "Regional"
},
"condition": "[parameters('provisionPublicIp')]"
},
{
"type": "Microsoft.Compute/virtualMachines",
"apiVersion": "2019-03-01",
"name": "[concat(parameters('vmName'), copyIndex())]",
"location": "[resourceGroup().location]",
"dependsOn": [
"[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]"
],
"copy": {
"name": "VmCopy",
"count": "[parameters('vmCount')]"
},
"tags": "[parameters('vmTags')]",
"properties": {
"hardwareProfile": {
"vmSize": "[parameters('vmSize')]"
},
"storageProfile": {
"osDisk": {
"createOption": "fromImage",
"managedDisk": {
"storageAccountType": "[variables('osDiskType')]"
}
},
"imageReference": {
"publisher": "[parameters('imagePublisher')]",
"offer": "[parameters('imageOffer')]",
"sku": "[parameters('imageSku')]",
"version": "[parameters('imageVersion')]"
}
},
"networkProfile": {
"networkInterfaces": [
{
"id": "[resourceId('Microsoft.Network/networkInterfaces', concat(variables('networkInterfaceName'), copyIndex()))]"
}
]
},
"osProfile": {
"computerName": "[concat(parameters('vmName'), copyIndex())]",
"adminUsername": "[parameters('adminUsername')]",
"adminPassword": "[parameters('publicKey')]",
"linuxConfiguration": {
"disablePasswordAuthentication": true,
"ssh": {
"publicKeys": [
{
"path": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]",
"keyData": "[parameters('publicKey')]"
}
]
}
}
}
},
"identity": {
"type": "UserAssigned",
"userAssignedIdentities": {
"[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', 'ray-msi-user-identity')]": {
}
}
}
}
],
"outputs": {
"publicIp": {
"type": "array",
"copy": {
"count": "[parameters('vmCount')]",
"input": "[reference(concat(variables('publicIpAddressName'), copyIndex())).ipAddress]"
},
"condition": "[parameters('provisionPublicIp')]"
},
"privateIp": {
"type": "array",
"copy": {
"count": "[parameters('vmCount')]",
"input": "[reference(concat(variables('networkInterfaceName'), copyIndex())).ipConfigurations[0].properties.privateIPAddress]"
}
}
}
}
+27 -148
View File
@@ -1,14 +1,11 @@
import json
import logging
import random
import os
import time
import uuid
from azure.common.exceptions import CloudError, AuthenticationError
from azure.common.client_factory import get_client_from_cli_profile
from azure.mgmt.authorization import AuthorizationManagementClient
from azure.mgmt.network import NetworkManagementClient
from azure.mgmt.resource import ResourceManagementClient
from azure.mgmt.msi import ManagedServiceIdentityClient
from azure.mgmt.resource.resources.models import DeploymentMode
RETRIES = 30
MSI_NAME = "ray-msi-user-identity"
@@ -20,10 +17,8 @@ logger = logging.getLogger(__name__)
def bootstrap_azure(config):
config = _configure_resource_group(config)
config = _configure_msi_user(config)
config = _configure_key_pair(config)
config = _configure_network(config)
config = _configure_resource_group(config)
return config
@@ -59,67 +54,36 @@ def _configure_resource_group(config):
resource_client.resource_groups.create_or_update(
resource_group_name=resource_group, parameters=params)
return config
# load the template
template_path = os.path.join(
os.path.dirname(__file__), "azure-config-template.json")
with open(template_path, "r") as template_file_fd:
template = json.load(template_file_fd)
# choose a random subnet
random.seed(resource_group)
# start at 1 to avoid most likely collision at 0
parameters = {"subnet": "10.{}.0.0/16".format(random.randint(1, 254))}
def _configure_msi_user(config):
msi_client = _get_client(ManagedServiceIdentityClient, config)
resource_client = _get_client(ResourceManagementClient, config)
auth_client = _get_client(AuthorizationManagementClient, config)
deployment_properties = {
"mode": DeploymentMode.incremental,
"template": template,
"parameters": {k: {
"value": v
}
for k, v in parameters.items()}
}
resource_group = config["provider"]["resource_group"]
location = config["provider"]["location"]
resource_group_id = resource_client.resource_groups.get(resource_group).id
try:
identity = msi_client.user_assigned_identities.list_by_resource_group(
resource_group_name=resource_group,
filter="name eq '{}'".format(MSI_NAME)).next()
logger.info("Found MSI User Assigned Identity: %s", MSI_NAME)
except StopIteration:
logger.info("Creating MSI User Assigned Identity: %s", MSI_NAME)
identity = msi_client.user_assigned_identities.create_or_update(
resource_group_name=resource_group,
resource_name=MSI_NAME,
location=location)
identity_id = identity.id
principal_id = identity.principal_id
config["provider"]["msi_identity_id"] = identity_id
config["provider"]["msi_identity_principal_id"] = principal_id
# assign Contributor role for MSI User Identity to resource group
role_id = auth_client.role_definitions.list(
scope=resource_group_id, filter="roleName eq 'Contributor'").next().id
role_params = {"role_definition_id": role_id, "principal_id": principal_id}
for _ in range(RETRIES):
try:
filter_expr = "principalId eq '{}'".format(principal_id)
assignments = auth_client.role_assignments.list_for_scope(
scope=resource_group_id, filter=filter_expr)
if any(a.role_definition_id == role_id for a in assignments):
break
auth_client.role_assignments.create(
scope=resource_group_id,
role_assignment_name=uuid.uuid4(),
parameters=role_params)
logger.info("Assigning Contributor Role to MSI User")
except CloudError as ce:
if ce.inner_exception.error == "PrincipalNotFound":
time.sleep(5)
else:
raise Exception(
"Failed to create contributor role assignment (timeout)")
deployment_async_operation = resource_client.deployments.create_or_update(
resource_group, "ray-config", deployment_properties)
deployment_async_operation.wait()
return config
def _configure_key_pair(config):
ssh_user = config["auth"]["ssh_user"]
# search if the keys exist
for key_type in ["ssh_private_key", "ssh_public_key"]:
try:
key_path = os.path.expanduser(config["auth"][key_type])
@@ -135,93 +99,8 @@ def _configure_key_pair(config):
with open(key_path, "r") as f:
public_key = f.read()
os_profile = {
"admin_username": ssh_user,
"computer_name": None,
"linux_configuration": {
"disable_password_authentication": True,
"ssh": {
"public_keys": [{
"key_data": public_key,
"path": "/home/{}/.ssh/authorized_keys".format(ssh_user)
}]
}
}
}
for node_type in ["head_node", "worker_nodes"]:
config[node_type]["os_profile"] = os_profile
return config
def _configure_network(config):
# skip this if subnet is manually set in configuration yaml
if "subnet_id" in config["provider"]:
return config
location = config["provider"]["location"]
resource_group = config["provider"]["resource_group"]
network_client = _get_client(NetworkManagementClient, config)
vnets = []
for _ in range(RETRIES):
try:
vnets = list(
network_client.virtual_networks.list(
resource_group_name=resource_group,
filter="name eq '{}'".format(VNET_NAME)))
break
except CloudError:
time.sleep(1)
except AuthenticationError:
# wait for service principal authorization to populate
time.sleep(1)
# can't update vnet if subnet already exists
if not vnets:
# create vnet
logger.info("Creating/Updating VNet: %s", VNET_NAME)
vnet_params = {
"location": location,
"address_space": {
"address_prefixes": ["10.0.0.0/16"]
}
}
network_client.virtual_networks.create_or_update(
resource_group_name=resource_group,
virtual_network_name=VNET_NAME,
parameters=vnet_params).wait()
# create subnet
logger.info("Creating/Updating Subnet: %s", SUBNET_NAME)
subnet_params = {"address_prefix": "10.0.0.0/24"}
subnet = network_client.subnets.create_or_update(
resource_group_name=resource_group,
virtual_network_name=VNET_NAME,
subnet_name=SUBNET_NAME,
subnet_parameters=subnet_params).result()
config["provider"]["subnet_id"] = subnet.id
# create network security group
logger.info("Creating/Updating Network Security Group: %s", NSG_NAME)
nsg_params = {
"location": location,
"security_rules": [{
"protocol": "Tcp",
"source_port_range": "*",
"source_address_prefix": "*",
"destination_port_range": "22",
"destination_address_prefix": "*",
"access": "Allow",
"priority": 300,
"direction": "Inbound",
"name": "ssh_rule"
}]
}
network_client.network_security_groups.create_or_update(
resource_group_name=resource_group,
network_security_group_name=NSG_NAME,
parameters=nsg_params).wait()
config[node_type]["azure_arm_parameters"]["adminUsername"] = ssh_user
config[node_type]["azure_arm_parameters"]["publicKey"] = public_key
return config
+20 -50
View File
@@ -65,54 +65,25 @@ auth:
ssh_private_key: ~/.ssh/id_rsa
ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields
# The Azure Python SDK client expects slug_style property names
# For more documentation on available fields, see:
# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
# Note: the Azure Python SDK expects all parameter keys to be in slug_style
# the styles of parameter values are not changed
# Provider-specific config for the head node, e.g. instance type.
head_node:
hardware_profile:
vm_size: Standard_D2s_v3
storage_profile:
os_disk:
create_option: FromImage
caching: ReadWrite
image_reference:
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
publisher: microsoft-dsvm
offer: ubuntu-1804
sku: 1804-gen2
version: 20.02.01
azure_arm_parameters:
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.02.01
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields
# Documentation on fields used can be found here:
# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
# Note: the Azure Python SDK expects all parameter keys to be in slug_style
# the styles of parameter values are not changed
# Provider-specific config for worker nodes, e.g. instance type.
worker_nodes:
hardware_profile:
vm_size: Standard_F2s_v2
storage_profile:
os_disk:
create_option: FromImage
caching: ReadWrite
image_reference:
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
publisher: microsoft-dsvm
offer: ubuntu-1804
sku: 1804-gen
version: 20.02.01
# You can provision additional disk space as follows
# data_disks:
# - disk_size_gb: 1024
# run workers on spot instances by default
priority: Spot
eviction_policy: Deallocate
billing_profile:
max_price: -1
azure_arm_parameters:
vmSize: Standard_D2s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.02.01
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@@ -133,10 +104,9 @@ setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# change to use environment desired
#- echo "conda activate py37_pytorch" >> ~/.bashrc
#- echo "conda activate py37_tensorflow" >> ~/.bashrc
- pip install -U https://sdgraystorage.blob.core.windows.net/ray-wheels/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
@@ -144,7 +114,7 @@ setup_commands:
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- pip install azure-cli-core azure-core azure-mgmt-authorization azure-mgmt-compute azure-mgmt-msi azure-mgmt-network
- pip install azure-cli-core azure-core azure-mgmt-authorization azure-mgmt-network azure-mgmt-compute azure-mgmt-msi
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
@@ -60,26 +60,16 @@ auth:
ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields
# The Azure Python SDK client expects slug_style property names
# For more documentation on available fields, see:
# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
# Note: the Azure Python SDK expects all parameter keys to be in slug_style
# the styles of parameter values are not changed
# Ray will auto-configure unspecified fields using example-full.yaml
head_node:
hardware_profile:
vm_size: Standard_NC6s_v3
azure_arm_parameters:
vmSize: Standard_NC6s_v3
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields
# The Azure Python SDK client expects slug_style property names
# For more documentation on available fields, see:
# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
# Note: the Azure Python SDK expects all parameter keys to be in slug_style
# the styles of parameter values are not changed
# Ray will auto-configure unspecified fields using example-full.yaml
worker_nodes:
hardware_profile:
vm_size: Standard_NC6s_v3
azure_arm_parameters:
vmSize: Standard_NC6s_v3
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@@ -90,7 +80,7 @@ file_mounts: {
# List of shell commands to run to set up nodes.
setup_commands:
- pip install -U https://sdgraystorage.blob.core.windows.net/ray-wheels/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
+17 -44
View File
@@ -66,53 +66,26 @@ auth:
ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields
# The Azure Python SDK client expects slug_style property names
# For more documentation on available fields, see:
# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
# Note: the Azure Python SDK expects all parameter keys to be in slug_style
# the styles of parameter values are not changed
# Ray will auto-configure unspecified fields using example-full.yaml
head_node:
hardware_profile:
vm_size: Standard_NC6
storage_profile:
os_disk:
create_option: FromImage
caching: ReadWrite
image_reference:
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
publisher: microsoft-dsvm
offer: ubuntu-1804
sku: 1804
version: 20.02.01
azure_arm_parameters:
vmSize: Standard_NC6
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804
imageVersion: 20.02.01
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields
# Documentation on fields used can be found here:
# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
# Note: the Azure Python SDK expects all parameter keys to be in slug_style
# the styles of parameter values are not changed
# Ray will auto-configure unspecified fields using example-full.yaml
worker_nodes:
hardware_profile:
vm_size: Standard_NC6
storage_profile:
os_disk:
create_option: FromImage
caching: ReadWrite
image_reference:
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
publisher: microsoft-dsvm
offer: ubuntu-1804
sku: 1804
version: 20.02.01
# You can provision additional disk space as follows
# data_disks:
# - disk_size_gb: 1024
# run workers on spot instances by default
priority: Spot
eviction_policy: Deallocate
billing_profile:
max_price: -1
azure_arm_parameters:
vmSize: Standard_NC6
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804
imageVersion: 20.02.01
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@@ -135,7 +108,7 @@ setup_commands:
# below with a git checkout <your_sha> (and possibly a recompile).
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://sdgraystorage.blob.core.windows.net/ray-wheels/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
+85 -88
View File
@@ -1,4 +1,6 @@
import json
import logging
import os
from threading import RLock
from uuid import uuid4
@@ -6,7 +8,9 @@ from azure.common.client_factory import get_client_from_cli_profile
from msrestazure.azure_active_directory import MSIAuthentication
from azure.mgmt.compute import ComputeManagementClient
from azure.mgmt.network import NetworkManagementClient
from azure.mgmt.compute.models import ResourceIdentityType
from azure.mgmt.resource import ResourceManagementClient
from azure.mgmt.resource.resources.models import DeploymentMode
from knack.util import CLIError
from ray.autoscaler.node_provider import NodeProvider
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_NAME
@@ -50,15 +54,21 @@ class AzureNodeProvider(NodeProvider):
client_class=ComputeManagementClient, **kwargs)
self.network_client = get_client_from_cli_profile(
client_class=NetworkManagementClient, **kwargs)
except Exception:
logger.info(
"CLI profile authentication failed. Trying MSI", exc_info=True)
self.resource_client = get_client_from_cli_profile(
client_class=ResourceManagementClient, **kwargs)
except CLIError as e:
if str(e) != "Please run 'az login' to setup account.":
raise
else:
logger.info("CLI profile authentication failed. Trying MSI")
credentials = MSIAuthentication()
self.compute_client = ComputeManagementClient(
credentials=credentials, **kwargs)
self.network_client = NetworkManagementClient(
credentials=credentials, **kwargs)
credentials = MSIAuthentication()
self.compute_client = ComputeManagementClient(
credentials=credentials, **kwargs)
self.network_client = NetworkManagementClient(
credentials=credentials, **kwargs)
self.resource_client = ResourceManagementClient(
credentials=credentials, **kwargs)
self.lock = RLock()
@@ -164,79 +174,43 @@ class AzureNodeProvider(NodeProvider):
def create_node(self, node_config, tags, count):
"""Creates a number of nodes within the namespace."""
# TODO: restart deallocated nodes if possible
location = self.provider_config["location"]
resource_group = self.provider_config["resource_group"]
subnet_id = self.provider_config["subnet_id"]
config = node_config.copy()
config_tags = config.get("tags", {})
# load the template
template_path = os.path.join(
os.path.dirname(__file__), "azure-vm-template.json")
with open(template_path, "r") as template_file_fd:
template = json.load(template_file_fd)
# get the tags
config_tags = node_config.get("tags", {}).copy()
config_tags.update(tags)
config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
config["tags"] = config_tags
config["location"] = location
name_tag = config_tags.get(TAG_RAY_NODE_NAME, "node")
unique_id = uuid4().hex[:VM_NAME_UUID_LEN]
for _ in range(count):
unique_id = uuid4().hex[:VM_NAME_UUID_LEN]
vm_name = "{name}-{id}".format(name=name_tag, id=unique_id)
config["os_profile"]["computer_name"] = vm_name
parameters = node_config["azure_arm_parameters"].copy()
parameters["vmName"] = "{name}-{id}".format(
name=name_tag, id=unique_id)
parameters["provisionPublicIp"] = not self.provider_config.get(
"use_internal_ips", False)
parameters["vmTags"] = config_tags
parameters["vmCount"] = count
try:
assert len(vm_name) <= VM_NAME_MAX_LEN
except AssertionError as e:
e.args += ("name", vm_name)
raise
ip_configuration = {"name": uuid4(), "subnet": {"id": subnet_id}}
if not self.provider_config.get("use_internal_ips", False):
# create public ip address
public_ip_addess_params = {
"location": location,
"public_ip_allocation_method": "Dynamic"
}
public_ip_address = (
self.network_client.public_ip_addresses.create_or_update(
resource_group_name=resource_group,
public_ip_address_name="{}-ip".format(vm_name),
parameters=public_ip_addess_params).result())
ip_configuration["public_ip_address"] = public_ip_address
nic_params = {
"location": location,
"ip_configurations": [ip_configuration]
deployment_properties = {
"mode": DeploymentMode.incremental,
"template": template,
"parameters": {k: {
"value": v
}
nic = self.network_client.network_interfaces.create_or_update(
resource_group_name=resource_group,
network_interface_name="{}-nic".format(vm_name),
parameters=nic_params).result()
for k, v in parameters.items()}
}
# update vm config with network parameters
config["network_profile"] = {
"network_interfaces": [{
"id": nic.id
}]
}
config["identity"] = {
"type": ResourceIdentityType.user_assigned,
"user_assigned_identities": [{
# zero-documentation.. *sigh*
"key": self.provider_config["msi_identity_id"],
"value": {
"principal_id": self.provider_config[
"msi_identity_principal_id"],
"client_id": self.provider_config["msi_identity_id"]
}
}]
}
# TODO: do we need to wait or fire and forget is fine?
self.compute_client.virtual_machines.create_or_update(
resource_group_name=self.provider_config["resource_group"],
vm_name=vm_name,
parameters=config)
# TODO: we could get the private/public ips back directly
self.resource_client.deployments.create_or_update(
resource_group, "ray-vm-{}".format(name_tag),
deployment_properties).wait()
@synchronized
def set_node_tags(self, node_id, tags):
@@ -252,34 +226,57 @@ class AzureNodeProvider(NodeProvider):
def terminate_node(self, node_id):
"""Terminates the specified node. This will delete the VM and
associated resources (NIC, IP, Storage) for the specified node."""
# self.compute_client.virtual_machines.deallocate(
# resource_group_name=self.provider_config["resource_group"],
# vm_name=node_id)
resource_group = self.provider_config["resource_group"]
nodes = self._get_filtered_nodes(
tag_filters={TAG_RAY_CLUSTER_NAME: self.cluster_name})
for node, metadata in nodes.items():
# gather disks to delete later
vm = self.compute_client.virtual_machines.get(
resource_group_name=resource_group, vm_name=node)
disks = {d.name for d in vm.storage_profile.data_disks}
disks.add(vm.storage_profile.os_disk.name)
try:
# get metadata for node
metadata = self._get_node(node_id)
except KeyError:
# node no longer exists
return
# TODO: deallocate instead of delete to allow possible reuse
# self.compute_client.virtual_machines.deallocate(
# resource_group_name=resource_group,
# vm_name=node_id)
# gather disks to delete later
vm = self.compute_client.virtual_machines.get(
resource_group_name=resource_group, vm_name=node_id)
disks = {d.name for d in vm.storage_profile.data_disks}
disks.add(vm.storage_profile.os_disk.name)
try:
# delete machine, must wait for this to complete
self.compute_client.virtual_machines.delete(
resource_group_name=resource_group, vm_name=node).wait()
resource_group_name=resource_group, vm_name=node_id).wait()
except Exception as e:
logger.warning("Failed to delete VM: {}".format(e))
try:
# delete nic
self.network_client.network_interfaces.delete(
resource_group_name=resource_group,
network_interface_name=metadata["nic_name"])
# delete ip address
if "public_ip_name" in metadata:
except Exception as e:
logger.warning("Failed to delete nic: {}".format(e))
# delete ip address
if "public_ip_name" in metadata:
try:
self.network_client.public_ip_addresses.delete(
resource_group_name=resource_group,
public_ip_address_name=metadata["public_ip_name"])
# delete disks
for disk in disks:
except Exception as e:
logger.warning("Failed to delete public ip: {}".format(e))
# delete disks
for disk in disks:
try:
self.compute_client.disks.delete(
resource_group_name=resource_group, disk_name=disk)
except Exception as e:
logger.warning("Failed to delete disk: {}".format(e))
def _get_node(self, node_id):
self._get_filtered_nodes({}) # Side effect: updates cache
+2
View File
@@ -42,6 +42,8 @@ optional_ray_files = []
ray_autoscaler_files = [
"ray/autoscaler/aws/example-full.yaml",
"ray/autoscaler/azure/example-full.yaml",
"ray/autoscaler/azure/azure-vm-template.json",
"ray/autoscaler/azure/azure-config-template.json",
"ray/autoscaler/gcp/example-full.yaml",
"ray/autoscaler/local/example-full.yaml",
"ray/autoscaler/kubernetes/example-full.yaml",