diff --git a/doc/source/autoscaling.rst b/doc/source/autoscaling.rst
index 5043609dc..96b3b60b6 100644
--- a/doc/source/autoscaling.rst
+++ b/doc/source/autoscaling.rst
@@ -59,8 +59,12 @@ Test that it works by running the following commands from your local machine:
# Get a remote screen on the head node.
$ ray attach ray/python/ray/autoscaler/azure/example-full.yaml
- $ source activate tensorflow_p36
- $ # Try running a Ray program with 'ray.init(address="auto")'.
+ # test ray setup
+ # enable conda environment
+ $ exec bash -l
+ $ conda activate py37_tensorflow
+ $ python -c 'import ray; ray.init()'
+ $ exit
# Tear down the cluster.
$ ray down ray/python/ray/autoscaler/azure/example-full.yaml
@@ -69,26 +73,26 @@ Azure Portal
Alternatively, you can deploy a cluster using Azure portal directly. Please note that auto scaling is done using Azure VM Scale Sets and not through
the Ray autoscaler. This will deploy `Azure Data Science VMs (DSVM) `_
-for both the head node and an auto-scale cluster managed by `Azure Virtual Machine Scale Sets `_.
-The head node conviently exposes both SSH as well as JupyterLab.
+for both the head node and the auto-scalable cluster managed by `Azure Virtual Machine Scale Sets `_.
+The head node conveniently exposes both SSH as well as JupyterLab.
.. image:: https://aka.ms/deploytoazurebutton
:target: https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fray-project%2Fray%2Fmaster%2Fdoc%2Fazure%2Fazure-ray-template.json
:alt: Deploy to Azure
-Once the template is successfully deploy the deployment output page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input).
-Use the following code connect to the Ray cluster.
+Once the template is successfully deployed the deployment output page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input).
+Use the following code in a Jupyter notebook to connect to the Ray cluster.
.. code-block:: python
import ray
ray.init(address='auto')
-Note that on each node the `azure-init.sh `_ script is executed and performs
+Note that on each node the `azure-init.sh `_ script is executed and performs the following actions:
-1. activate one of the conda environments available on DSVM
-2. install Ray and any other user-specified dependencies
-3. setup of a systemd task (``/lib/systemd/system/ray.service``) which starting ray in head or worker mode
+1. Activates one of the conda environments available on DSVM
+2. Installs Ray and any other user-specified dependencies
+3. Sets up a systemd task (``/lib/systemd/system/ray.service``) to start Ray in head or worker mode
GCP
~~~
diff --git a/python/ray/autoscaler/azure/azure-config-template.json b/python/ray/autoscaler/azure/azure-config-template.json
new file mode 100644
index 000000000..65c671c82
--- /dev/null
+++ b/python/ray/autoscaler/azure/azure-config-template.json
@@ -0,0 +1,81 @@
+{
+ "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+ "contentVersion": "1.0.0.0",
+ "parameters": {
+ "subnet": {
+ "type": "string",
+ "metadata": {
+ "description": "The subnet to be used"
+ }
+ }
+ },
+ "variables": {
+ "Contributor": "[concat('/subscriptions/', subscription().subscriptionId, '/providers/Microsoft.Authorization/roleDefinitions/', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]"
+ },
+ "resources": [
+ {
+ "type": "Microsoft.ManagedIdentity/userAssignedIdentities",
+ "apiVersion": "2018-11-30",
+ "location": "[resourceGroup().location]",
+ "name": "ray-msi-user-identity"
+ },
+ {
+ "type": "Microsoft.Authorization/roleAssignments",
+ "apiVersion": "2018-09-01-preview",
+ "name": "[guid(resourceGroup().id)]",
+ "properties": {
+ "principalId": "[reference('ray-msi-user-identity').principalId]",
+ "roleDefinitionId": "[variables('Contributor')]",
+ "scope": "[resourceGroup().id]",
+ "principalType": "ServicePrincipal"
+ },
+ "dependsOn": [
+ "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', 'ray-msi-user-identity')]"
+ ]
+ },
+ {
+ "type": "Microsoft.Network/networkSecurityGroups",
+ "apiVersion": "2019-02-01",
+ "name": "ray-nsg",
+ "location": "[resourceGroup().location]",
+ "properties": {
+ "securityRules": [
+ {
+ "name": "SSH",
+ "properties": {
+ "priority": 1000,
+ "protocol": "TCP",
+ "access": "Allow",
+ "direction": "Inbound",
+ "sourceAddressPrefix": "*",
+ "sourcePortRange": "*",
+ "destinationAddressPrefix": "*",
+ "destinationPortRange": "22"
+ }
+ }
+ ]
+ }
+ },
+ {
+ "type": "Microsoft.Network/virtualNetworks",
+ "apiVersion": "2019-11-01",
+ "name": "ray-vnet",
+ "location": "[resourceGroup().location]",
+ "properties": {
+ "addressSpace": {
+ "addressPrefixes": [
+ "[parameters('subnet')]"
+ ]
+ },
+ "subnets": [
+ {
+ "name": "ray-subnet",
+ "properties": {
+ "addressPrefix": "[parameters('subnet')]"
+ }
+ }
+ ]
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/python/ray/autoscaler/azure/azure-vm-template.json b/python/ray/autoscaler/azure/azure-vm-template.json
new file mode 100644
index 000000000..9410d87db
--- /dev/null
+++ b/python/ray/autoscaler/azure/azure-vm-template.json
@@ -0,0 +1,243 @@
+{
+ "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+ "contentVersion": "1.0.0.0",
+ "parameters": {
+ "vmName": {
+ "type": "string",
+ "metadata": {
+ "description": "The name of you Virtual Machine."
+ }
+ },
+ "adminUsername": {
+ "type": "string",
+ "metadata": {
+ "description": "Username for the Virtual Machine."
+ }
+ },
+ "publicKey": {
+ "type": "securestring",
+ "metadata": {
+ "description": "SSH Key for the Virtual Machine"
+ }
+ },
+ "imagePublisher": {
+ "type": "string",
+ "metadata": {
+ "description": "The publisher of the VM image"
+ }
+ },
+ "imageOffer": {
+ "type": "string",
+ "metadata": {
+ "description": "The offer of the VM image"
+ }
+ },
+ "imageSku": {
+ "type": "string",
+ "metadata": {
+ "description": "The sku of the VM image"
+ }
+ },
+ "imageVersion": {
+ "type": "string",
+ "metadata": {
+ "description": "The version of the VM image"
+ }
+ },
+ "vmSize": {
+ "type": "string",
+ "metadata": {
+ "description": "The size of the VM"
+ }
+ },
+ "vmTags": {
+ "type": "object",
+ "metadata": {
+ "description": "Tags for the VM"
+ }
+ },
+ "vmCount": {
+ "type": "int",
+ "metadata": {
+ "description": "Number of VMs to deploy"
+ }
+ },
+ "provisionPublicIp": {
+ "type": "bool",
+ "defaultValue": true,
+ "metadata": {
+ "description": "If true creates a public ip"
+ }
+ }
+ },
+ "variables": {
+ "publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]",
+ "networkInterfaceNamePrivate": "[concat(parameters('vmName'),'-nic')]",
+ "networkInterfaceNamePublic": "[concat(parameters('vmName'),'-nic-public')]",
+ "networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
+ "networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]",
+ "subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]",
+ "osDiskType": "Standard_LRS"
+ },
+ "resources": [
+ {
+ "type": "Microsoft.Network/networkInterfaces",
+ "apiVersion": "2018-10-01",
+ "name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]",
+ "location": "[resourceGroup().location]",
+ "dependsOn": [
+ "[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]"
+ ],
+ "copy": {
+ "name": "NICPublicCopy",
+ "count": "[parameters('vmCount')]"
+ },
+ "properties": {
+ "ipConfigurations": [
+ {
+ "name": "[variables('networkIpConfig')]",
+ "properties": {
+ "subnet": {
+ "id": "[variables('subnetRef')]"
+ },
+ "privateIPAllocationMethod": "Dynamic",
+ "publicIpAddress": {
+ "id": "[resourceId('Microsoft.Network/publicIPAddresses', concat(variables('publicIPAddressName'), copyIndex()))]"
+ }
+ }
+ }
+ ],
+ "networkSecurityGroup": {
+ "id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]"
+ }
+ },
+ "condition": "[parameters('provisionPublicIp')]"
+ },
+ {
+ "type": "Microsoft.Network/networkInterfaces",
+ "apiVersion": "2018-10-01",
+ "name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]",
+ "location": "[resourceGroup().location]",
+ "copy": {
+ "name": "NICPrivateCopy",
+ "count": "[parameters('vmCount')]"
+ },
+ "properties": {
+ "ipConfigurations": [
+ {
+ "name": "[variables('networkIpConfig')]",
+ "properties": {
+ "subnet": {
+ "id": "[variables('subnetRef')]"
+ },
+ "privateIPAllocationMethod": "Dynamic"
+ }
+ }
+ ],
+ "networkSecurityGroup": {
+ "id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]"
+ }
+ },
+ "condition": "[not(parameters('provisionPublicIp'))]"
+ },
+ {
+ "type": "Microsoft.Network/publicIpAddresses",
+ "apiVersion": "2019-02-01",
+ "name": "[concat(variables('publicIpAddressName'), copyIndex())]",
+ "location": "[resourceGroup().location]",
+ "properties": {
+ "publicIpAllocationMethod": "Static",
+ "publicIPAddressVersion": "IPv4"
+ },
+ "copy": {
+ "name": "PublicIpCopy",
+ "count": "[parameters('vmCount')]"
+ },
+ "sku": {
+ "name": "Basic",
+ "tier": "Regional"
+ },
+ "condition": "[parameters('provisionPublicIp')]"
+ },
+ {
+ "type": "Microsoft.Compute/virtualMachines",
+ "apiVersion": "2019-03-01",
+ "name": "[concat(parameters('vmName'), copyIndex())]",
+ "location": "[resourceGroup().location]",
+ "dependsOn": [
+ "[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]"
+ ],
+ "copy": {
+ "name": "VmCopy",
+ "count": "[parameters('vmCount')]"
+ },
+ "tags": "[parameters('vmTags')]",
+ "properties": {
+ "hardwareProfile": {
+ "vmSize": "[parameters('vmSize')]"
+ },
+ "storageProfile": {
+ "osDisk": {
+ "createOption": "fromImage",
+ "managedDisk": {
+ "storageAccountType": "[variables('osDiskType')]"
+ }
+ },
+ "imageReference": {
+ "publisher": "[parameters('imagePublisher')]",
+ "offer": "[parameters('imageOffer')]",
+ "sku": "[parameters('imageSku')]",
+ "version": "[parameters('imageVersion')]"
+ }
+ },
+ "networkProfile": {
+ "networkInterfaces": [
+ {
+ "id": "[resourceId('Microsoft.Network/networkInterfaces', concat(variables('networkInterfaceName'), copyIndex()))]"
+ }
+ ]
+ },
+ "osProfile": {
+ "computerName": "[concat(parameters('vmName'), copyIndex())]",
+ "adminUsername": "[parameters('adminUsername')]",
+ "adminPassword": "[parameters('publicKey')]",
+ "linuxConfiguration": {
+ "disablePasswordAuthentication": true,
+ "ssh": {
+ "publicKeys": [
+ {
+ "path": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]",
+ "keyData": "[parameters('publicKey')]"
+ }
+ ]
+ }
+ }
+ }
+ },
+ "identity": {
+ "type": "UserAssigned",
+ "userAssignedIdentities": {
+ "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', 'ray-msi-user-identity')]": {
+ }
+ }
+ }
+ }
+ ],
+ "outputs": {
+ "publicIp": {
+ "type": "array",
+ "copy": {
+ "count": "[parameters('vmCount')]",
+ "input": "[reference(concat(variables('publicIpAddressName'), copyIndex())).ipAddress]"
+ },
+ "condition": "[parameters('provisionPublicIp')]"
+ },
+ "privateIp": {
+ "type": "array",
+ "copy": {
+ "count": "[parameters('vmCount')]",
+ "input": "[reference(concat(variables('networkInterfaceName'), copyIndex())).ipConfigurations[0].properties.privateIPAddress]"
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/python/ray/autoscaler/azure/config.py b/python/ray/autoscaler/azure/config.py
index 844471af8..0be524a2c 100644
--- a/python/ray/autoscaler/azure/config.py
+++ b/python/ray/autoscaler/azure/config.py
@@ -1,14 +1,11 @@
+import json
import logging
+import random
import os
-import time
-import uuid
-from azure.common.exceptions import CloudError, AuthenticationError
from azure.common.client_factory import get_client_from_cli_profile
-from azure.mgmt.authorization import AuthorizationManagementClient
-from azure.mgmt.network import NetworkManagementClient
from azure.mgmt.resource import ResourceManagementClient
-from azure.mgmt.msi import ManagedServiceIdentityClient
+from azure.mgmt.resource.resources.models import DeploymentMode
RETRIES = 30
MSI_NAME = "ray-msi-user-identity"
@@ -20,10 +17,8 @@ logger = logging.getLogger(__name__)
def bootstrap_azure(config):
- config = _configure_resource_group(config)
- config = _configure_msi_user(config)
config = _configure_key_pair(config)
- config = _configure_network(config)
+ config = _configure_resource_group(config)
return config
@@ -59,67 +54,36 @@ def _configure_resource_group(config):
resource_client.resource_groups.create_or_update(
resource_group_name=resource_group, parameters=params)
- return config
+ # load the template
+ template_path = os.path.join(
+ os.path.dirname(__file__), "azure-config-template.json")
+ with open(template_path, "r") as template_file_fd:
+ template = json.load(template_file_fd)
+ # choose a random subnet
+ random.seed(resource_group)
+ # start at 1 to avoid most likely collision at 0
+ parameters = {"subnet": "10.{}.0.0/16".format(random.randint(1, 254))}
-def _configure_msi_user(config):
- msi_client = _get_client(ManagedServiceIdentityClient, config)
- resource_client = _get_client(ResourceManagementClient, config)
- auth_client = _get_client(AuthorizationManagementClient, config)
+ deployment_properties = {
+ "mode": DeploymentMode.incremental,
+ "template": template,
+ "parameters": {k: {
+ "value": v
+ }
+ for k, v in parameters.items()}
+ }
- resource_group = config["provider"]["resource_group"]
- location = config["provider"]["location"]
-
- resource_group_id = resource_client.resource_groups.get(resource_group).id
- try:
- identity = msi_client.user_assigned_identities.list_by_resource_group(
- resource_group_name=resource_group,
- filter="name eq '{}'".format(MSI_NAME)).next()
- logger.info("Found MSI User Assigned Identity: %s", MSI_NAME)
- except StopIteration:
- logger.info("Creating MSI User Assigned Identity: %s", MSI_NAME)
- identity = msi_client.user_assigned_identities.create_or_update(
- resource_group_name=resource_group,
- resource_name=MSI_NAME,
- location=location)
-
- identity_id = identity.id
- principal_id = identity.principal_id
- config["provider"]["msi_identity_id"] = identity_id
- config["provider"]["msi_identity_principal_id"] = principal_id
-
- # assign Contributor role for MSI User Identity to resource group
- role_id = auth_client.role_definitions.list(
- scope=resource_group_id, filter="roleName eq 'Contributor'").next().id
- role_params = {"role_definition_id": role_id, "principal_id": principal_id}
-
- for _ in range(RETRIES):
- try:
- filter_expr = "principalId eq '{}'".format(principal_id)
- assignments = auth_client.role_assignments.list_for_scope(
- scope=resource_group_id, filter=filter_expr)
-
- if any(a.role_definition_id == role_id for a in assignments):
- break
-
- auth_client.role_assignments.create(
- scope=resource_group_id,
- role_assignment_name=uuid.uuid4(),
- parameters=role_params)
- logger.info("Assigning Contributor Role to MSI User")
- except CloudError as ce:
- if ce.inner_exception.error == "PrincipalNotFound":
- time.sleep(5)
- else:
- raise Exception(
- "Failed to create contributor role assignment (timeout)")
+ deployment_async_operation = resource_client.deployments.create_or_update(
+ resource_group, "ray-config", deployment_properties)
+ deployment_async_operation.wait()
return config
def _configure_key_pair(config):
ssh_user = config["auth"]["ssh_user"]
-
+ # search if the keys exist
for key_type in ["ssh_private_key", "ssh_public_key"]:
try:
key_path = os.path.expanduser(config["auth"][key_type])
@@ -135,93 +99,8 @@ def _configure_key_pair(config):
with open(key_path, "r") as f:
public_key = f.read()
- os_profile = {
- "admin_username": ssh_user,
- "computer_name": None,
- "linux_configuration": {
- "disable_password_authentication": True,
- "ssh": {
- "public_keys": [{
- "key_data": public_key,
- "path": "/home/{}/.ssh/authorized_keys".format(ssh_user)
- }]
- }
- }
- }
for node_type in ["head_node", "worker_nodes"]:
- config[node_type]["os_profile"] = os_profile
-
- return config
-
-
-def _configure_network(config):
- # skip this if subnet is manually set in configuration yaml
- if "subnet_id" in config["provider"]:
- return config
-
- location = config["provider"]["location"]
- resource_group = config["provider"]["resource_group"]
- network_client = _get_client(NetworkManagementClient, config)
-
- vnets = []
- for _ in range(RETRIES):
- try:
- vnets = list(
- network_client.virtual_networks.list(
- resource_group_name=resource_group,
- filter="name eq '{}'".format(VNET_NAME)))
- break
- except CloudError:
- time.sleep(1)
- except AuthenticationError:
- # wait for service principal authorization to populate
- time.sleep(1)
-
- # can't update vnet if subnet already exists
- if not vnets:
- # create vnet
- logger.info("Creating/Updating VNet: %s", VNET_NAME)
- vnet_params = {
- "location": location,
- "address_space": {
- "address_prefixes": ["10.0.0.0/16"]
- }
- }
- network_client.virtual_networks.create_or_update(
- resource_group_name=resource_group,
- virtual_network_name=VNET_NAME,
- parameters=vnet_params).wait()
-
- # create subnet
- logger.info("Creating/Updating Subnet: %s", SUBNET_NAME)
- subnet_params = {"address_prefix": "10.0.0.0/24"}
- subnet = network_client.subnets.create_or_update(
- resource_group_name=resource_group,
- virtual_network_name=VNET_NAME,
- subnet_name=SUBNET_NAME,
- subnet_parameters=subnet_params).result()
-
- config["provider"]["subnet_id"] = subnet.id
-
- # create network security group
- logger.info("Creating/Updating Network Security Group: %s", NSG_NAME)
- nsg_params = {
- "location": location,
- "security_rules": [{
- "protocol": "Tcp",
- "source_port_range": "*",
- "source_address_prefix": "*",
- "destination_port_range": "22",
- "destination_address_prefix": "*",
- "access": "Allow",
- "priority": 300,
- "direction": "Inbound",
- "name": "ssh_rule"
- }]
- }
- network_client.network_security_groups.create_or_update(
- resource_group_name=resource_group,
- network_security_group_name=NSG_NAME,
- parameters=nsg_params).wait()
+ config[node_type]["azure_arm_parameters"]["adminUsername"] = ssh_user
+ config[node_type]["azure_arm_parameters"]["publicKey"] = public_key
return config
diff --git a/python/ray/autoscaler/azure/example-full.yaml b/python/ray/autoscaler/azure/example-full.yaml
index 3149ee40f..bd93481a8 100644
--- a/python/ray/autoscaler/azure/example-full.yaml
+++ b/python/ray/autoscaler/azure/example-full.yaml
@@ -65,54 +65,25 @@ auth:
ssh_private_key: ~/.ssh/id_rsa
ssh_public_key: ~/.ssh/id_rsa.pub
-# Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields
-# The Azure Python SDK client expects slug_style property names
-# For more documentation on available fields, see:
-# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
-# Note: the Azure Python SDK expects all parameter keys to be in slug_style
-# the styles of parameter values are not changed
+# Provider-specific config for the head node, e.g. instance type.
head_node:
- hardware_profile:
- vm_size: Standard_D2s_v3
- storage_profile:
- os_disk:
- create_option: FromImage
- caching: ReadWrite
- image_reference:
- # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
- publisher: microsoft-dsvm
- offer: ubuntu-1804
- sku: 1804-gen2
- version: 20.02.01
+ azure_arm_parameters:
+ vmSize: Standard_D2s_v3
+ # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+ imagePublisher: microsoft-dsvm
+ imageOffer: ubuntu-1804
+ imageSku: 1804-gen2
+ imageVersion: 20.02.01
-# Provider-specific config for worker nodes, e.g. instance type. By default
-# Ray will auto-configure unspecified fields
-# Documentation on fields used can be found here:
-# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
-# Note: the Azure Python SDK expects all parameter keys to be in slug_style
-# the styles of parameter values are not changed
+# Provider-specific config for worker nodes, e.g. instance type.
worker_nodes:
- hardware_profile:
- vm_size: Standard_F2s_v2
- storage_profile:
- os_disk:
- create_option: FromImage
- caching: ReadWrite
- image_reference:
- # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
- publisher: microsoft-dsvm
- offer: ubuntu-1804
- sku: 1804-gen
- version: 20.02.01
- # You can provision additional disk space as follows
- # data_disks:
- # - disk_size_gb: 1024
- # run workers on spot instances by default
- priority: Spot
- eviction_policy: Deallocate
- billing_profile:
- max_price: -1
+ azure_arm_parameters:
+ vmSize: Standard_D2s_v3
+ # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+ imagePublisher: microsoft-dsvm
+ imageOffer: ubuntu-1804
+ imageSku: 1804-gen2
+ imageVersion: 20.02.01
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@@ -133,10 +104,9 @@ setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout (and possibly a recompile).
- # change to use environment desired
- #- echo "conda activate py37_pytorch" >> ~/.bashrc
- #- echo "conda activate py37_tensorflow" >> ~/.bashrc
- - pip install -U https://sdgraystorage.blob.core.windows.net/ray-wheels/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
+ # - echo 'conda activate py37_pytorch' >> ~/.bashrc
+ - echo 'conda activate py37_tensorflow' >> ~/.bashrc
+ - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
@@ -144,7 +114,7 @@ setup_commands:
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- - pip install azure-cli-core azure-core azure-mgmt-authorization azure-mgmt-compute azure-mgmt-msi azure-mgmt-network
+ - pip install azure-cli-core azure-core azure-mgmt-authorization azure-mgmt-network azure-mgmt-compute azure-mgmt-msi
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
diff --git a/python/ray/autoscaler/azure/example-gpu-docker.yaml b/python/ray/autoscaler/azure/example-gpu-docker.yaml
index 6240d50ac..52ef692eb 100644
--- a/python/ray/autoscaler/azure/example-gpu-docker.yaml
+++ b/python/ray/autoscaler/azure/example-gpu-docker.yaml
@@ -60,26 +60,16 @@ auth:
ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields
-# The Azure Python SDK client expects slug_style property names
-# For more documentation on available fields, see:
-# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
-# Note: the Azure Python SDK expects all parameter keys to be in slug_style
-# the styles of parameter values are not changed
+# Ray will auto-configure unspecified fields using example-full.yaml
head_node:
- hardware_profile:
- vm_size: Standard_NC6s_v3
+ azure_arm_parameters:
+ vmSize: Standard_NC6s_v3
# Provider-specific config for worker nodes, e.g. instance type. By default
-# Ray will auto-configure unspecified fields
-# The Azure Python SDK client expects slug_style property names
-# For more documentation on available fields, see:
-# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
-# Note: the Azure Python SDK expects all parameter keys to be in slug_style
-# the styles of parameter values are not changed
+# Ray will auto-configure unspecified fields using example-full.yaml
worker_nodes:
- hardware_profile:
- vm_size: Standard_NC6s_v3
+ azure_arm_parameters:
+ vmSize: Standard_NC6s_v3
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@@ -90,7 +80,7 @@ file_mounts: {
# List of shell commands to run to set up nodes.
setup_commands:
- - pip install -U https://sdgraystorage.blob.core.windows.net/ray-wheels/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
+ - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
diff --git a/python/ray/autoscaler/azure/example-gpu.yaml b/python/ray/autoscaler/azure/example-gpu.yaml
index ef87d1d67..cfe415bd8 100644
--- a/python/ray/autoscaler/azure/example-gpu.yaml
+++ b/python/ray/autoscaler/azure/example-gpu.yaml
@@ -66,53 +66,26 @@ auth:
ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default
-# Ray will auto-configure unspecified fields
-# The Azure Python SDK client expects slug_style property names
-# For more documentation on available fields, see:
-# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
-# Note: the Azure Python SDK expects all parameter keys to be in slug_style
-# the styles of parameter values are not changed
+# Ray will auto-configure unspecified fields using example-full.yaml
head_node:
- hardware_profile:
- vm_size: Standard_NC6
- storage_profile:
- os_disk:
- create_option: FromImage
- caching: ReadWrite
- image_reference:
- # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
- publisher: microsoft-dsvm
- offer: ubuntu-1804
- sku: 1804
- version: 20.02.01
+ azure_arm_parameters:
+ vmSize: Standard_NC6
+ # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+ imagePublisher: microsoft-dsvm
+ imageOffer: ubuntu-1804
+ imageSku: 1804
+ imageVersion: 20.02.01
# Provider-specific config for worker nodes, e.g. instance type. By default
-# Ray will auto-configure unspecified fields
-# Documentation on fields used can be found here:
-# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
-# Note: the Azure Python SDK expects all parameter keys to be in slug_style
-# the styles of parameter values are not changed
+# Ray will auto-configure unspecified fields using example-full.yaml
worker_nodes:
- hardware_profile:
- vm_size: Standard_NC6
- storage_profile:
- os_disk:
- create_option: FromImage
- caching: ReadWrite
- image_reference:
- # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
- publisher: microsoft-dsvm
- offer: ubuntu-1804
- sku: 1804
- version: 20.02.01
- # You can provision additional disk space as follows
- # data_disks:
- # - disk_size_gb: 1024
- # run workers on spot instances by default
- priority: Spot
- eviction_policy: Deallocate
- billing_profile:
- max_price: -1
+ azure_arm_parameters:
+ vmSize: Standard_NC6
+ # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+ imagePublisher: microsoft-dsvm
+ imageOffer: ubuntu-1804
+ imageSku: 1804
+ imageVersion: 20.02.01
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@@ -135,7 +108,7 @@ setup_commands:
# below with a git checkout (and possibly a recompile).
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- - pip install -U https://sdgraystorage.blob.core.windows.net/ray-wheels/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
+ - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
# Consider uncommenting these if you also want to run apt-get commands during setup
# - sudo pkill -9 apt-get || true
# - sudo pkill -9 dpkg || true
diff --git a/python/ray/autoscaler/azure/node_provider.py b/python/ray/autoscaler/azure/node_provider.py
index 68cdae055..3d24bdb33 100644
--- a/python/ray/autoscaler/azure/node_provider.py
+++ b/python/ray/autoscaler/azure/node_provider.py
@@ -1,4 +1,6 @@
+import json
import logging
+import os
from threading import RLock
from uuid import uuid4
@@ -6,7 +8,9 @@ from azure.common.client_factory import get_client_from_cli_profile
from msrestazure.azure_active_directory import MSIAuthentication
from azure.mgmt.compute import ComputeManagementClient
from azure.mgmt.network import NetworkManagementClient
-from azure.mgmt.compute.models import ResourceIdentityType
+from azure.mgmt.resource import ResourceManagementClient
+from azure.mgmt.resource.resources.models import DeploymentMode
+from knack.util import CLIError
from ray.autoscaler.node_provider import NodeProvider
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_NAME
@@ -50,15 +54,21 @@ class AzureNodeProvider(NodeProvider):
client_class=ComputeManagementClient, **kwargs)
self.network_client = get_client_from_cli_profile(
client_class=NetworkManagementClient, **kwargs)
- except Exception:
- logger.info(
- "CLI profile authentication failed. Trying MSI", exc_info=True)
+ self.resource_client = get_client_from_cli_profile(
+ client_class=ResourceManagementClient, **kwargs)
+ except CLIError as e:
+ if str(e) != "Please run 'az login' to setup account.":
+ raise
+ else:
+ logger.info("CLI profile authentication failed. Trying MSI")
- credentials = MSIAuthentication()
- self.compute_client = ComputeManagementClient(
- credentials=credentials, **kwargs)
- self.network_client = NetworkManagementClient(
- credentials=credentials, **kwargs)
+ credentials = MSIAuthentication()
+ self.compute_client = ComputeManagementClient(
+ credentials=credentials, **kwargs)
+ self.network_client = NetworkManagementClient(
+ credentials=credentials, **kwargs)
+ self.resource_client = ResourceManagementClient(
+ credentials=credentials, **kwargs)
self.lock = RLock()
@@ -164,79 +174,43 @@ class AzureNodeProvider(NodeProvider):
def create_node(self, node_config, tags, count):
"""Creates a number of nodes within the namespace."""
# TODO: restart deallocated nodes if possible
- location = self.provider_config["location"]
resource_group = self.provider_config["resource_group"]
- subnet_id = self.provider_config["subnet_id"]
- config = node_config.copy()
- config_tags = config.get("tags", {})
+ # load the template
+ template_path = os.path.join(
+ os.path.dirname(__file__), "azure-vm-template.json")
+ with open(template_path, "r") as template_file_fd:
+ template = json.load(template_file_fd)
+
+ # get the tags
+ config_tags = node_config.get("tags", {}).copy()
config_tags.update(tags)
config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
- config["tags"] = config_tags
- config["location"] = location
name_tag = config_tags.get(TAG_RAY_NODE_NAME, "node")
+ unique_id = uuid4().hex[:VM_NAME_UUID_LEN]
- for _ in range(count):
- unique_id = uuid4().hex[:VM_NAME_UUID_LEN]
- vm_name = "{name}-{id}".format(name=name_tag, id=unique_id)
- config["os_profile"]["computer_name"] = vm_name
+ parameters = node_config["azure_arm_parameters"].copy()
+ parameters["vmName"] = "{name}-{id}".format(
+ name=name_tag, id=unique_id)
+ parameters["provisionPublicIp"] = not self.provider_config.get(
+ "use_internal_ips", False)
+ parameters["vmTags"] = config_tags
+ parameters["vmCount"] = count
- try:
- assert len(vm_name) <= VM_NAME_MAX_LEN
- except AssertionError as e:
- e.args += ("name", vm_name)
- raise
-
- ip_configuration = {"name": uuid4(), "subnet": {"id": subnet_id}}
-
- if not self.provider_config.get("use_internal_ips", False):
- # create public ip address
- public_ip_addess_params = {
- "location": location,
- "public_ip_allocation_method": "Dynamic"
- }
- public_ip_address = (
- self.network_client.public_ip_addresses.create_or_update(
- resource_group_name=resource_group,
- public_ip_address_name="{}-ip".format(vm_name),
- parameters=public_ip_addess_params).result())
- ip_configuration["public_ip_address"] = public_ip_address
-
- nic_params = {
- "location": location,
- "ip_configurations": [ip_configuration]
+ deployment_properties = {
+ "mode": DeploymentMode.incremental,
+ "template": template,
+ "parameters": {k: {
+ "value": v
}
- nic = self.network_client.network_interfaces.create_or_update(
- resource_group_name=resource_group,
- network_interface_name="{}-nic".format(vm_name),
- parameters=nic_params).result()
+ for k, v in parameters.items()}
+ }
- # update vm config with network parameters
- config["network_profile"] = {
- "network_interfaces": [{
- "id": nic.id
- }]
- }
-
- config["identity"] = {
- "type": ResourceIdentityType.user_assigned,
- "user_assigned_identities": [{
- # zero-documentation.. *sigh*
- "key": self.provider_config["msi_identity_id"],
- "value": {
- "principal_id": self.provider_config[
- "msi_identity_principal_id"],
- "client_id": self.provider_config["msi_identity_id"]
- }
- }]
- }
-
- # TODO: do we need to wait or fire and forget is fine?
- self.compute_client.virtual_machines.create_or_update(
- resource_group_name=self.provider_config["resource_group"],
- vm_name=vm_name,
- parameters=config)
+ # TODO: we could get the private/public ips back directly
+ self.resource_client.deployments.create_or_update(
+ resource_group, "ray-vm-{}".format(name_tag),
+ deployment_properties).wait()
@synchronized
def set_node_tags(self, node_id, tags):
@@ -252,34 +226,57 @@ class AzureNodeProvider(NodeProvider):
def terminate_node(self, node_id):
"""Terminates the specified node. This will delete the VM and
associated resources (NIC, IP, Storage) for the specified node."""
- # self.compute_client.virtual_machines.deallocate(
- # resource_group_name=self.provider_config["resource_group"],
- # vm_name=node_id)
+
resource_group = self.provider_config["resource_group"]
- nodes = self._get_filtered_nodes(
- tag_filters={TAG_RAY_CLUSTER_NAME: self.cluster_name})
- for node, metadata in nodes.items():
- # gather disks to delete later
- vm = self.compute_client.virtual_machines.get(
- resource_group_name=resource_group, vm_name=node)
- disks = {d.name for d in vm.storage_profile.data_disks}
- disks.add(vm.storage_profile.os_disk.name)
+ try:
+ # get metadata for node
+ metadata = self._get_node(node_id)
+ except KeyError:
+ # node no longer exists
+ return
+
+ # TODO: deallocate instead of delete to allow possible reuse
+ # self.compute_client.virtual_machines.deallocate(
+ # resource_group_name=resource_group,
+ # vm_name=node_id)
+
+ # gather disks to delete later
+ vm = self.compute_client.virtual_machines.get(
+ resource_group_name=resource_group, vm_name=node_id)
+ disks = {d.name for d in vm.storage_profile.data_disks}
+ disks.add(vm.storage_profile.os_disk.name)
+
+ try:
# delete machine, must wait for this to complete
self.compute_client.virtual_machines.delete(
- resource_group_name=resource_group, vm_name=node).wait()
+ resource_group_name=resource_group, vm_name=node_id).wait()
+ except Exception as e:
+ logger.warning("Failed to delete VM: {}".format(e))
+
+ try:
# delete nic
self.network_client.network_interfaces.delete(
resource_group_name=resource_group,
network_interface_name=metadata["nic_name"])
- # delete ip address
- if "public_ip_name" in metadata:
+ except Exception as e:
+ logger.warning("Failed to delete nic: {}".format(e))
+
+ # delete ip address
+ if "public_ip_name" in metadata:
+ try:
self.network_client.public_ip_addresses.delete(
resource_group_name=resource_group,
public_ip_address_name=metadata["public_ip_name"])
- # delete disks
- for disk in disks:
+ except Exception as e:
+ logger.warning("Failed to delete public ip: {}".format(e))
+
+ # delete disks
+ for disk in disks:
+ try:
self.compute_client.disks.delete(
resource_group_name=resource_group, disk_name=disk)
+ except Exception as e:
+ logger.warning("Failed to delete disk: {}".format(e))
def _get_node(self, node_id):
self._get_filtered_nodes({}) # Side effect: updates cache
diff --git a/python/setup.py b/python/setup.py
index 1013832e0..9a95a4d74 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -42,6 +42,8 @@ optional_ray_files = []
ray_autoscaler_files = [
"ray/autoscaler/aws/example-full.yaml",
"ray/autoscaler/azure/example-full.yaml",
+ "ray/autoscaler/azure/azure-vm-template.json",
+ "ray/autoscaler/azure/azure-config-template.json",
"ray/autoscaler/gcp/example-full.yaml",
"ray/autoscaler/local/example-full.yaml",
"ray/autoscaler/kubernetes/example-full.yaml",