diff --git a/doc/source/autoscaling.rst b/doc/source/autoscaling.rst index 5043609dc..96b3b60b6 100644 --- a/doc/source/autoscaling.rst +++ b/doc/source/autoscaling.rst @@ -59,8 +59,12 @@ Test that it works by running the following commands from your local machine: # Get a remote screen on the head node. $ ray attach ray/python/ray/autoscaler/azure/example-full.yaml - $ source activate tensorflow_p36 - $ # Try running a Ray program with 'ray.init(address="auto")'. + # test ray setup + # enable conda environment + $ exec bash -l + $ conda activate py37_tensorflow + $ python -c 'import ray; ray.init()' + $ exit # Tear down the cluster. $ ray down ray/python/ray/autoscaler/azure/example-full.yaml @@ -69,26 +73,26 @@ Azure Portal Alternatively, you can deploy a cluster using Azure portal directly. Please note that auto scaling is done using Azure VM Scale Sets and not through the Ray autoscaler. This will deploy `Azure Data Science VMs (DSVM) `_ -for both the head node and an auto-scale cluster managed by `Azure Virtual Machine Scale Sets `_. -The head node conviently exposes both SSH as well as JupyterLab. +for both the head node and the auto-scalable cluster managed by `Azure Virtual Machine Scale Sets `_. +The head node conveniently exposes both SSH as well as JupyterLab. .. image:: https://aka.ms/deploytoazurebutton :target: https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fray-project%2Fray%2Fmaster%2Fdoc%2Fazure%2Fazure-ray-template.json :alt: Deploy to Azure -Once the template is successfully deploy the deployment output page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input). -Use the following code connect to the Ray cluster. +Once the template is successfully deployed the deployment output page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input). +Use the following code in a Jupyter notebook to connect to the Ray cluster. .. code-block:: python import ray ray.init(address='auto') -Note that on each node the `azure-init.sh `_ script is executed and performs +Note that on each node the `azure-init.sh `_ script is executed and performs the following actions: -1. activate one of the conda environments available on DSVM -2. install Ray and any other user-specified dependencies -3. setup of a systemd task (``/lib/systemd/system/ray.service``) which starting ray in head or worker mode +1. Activates one of the conda environments available on DSVM +2. Installs Ray and any other user-specified dependencies +3. Sets up a systemd task (``/lib/systemd/system/ray.service``) to start Ray in head or worker mode GCP ~~~ diff --git a/python/ray/autoscaler/azure/azure-config-template.json b/python/ray/autoscaler/azure/azure-config-template.json new file mode 100644 index 000000000..65c671c82 --- /dev/null +++ b/python/ray/autoscaler/azure/azure-config-template.json @@ -0,0 +1,81 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "subnet": { + "type": "string", + "metadata": { + "description": "The subnet to be used" + } + } + }, + "variables": { + "Contributor": "[concat('/subscriptions/', subscription().subscriptionId, '/providers/Microsoft.Authorization/roleDefinitions/', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]" + }, + "resources": [ + { + "type": "Microsoft.ManagedIdentity/userAssignedIdentities", + "apiVersion": "2018-11-30", + "location": "[resourceGroup().location]", + "name": "ray-msi-user-identity" + }, + { + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2018-09-01-preview", + "name": "[guid(resourceGroup().id)]", + "properties": { + "principalId": "[reference('ray-msi-user-identity').principalId]", + "roleDefinitionId": "[variables('Contributor')]", + "scope": "[resourceGroup().id]", + "principalType": "ServicePrincipal" + }, + "dependsOn": [ + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', 'ray-msi-user-identity')]" + ] + }, + { + "type": "Microsoft.Network/networkSecurityGroups", + "apiVersion": "2019-02-01", + "name": "ray-nsg", + "location": "[resourceGroup().location]", + "properties": { + "securityRules": [ + { + "name": "SSH", + "properties": { + "priority": 1000, + "protocol": "TCP", + "access": "Allow", + "direction": "Inbound", + "sourceAddressPrefix": "*", + "sourcePortRange": "*", + "destinationAddressPrefix": "*", + "destinationPortRange": "22" + } + } + ] + } + }, + { + "type": "Microsoft.Network/virtualNetworks", + "apiVersion": "2019-11-01", + "name": "ray-vnet", + "location": "[resourceGroup().location]", + "properties": { + "addressSpace": { + "addressPrefixes": [ + "[parameters('subnet')]" + ] + }, + "subnets": [ + { + "name": "ray-subnet", + "properties": { + "addressPrefix": "[parameters('subnet')]" + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/python/ray/autoscaler/azure/azure-vm-template.json b/python/ray/autoscaler/azure/azure-vm-template.json new file mode 100644 index 000000000..9410d87db --- /dev/null +++ b/python/ray/autoscaler/azure/azure-vm-template.json @@ -0,0 +1,243 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "vmName": { + "type": "string", + "metadata": { + "description": "The name of you Virtual Machine." + } + }, + "adminUsername": { + "type": "string", + "metadata": { + "description": "Username for the Virtual Machine." + } + }, + "publicKey": { + "type": "securestring", + "metadata": { + "description": "SSH Key for the Virtual Machine" + } + }, + "imagePublisher": { + "type": "string", + "metadata": { + "description": "The publisher of the VM image" + } + }, + "imageOffer": { + "type": "string", + "metadata": { + "description": "The offer of the VM image" + } + }, + "imageSku": { + "type": "string", + "metadata": { + "description": "The sku of the VM image" + } + }, + "imageVersion": { + "type": "string", + "metadata": { + "description": "The version of the VM image" + } + }, + "vmSize": { + "type": "string", + "metadata": { + "description": "The size of the VM" + } + }, + "vmTags": { + "type": "object", + "metadata": { + "description": "Tags for the VM" + } + }, + "vmCount": { + "type": "int", + "metadata": { + "description": "Number of VMs to deploy" + } + }, + "provisionPublicIp": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "If true creates a public ip" + } + } + }, + "variables": { + "publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]", + "networkInterfaceNamePrivate": "[concat(parameters('vmName'),'-nic')]", + "networkInterfaceNamePublic": "[concat(parameters('vmName'),'-nic-public')]", + "networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]", + "networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]", + "subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]", + "osDiskType": "Standard_LRS" + }, + "resources": [ + { + "type": "Microsoft.Network/networkInterfaces", + "apiVersion": "2018-10-01", + "name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]", + "location": "[resourceGroup().location]", + "dependsOn": [ + "[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]" + ], + "copy": { + "name": "NICPublicCopy", + "count": "[parameters('vmCount')]" + }, + "properties": { + "ipConfigurations": [ + { + "name": "[variables('networkIpConfig')]", + "properties": { + "subnet": { + "id": "[variables('subnetRef')]" + }, + "privateIPAllocationMethod": "Dynamic", + "publicIpAddress": { + "id": "[resourceId('Microsoft.Network/publicIPAddresses', concat(variables('publicIPAddressName'), copyIndex()))]" + } + } + } + ], + "networkSecurityGroup": { + "id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]" + } + }, + "condition": "[parameters('provisionPublicIp')]" + }, + { + "type": "Microsoft.Network/networkInterfaces", + "apiVersion": "2018-10-01", + "name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]", + "location": "[resourceGroup().location]", + "copy": { + "name": "NICPrivateCopy", + "count": "[parameters('vmCount')]" + }, + "properties": { + "ipConfigurations": [ + { + "name": "[variables('networkIpConfig')]", + "properties": { + "subnet": { + "id": "[variables('subnetRef')]" + }, + "privateIPAllocationMethod": "Dynamic" + } + } + ], + "networkSecurityGroup": { + "id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]" + } + }, + "condition": "[not(parameters('provisionPublicIp'))]" + }, + { + "type": "Microsoft.Network/publicIpAddresses", + "apiVersion": "2019-02-01", + "name": "[concat(variables('publicIpAddressName'), copyIndex())]", + "location": "[resourceGroup().location]", + "properties": { + "publicIpAllocationMethod": "Static", + "publicIPAddressVersion": "IPv4" + }, + "copy": { + "name": "PublicIpCopy", + "count": "[parameters('vmCount')]" + }, + "sku": { + "name": "Basic", + "tier": "Regional" + }, + "condition": "[parameters('provisionPublicIp')]" + }, + { + "type": "Microsoft.Compute/virtualMachines", + "apiVersion": "2019-03-01", + "name": "[concat(parameters('vmName'), copyIndex())]", + "location": "[resourceGroup().location]", + "dependsOn": [ + "[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]" + ], + "copy": { + "name": "VmCopy", + "count": "[parameters('vmCount')]" + }, + "tags": "[parameters('vmTags')]", + "properties": { + "hardwareProfile": { + "vmSize": "[parameters('vmSize')]" + }, + "storageProfile": { + "osDisk": { + "createOption": "fromImage", + "managedDisk": { + "storageAccountType": "[variables('osDiskType')]" + } + }, + "imageReference": { + "publisher": "[parameters('imagePublisher')]", + "offer": "[parameters('imageOffer')]", + "sku": "[parameters('imageSku')]", + "version": "[parameters('imageVersion')]" + } + }, + "networkProfile": { + "networkInterfaces": [ + { + "id": "[resourceId('Microsoft.Network/networkInterfaces', concat(variables('networkInterfaceName'), copyIndex()))]" + } + ] + }, + "osProfile": { + "computerName": "[concat(parameters('vmName'), copyIndex())]", + "adminUsername": "[parameters('adminUsername')]", + "adminPassword": "[parameters('publicKey')]", + "linuxConfiguration": { + "disablePasswordAuthentication": true, + "ssh": { + "publicKeys": [ + { + "path": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]", + "keyData": "[parameters('publicKey')]" + } + ] + } + } + } + }, + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": { + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', 'ray-msi-user-identity')]": { + } + } + } + } + ], + "outputs": { + "publicIp": { + "type": "array", + "copy": { + "count": "[parameters('vmCount')]", + "input": "[reference(concat(variables('publicIpAddressName'), copyIndex())).ipAddress]" + }, + "condition": "[parameters('provisionPublicIp')]" + }, + "privateIp": { + "type": "array", + "copy": { + "count": "[parameters('vmCount')]", + "input": "[reference(concat(variables('networkInterfaceName'), copyIndex())).ipConfigurations[0].properties.privateIPAddress]" + } + } + } +} \ No newline at end of file diff --git a/python/ray/autoscaler/azure/config.py b/python/ray/autoscaler/azure/config.py index 844471af8..0be524a2c 100644 --- a/python/ray/autoscaler/azure/config.py +++ b/python/ray/autoscaler/azure/config.py @@ -1,14 +1,11 @@ +import json import logging +import random import os -import time -import uuid -from azure.common.exceptions import CloudError, AuthenticationError from azure.common.client_factory import get_client_from_cli_profile -from azure.mgmt.authorization import AuthorizationManagementClient -from azure.mgmt.network import NetworkManagementClient from azure.mgmt.resource import ResourceManagementClient -from azure.mgmt.msi import ManagedServiceIdentityClient +from azure.mgmt.resource.resources.models import DeploymentMode RETRIES = 30 MSI_NAME = "ray-msi-user-identity" @@ -20,10 +17,8 @@ logger = logging.getLogger(__name__) def bootstrap_azure(config): - config = _configure_resource_group(config) - config = _configure_msi_user(config) config = _configure_key_pair(config) - config = _configure_network(config) + config = _configure_resource_group(config) return config @@ -59,67 +54,36 @@ def _configure_resource_group(config): resource_client.resource_groups.create_or_update( resource_group_name=resource_group, parameters=params) - return config + # load the template + template_path = os.path.join( + os.path.dirname(__file__), "azure-config-template.json") + with open(template_path, "r") as template_file_fd: + template = json.load(template_file_fd) + # choose a random subnet + random.seed(resource_group) + # start at 1 to avoid most likely collision at 0 + parameters = {"subnet": "10.{}.0.0/16".format(random.randint(1, 254))} -def _configure_msi_user(config): - msi_client = _get_client(ManagedServiceIdentityClient, config) - resource_client = _get_client(ResourceManagementClient, config) - auth_client = _get_client(AuthorizationManagementClient, config) + deployment_properties = { + "mode": DeploymentMode.incremental, + "template": template, + "parameters": {k: { + "value": v + } + for k, v in parameters.items()} + } - resource_group = config["provider"]["resource_group"] - location = config["provider"]["location"] - - resource_group_id = resource_client.resource_groups.get(resource_group).id - try: - identity = msi_client.user_assigned_identities.list_by_resource_group( - resource_group_name=resource_group, - filter="name eq '{}'".format(MSI_NAME)).next() - logger.info("Found MSI User Assigned Identity: %s", MSI_NAME) - except StopIteration: - logger.info("Creating MSI User Assigned Identity: %s", MSI_NAME) - identity = msi_client.user_assigned_identities.create_or_update( - resource_group_name=resource_group, - resource_name=MSI_NAME, - location=location) - - identity_id = identity.id - principal_id = identity.principal_id - config["provider"]["msi_identity_id"] = identity_id - config["provider"]["msi_identity_principal_id"] = principal_id - - # assign Contributor role for MSI User Identity to resource group - role_id = auth_client.role_definitions.list( - scope=resource_group_id, filter="roleName eq 'Contributor'").next().id - role_params = {"role_definition_id": role_id, "principal_id": principal_id} - - for _ in range(RETRIES): - try: - filter_expr = "principalId eq '{}'".format(principal_id) - assignments = auth_client.role_assignments.list_for_scope( - scope=resource_group_id, filter=filter_expr) - - if any(a.role_definition_id == role_id for a in assignments): - break - - auth_client.role_assignments.create( - scope=resource_group_id, - role_assignment_name=uuid.uuid4(), - parameters=role_params) - logger.info("Assigning Contributor Role to MSI User") - except CloudError as ce: - if ce.inner_exception.error == "PrincipalNotFound": - time.sleep(5) - else: - raise Exception( - "Failed to create contributor role assignment (timeout)") + deployment_async_operation = resource_client.deployments.create_or_update( + resource_group, "ray-config", deployment_properties) + deployment_async_operation.wait() return config def _configure_key_pair(config): ssh_user = config["auth"]["ssh_user"] - + # search if the keys exist for key_type in ["ssh_private_key", "ssh_public_key"]: try: key_path = os.path.expanduser(config["auth"][key_type]) @@ -135,93 +99,8 @@ def _configure_key_pair(config): with open(key_path, "r") as f: public_key = f.read() - os_profile = { - "admin_username": ssh_user, - "computer_name": None, - "linux_configuration": { - "disable_password_authentication": True, - "ssh": { - "public_keys": [{ - "key_data": public_key, - "path": "/home/{}/.ssh/authorized_keys".format(ssh_user) - }] - } - } - } for node_type in ["head_node", "worker_nodes"]: - config[node_type]["os_profile"] = os_profile - - return config - - -def _configure_network(config): - # skip this if subnet is manually set in configuration yaml - if "subnet_id" in config["provider"]: - return config - - location = config["provider"]["location"] - resource_group = config["provider"]["resource_group"] - network_client = _get_client(NetworkManagementClient, config) - - vnets = [] - for _ in range(RETRIES): - try: - vnets = list( - network_client.virtual_networks.list( - resource_group_name=resource_group, - filter="name eq '{}'".format(VNET_NAME))) - break - except CloudError: - time.sleep(1) - except AuthenticationError: - # wait for service principal authorization to populate - time.sleep(1) - - # can't update vnet if subnet already exists - if not vnets: - # create vnet - logger.info("Creating/Updating VNet: %s", VNET_NAME) - vnet_params = { - "location": location, - "address_space": { - "address_prefixes": ["10.0.0.0/16"] - } - } - network_client.virtual_networks.create_or_update( - resource_group_name=resource_group, - virtual_network_name=VNET_NAME, - parameters=vnet_params).wait() - - # create subnet - logger.info("Creating/Updating Subnet: %s", SUBNET_NAME) - subnet_params = {"address_prefix": "10.0.0.0/24"} - subnet = network_client.subnets.create_or_update( - resource_group_name=resource_group, - virtual_network_name=VNET_NAME, - subnet_name=SUBNET_NAME, - subnet_parameters=subnet_params).result() - - config["provider"]["subnet_id"] = subnet.id - - # create network security group - logger.info("Creating/Updating Network Security Group: %s", NSG_NAME) - nsg_params = { - "location": location, - "security_rules": [{ - "protocol": "Tcp", - "source_port_range": "*", - "source_address_prefix": "*", - "destination_port_range": "22", - "destination_address_prefix": "*", - "access": "Allow", - "priority": 300, - "direction": "Inbound", - "name": "ssh_rule" - }] - } - network_client.network_security_groups.create_or_update( - resource_group_name=resource_group, - network_security_group_name=NSG_NAME, - parameters=nsg_params).wait() + config[node_type]["azure_arm_parameters"]["adminUsername"] = ssh_user + config[node_type]["azure_arm_parameters"]["publicKey"] = public_key return config diff --git a/python/ray/autoscaler/azure/example-full.yaml b/python/ray/autoscaler/azure/example-full.yaml index 3149ee40f..bd93481a8 100644 --- a/python/ray/autoscaler/azure/example-full.yaml +++ b/python/ray/autoscaler/azure/example-full.yaml @@ -65,54 +65,25 @@ auth: ssh_private_key: ~/.ssh/id_rsa ssh_public_key: ~/.ssh/id_rsa.pub -# Provider-specific config for the head node, e.g. instance type. By default -# Ray will auto-configure unspecified fields -# The Azure Python SDK client expects slug_style property names -# For more documentation on available fields, see: -# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python -# Note: the Azure Python SDK expects all parameter keys to be in slug_style -# the styles of parameter values are not changed +# Provider-specific config for the head node, e.g. instance type. head_node: - hardware_profile: - vm_size: Standard_D2s_v3 - storage_profile: - os_disk: - create_option: FromImage - caching: ReadWrite - image_reference: - # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage - publisher: microsoft-dsvm - offer: ubuntu-1804 - sku: 1804-gen2 - version: 20.02.01 + azure_arm_parameters: + vmSize: Standard_D2s_v3 + # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage + imagePublisher: microsoft-dsvm + imageOffer: ubuntu-1804 + imageSku: 1804-gen2 + imageVersion: 20.02.01 -# Provider-specific config for worker nodes, e.g. instance type. By default -# Ray will auto-configure unspecified fields -# Documentation on fields used can be found here: -# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python -# Note: the Azure Python SDK expects all parameter keys to be in slug_style -# the styles of parameter values are not changed +# Provider-specific config for worker nodes, e.g. instance type. worker_nodes: - hardware_profile: - vm_size: Standard_F2s_v2 - storage_profile: - os_disk: - create_option: FromImage - caching: ReadWrite - image_reference: - # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage - publisher: microsoft-dsvm - offer: ubuntu-1804 - sku: 1804-gen - version: 20.02.01 - # You can provision additional disk space as follows - # data_disks: - # - disk_size_gb: 1024 - # run workers on spot instances by default - priority: Spot - eviction_policy: Deallocate - billing_profile: - max_price: -1 + azure_arm_parameters: + vmSize: Standard_D2s_v3 + # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage + imagePublisher: microsoft-dsvm + imageOffer: ubuntu-1804 + imageSku: 1804-gen2 + imageVersion: 20.02.01 # Files or directories to copy to the head and worker nodes. The format is a # dictionary from REMOTE_PATH: LOCAL_PATH, e.g. @@ -133,10 +104,9 @@ setup_commands: # Note: if you're developing Ray, you probably want to create an AMI that # has your Ray repo pre-cloned. Then, you can replace the pip installs # below with a git checkout (and possibly a recompile). - # change to use environment desired - #- echo "conda activate py37_pytorch" >> ~/.bashrc - #- echo "conda activate py37_tensorflow" >> ~/.bashrc - - pip install -U https://sdgraystorage.blob.core.windows.net/ray-wheels/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl + # - echo 'conda activate py37_pytorch' >> ~/.bashrc + - echo 'conda activate py37_tensorflow' >> ~/.bashrc + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl # Consider uncommenting these if you also want to run apt-get commands during setup # - sudo pkill -9 apt-get || true # - sudo pkill -9 dpkg || true @@ -144,7 +114,7 @@ setup_commands: # Custom commands that will be run on the head node after common setup. head_setup_commands: - - pip install azure-cli-core azure-core azure-mgmt-authorization azure-mgmt-compute azure-mgmt-msi azure-mgmt-network + - pip install azure-cli-core azure-core azure-mgmt-authorization azure-mgmt-network azure-mgmt-compute azure-mgmt-msi # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] diff --git a/python/ray/autoscaler/azure/example-gpu-docker.yaml b/python/ray/autoscaler/azure/example-gpu-docker.yaml index 6240d50ac..52ef692eb 100644 --- a/python/ray/autoscaler/azure/example-gpu-docker.yaml +++ b/python/ray/autoscaler/azure/example-gpu-docker.yaml @@ -60,26 +60,16 @@ auth: ssh_public_key: ~/.ssh/id_rsa.pub # Provider-specific config for the head node, e.g. instance type. By default -# Ray will auto-configure unspecified fields -# The Azure Python SDK client expects slug_style property names -# For more documentation on available fields, see: -# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python -# Note: the Azure Python SDK expects all parameter keys to be in slug_style -# the styles of parameter values are not changed +# Ray will auto-configure unspecified fields using example-full.yaml head_node: - hardware_profile: - vm_size: Standard_NC6s_v3 + azure_arm_parameters: + vmSize: Standard_NC6s_v3 # Provider-specific config for worker nodes, e.g. instance type. By default -# Ray will auto-configure unspecified fields -# The Azure Python SDK client expects slug_style property names -# For more documentation on available fields, see: -# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python -# Note: the Azure Python SDK expects all parameter keys to be in slug_style -# the styles of parameter values are not changed +# Ray will auto-configure unspecified fields using example-full.yaml worker_nodes: - hardware_profile: - vm_size: Standard_NC6s_v3 + azure_arm_parameters: + vmSize: Standard_NC6s_v3 # Files or directories to copy to the head and worker nodes. The format is a # dictionary from REMOTE_PATH: LOCAL_PATH, e.g. @@ -90,7 +80,7 @@ file_mounts: { # List of shell commands to run to set up nodes. setup_commands: - - pip install -U https://sdgraystorage.blob.core.windows.net/ray-wheels/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl # Custom commands that will be run on the head node after common setup. head_setup_commands: diff --git a/python/ray/autoscaler/azure/example-gpu.yaml b/python/ray/autoscaler/azure/example-gpu.yaml index ef87d1d67..cfe415bd8 100644 --- a/python/ray/autoscaler/azure/example-gpu.yaml +++ b/python/ray/autoscaler/azure/example-gpu.yaml @@ -66,53 +66,26 @@ auth: ssh_public_key: ~/.ssh/id_rsa.pub # Provider-specific config for the head node, e.g. instance type. By default -# Ray will auto-configure unspecified fields -# The Azure Python SDK client expects slug_style property names -# For more documentation on available fields, see: -# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python -# Note: the Azure Python SDK expects all parameter keys to be in slug_style -# the styles of parameter values are not changed +# Ray will auto-configure unspecified fields using example-full.yaml head_node: - hardware_profile: - vm_size: Standard_NC6 - storage_profile: - os_disk: - create_option: FromImage - caching: ReadWrite - image_reference: - # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage - publisher: microsoft-dsvm - offer: ubuntu-1804 - sku: 1804 - version: 20.02.01 + azure_arm_parameters: + vmSize: Standard_NC6 + # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage + imagePublisher: microsoft-dsvm + imageOffer: ubuntu-1804 + imageSku: 1804 + imageVersion: 20.02.01 # Provider-specific config for worker nodes, e.g. instance type. By default -# Ray will auto-configure unspecified fields -# Documentation on fields used can be found here: -# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python -# Note: the Azure Python SDK expects all parameter keys to be in slug_style -# the styles of parameter values are not changed +# Ray will auto-configure unspecified fields using example-full.yaml worker_nodes: - hardware_profile: - vm_size: Standard_NC6 - storage_profile: - os_disk: - create_option: FromImage - caching: ReadWrite - image_reference: - # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage - publisher: microsoft-dsvm - offer: ubuntu-1804 - sku: 1804 - version: 20.02.01 - # You can provision additional disk space as follows - # data_disks: - # - disk_size_gb: 1024 - # run workers on spot instances by default - priority: Spot - eviction_policy: Deallocate - billing_profile: - max_price: -1 + azure_arm_parameters: + vmSize: Standard_NC6 + # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage + imagePublisher: microsoft-dsvm + imageOffer: ubuntu-1804 + imageSku: 1804 + imageVersion: 20.02.01 # Files or directories to copy to the head and worker nodes. The format is a # dictionary from REMOTE_PATH: LOCAL_PATH, e.g. @@ -135,7 +108,7 @@ setup_commands: # below with a git checkout (and possibly a recompile). # - echo 'conda activate py37_pytorch' >> ~/.bashrc - echo 'conda activate py37_tensorflow' >> ~/.bashrc - - pip install -U https://sdgraystorage.blob.core.windows.net/ray-wheels/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl # Consider uncommenting these if you also want to run apt-get commands during setup # - sudo pkill -9 apt-get || true # - sudo pkill -9 dpkg || true diff --git a/python/ray/autoscaler/azure/node_provider.py b/python/ray/autoscaler/azure/node_provider.py index 68cdae055..3d24bdb33 100644 --- a/python/ray/autoscaler/azure/node_provider.py +++ b/python/ray/autoscaler/azure/node_provider.py @@ -1,4 +1,6 @@ +import json import logging +import os from threading import RLock from uuid import uuid4 @@ -6,7 +8,9 @@ from azure.common.client_factory import get_client_from_cli_profile from msrestazure.azure_active_directory import MSIAuthentication from azure.mgmt.compute import ComputeManagementClient from azure.mgmt.network import NetworkManagementClient -from azure.mgmt.compute.models import ResourceIdentityType +from azure.mgmt.resource import ResourceManagementClient +from azure.mgmt.resource.resources.models import DeploymentMode +from knack.util import CLIError from ray.autoscaler.node_provider import NodeProvider from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_NAME @@ -50,15 +54,21 @@ class AzureNodeProvider(NodeProvider): client_class=ComputeManagementClient, **kwargs) self.network_client = get_client_from_cli_profile( client_class=NetworkManagementClient, **kwargs) - except Exception: - logger.info( - "CLI profile authentication failed. Trying MSI", exc_info=True) + self.resource_client = get_client_from_cli_profile( + client_class=ResourceManagementClient, **kwargs) + except CLIError as e: + if str(e) != "Please run 'az login' to setup account.": + raise + else: + logger.info("CLI profile authentication failed. Trying MSI") - credentials = MSIAuthentication() - self.compute_client = ComputeManagementClient( - credentials=credentials, **kwargs) - self.network_client = NetworkManagementClient( - credentials=credentials, **kwargs) + credentials = MSIAuthentication() + self.compute_client = ComputeManagementClient( + credentials=credentials, **kwargs) + self.network_client = NetworkManagementClient( + credentials=credentials, **kwargs) + self.resource_client = ResourceManagementClient( + credentials=credentials, **kwargs) self.lock = RLock() @@ -164,79 +174,43 @@ class AzureNodeProvider(NodeProvider): def create_node(self, node_config, tags, count): """Creates a number of nodes within the namespace.""" # TODO: restart deallocated nodes if possible - location = self.provider_config["location"] resource_group = self.provider_config["resource_group"] - subnet_id = self.provider_config["subnet_id"] - config = node_config.copy() - config_tags = config.get("tags", {}) + # load the template + template_path = os.path.join( + os.path.dirname(__file__), "azure-vm-template.json") + with open(template_path, "r") as template_file_fd: + template = json.load(template_file_fd) + + # get the tags + config_tags = node_config.get("tags", {}).copy() config_tags.update(tags) config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name - config["tags"] = config_tags - config["location"] = location name_tag = config_tags.get(TAG_RAY_NODE_NAME, "node") + unique_id = uuid4().hex[:VM_NAME_UUID_LEN] - for _ in range(count): - unique_id = uuid4().hex[:VM_NAME_UUID_LEN] - vm_name = "{name}-{id}".format(name=name_tag, id=unique_id) - config["os_profile"]["computer_name"] = vm_name + parameters = node_config["azure_arm_parameters"].copy() + parameters["vmName"] = "{name}-{id}".format( + name=name_tag, id=unique_id) + parameters["provisionPublicIp"] = not self.provider_config.get( + "use_internal_ips", False) + parameters["vmTags"] = config_tags + parameters["vmCount"] = count - try: - assert len(vm_name) <= VM_NAME_MAX_LEN - except AssertionError as e: - e.args += ("name", vm_name) - raise - - ip_configuration = {"name": uuid4(), "subnet": {"id": subnet_id}} - - if not self.provider_config.get("use_internal_ips", False): - # create public ip address - public_ip_addess_params = { - "location": location, - "public_ip_allocation_method": "Dynamic" - } - public_ip_address = ( - self.network_client.public_ip_addresses.create_or_update( - resource_group_name=resource_group, - public_ip_address_name="{}-ip".format(vm_name), - parameters=public_ip_addess_params).result()) - ip_configuration["public_ip_address"] = public_ip_address - - nic_params = { - "location": location, - "ip_configurations": [ip_configuration] + deployment_properties = { + "mode": DeploymentMode.incremental, + "template": template, + "parameters": {k: { + "value": v } - nic = self.network_client.network_interfaces.create_or_update( - resource_group_name=resource_group, - network_interface_name="{}-nic".format(vm_name), - parameters=nic_params).result() + for k, v in parameters.items()} + } - # update vm config with network parameters - config["network_profile"] = { - "network_interfaces": [{ - "id": nic.id - }] - } - - config["identity"] = { - "type": ResourceIdentityType.user_assigned, - "user_assigned_identities": [{ - # zero-documentation.. *sigh* - "key": self.provider_config["msi_identity_id"], - "value": { - "principal_id": self.provider_config[ - "msi_identity_principal_id"], - "client_id": self.provider_config["msi_identity_id"] - } - }] - } - - # TODO: do we need to wait or fire and forget is fine? - self.compute_client.virtual_machines.create_or_update( - resource_group_name=self.provider_config["resource_group"], - vm_name=vm_name, - parameters=config) + # TODO: we could get the private/public ips back directly + self.resource_client.deployments.create_or_update( + resource_group, "ray-vm-{}".format(name_tag), + deployment_properties).wait() @synchronized def set_node_tags(self, node_id, tags): @@ -252,34 +226,57 @@ class AzureNodeProvider(NodeProvider): def terminate_node(self, node_id): """Terminates the specified node. This will delete the VM and associated resources (NIC, IP, Storage) for the specified node.""" - # self.compute_client.virtual_machines.deallocate( - # resource_group_name=self.provider_config["resource_group"], - # vm_name=node_id) + resource_group = self.provider_config["resource_group"] - nodes = self._get_filtered_nodes( - tag_filters={TAG_RAY_CLUSTER_NAME: self.cluster_name}) - for node, metadata in nodes.items(): - # gather disks to delete later - vm = self.compute_client.virtual_machines.get( - resource_group_name=resource_group, vm_name=node) - disks = {d.name for d in vm.storage_profile.data_disks} - disks.add(vm.storage_profile.os_disk.name) + try: + # get metadata for node + metadata = self._get_node(node_id) + except KeyError: + # node no longer exists + return + + # TODO: deallocate instead of delete to allow possible reuse + # self.compute_client.virtual_machines.deallocate( + # resource_group_name=resource_group, + # vm_name=node_id) + + # gather disks to delete later + vm = self.compute_client.virtual_machines.get( + resource_group_name=resource_group, vm_name=node_id) + disks = {d.name for d in vm.storage_profile.data_disks} + disks.add(vm.storage_profile.os_disk.name) + + try: # delete machine, must wait for this to complete self.compute_client.virtual_machines.delete( - resource_group_name=resource_group, vm_name=node).wait() + resource_group_name=resource_group, vm_name=node_id).wait() + except Exception as e: + logger.warning("Failed to delete VM: {}".format(e)) + + try: # delete nic self.network_client.network_interfaces.delete( resource_group_name=resource_group, network_interface_name=metadata["nic_name"]) - # delete ip address - if "public_ip_name" in metadata: + except Exception as e: + logger.warning("Failed to delete nic: {}".format(e)) + + # delete ip address + if "public_ip_name" in metadata: + try: self.network_client.public_ip_addresses.delete( resource_group_name=resource_group, public_ip_address_name=metadata["public_ip_name"]) - # delete disks - for disk in disks: + except Exception as e: + logger.warning("Failed to delete public ip: {}".format(e)) + + # delete disks + for disk in disks: + try: self.compute_client.disks.delete( resource_group_name=resource_group, disk_name=disk) + except Exception as e: + logger.warning("Failed to delete disk: {}".format(e)) def _get_node(self, node_id): self._get_filtered_nodes({}) # Side effect: updates cache diff --git a/python/setup.py b/python/setup.py index 1013832e0..9a95a4d74 100644 --- a/python/setup.py +++ b/python/setup.py @@ -42,6 +42,8 @@ optional_ray_files = [] ray_autoscaler_files = [ "ray/autoscaler/aws/example-full.yaml", "ray/autoscaler/azure/example-full.yaml", + "ray/autoscaler/azure/azure-vm-template.json", + "ray/autoscaler/azure/azure-config-template.json", "ray/autoscaler/gcp/example-full.yaml", "ray/autoscaler/local/example-full.yaml", "ray/autoscaler/kubernetes/example-full.yaml",