mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 19:00:36 +08:00
[autoscaler] Switch to ARM for Azure deployment (#7717)
* switch to ARM templates for config and VMs * switch to ARM templates for config and VMs * auto-formatting * addressed Scotts comment * added missing imports * fixed gpu templates fixed wheel reference * added missing reference * cleanup wording and yamls * Update doc/source/autoscaling.rst Co-Authored-By: Scott Graham <5720537+gramhagen@users.noreply.github.com> Co-authored-by: Ubuntu <marcozo@marcozodev2.zqvgrdyupqrudayw1il1agipig.jx.internal.cloudapp.net> Co-authored-by: Scott Graham <5720537+gramhagen@users.noreply.github.com>
This commit is contained in:
+14
-10
@@ -59,8 +59,12 @@ Test that it works by running the following commands from your local machine:
|
||||
|
||||
# Get a remote screen on the head node.
|
||||
$ ray attach ray/python/ray/autoscaler/azure/example-full.yaml
|
||||
$ source activate tensorflow_p36
|
||||
$ # Try running a Ray program with 'ray.init(address="auto")'.
|
||||
# test ray setup
|
||||
# enable conda environment
|
||||
$ exec bash -l
|
||||
$ conda activate py37_tensorflow
|
||||
$ python -c 'import ray; ray.init()'
|
||||
$ exit
|
||||
# Tear down the cluster.
|
||||
$ ray down ray/python/ray/autoscaler/azure/example-full.yaml
|
||||
|
||||
@@ -69,26 +73,26 @@ Azure Portal
|
||||
|
||||
Alternatively, you can deploy a cluster using Azure portal directly. Please note that auto scaling is done using Azure VM Scale Sets and not through
|
||||
the Ray autoscaler. This will deploy `Azure Data Science VMs (DSVM) <https://azure.microsoft.com/en-us/services/virtual-machines/data-science-virtual-machines/>`_
|
||||
for both the head node and an auto-scale cluster managed by `Azure Virtual Machine Scale Sets <https://azure.microsoft.com/en-us/services/virtual-machine-scale-sets/>`_.
|
||||
The head node conviently exposes both SSH as well as JupyterLab.
|
||||
for both the head node and the auto-scalable cluster managed by `Azure Virtual Machine Scale Sets <https://azure.microsoft.com/en-us/services/virtual-machine-scale-sets/>`_.
|
||||
The head node conveniently exposes both SSH as well as JupyterLab.
|
||||
|
||||
.. image:: https://aka.ms/deploytoazurebutton
|
||||
:target: https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fray-project%2Fray%2Fmaster%2Fdoc%2Fazure%2Fazure-ray-template.json
|
||||
:alt: Deploy to Azure
|
||||
|
||||
Once the template is successfully deploy the deployment output page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input).
|
||||
Use the following code connect to the Ray cluster.
|
||||
Once the template is successfully deployed the deployment output page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input).
|
||||
Use the following code in a Jupyter notebook to connect to the Ray cluster.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import ray
|
||||
ray.init(address='auto')
|
||||
|
||||
Note that on each node the `azure-init.sh <https://github.com/ray-project/ray/blob/master/doc/azure/azure-init.sh>`_ script is executed and performs
|
||||
Note that on each node the `azure-init.sh <https://github.com/ray-project/ray/blob/master/doc/azure/azure-init.sh>`_ script is executed and performs the following actions:
|
||||
|
||||
1. activate one of the conda environments available on DSVM
|
||||
2. install Ray and any other user-specified dependencies
|
||||
3. setup of a systemd task (``/lib/systemd/system/ray.service``) which starting ray in head or worker mode
|
||||
1. Activates one of the conda environments available on DSVM
|
||||
2. Installs Ray and any other user-specified dependencies
|
||||
3. Sets up a systemd task (``/lib/systemd/system/ray.service``) to start Ray in head or worker mode
|
||||
|
||||
GCP
|
||||
~~~
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
{
|
||||
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
|
||||
"contentVersion": "1.0.0.0",
|
||||
"parameters": {
|
||||
"subnet": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
"description": "The subnet to be used"
|
||||
}
|
||||
}
|
||||
},
|
||||
"variables": {
|
||||
"Contributor": "[concat('/subscriptions/', subscription().subscriptionId, '/providers/Microsoft.Authorization/roleDefinitions/', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]"
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"type": "Microsoft.ManagedIdentity/userAssignedIdentities",
|
||||
"apiVersion": "2018-11-30",
|
||||
"location": "[resourceGroup().location]",
|
||||
"name": "ray-msi-user-identity"
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Authorization/roleAssignments",
|
||||
"apiVersion": "2018-09-01-preview",
|
||||
"name": "[guid(resourceGroup().id)]",
|
||||
"properties": {
|
||||
"principalId": "[reference('ray-msi-user-identity').principalId]",
|
||||
"roleDefinitionId": "[variables('Contributor')]",
|
||||
"scope": "[resourceGroup().id]",
|
||||
"principalType": "ServicePrincipal"
|
||||
},
|
||||
"dependsOn": [
|
||||
"[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', 'ray-msi-user-identity')]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Network/networkSecurityGroups",
|
||||
"apiVersion": "2019-02-01",
|
||||
"name": "ray-nsg",
|
||||
"location": "[resourceGroup().location]",
|
||||
"properties": {
|
||||
"securityRules": [
|
||||
{
|
||||
"name": "SSH",
|
||||
"properties": {
|
||||
"priority": 1000,
|
||||
"protocol": "TCP",
|
||||
"access": "Allow",
|
||||
"direction": "Inbound",
|
||||
"sourceAddressPrefix": "*",
|
||||
"sourcePortRange": "*",
|
||||
"destinationAddressPrefix": "*",
|
||||
"destinationPortRange": "22"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Network/virtualNetworks",
|
||||
"apiVersion": "2019-11-01",
|
||||
"name": "ray-vnet",
|
||||
"location": "[resourceGroup().location]",
|
||||
"properties": {
|
||||
"addressSpace": {
|
||||
"addressPrefixes": [
|
||||
"[parameters('subnet')]"
|
||||
]
|
||||
},
|
||||
"subnets": [
|
||||
{
|
||||
"name": "ray-subnet",
|
||||
"properties": {
|
||||
"addressPrefix": "[parameters('subnet')]"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,243 @@
|
||||
{
|
||||
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
|
||||
"contentVersion": "1.0.0.0",
|
||||
"parameters": {
|
||||
"vmName": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
"description": "The name of you Virtual Machine."
|
||||
}
|
||||
},
|
||||
"adminUsername": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
"description": "Username for the Virtual Machine."
|
||||
}
|
||||
},
|
||||
"publicKey": {
|
||||
"type": "securestring",
|
||||
"metadata": {
|
||||
"description": "SSH Key for the Virtual Machine"
|
||||
}
|
||||
},
|
||||
"imagePublisher": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
"description": "The publisher of the VM image"
|
||||
}
|
||||
},
|
||||
"imageOffer": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
"description": "The offer of the VM image"
|
||||
}
|
||||
},
|
||||
"imageSku": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
"description": "The sku of the VM image"
|
||||
}
|
||||
},
|
||||
"imageVersion": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
"description": "The version of the VM image"
|
||||
}
|
||||
},
|
||||
"vmSize": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
"description": "The size of the VM"
|
||||
}
|
||||
},
|
||||
"vmTags": {
|
||||
"type": "object",
|
||||
"metadata": {
|
||||
"description": "Tags for the VM"
|
||||
}
|
||||
},
|
||||
"vmCount": {
|
||||
"type": "int",
|
||||
"metadata": {
|
||||
"description": "Number of VMs to deploy"
|
||||
}
|
||||
},
|
||||
"provisionPublicIp": {
|
||||
"type": "bool",
|
||||
"defaultValue": true,
|
||||
"metadata": {
|
||||
"description": "If true creates a public ip"
|
||||
}
|
||||
}
|
||||
},
|
||||
"variables": {
|
||||
"publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]",
|
||||
"networkInterfaceNamePrivate": "[concat(parameters('vmName'),'-nic')]",
|
||||
"networkInterfaceNamePublic": "[concat(parameters('vmName'),'-nic-public')]",
|
||||
"networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
|
||||
"networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]",
|
||||
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]",
|
||||
"osDiskType": "Standard_LRS"
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"type": "Microsoft.Network/networkInterfaces",
|
||||
"apiVersion": "2018-10-01",
|
||||
"name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"dependsOn": [
|
||||
"[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]"
|
||||
],
|
||||
"copy": {
|
||||
"name": "NICPublicCopy",
|
||||
"count": "[parameters('vmCount')]"
|
||||
},
|
||||
"properties": {
|
||||
"ipConfigurations": [
|
||||
{
|
||||
"name": "[variables('networkIpConfig')]",
|
||||
"properties": {
|
||||
"subnet": {
|
||||
"id": "[variables('subnetRef')]"
|
||||
},
|
||||
"privateIPAllocationMethod": "Dynamic",
|
||||
"publicIpAddress": {
|
||||
"id": "[resourceId('Microsoft.Network/publicIPAddresses', concat(variables('publicIPAddressName'), copyIndex()))]"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"networkSecurityGroup": {
|
||||
"id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]"
|
||||
}
|
||||
},
|
||||
"condition": "[parameters('provisionPublicIp')]"
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Network/networkInterfaces",
|
||||
"apiVersion": "2018-10-01",
|
||||
"name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"copy": {
|
||||
"name": "NICPrivateCopy",
|
||||
"count": "[parameters('vmCount')]"
|
||||
},
|
||||
"properties": {
|
||||
"ipConfigurations": [
|
||||
{
|
||||
"name": "[variables('networkIpConfig')]",
|
||||
"properties": {
|
||||
"subnet": {
|
||||
"id": "[variables('subnetRef')]"
|
||||
},
|
||||
"privateIPAllocationMethod": "Dynamic"
|
||||
}
|
||||
}
|
||||
],
|
||||
"networkSecurityGroup": {
|
||||
"id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]"
|
||||
}
|
||||
},
|
||||
"condition": "[not(parameters('provisionPublicIp'))]"
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Network/publicIpAddresses",
|
||||
"apiVersion": "2019-02-01",
|
||||
"name": "[concat(variables('publicIpAddressName'), copyIndex())]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"properties": {
|
||||
"publicIpAllocationMethod": "Static",
|
||||
"publicIPAddressVersion": "IPv4"
|
||||
},
|
||||
"copy": {
|
||||
"name": "PublicIpCopy",
|
||||
"count": "[parameters('vmCount')]"
|
||||
},
|
||||
"sku": {
|
||||
"name": "Basic",
|
||||
"tier": "Regional"
|
||||
},
|
||||
"condition": "[parameters('provisionPublicIp')]"
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Compute/virtualMachines",
|
||||
"apiVersion": "2019-03-01",
|
||||
"name": "[concat(parameters('vmName'), copyIndex())]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"dependsOn": [
|
||||
"[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]"
|
||||
],
|
||||
"copy": {
|
||||
"name": "VmCopy",
|
||||
"count": "[parameters('vmCount')]"
|
||||
},
|
||||
"tags": "[parameters('vmTags')]",
|
||||
"properties": {
|
||||
"hardwareProfile": {
|
||||
"vmSize": "[parameters('vmSize')]"
|
||||
},
|
||||
"storageProfile": {
|
||||
"osDisk": {
|
||||
"createOption": "fromImage",
|
||||
"managedDisk": {
|
||||
"storageAccountType": "[variables('osDiskType')]"
|
||||
}
|
||||
},
|
||||
"imageReference": {
|
||||
"publisher": "[parameters('imagePublisher')]",
|
||||
"offer": "[parameters('imageOffer')]",
|
||||
"sku": "[parameters('imageSku')]",
|
||||
"version": "[parameters('imageVersion')]"
|
||||
}
|
||||
},
|
||||
"networkProfile": {
|
||||
"networkInterfaces": [
|
||||
{
|
||||
"id": "[resourceId('Microsoft.Network/networkInterfaces', concat(variables('networkInterfaceName'), copyIndex()))]"
|
||||
}
|
||||
]
|
||||
},
|
||||
"osProfile": {
|
||||
"computerName": "[concat(parameters('vmName'), copyIndex())]",
|
||||
"adminUsername": "[parameters('adminUsername')]",
|
||||
"adminPassword": "[parameters('publicKey')]",
|
||||
"linuxConfiguration": {
|
||||
"disablePasswordAuthentication": true,
|
||||
"ssh": {
|
||||
"publicKeys": [
|
||||
{
|
||||
"path": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]",
|
||||
"keyData": "[parameters('publicKey')]"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"identity": {
|
||||
"type": "UserAssigned",
|
||||
"userAssignedIdentities": {
|
||||
"[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', 'ray-msi-user-identity')]": {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"outputs": {
|
||||
"publicIp": {
|
||||
"type": "array",
|
||||
"copy": {
|
||||
"count": "[parameters('vmCount')]",
|
||||
"input": "[reference(concat(variables('publicIpAddressName'), copyIndex())).ipAddress]"
|
||||
},
|
||||
"condition": "[parameters('provisionPublicIp')]"
|
||||
},
|
||||
"privateIp": {
|
||||
"type": "array",
|
||||
"copy": {
|
||||
"count": "[parameters('vmCount')]",
|
||||
"input": "[reference(concat(variables('networkInterfaceName'), copyIndex())).ipConfigurations[0].properties.privateIPAddress]"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,14 +1,11 @@
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
|
||||
from azure.common.exceptions import CloudError, AuthenticationError
|
||||
from azure.common.client_factory import get_client_from_cli_profile
|
||||
from azure.mgmt.authorization import AuthorizationManagementClient
|
||||
from azure.mgmt.network import NetworkManagementClient
|
||||
from azure.mgmt.resource import ResourceManagementClient
|
||||
from azure.mgmt.msi import ManagedServiceIdentityClient
|
||||
from azure.mgmt.resource.resources.models import DeploymentMode
|
||||
|
||||
RETRIES = 30
|
||||
MSI_NAME = "ray-msi-user-identity"
|
||||
@@ -20,10 +17,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def bootstrap_azure(config):
|
||||
config = _configure_resource_group(config)
|
||||
config = _configure_msi_user(config)
|
||||
config = _configure_key_pair(config)
|
||||
config = _configure_network(config)
|
||||
config = _configure_resource_group(config)
|
||||
return config
|
||||
|
||||
|
||||
@@ -59,67 +54,36 @@ def _configure_resource_group(config):
|
||||
resource_client.resource_groups.create_or_update(
|
||||
resource_group_name=resource_group, parameters=params)
|
||||
|
||||
return config
|
||||
# load the template
|
||||
template_path = os.path.join(
|
||||
os.path.dirname(__file__), "azure-config-template.json")
|
||||
with open(template_path, "r") as template_file_fd:
|
||||
template = json.load(template_file_fd)
|
||||
|
||||
# choose a random subnet
|
||||
random.seed(resource_group)
|
||||
# start at 1 to avoid most likely collision at 0
|
||||
parameters = {"subnet": "10.{}.0.0/16".format(random.randint(1, 254))}
|
||||
|
||||
def _configure_msi_user(config):
|
||||
msi_client = _get_client(ManagedServiceIdentityClient, config)
|
||||
resource_client = _get_client(ResourceManagementClient, config)
|
||||
auth_client = _get_client(AuthorizationManagementClient, config)
|
||||
deployment_properties = {
|
||||
"mode": DeploymentMode.incremental,
|
||||
"template": template,
|
||||
"parameters": {k: {
|
||||
"value": v
|
||||
}
|
||||
for k, v in parameters.items()}
|
||||
}
|
||||
|
||||
resource_group = config["provider"]["resource_group"]
|
||||
location = config["provider"]["location"]
|
||||
|
||||
resource_group_id = resource_client.resource_groups.get(resource_group).id
|
||||
try:
|
||||
identity = msi_client.user_assigned_identities.list_by_resource_group(
|
||||
resource_group_name=resource_group,
|
||||
filter="name eq '{}'".format(MSI_NAME)).next()
|
||||
logger.info("Found MSI User Assigned Identity: %s", MSI_NAME)
|
||||
except StopIteration:
|
||||
logger.info("Creating MSI User Assigned Identity: %s", MSI_NAME)
|
||||
identity = msi_client.user_assigned_identities.create_or_update(
|
||||
resource_group_name=resource_group,
|
||||
resource_name=MSI_NAME,
|
||||
location=location)
|
||||
|
||||
identity_id = identity.id
|
||||
principal_id = identity.principal_id
|
||||
config["provider"]["msi_identity_id"] = identity_id
|
||||
config["provider"]["msi_identity_principal_id"] = principal_id
|
||||
|
||||
# assign Contributor role for MSI User Identity to resource group
|
||||
role_id = auth_client.role_definitions.list(
|
||||
scope=resource_group_id, filter="roleName eq 'Contributor'").next().id
|
||||
role_params = {"role_definition_id": role_id, "principal_id": principal_id}
|
||||
|
||||
for _ in range(RETRIES):
|
||||
try:
|
||||
filter_expr = "principalId eq '{}'".format(principal_id)
|
||||
assignments = auth_client.role_assignments.list_for_scope(
|
||||
scope=resource_group_id, filter=filter_expr)
|
||||
|
||||
if any(a.role_definition_id == role_id for a in assignments):
|
||||
break
|
||||
|
||||
auth_client.role_assignments.create(
|
||||
scope=resource_group_id,
|
||||
role_assignment_name=uuid.uuid4(),
|
||||
parameters=role_params)
|
||||
logger.info("Assigning Contributor Role to MSI User")
|
||||
except CloudError as ce:
|
||||
if ce.inner_exception.error == "PrincipalNotFound":
|
||||
time.sleep(5)
|
||||
else:
|
||||
raise Exception(
|
||||
"Failed to create contributor role assignment (timeout)")
|
||||
deployment_async_operation = resource_client.deployments.create_or_update(
|
||||
resource_group, "ray-config", deployment_properties)
|
||||
deployment_async_operation.wait()
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _configure_key_pair(config):
|
||||
ssh_user = config["auth"]["ssh_user"]
|
||||
|
||||
# search if the keys exist
|
||||
for key_type in ["ssh_private_key", "ssh_public_key"]:
|
||||
try:
|
||||
key_path = os.path.expanduser(config["auth"][key_type])
|
||||
@@ -135,93 +99,8 @@ def _configure_key_pair(config):
|
||||
with open(key_path, "r") as f:
|
||||
public_key = f.read()
|
||||
|
||||
os_profile = {
|
||||
"admin_username": ssh_user,
|
||||
"computer_name": None,
|
||||
"linux_configuration": {
|
||||
"disable_password_authentication": True,
|
||||
"ssh": {
|
||||
"public_keys": [{
|
||||
"key_data": public_key,
|
||||
"path": "/home/{}/.ssh/authorized_keys".format(ssh_user)
|
||||
}]
|
||||
}
|
||||
}
|
||||
}
|
||||
for node_type in ["head_node", "worker_nodes"]:
|
||||
config[node_type]["os_profile"] = os_profile
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _configure_network(config):
|
||||
# skip this if subnet is manually set in configuration yaml
|
||||
if "subnet_id" in config["provider"]:
|
||||
return config
|
||||
|
||||
location = config["provider"]["location"]
|
||||
resource_group = config["provider"]["resource_group"]
|
||||
network_client = _get_client(NetworkManagementClient, config)
|
||||
|
||||
vnets = []
|
||||
for _ in range(RETRIES):
|
||||
try:
|
||||
vnets = list(
|
||||
network_client.virtual_networks.list(
|
||||
resource_group_name=resource_group,
|
||||
filter="name eq '{}'".format(VNET_NAME)))
|
||||
break
|
||||
except CloudError:
|
||||
time.sleep(1)
|
||||
except AuthenticationError:
|
||||
# wait for service principal authorization to populate
|
||||
time.sleep(1)
|
||||
|
||||
# can't update vnet if subnet already exists
|
||||
if not vnets:
|
||||
# create vnet
|
||||
logger.info("Creating/Updating VNet: %s", VNET_NAME)
|
||||
vnet_params = {
|
||||
"location": location,
|
||||
"address_space": {
|
||||
"address_prefixes": ["10.0.0.0/16"]
|
||||
}
|
||||
}
|
||||
network_client.virtual_networks.create_or_update(
|
||||
resource_group_name=resource_group,
|
||||
virtual_network_name=VNET_NAME,
|
||||
parameters=vnet_params).wait()
|
||||
|
||||
# create subnet
|
||||
logger.info("Creating/Updating Subnet: %s", SUBNET_NAME)
|
||||
subnet_params = {"address_prefix": "10.0.0.0/24"}
|
||||
subnet = network_client.subnets.create_or_update(
|
||||
resource_group_name=resource_group,
|
||||
virtual_network_name=VNET_NAME,
|
||||
subnet_name=SUBNET_NAME,
|
||||
subnet_parameters=subnet_params).result()
|
||||
|
||||
config["provider"]["subnet_id"] = subnet.id
|
||||
|
||||
# create network security group
|
||||
logger.info("Creating/Updating Network Security Group: %s", NSG_NAME)
|
||||
nsg_params = {
|
||||
"location": location,
|
||||
"security_rules": [{
|
||||
"protocol": "Tcp",
|
||||
"source_port_range": "*",
|
||||
"source_address_prefix": "*",
|
||||
"destination_port_range": "22",
|
||||
"destination_address_prefix": "*",
|
||||
"access": "Allow",
|
||||
"priority": 300,
|
||||
"direction": "Inbound",
|
||||
"name": "ssh_rule"
|
||||
}]
|
||||
}
|
||||
network_client.network_security_groups.create_or_update(
|
||||
resource_group_name=resource_group,
|
||||
network_security_group_name=NSG_NAME,
|
||||
parameters=nsg_params).wait()
|
||||
config[node_type]["azure_arm_parameters"]["adminUsername"] = ssh_user
|
||||
config[node_type]["azure_arm_parameters"]["publicKey"] = public_key
|
||||
|
||||
return config
|
||||
|
||||
@@ -65,54 +65,25 @@ auth:
|
||||
ssh_private_key: ~/.ssh/id_rsa
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields
|
||||
# The Azure Python SDK client expects slug_style property names
|
||||
# For more documentation on available fields, see:
|
||||
# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
|
||||
# Note: the Azure Python SDK expects all parameter keys to be in slug_style
|
||||
# the styles of parameter values are not changed
|
||||
# Provider-specific config for the head node, e.g. instance type.
|
||||
head_node:
|
||||
hardware_profile:
|
||||
vm_size: Standard_D2s_v3
|
||||
storage_profile:
|
||||
os_disk:
|
||||
create_option: FromImage
|
||||
caching: ReadWrite
|
||||
image_reference:
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
publisher: microsoft-dsvm
|
||||
offer: ubuntu-1804
|
||||
sku: 1804-gen2
|
||||
version: 20.02.01
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_D2s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.02.01
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields
|
||||
# Documentation on fields used can be found here:
|
||||
# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
|
||||
# Note: the Azure Python SDK expects all parameter keys to be in slug_style
|
||||
# the styles of parameter values are not changed
|
||||
# Provider-specific config for worker nodes, e.g. instance type.
|
||||
worker_nodes:
|
||||
hardware_profile:
|
||||
vm_size: Standard_F2s_v2
|
||||
storage_profile:
|
||||
os_disk:
|
||||
create_option: FromImage
|
||||
caching: ReadWrite
|
||||
image_reference:
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
publisher: microsoft-dsvm
|
||||
offer: ubuntu-1804
|
||||
sku: 1804-gen
|
||||
version: 20.02.01
|
||||
# You can provision additional disk space as follows
|
||||
# data_disks:
|
||||
# - disk_size_gb: 1024
|
||||
# run workers on spot instances by default
|
||||
priority: Spot
|
||||
eviction_policy: Deallocate
|
||||
billing_profile:
|
||||
max_price: -1
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_D2s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.02.01
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
@@ -133,10 +104,9 @@ setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# change to use environment desired
|
||||
#- echo "conda activate py37_pytorch" >> ~/.bashrc
|
||||
#- echo "conda activate py37_tensorflow" >> ~/.bashrc
|
||||
- pip install -U https://sdgraystorage.blob.core.windows.net/ray-wheels/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
||||
# - sudo pkill -9 apt-get || true
|
||||
# - sudo pkill -9 dpkg || true
|
||||
@@ -144,7 +114,7 @@ setup_commands:
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
- pip install azure-cli-core azure-core azure-mgmt-authorization azure-mgmt-compute azure-mgmt-msi azure-mgmt-network
|
||||
- pip install azure-cli-core azure-core azure-mgmt-authorization azure-mgmt-network azure-mgmt-compute azure-mgmt-msi
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
@@ -60,26 +60,16 @@ auth:
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields
|
||||
# The Azure Python SDK client expects slug_style property names
|
||||
# For more documentation on available fields, see:
|
||||
# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
|
||||
# Note: the Azure Python SDK expects all parameter keys to be in slug_style
|
||||
# the styles of parameter values are not changed
|
||||
# Ray will auto-configure unspecified fields using example-full.yaml
|
||||
head_node:
|
||||
hardware_profile:
|
||||
vm_size: Standard_NC6s_v3
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_NC6s_v3
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields
|
||||
# The Azure Python SDK client expects slug_style property names
|
||||
# For more documentation on available fields, see:
|
||||
# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
|
||||
# Note: the Azure Python SDK expects all parameter keys to be in slug_style
|
||||
# the styles of parameter values are not changed
|
||||
# Ray will auto-configure unspecified fields using example-full.yaml
|
||||
worker_nodes:
|
||||
hardware_profile:
|
||||
vm_size: Standard_NC6s_v3
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_NC6s_v3
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
@@ -90,7 +80,7 @@ file_mounts: {
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
- pip install -U https://sdgraystorage.blob.core.windows.net/ray-wheels/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
|
||||
@@ -66,53 +66,26 @@ auth:
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields
|
||||
# The Azure Python SDK client expects slug_style property names
|
||||
# For more documentation on available fields, see:
|
||||
# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
|
||||
# Note: the Azure Python SDK expects all parameter keys to be in slug_style
|
||||
# the styles of parameter values are not changed
|
||||
# Ray will auto-configure unspecified fields using example-full.yaml
|
||||
head_node:
|
||||
hardware_profile:
|
||||
vm_size: Standard_NC6
|
||||
storage_profile:
|
||||
os_disk:
|
||||
create_option: FromImage
|
||||
caching: ReadWrite
|
||||
image_reference:
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
publisher: microsoft-dsvm
|
||||
offer: ubuntu-1804
|
||||
sku: 1804
|
||||
version: 20.02.01
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_NC6
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804
|
||||
imageVersion: 20.02.01
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields
|
||||
# Documentation on fields used can be found here:
|
||||
# https://docs.microsoft.com/en-us/python/api/overview/azure/virtualmachines?view=azure-python
|
||||
# Note: the Azure Python SDK expects all parameter keys to be in slug_style
|
||||
# the styles of parameter values are not changed
|
||||
# Ray will auto-configure unspecified fields using example-full.yaml
|
||||
worker_nodes:
|
||||
hardware_profile:
|
||||
vm_size: Standard_NC6
|
||||
storage_profile:
|
||||
os_disk:
|
||||
create_option: FromImage
|
||||
caching: ReadWrite
|
||||
image_reference:
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
publisher: microsoft-dsvm
|
||||
offer: ubuntu-1804
|
||||
sku: 1804
|
||||
version: 20.02.01
|
||||
# You can provision additional disk space as follows
|
||||
# data_disks:
|
||||
# - disk_size_gb: 1024
|
||||
# run workers on spot instances by default
|
||||
priority: Spot
|
||||
eviction_policy: Deallocate
|
||||
billing_profile:
|
||||
max_price: -1
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_NC6
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804
|
||||
imageVersion: 20.02.01
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
@@ -135,7 +108,7 @@ setup_commands:
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U https://sdgraystorage.blob.core.windows.net/ray-wheels/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
||||
# - sudo pkill -9 apt-get || true
|
||||
# - sudo pkill -9 dpkg || true
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from threading import RLock
|
||||
from uuid import uuid4
|
||||
|
||||
@@ -6,7 +8,9 @@ from azure.common.client_factory import get_client_from_cli_profile
|
||||
from msrestazure.azure_active_directory import MSIAuthentication
|
||||
from azure.mgmt.compute import ComputeManagementClient
|
||||
from azure.mgmt.network import NetworkManagementClient
|
||||
from azure.mgmt.compute.models import ResourceIdentityType
|
||||
from azure.mgmt.resource import ResourceManagementClient
|
||||
from azure.mgmt.resource.resources.models import DeploymentMode
|
||||
from knack.util import CLIError
|
||||
|
||||
from ray.autoscaler.node_provider import NodeProvider
|
||||
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_NAME
|
||||
@@ -50,15 +54,21 @@ class AzureNodeProvider(NodeProvider):
|
||||
client_class=ComputeManagementClient, **kwargs)
|
||||
self.network_client = get_client_from_cli_profile(
|
||||
client_class=NetworkManagementClient, **kwargs)
|
||||
except Exception:
|
||||
logger.info(
|
||||
"CLI profile authentication failed. Trying MSI", exc_info=True)
|
||||
self.resource_client = get_client_from_cli_profile(
|
||||
client_class=ResourceManagementClient, **kwargs)
|
||||
except CLIError as e:
|
||||
if str(e) != "Please run 'az login' to setup account.":
|
||||
raise
|
||||
else:
|
||||
logger.info("CLI profile authentication failed. Trying MSI")
|
||||
|
||||
credentials = MSIAuthentication()
|
||||
self.compute_client = ComputeManagementClient(
|
||||
credentials=credentials, **kwargs)
|
||||
self.network_client = NetworkManagementClient(
|
||||
credentials=credentials, **kwargs)
|
||||
credentials = MSIAuthentication()
|
||||
self.compute_client = ComputeManagementClient(
|
||||
credentials=credentials, **kwargs)
|
||||
self.network_client = NetworkManagementClient(
|
||||
credentials=credentials, **kwargs)
|
||||
self.resource_client = ResourceManagementClient(
|
||||
credentials=credentials, **kwargs)
|
||||
|
||||
self.lock = RLock()
|
||||
|
||||
@@ -164,79 +174,43 @@ class AzureNodeProvider(NodeProvider):
|
||||
def create_node(self, node_config, tags, count):
|
||||
"""Creates a number of nodes within the namespace."""
|
||||
# TODO: restart deallocated nodes if possible
|
||||
location = self.provider_config["location"]
|
||||
resource_group = self.provider_config["resource_group"]
|
||||
subnet_id = self.provider_config["subnet_id"]
|
||||
|
||||
config = node_config.copy()
|
||||
config_tags = config.get("tags", {})
|
||||
# load the template
|
||||
template_path = os.path.join(
|
||||
os.path.dirname(__file__), "azure-vm-template.json")
|
||||
with open(template_path, "r") as template_file_fd:
|
||||
template = json.load(template_file_fd)
|
||||
|
||||
# get the tags
|
||||
config_tags = node_config.get("tags", {}).copy()
|
||||
config_tags.update(tags)
|
||||
config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
|
||||
|
||||
config["tags"] = config_tags
|
||||
config["location"] = location
|
||||
name_tag = config_tags.get(TAG_RAY_NODE_NAME, "node")
|
||||
unique_id = uuid4().hex[:VM_NAME_UUID_LEN]
|
||||
|
||||
for _ in range(count):
|
||||
unique_id = uuid4().hex[:VM_NAME_UUID_LEN]
|
||||
vm_name = "{name}-{id}".format(name=name_tag, id=unique_id)
|
||||
config["os_profile"]["computer_name"] = vm_name
|
||||
parameters = node_config["azure_arm_parameters"].copy()
|
||||
parameters["vmName"] = "{name}-{id}".format(
|
||||
name=name_tag, id=unique_id)
|
||||
parameters["provisionPublicIp"] = not self.provider_config.get(
|
||||
"use_internal_ips", False)
|
||||
parameters["vmTags"] = config_tags
|
||||
parameters["vmCount"] = count
|
||||
|
||||
try:
|
||||
assert len(vm_name) <= VM_NAME_MAX_LEN
|
||||
except AssertionError as e:
|
||||
e.args += ("name", vm_name)
|
||||
raise
|
||||
|
||||
ip_configuration = {"name": uuid4(), "subnet": {"id": subnet_id}}
|
||||
|
||||
if not self.provider_config.get("use_internal_ips", False):
|
||||
# create public ip address
|
||||
public_ip_addess_params = {
|
||||
"location": location,
|
||||
"public_ip_allocation_method": "Dynamic"
|
||||
}
|
||||
public_ip_address = (
|
||||
self.network_client.public_ip_addresses.create_or_update(
|
||||
resource_group_name=resource_group,
|
||||
public_ip_address_name="{}-ip".format(vm_name),
|
||||
parameters=public_ip_addess_params).result())
|
||||
ip_configuration["public_ip_address"] = public_ip_address
|
||||
|
||||
nic_params = {
|
||||
"location": location,
|
||||
"ip_configurations": [ip_configuration]
|
||||
deployment_properties = {
|
||||
"mode": DeploymentMode.incremental,
|
||||
"template": template,
|
||||
"parameters": {k: {
|
||||
"value": v
|
||||
}
|
||||
nic = self.network_client.network_interfaces.create_or_update(
|
||||
resource_group_name=resource_group,
|
||||
network_interface_name="{}-nic".format(vm_name),
|
||||
parameters=nic_params).result()
|
||||
for k, v in parameters.items()}
|
||||
}
|
||||
|
||||
# update vm config with network parameters
|
||||
config["network_profile"] = {
|
||||
"network_interfaces": [{
|
||||
"id": nic.id
|
||||
}]
|
||||
}
|
||||
|
||||
config["identity"] = {
|
||||
"type": ResourceIdentityType.user_assigned,
|
||||
"user_assigned_identities": [{
|
||||
# zero-documentation.. *sigh*
|
||||
"key": self.provider_config["msi_identity_id"],
|
||||
"value": {
|
||||
"principal_id": self.provider_config[
|
||||
"msi_identity_principal_id"],
|
||||
"client_id": self.provider_config["msi_identity_id"]
|
||||
}
|
||||
}]
|
||||
}
|
||||
|
||||
# TODO: do we need to wait or fire and forget is fine?
|
||||
self.compute_client.virtual_machines.create_or_update(
|
||||
resource_group_name=self.provider_config["resource_group"],
|
||||
vm_name=vm_name,
|
||||
parameters=config)
|
||||
# TODO: we could get the private/public ips back directly
|
||||
self.resource_client.deployments.create_or_update(
|
||||
resource_group, "ray-vm-{}".format(name_tag),
|
||||
deployment_properties).wait()
|
||||
|
||||
@synchronized
|
||||
def set_node_tags(self, node_id, tags):
|
||||
@@ -252,34 +226,57 @@ class AzureNodeProvider(NodeProvider):
|
||||
def terminate_node(self, node_id):
|
||||
"""Terminates the specified node. This will delete the VM and
|
||||
associated resources (NIC, IP, Storage) for the specified node."""
|
||||
# self.compute_client.virtual_machines.deallocate(
|
||||
# resource_group_name=self.provider_config["resource_group"],
|
||||
# vm_name=node_id)
|
||||
|
||||
resource_group = self.provider_config["resource_group"]
|
||||
nodes = self._get_filtered_nodes(
|
||||
tag_filters={TAG_RAY_CLUSTER_NAME: self.cluster_name})
|
||||
for node, metadata in nodes.items():
|
||||
# gather disks to delete later
|
||||
vm = self.compute_client.virtual_machines.get(
|
||||
resource_group_name=resource_group, vm_name=node)
|
||||
disks = {d.name for d in vm.storage_profile.data_disks}
|
||||
disks.add(vm.storage_profile.os_disk.name)
|
||||
try:
|
||||
# get metadata for node
|
||||
metadata = self._get_node(node_id)
|
||||
except KeyError:
|
||||
# node no longer exists
|
||||
return
|
||||
|
||||
# TODO: deallocate instead of delete to allow possible reuse
|
||||
# self.compute_client.virtual_machines.deallocate(
|
||||
# resource_group_name=resource_group,
|
||||
# vm_name=node_id)
|
||||
|
||||
# gather disks to delete later
|
||||
vm = self.compute_client.virtual_machines.get(
|
||||
resource_group_name=resource_group, vm_name=node_id)
|
||||
disks = {d.name for d in vm.storage_profile.data_disks}
|
||||
disks.add(vm.storage_profile.os_disk.name)
|
||||
|
||||
try:
|
||||
# delete machine, must wait for this to complete
|
||||
self.compute_client.virtual_machines.delete(
|
||||
resource_group_name=resource_group, vm_name=node).wait()
|
||||
resource_group_name=resource_group, vm_name=node_id).wait()
|
||||
except Exception as e:
|
||||
logger.warning("Failed to delete VM: {}".format(e))
|
||||
|
||||
try:
|
||||
# delete nic
|
||||
self.network_client.network_interfaces.delete(
|
||||
resource_group_name=resource_group,
|
||||
network_interface_name=metadata["nic_name"])
|
||||
# delete ip address
|
||||
if "public_ip_name" in metadata:
|
||||
except Exception as e:
|
||||
logger.warning("Failed to delete nic: {}".format(e))
|
||||
|
||||
# delete ip address
|
||||
if "public_ip_name" in metadata:
|
||||
try:
|
||||
self.network_client.public_ip_addresses.delete(
|
||||
resource_group_name=resource_group,
|
||||
public_ip_address_name=metadata["public_ip_name"])
|
||||
# delete disks
|
||||
for disk in disks:
|
||||
except Exception as e:
|
||||
logger.warning("Failed to delete public ip: {}".format(e))
|
||||
|
||||
# delete disks
|
||||
for disk in disks:
|
||||
try:
|
||||
self.compute_client.disks.delete(
|
||||
resource_group_name=resource_group, disk_name=disk)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to delete disk: {}".format(e))
|
||||
|
||||
def _get_node(self, node_id):
|
||||
self._get_filtered_nodes({}) # Side effect: updates cache
|
||||
|
||||
@@ -42,6 +42,8 @@ optional_ray_files = []
|
||||
ray_autoscaler_files = [
|
||||
"ray/autoscaler/aws/example-full.yaml",
|
||||
"ray/autoscaler/azure/example-full.yaml",
|
||||
"ray/autoscaler/azure/azure-vm-template.json",
|
||||
"ray/autoscaler/azure/azure-config-template.json",
|
||||
"ray/autoscaler/gcp/example-full.yaml",
|
||||
"ray/autoscaler/local/example-full.yaml",
|
||||
"ray/autoscaler/kubernetes/example-full.yaml",
|
||||
|
||||
Reference in New Issue
Block a user