mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 09:12:56 +08:00
[autoscaler] Azure deployment fixes (#11613)
Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
This commit is contained in:
@@ -10,18 +10,19 @@
|
||||
}
|
||||
},
|
||||
"variables": {
|
||||
"Contributor": "[concat('/subscriptions/', subscription().subscriptionId, '/providers/Microsoft.Authorization/roleDefinitions/', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]"
|
||||
"Contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
|
||||
"location": "[resourceGroup().location]"
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"type": "Microsoft.ManagedIdentity/userAssignedIdentities",
|
||||
"apiVersion": "2018-11-30",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"name": "ray-msi-user-identity"
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Authorization/roleAssignments",
|
||||
"apiVersion": "2018-09-01-preview",
|
||||
"apiVersion": "2020-04-01-preview",
|
||||
"name": "[guid(resourceGroup().id)]",
|
||||
"properties": {
|
||||
"principalId": "[reference('ray-msi-user-identity').principalId]",
|
||||
@@ -37,7 +38,7 @@
|
||||
"type": "Microsoft.Network/networkSecurityGroups",
|
||||
"apiVersion": "2019-02-01",
|
||||
"name": "ray-nsg",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"properties": {
|
||||
"securityRules": [
|
||||
{
|
||||
@@ -60,7 +61,7 @@
|
||||
"type": "Microsoft.Network/virtualNetworks",
|
||||
"apiVersion": "2019-11-01",
|
||||
"name": "ray-vnet",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"properties": {
|
||||
"addressSpace": {
|
||||
"addressPrefixes": [
|
||||
|
||||
@@ -85,20 +85,21 @@
|
||||
}
|
||||
},
|
||||
"variables": {
|
||||
"publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"networkInterfaceNamePrivate": "[concat(parameters('vmName'),'-nic')]",
|
||||
"networkInterfaceNamePublic": "[concat(parameters('vmName'),'-nic-public')]",
|
||||
"networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
|
||||
"networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]",
|
||||
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]",
|
||||
"osDiskType": "Standard_LRS"
|
||||
"osDiskType": "Standard_LRS",
|
||||
"publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]",
|
||||
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]"
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"type": "Microsoft.Network/networkInterfaces",
|
||||
"apiVersion": "2018-10-01",
|
||||
"apiVersion": "2020-06-01",
|
||||
"name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"dependsOn": [
|
||||
"[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]"
|
||||
],
|
||||
@@ -129,9 +130,9 @@
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Network/networkInterfaces",
|
||||
"apiVersion": "2018-10-01",
|
||||
"apiVersion": "2020-06-01",
|
||||
"name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"copy": {
|
||||
"name": "NICPrivateCopy",
|
||||
"count": "[parameters('vmCount')]"
|
||||
@@ -158,7 +159,7 @@
|
||||
"type": "Microsoft.Network/publicIpAddresses",
|
||||
"apiVersion": "2019-02-01",
|
||||
"name": "[concat(variables('publicIpAddressName'), copyIndex())]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"properties": {
|
||||
"publicIpAllocationMethod": "Static",
|
||||
"publicIPAddressVersion": "IPv4"
|
||||
@@ -177,7 +178,7 @@
|
||||
"type": "Microsoft.Compute/virtualMachines",
|
||||
"apiVersion": "2019-03-01",
|
||||
"name": "[concat(parameters('vmName'), copyIndex())]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"dependsOn": [
|
||||
"[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]"
|
||||
],
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import random
|
||||
import os
|
||||
|
||||
from azure.common.client_factory import get_client_from_cli_profile
|
||||
from azure.mgmt.resource import ResourceManagementClient
|
||||
@@ -55,8 +55,8 @@ def _configure_resource_group(config):
|
||||
resource_group_name=resource_group, parameters=params)
|
||||
|
||||
# load the template file
|
||||
current_path = os.path.dirname(os.path.abspath(__file__))
|
||||
template_path = os.path.join(current_path, "azure-config-template.json")
|
||||
current_path = Path(__file__).parent
|
||||
template_path = current_path.joinpath("azure-config-template.json")
|
||||
with open(template_path, "r") as template_fp:
|
||||
template = json.load(template_fp)
|
||||
|
||||
@@ -86,16 +86,17 @@ def _configure_resource_group(config):
|
||||
|
||||
def _configure_key_pair(config):
|
||||
ssh_user = config["auth"]["ssh_user"]
|
||||
public_key = None
|
||||
# search if the keys exist
|
||||
for key_type in ["ssh_private_key", "ssh_public_key"]:
|
||||
try:
|
||||
key_path = os.path.expanduser(config["auth"][key_type])
|
||||
key_path = Path(config["auth"][key_type]).expanduser()
|
||||
except KeyError:
|
||||
raise Exception("Config must define {}".format(key_type))
|
||||
except TypeError:
|
||||
raise Exception("Invalid config value for {}".format(key_type))
|
||||
|
||||
assert os.path.exists(key_path), (
|
||||
assert key_path.is_file(), (
|
||||
"Could not find ssh key: {}".format(key_path))
|
||||
|
||||
if key_type == "ssh_public_key":
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from threading import RLock
|
||||
from uuid import uuid4
|
||||
|
||||
@@ -178,8 +178,8 @@ class AzureNodeProvider(NodeProvider):
|
||||
resource_group = self.provider_config["resource_group"]
|
||||
|
||||
# load the template file
|
||||
current_path = os.path.dirname(os.path.abspath(__file__))
|
||||
template_path = os.path.join(current_path, "azure-vm-template.json")
|
||||
current_path = Path(__file__).parent
|
||||
template_path = current_path.joinpath("azure-vm-template.json")
|
||||
with open(template_path, "r") as template_fp:
|
||||
template = json.load(template_fp)
|
||||
|
||||
|
||||
@@ -49,6 +49,7 @@ auth:
|
||||
# you must specify paths to matching private and public key pair files
|
||||
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
|
||||
ssh_private_key: ~/.ssh/id_rsa
|
||||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
|
||||
@@ -64,7 +65,7 @@ head_node:
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.02.01
|
||||
imageVersion: 20.07.06
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type.
|
||||
worker_nodes:
|
||||
@@ -74,7 +75,7 @@ worker_nodes:
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.02.01
|
||||
imageVersion: 20.07.06
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
@@ -86,6 +87,7 @@ worker_nodes:
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
|
||||
}
|
||||
|
||||
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||
@@ -118,6 +120,7 @@ setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
|
||||
@@ -62,6 +62,7 @@ auth:
|
||||
# you must specify paths to matching private and public key pair files
|
||||
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
|
||||
ssh_private_key: ~/.ssh/id_rsa
|
||||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
|
||||
@@ -77,7 +78,7 @@ head_node:
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.02.01
|
||||
imageVersion: 20.07.06
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type.
|
||||
worker_nodes:
|
||||
@@ -87,7 +88,7 @@ worker_nodes:
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.02.01
|
||||
imageVersion: 20.07.06
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
@@ -99,6 +100,7 @@ worker_nodes:
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
|
||||
}
|
||||
|
||||
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||
@@ -130,12 +132,14 @@ initialization_commands:
|
||||
- touch ~/.sudo_as_admin_successful
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands: []
|
||||
setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
|
||||
@@ -52,6 +52,7 @@ auth:
|
||||
# you must specify paths to matching private and public key pair files
|
||||
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
|
||||
ssh_private_key: ~/.ssh/id_rsa
|
||||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
@@ -71,6 +72,7 @@ worker_nodes:
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
|
||||
}
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
|
||||
@@ -60,6 +60,7 @@ auth:
|
||||
# you must specify paths to matching private and public key pair files
|
||||
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
|
||||
ssh_private_key: ~/.ssh/id_rsa
|
||||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
@@ -71,7 +72,7 @@ head_node:
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: "1804"
|
||||
imageVersion: 20.02.01
|
||||
imageVersion: 20.07.06
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields using defaults.yaml
|
||||
@@ -82,13 +83,14 @@ worker_nodes:
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: "1804"
|
||||
imageVersion: 20.02.01
|
||||
imageVersion: 20.07.06
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
|
||||
}
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
@@ -103,6 +105,7 @@ setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
|
||||
@@ -17,4 +17,5 @@ auth:
|
||||
# you must specify paths to matching private and public key pair files
|
||||
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
|
||||
ssh_private_key: ~/.ssh/id_rsa
|
||||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
+15
-9
@@ -71,19 +71,23 @@ generated_python_directories = [
|
||||
optional_ray_files = ["ray/nightly-wheels.yaml"]
|
||||
|
||||
ray_autoscaler_files = [
|
||||
"ray/autoscaler/aws/defaults.yaml", "ray/autoscaler/azure/defaults.yaml",
|
||||
"ray/autoscaler/azure/azure-vm-template.json",
|
||||
"ray/autoscaler/azure/azure-config-template.json",
|
||||
"ray/autoscaler/gcp/defaults.yaml", "ray/autoscaler/local/defaults.yaml",
|
||||
"ray/autoscaler/aws/defaults.yaml",
|
||||
"ray/autoscaler/azure/defaults.yaml",
|
||||
"ray/autoscaler/_private/azure/azure-vm-template.json",
|
||||
"ray/autoscaler/_private/azure/azure-config-template.json",
|
||||
"ray/autoscaler/gcp/defaults.yaml",
|
||||
"ray/autoscaler/local/defaults.yaml",
|
||||
"ray/autoscaler/kubernetes/defaults.yaml",
|
||||
"ray/autoscaler/kubernetes/kubectl-rsync.sh",
|
||||
"ray/autoscaler/staroid/defaults.yaml", "ray/autoscaler/ray-schema.json"
|
||||
"ray/autoscaler/staroid/defaults.yaml",
|
||||
"ray/autoscaler/ray-schema.json",
|
||||
]
|
||||
|
||||
ray_project_files = [
|
||||
"ray/projects/schema.json", "ray/projects/templates/cluster_template.yaml",
|
||||
"ray/projects/schema.json",
|
||||
"ray/projects/templates/cluster_template.yaml",
|
||||
"ray/projects/templates/project_template.yaml",
|
||||
"ray/projects/templates/requirements.txt"
|
||||
"ray/projects/templates/requirements.txt",
|
||||
]
|
||||
|
||||
ray_dashboard_files = [
|
||||
@@ -105,8 +109,10 @@ extras = {
|
||||
"dataclasses; python_version < '3.7'"
|
||||
],
|
||||
"tune": [
|
||||
"tabulate", "tensorboardX", "pandas",
|
||||
"dataclasses; python_version < '3.7'"
|
||||
"dataclasses; python_version < '3.7'",
|
||||
"pandas",
|
||||
"tabulate",
|
||||
"tensorboardX",
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user