mirror of
https://github.com/wassname/ray.git
synced 2026-07-05 07:56:05 +08:00
[autoscaler] Azure deployment fixes (#11613)
Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
This commit is contained in:
@@ -1777,6 +1777,7 @@ filegroup(
|
||||
"python/ray/*.py",
|
||||
"python/ray/autoscaler/*.py",
|
||||
"python/ray/autoscaler/_private/*.py",
|
||||
"python/ray/autoscaler/_private/azure/*.json",
|
||||
"python/ray/autoscaler/aws/defaults.yaml",
|
||||
"python/ray/autoscaler/azure/defaults.yaml",
|
||||
"python/ray/autoscaler/gcp/defaults.yaml",
|
||||
|
||||
@@ -13,17 +13,21 @@ sudo -u "$USERNAME" -i /bin/bash -l -c "conda activate $CONDA_ENV; pip install $
|
||||
echo "Setting up service scripts..."
|
||||
cat > /home/"$USERNAME"/ray-head.sh << EOM
|
||||
#!/bin/bash
|
||||
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda activate $CONDA_ENV
|
||||
|
||||
NUM_GPUS=\`nvidia-smi -L | wc -l\`
|
||||
|
||||
ray stop
|
||||
ulimit -n 65536
|
||||
ray start --head -port=6379 --object-manager-port=8076 --num-gpus=\$NUM_GPUS --block --webui-host 0.0.0.0
|
||||
ray start --head --port=6379 --object-manager-port=8076 --num-gpus=\$NUM_GPUS --block --dashboard-host 0.0.0.0
|
||||
EOM
|
||||
|
||||
cat > /home/"$USERNAME"/ray-worker.sh << EOM
|
||||
#!/bin/bash
|
||||
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda activate $CONDA_ENV
|
||||
|
||||
NUM_GPUS=\`nvidia-smi -L | wc -l\`
|
||||
@@ -42,6 +46,7 @@ EOM
|
||||
cat > /home/"$USERNAME"/tensorboard.sh << EOM
|
||||
#!/bin/bash
|
||||
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda activate $CONDA_ENV
|
||||
mkdir -p /home/$USERNAME/ray_results
|
||||
|
||||
|
||||
@@ -55,6 +55,7 @@
|
||||
"type": "int",
|
||||
"defaultValue": 1,
|
||||
"minValue": 0,
|
||||
"maxValue": 1000,
|
||||
"metadata": {
|
||||
"description": "Initial number of worker nodes"
|
||||
}
|
||||
@@ -63,6 +64,7 @@
|
||||
"type": "int",
|
||||
"defaultValue": 1,
|
||||
"minValue": 0,
|
||||
"maxValue": 1000,
|
||||
"metadata": {
|
||||
"description": "Minimum number of worker nodes"
|
||||
}
|
||||
@@ -71,6 +73,7 @@
|
||||
"type": "int",
|
||||
"defaultValue": 1,
|
||||
"minValue": 0,
|
||||
"maxValue": 1000,
|
||||
"metadata": {
|
||||
"description": "Maximum number of worker nodes"
|
||||
}
|
||||
@@ -107,30 +110,35 @@
|
||||
},
|
||||
"variables": {
|
||||
"azureScriptInitUrl": "https://raw.githubusercontent.com/ray-project/ray/master/doc/azure/azure-init.sh",
|
||||
"location": "[resourceGroup().location]",
|
||||
"vmName": "ray-node",
|
||||
"subnetWorkers": "10.32.0.0/16",
|
||||
"subnetHead": "10.33.0.0/16",
|
||||
"publicIpAddressName": "[concat(variables('vmName'), '-ip' )]",
|
||||
"networkIpConfig": "[guid(resourceGroup().id, variables('vmName'))]",
|
||||
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]",
|
||||
"subnetHeadRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet-head')]",
|
||||
"subnetName": "ray-subnet",
|
||||
"subnetHeadName": "ray-subnet-head",
|
||||
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', variables('vNetName'), variables('subnetName'))]",
|
||||
"subnetHeadRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', variables('vNetName'), variables('subnetHeadName'))]",
|
||||
"osDiskType": "Standard_LRS",
|
||||
"vmNameHead": "[concat(variables('vmName'), '-head')]",
|
||||
"vmNameWorker": "[concat(variables('vmName'), '-workers')]",
|
||||
"networkInterfaceName": "[concat(variables('vmName'), '-nic')]",
|
||||
"networkSecurityGroupName": "ray-nsg",
|
||||
"vNetName": "ray-vnet",
|
||||
"subnetNetwork": "[split(variables('subnetHead'), '/')[0]]",
|
||||
"headInternalIP": "[concat(substring(variables('subnetNetwork'), 0, lastIndexOf(variables('subnetNetwork'), '.')), '.5')]",
|
||||
"imagePublisher": "microsoft-dsvm",
|
||||
"imageOffer": "ubuntu-1804",
|
||||
"imageSku": "1804",
|
||||
"imageVersion": "latest"
|
||||
"imageVersion": "20.07.06"
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"type": "Microsoft.Network/networkSecurityGroups",
|
||||
"apiVersion": "2019-02-01",
|
||||
"name": "ray-nsg",
|
||||
"location": "[resourceGroup().location]",
|
||||
"name": "[variables('networkSecurityGroupName')]",
|
||||
"location": "[variables('location')]",
|
||||
"properties": {
|
||||
"securityRules": [
|
||||
{
|
||||
@@ -191,8 +199,8 @@
|
||||
{
|
||||
"type": "Microsoft.Network/virtualNetworks",
|
||||
"apiVersion": "2019-11-01",
|
||||
"name": "ray-vnet",
|
||||
"location": "[resourceGroup().location]",
|
||||
"name": "[variables('vNetName')]",
|
||||
"location": "[variables('location')]",
|
||||
"properties": {
|
||||
"addressSpace": {
|
||||
"addressPrefixes": [
|
||||
@@ -202,13 +210,13 @@
|
||||
},
|
||||
"subnets": [
|
||||
{
|
||||
"name": "ray-subnet",
|
||||
"name": "[variables('subnetName')]",
|
||||
"properties": {
|
||||
"addressPrefix": "[variables('subnetWorkers')]"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "ray-subnet-head",
|
||||
"name": "[variables('subnetHeadName')]",
|
||||
"properties": {
|
||||
"addressPrefix": "[variables('subnetHead')]"
|
||||
}
|
||||
@@ -220,7 +228,7 @@
|
||||
"type": "Microsoft.Network/publicIpAddresses",
|
||||
"apiVersion": "2019-02-01",
|
||||
"name": "[variables('publicIpAddressName')]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"properties": {
|
||||
"publicIpAllocationMethod": "Static",
|
||||
"publicIPAddressVersion": "IPv4"
|
||||
@@ -232,12 +240,12 @@
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Network/networkInterfaces",
|
||||
"apiVersion": "2018-10-01",
|
||||
"apiVersion": "2020-06-01",
|
||||
"name": "[variables('networkInterfaceName')]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"dependsOn": [
|
||||
"[resourceId('Microsoft.Network/publicIpAddresses/', variables('publicIpAddressName'))]",
|
||||
"[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]"
|
||||
"[resourceId('Microsoft.Network/publicIpAddresses', variables('publicIpAddressName'))]",
|
||||
"[resourceId('Microsoft.Network/networkSecurityGroups', variables('networkSecurityGroupName'))]"
|
||||
],
|
||||
"properties": {
|
||||
"ipConfigurations": [
|
||||
@@ -256,17 +264,17 @@
|
||||
}
|
||||
],
|
||||
"networkSecurityGroup": {
|
||||
"id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]"
|
||||
"id": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('networkSecurityGroupName'))]"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Compute/virtualMachines",
|
||||
"apiVersion": "2019-07-01",
|
||||
"apiVersion": "2020-06-01",
|
||||
"name": "[variables('vmNameHead')]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"dependsOn": [
|
||||
"[resourceId('Microsoft.Network/networkInterfaces/', variables('networkInterfaceName'))]"
|
||||
"[resourceId('Microsoft.Network/networkInterfaces', variables('networkInterfaceName'))]"
|
||||
],
|
||||
"properties": {
|
||||
"hardwareProfile": {
|
||||
@@ -315,10 +323,10 @@
|
||||
{
|
||||
"type": "Microsoft.Compute/virtualMachines/extensions",
|
||||
"name": "[concat(variables('vmNameHead'), '/HeadNodeInitScript')]",
|
||||
"apiVersion": "2017-03-30",
|
||||
"location": "[resourceGroup().location]",
|
||||
"apiVersion": "2020-06-01",
|
||||
"location": "[variables('location')]",
|
||||
"dependsOn": [
|
||||
"[concat('Microsoft.Compute/virtualMachines/', variables('vmNameHead'))]"
|
||||
"[resourceId('Microsoft.Compute/virtualMachines', variables('vmNameHead'))]"
|
||||
],
|
||||
"properties": {
|
||||
"publisher": "Microsoft.Azure.Extensions",
|
||||
@@ -338,10 +346,10 @@
|
||||
{
|
||||
"type": "Microsoft.Compute/virtualMachineScaleSets",
|
||||
"name": "[variables('vmNameWorker')]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"apiVersion": "2019-07-01",
|
||||
"dependsOn": [
|
||||
"Microsoft.Network/virtualNetworks/ray-vnet"
|
||||
"[resourceId('Microsoft.Network/virtualNetworks', variables('vNetName'))]"
|
||||
],
|
||||
"sku": {
|
||||
"name": "[parameters('workerNodeSize')]",
|
||||
@@ -430,13 +438,13 @@
|
||||
"type": "Microsoft.Insights/autoscaleSettings",
|
||||
"apiVersion": "2015-04-01",
|
||||
"name": "cpuautoscale",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"dependsOn": [
|
||||
"[concat('Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]"
|
||||
"[resourceId('Microsoft.Compute/virtualMachineScaleSets', variables('vmNameWorker'))]"
|
||||
],
|
||||
"properties": {
|
||||
"name": "cpuautoscale",
|
||||
"targetResourceUri": "[concat('/subscriptions/',subscription().subscriptionId, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
|
||||
"targetResourceUri": "[concat(resourceGroup().id, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
|
||||
"enabled": true,
|
||||
"profiles": [
|
||||
{
|
||||
@@ -450,8 +458,7 @@
|
||||
{
|
||||
"metricTrigger": {
|
||||
"metricName": "Percentage CPU",
|
||||
"metricNamespace": "",
|
||||
"metricResourceUri": "[concat('/subscriptions/',subscription().subscriptionId, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
|
||||
"metricResourceUri": "[concat(resourceGroup().id, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
|
||||
"timeGrain": "PT1M",
|
||||
"statistic": "Average",
|
||||
"timeWindow": "PT10M",
|
||||
@@ -469,8 +476,7 @@
|
||||
{
|
||||
"metricTrigger": {
|
||||
"metricName": "Percentage CPU",
|
||||
"metricNamespace": "",
|
||||
"metricResourceUri": "[concat('/subscriptions/',subscription().subscriptionId, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
|
||||
"metricResourceUri": "[concat(resourceGroup().id, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
|
||||
"timeGrain": "PT1M",
|
||||
"statistic": "Average",
|
||||
"timeWindow": "PT30M",
|
||||
|
||||
@@ -48,7 +48,7 @@ AWS/GCP/Azure
|
||||
See :ref:`aws-cluster` for recipes on customizing AWS clusters.
|
||||
.. group-tab:: Azure
|
||||
|
||||
First, install the Azure CLI (``pip install azure-cli azure-core``) then login using (``az login``).
|
||||
First, install the Azure CLI (``pip install azure-cli``) then login using (``az login``).
|
||||
|
||||
Set the subscription to use from the command line (``az account set -s <subscription_id>``) or by modifying the provider section of the config provided e.g: `ray/python/ray/autoscaler/azure/example-full.yaml`
|
||||
|
||||
@@ -65,10 +65,7 @@ AWS/GCP/Azure
|
||||
# Get a remote screen on the head node.
|
||||
$ ray attach ray/python/ray/autoscaler/azure/example-full.yaml
|
||||
# test ray setup
|
||||
# enable conda environment
|
||||
$ exec bash -l
|
||||
$ conda activate py37_tensorflow
|
||||
$ python -c 'import ray; ray.init()'
|
||||
$ python -c 'import ray; ray.init(address="auto")'
|
||||
$ exit
|
||||
# Tear down the cluster.
|
||||
$ ray down ray/python/ray/autoscaler/azure/example-full.yaml
|
||||
@@ -83,8 +80,8 @@ AWS/GCP/Azure
|
||||
:target: https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fray-project%2Fray%2Fmaster%2Fdoc%2Fazure%2Fazure-ray-template.json
|
||||
:alt: Deploy to Azure
|
||||
|
||||
Once the template is successfully deployed the deployment output page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input).
|
||||
Use the following code in a Jupyter notebook to connect to the Ray cluster.
|
||||
Once the template is successfully deployed the deployment Outputs page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input).
|
||||
Use the following code in a Jupyter notebook (using the conda environment specified in the template input, py37_tensorflow by default) to connect to the Ray cluster.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
||||
@@ -10,18 +10,19 @@
|
||||
}
|
||||
},
|
||||
"variables": {
|
||||
"Contributor": "[concat('/subscriptions/', subscription().subscriptionId, '/providers/Microsoft.Authorization/roleDefinitions/', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]"
|
||||
"Contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
|
||||
"location": "[resourceGroup().location]"
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"type": "Microsoft.ManagedIdentity/userAssignedIdentities",
|
||||
"apiVersion": "2018-11-30",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"name": "ray-msi-user-identity"
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Authorization/roleAssignments",
|
||||
"apiVersion": "2018-09-01-preview",
|
||||
"apiVersion": "2020-04-01-preview",
|
||||
"name": "[guid(resourceGroup().id)]",
|
||||
"properties": {
|
||||
"principalId": "[reference('ray-msi-user-identity').principalId]",
|
||||
@@ -37,7 +38,7 @@
|
||||
"type": "Microsoft.Network/networkSecurityGroups",
|
||||
"apiVersion": "2019-02-01",
|
||||
"name": "ray-nsg",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"properties": {
|
||||
"securityRules": [
|
||||
{
|
||||
@@ -60,7 +61,7 @@
|
||||
"type": "Microsoft.Network/virtualNetworks",
|
||||
"apiVersion": "2019-11-01",
|
||||
"name": "ray-vnet",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"properties": {
|
||||
"addressSpace": {
|
||||
"addressPrefixes": [
|
||||
|
||||
@@ -85,20 +85,21 @@
|
||||
}
|
||||
},
|
||||
"variables": {
|
||||
"publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"networkInterfaceNamePrivate": "[concat(parameters('vmName'),'-nic')]",
|
||||
"networkInterfaceNamePublic": "[concat(parameters('vmName'),'-nic-public')]",
|
||||
"networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
|
||||
"networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]",
|
||||
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]",
|
||||
"osDiskType": "Standard_LRS"
|
||||
"osDiskType": "Standard_LRS",
|
||||
"publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]",
|
||||
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]"
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"type": "Microsoft.Network/networkInterfaces",
|
||||
"apiVersion": "2018-10-01",
|
||||
"apiVersion": "2020-06-01",
|
||||
"name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"dependsOn": [
|
||||
"[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]"
|
||||
],
|
||||
@@ -129,9 +130,9 @@
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Network/networkInterfaces",
|
||||
"apiVersion": "2018-10-01",
|
||||
"apiVersion": "2020-06-01",
|
||||
"name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"copy": {
|
||||
"name": "NICPrivateCopy",
|
||||
"count": "[parameters('vmCount')]"
|
||||
@@ -158,7 +159,7 @@
|
||||
"type": "Microsoft.Network/publicIpAddresses",
|
||||
"apiVersion": "2019-02-01",
|
||||
"name": "[concat(variables('publicIpAddressName'), copyIndex())]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"properties": {
|
||||
"publicIpAllocationMethod": "Static",
|
||||
"publicIPAddressVersion": "IPv4"
|
||||
@@ -177,7 +178,7 @@
|
||||
"type": "Microsoft.Compute/virtualMachines",
|
||||
"apiVersion": "2019-03-01",
|
||||
"name": "[concat(parameters('vmName'), copyIndex())]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"location": "[variables('location')]",
|
||||
"dependsOn": [
|
||||
"[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]"
|
||||
],
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import random
|
||||
import os
|
||||
|
||||
from azure.common.client_factory import get_client_from_cli_profile
|
||||
from azure.mgmt.resource import ResourceManagementClient
|
||||
@@ -55,8 +55,8 @@ def _configure_resource_group(config):
|
||||
resource_group_name=resource_group, parameters=params)
|
||||
|
||||
# load the template file
|
||||
current_path = os.path.dirname(os.path.abspath(__file__))
|
||||
template_path = os.path.join(current_path, "azure-config-template.json")
|
||||
current_path = Path(__file__).parent
|
||||
template_path = current_path.joinpath("azure-config-template.json")
|
||||
with open(template_path, "r") as template_fp:
|
||||
template = json.load(template_fp)
|
||||
|
||||
@@ -86,16 +86,17 @@ def _configure_resource_group(config):
|
||||
|
||||
def _configure_key_pair(config):
|
||||
ssh_user = config["auth"]["ssh_user"]
|
||||
public_key = None
|
||||
# search if the keys exist
|
||||
for key_type in ["ssh_private_key", "ssh_public_key"]:
|
||||
try:
|
||||
key_path = os.path.expanduser(config["auth"][key_type])
|
||||
key_path = Path(config["auth"][key_type]).expanduser()
|
||||
except KeyError:
|
||||
raise Exception("Config must define {}".format(key_type))
|
||||
except TypeError:
|
||||
raise Exception("Invalid config value for {}".format(key_type))
|
||||
|
||||
assert os.path.exists(key_path), (
|
||||
assert key_path.is_file(), (
|
||||
"Could not find ssh key: {}".format(key_path))
|
||||
|
||||
if key_type == "ssh_public_key":
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from threading import RLock
|
||||
from uuid import uuid4
|
||||
|
||||
@@ -178,8 +178,8 @@ class AzureNodeProvider(NodeProvider):
|
||||
resource_group = self.provider_config["resource_group"]
|
||||
|
||||
# load the template file
|
||||
current_path = os.path.dirname(os.path.abspath(__file__))
|
||||
template_path = os.path.join(current_path, "azure-vm-template.json")
|
||||
current_path = Path(__file__).parent
|
||||
template_path = current_path.joinpath("azure-vm-template.json")
|
||||
with open(template_path, "r") as template_fp:
|
||||
template = json.load(template_fp)
|
||||
|
||||
|
||||
@@ -49,6 +49,7 @@ auth:
|
||||
# you must specify paths to matching private and public key pair files
|
||||
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
|
||||
ssh_private_key: ~/.ssh/id_rsa
|
||||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
|
||||
@@ -64,7 +65,7 @@ head_node:
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.02.01
|
||||
imageVersion: 20.07.06
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type.
|
||||
worker_nodes:
|
||||
@@ -74,7 +75,7 @@ worker_nodes:
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.02.01
|
||||
imageVersion: 20.07.06
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
@@ -86,6 +87,7 @@ worker_nodes:
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
|
||||
}
|
||||
|
||||
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||
@@ -118,6 +120,7 @@ setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
|
||||
@@ -62,6 +62,7 @@ auth:
|
||||
# you must specify paths to matching private and public key pair files
|
||||
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
|
||||
ssh_private_key: ~/.ssh/id_rsa
|
||||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
|
||||
@@ -77,7 +78,7 @@ head_node:
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.02.01
|
||||
imageVersion: 20.07.06
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type.
|
||||
worker_nodes:
|
||||
@@ -87,7 +88,7 @@ worker_nodes:
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.02.01
|
||||
imageVersion: 20.07.06
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
@@ -99,6 +100,7 @@ worker_nodes:
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
|
||||
}
|
||||
|
||||
# Files or directories to copy from the head node to the worker nodes. The format is a
|
||||
@@ -130,12 +132,14 @@ initialization_commands:
|
||||
- touch ~/.sudo_as_admin_successful
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands: []
|
||||
setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
|
||||
@@ -52,6 +52,7 @@ auth:
|
||||
# you must specify paths to matching private and public key pair files
|
||||
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
|
||||
ssh_private_key: ~/.ssh/id_rsa
|
||||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
@@ -71,6 +72,7 @@ worker_nodes:
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
|
||||
}
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
|
||||
@@ -60,6 +60,7 @@ auth:
|
||||
# you must specify paths to matching private and public key pair files
|
||||
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
|
||||
ssh_private_key: ~/.ssh/id_rsa
|
||||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
@@ -71,7 +72,7 @@ head_node:
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: "1804"
|
||||
imageVersion: 20.02.01
|
||||
imageVersion: 20.07.06
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields using defaults.yaml
|
||||
@@ -82,13 +83,14 @@ worker_nodes:
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: "1804"
|
||||
imageVersion: 20.02.01
|
||||
imageVersion: 20.07.06
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
|
||||
}
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
@@ -103,6 +105,7 @@ setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
|
||||
|
||||
@@ -17,4 +17,5 @@ auth:
|
||||
# you must specify paths to matching private and public key pair files
|
||||
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
|
||||
ssh_private_key: ~/.ssh/id_rsa
|
||||
# changes to this should match what is specified in file_mounts
|
||||
ssh_public_key: ~/.ssh/id_rsa.pub
|
||||
|
||||
+15
-9
@@ -71,19 +71,23 @@ generated_python_directories = [
|
||||
optional_ray_files = ["ray/nightly-wheels.yaml"]
|
||||
|
||||
ray_autoscaler_files = [
|
||||
"ray/autoscaler/aws/defaults.yaml", "ray/autoscaler/azure/defaults.yaml",
|
||||
"ray/autoscaler/azure/azure-vm-template.json",
|
||||
"ray/autoscaler/azure/azure-config-template.json",
|
||||
"ray/autoscaler/gcp/defaults.yaml", "ray/autoscaler/local/defaults.yaml",
|
||||
"ray/autoscaler/aws/defaults.yaml",
|
||||
"ray/autoscaler/azure/defaults.yaml",
|
||||
"ray/autoscaler/_private/azure/azure-vm-template.json",
|
||||
"ray/autoscaler/_private/azure/azure-config-template.json",
|
||||
"ray/autoscaler/gcp/defaults.yaml",
|
||||
"ray/autoscaler/local/defaults.yaml",
|
||||
"ray/autoscaler/kubernetes/defaults.yaml",
|
||||
"ray/autoscaler/kubernetes/kubectl-rsync.sh",
|
||||
"ray/autoscaler/staroid/defaults.yaml", "ray/autoscaler/ray-schema.json"
|
||||
"ray/autoscaler/staroid/defaults.yaml",
|
||||
"ray/autoscaler/ray-schema.json",
|
||||
]
|
||||
|
||||
ray_project_files = [
|
||||
"ray/projects/schema.json", "ray/projects/templates/cluster_template.yaml",
|
||||
"ray/projects/schema.json",
|
||||
"ray/projects/templates/cluster_template.yaml",
|
||||
"ray/projects/templates/project_template.yaml",
|
||||
"ray/projects/templates/requirements.txt"
|
||||
"ray/projects/templates/requirements.txt",
|
||||
]
|
||||
|
||||
ray_dashboard_files = [
|
||||
@@ -105,8 +109,10 @@ extras = {
|
||||
"dataclasses; python_version < '3.7'"
|
||||
],
|
||||
"tune": [
|
||||
"tabulate", "tensorboardX", "pandas",
|
||||
"dataclasses; python_version < '3.7'"
|
||||
"dataclasses; python_version < '3.7'",
|
||||
"pandas",
|
||||
"tabulate",
|
||||
"tensorboardX",
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user