[autoscaler] Azure deployment fixes (#11613)

Co-authored-by: Richard Liaw <rliaw@berkeley.edu>
This commit is contained in:
Scott Graham
2020-10-27 18:27:18 -04:00
committed by GitHub
parent 293483ed0b
commit c4ae94d60b
14 changed files with 108 additions and 77 deletions
+1
View File
@@ -1777,6 +1777,7 @@ filegroup(
"python/ray/*.py",
"python/ray/autoscaler/*.py",
"python/ray/autoscaler/_private/*.py",
"python/ray/autoscaler/_private/azure/*.json",
"python/ray/autoscaler/aws/defaults.yaml",
"python/ray/autoscaler/azure/defaults.yaml",
"python/ray/autoscaler/gcp/defaults.yaml",
+6 -1
View File
@@ -13,17 +13,21 @@ sudo -u "$USERNAME" -i /bin/bash -l -c "conda activate $CONDA_ENV; pip install $
echo "Setting up service scripts..."
cat > /home/"$USERNAME"/ray-head.sh << EOM
#!/bin/bash
eval "$(conda shell.bash hook)"
conda activate $CONDA_ENV
NUM_GPUS=\`nvidia-smi -L | wc -l\`
ray stop
ulimit -n 65536
ray start --head -port=6379 --object-manager-port=8076 --num-gpus=\$NUM_GPUS --block --webui-host 0.0.0.0
ray start --head --port=6379 --object-manager-port=8076 --num-gpus=\$NUM_GPUS --block --dashboard-host 0.0.0.0
EOM
cat > /home/"$USERNAME"/ray-worker.sh << EOM
#!/bin/bash
eval "$(conda shell.bash hook)"
conda activate $CONDA_ENV
NUM_GPUS=\`nvidia-smi -L | wc -l\`
@@ -42,6 +46,7 @@ EOM
cat > /home/"$USERNAME"/tensorboard.sh << EOM
#!/bin/bash
eval "$(conda shell.bash hook)"
conda activate $CONDA_ENV
mkdir -p /home/$USERNAME/ray_results
+36 -30
View File
@@ -55,6 +55,7 @@
"type": "int",
"defaultValue": 1,
"minValue": 0,
"maxValue": 1000,
"metadata": {
"description": "Initial number of worker nodes"
}
@@ -63,6 +64,7 @@
"type": "int",
"defaultValue": 1,
"minValue": 0,
"maxValue": 1000,
"metadata": {
"description": "Minimum number of worker nodes"
}
@@ -71,6 +73,7 @@
"type": "int",
"defaultValue": 1,
"minValue": 0,
"maxValue": 1000,
"metadata": {
"description": "Maximum number of worker nodes"
}
@@ -107,30 +110,35 @@
},
"variables": {
"azureScriptInitUrl": "https://raw.githubusercontent.com/ray-project/ray/master/doc/azure/azure-init.sh",
"location": "[resourceGroup().location]",
"vmName": "ray-node",
"subnetWorkers": "10.32.0.0/16",
"subnetHead": "10.33.0.0/16",
"publicIpAddressName": "[concat(variables('vmName'), '-ip' )]",
"networkIpConfig": "[guid(resourceGroup().id, variables('vmName'))]",
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]",
"subnetHeadRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet-head')]",
"subnetName": "ray-subnet",
"subnetHeadName": "ray-subnet-head",
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', variables('vNetName'), variables('subnetName'))]",
"subnetHeadRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', variables('vNetName'), variables('subnetHeadName'))]",
"osDiskType": "Standard_LRS",
"vmNameHead": "[concat(variables('vmName'), '-head')]",
"vmNameWorker": "[concat(variables('vmName'), '-workers')]",
"networkInterfaceName": "[concat(variables('vmName'), '-nic')]",
"networkSecurityGroupName": "ray-nsg",
"vNetName": "ray-vnet",
"subnetNetwork": "[split(variables('subnetHead'), '/')[0]]",
"headInternalIP": "[concat(substring(variables('subnetNetwork'), 0, lastIndexOf(variables('subnetNetwork'), '.')), '.5')]",
"imagePublisher": "microsoft-dsvm",
"imageOffer": "ubuntu-1804",
"imageSku": "1804",
"imageVersion": "latest"
"imageVersion": "20.07.06"
},
"resources": [
{
"type": "Microsoft.Network/networkSecurityGroups",
"apiVersion": "2019-02-01",
"name": "ray-nsg",
"location": "[resourceGroup().location]",
"name": "[variables('networkSecurityGroupName')]",
"location": "[variables('location')]",
"properties": {
"securityRules": [
{
@@ -191,8 +199,8 @@
{
"type": "Microsoft.Network/virtualNetworks",
"apiVersion": "2019-11-01",
"name": "ray-vnet",
"location": "[resourceGroup().location]",
"name": "[variables('vNetName')]",
"location": "[variables('location')]",
"properties": {
"addressSpace": {
"addressPrefixes": [
@@ -202,13 +210,13 @@
},
"subnets": [
{
"name": "ray-subnet",
"name": "[variables('subnetName')]",
"properties": {
"addressPrefix": "[variables('subnetWorkers')]"
}
},
{
"name": "ray-subnet-head",
"name": "[variables('subnetHeadName')]",
"properties": {
"addressPrefix": "[variables('subnetHead')]"
}
@@ -220,7 +228,7 @@
"type": "Microsoft.Network/publicIpAddresses",
"apiVersion": "2019-02-01",
"name": "[variables('publicIpAddressName')]",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"properties": {
"publicIpAllocationMethod": "Static",
"publicIPAddressVersion": "IPv4"
@@ -232,12 +240,12 @@
},
{
"type": "Microsoft.Network/networkInterfaces",
"apiVersion": "2018-10-01",
"apiVersion": "2020-06-01",
"name": "[variables('networkInterfaceName')]",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"dependsOn": [
"[resourceId('Microsoft.Network/publicIpAddresses/', variables('publicIpAddressName'))]",
"[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]"
"[resourceId('Microsoft.Network/publicIpAddresses', variables('publicIpAddressName'))]",
"[resourceId('Microsoft.Network/networkSecurityGroups', variables('networkSecurityGroupName'))]"
],
"properties": {
"ipConfigurations": [
@@ -256,17 +264,17 @@
}
],
"networkSecurityGroup": {
"id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]"
"id": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('networkSecurityGroupName'))]"
}
}
},
{
"type": "Microsoft.Compute/virtualMachines",
"apiVersion": "2019-07-01",
"apiVersion": "2020-06-01",
"name": "[variables('vmNameHead')]",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"dependsOn": [
"[resourceId('Microsoft.Network/networkInterfaces/', variables('networkInterfaceName'))]"
"[resourceId('Microsoft.Network/networkInterfaces', variables('networkInterfaceName'))]"
],
"properties": {
"hardwareProfile": {
@@ -315,10 +323,10 @@
{
"type": "Microsoft.Compute/virtualMachines/extensions",
"name": "[concat(variables('vmNameHead'), '/HeadNodeInitScript')]",
"apiVersion": "2017-03-30",
"location": "[resourceGroup().location]",
"apiVersion": "2020-06-01",
"location": "[variables('location')]",
"dependsOn": [
"[concat('Microsoft.Compute/virtualMachines/', variables('vmNameHead'))]"
"[resourceId('Microsoft.Compute/virtualMachines', variables('vmNameHead'))]"
],
"properties": {
"publisher": "Microsoft.Azure.Extensions",
@@ -338,10 +346,10 @@
{
"type": "Microsoft.Compute/virtualMachineScaleSets",
"name": "[variables('vmNameWorker')]",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"apiVersion": "2019-07-01",
"dependsOn": [
"Microsoft.Network/virtualNetworks/ray-vnet"
"[resourceId('Microsoft.Network/virtualNetworks', variables('vNetName'))]"
],
"sku": {
"name": "[parameters('workerNodeSize')]",
@@ -430,13 +438,13 @@
"type": "Microsoft.Insights/autoscaleSettings",
"apiVersion": "2015-04-01",
"name": "cpuautoscale",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"dependsOn": [
"[concat('Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]"
"[resourceId('Microsoft.Compute/virtualMachineScaleSets', variables('vmNameWorker'))]"
],
"properties": {
"name": "cpuautoscale",
"targetResourceUri": "[concat('/subscriptions/',subscription().subscriptionId, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
"targetResourceUri": "[concat(resourceGroup().id, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
"enabled": true,
"profiles": [
{
@@ -450,8 +458,7 @@
{
"metricTrigger": {
"metricName": "Percentage CPU",
"metricNamespace": "",
"metricResourceUri": "[concat('/subscriptions/',subscription().subscriptionId, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
"metricResourceUri": "[concat(resourceGroup().id, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
"timeGrain": "PT1M",
"statistic": "Average",
"timeWindow": "PT10M",
@@ -469,8 +476,7 @@
{
"metricTrigger": {
"metricName": "Percentage CPU",
"metricNamespace": "",
"metricResourceUri": "[concat('/subscriptions/',subscription().subscriptionId, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
"metricResourceUri": "[concat(resourceGroup().id, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]",
"timeGrain": "PT1M",
"statistic": "Average",
"timeWindow": "PT30M",
+4 -7
View File
@@ -48,7 +48,7 @@ AWS/GCP/Azure
See :ref:`aws-cluster` for recipes on customizing AWS clusters.
.. group-tab:: Azure
First, install the Azure CLI (``pip install azure-cli azure-core``) then login using (``az login``).
First, install the Azure CLI (``pip install azure-cli``) then login using (``az login``).
Set the subscription to use from the command line (``az account set -s <subscription_id>``) or by modifying the provider section of the config provided e.g: `ray/python/ray/autoscaler/azure/example-full.yaml`
@@ -65,10 +65,7 @@ AWS/GCP/Azure
# Get a remote screen on the head node.
$ ray attach ray/python/ray/autoscaler/azure/example-full.yaml
# test ray setup
# enable conda environment
$ exec bash -l
$ conda activate py37_tensorflow
$ python -c 'import ray; ray.init()'
$ python -c 'import ray; ray.init(address="auto")'
$ exit
# Tear down the cluster.
$ ray down ray/python/ray/autoscaler/azure/example-full.yaml
@@ -83,8 +80,8 @@ AWS/GCP/Azure
:target: https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fray-project%2Fray%2Fmaster%2Fdoc%2Fazure%2Fazure-ray-template.json
:alt: Deploy to Azure
Once the template is successfully deployed the deployment output page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input).
Use the following code in a Jupyter notebook to connect to the Ray cluster.
Once the template is successfully deployed the deployment Outputs page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input).
Use the following code in a Jupyter notebook (using the conda environment specified in the template input, py37_tensorflow by default) to connect to the Ray cluster.
.. code-block:: python
@@ -10,18 +10,19 @@
}
},
"variables": {
"Contributor": "[concat('/subscriptions/', subscription().subscriptionId, '/providers/Microsoft.Authorization/roleDefinitions/', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]"
"Contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
"location": "[resourceGroup().location]"
},
"resources": [
{
"type": "Microsoft.ManagedIdentity/userAssignedIdentities",
"apiVersion": "2018-11-30",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"name": "ray-msi-user-identity"
},
{
"type": "Microsoft.Authorization/roleAssignments",
"apiVersion": "2018-09-01-preview",
"apiVersion": "2020-04-01-preview",
"name": "[guid(resourceGroup().id)]",
"properties": {
"principalId": "[reference('ray-msi-user-identity').principalId]",
@@ -37,7 +38,7 @@
"type": "Microsoft.Network/networkSecurityGroups",
"apiVersion": "2019-02-01",
"name": "ray-nsg",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"properties": {
"securityRules": [
{
@@ -60,7 +61,7 @@
"type": "Microsoft.Network/virtualNetworks",
"apiVersion": "2019-11-01",
"name": "ray-vnet",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"properties": {
"addressSpace": {
"addressPrefixes": [
@@ -85,20 +85,21 @@
}
},
"variables": {
"publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]",
"location": "[resourceGroup().location]",
"networkInterfaceNamePrivate": "[concat(parameters('vmName'),'-nic')]",
"networkInterfaceNamePublic": "[concat(parameters('vmName'),'-nic-public')]",
"networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
"networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]",
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]",
"osDiskType": "Standard_LRS"
"osDiskType": "Standard_LRS",
"publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]",
"subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]"
},
"resources": [
{
"type": "Microsoft.Network/networkInterfaces",
"apiVersion": "2018-10-01",
"apiVersion": "2020-06-01",
"name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"dependsOn": [
"[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]"
],
@@ -129,9 +130,9 @@
},
{
"type": "Microsoft.Network/networkInterfaces",
"apiVersion": "2018-10-01",
"apiVersion": "2020-06-01",
"name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"copy": {
"name": "NICPrivateCopy",
"count": "[parameters('vmCount')]"
@@ -158,7 +159,7 @@
"type": "Microsoft.Network/publicIpAddresses",
"apiVersion": "2019-02-01",
"name": "[concat(variables('publicIpAddressName'), copyIndex())]",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"properties": {
"publicIpAllocationMethod": "Static",
"publicIPAddressVersion": "IPv4"
@@ -177,7 +178,7 @@
"type": "Microsoft.Compute/virtualMachines",
"apiVersion": "2019-03-01",
"name": "[concat(parameters('vmName'), copyIndex())]",
"location": "[resourceGroup().location]",
"location": "[variables('location')]",
"dependsOn": [
"[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]"
],
@@ -1,7 +1,7 @@
import json
import logging
from pathlib import Path
import random
import os
from azure.common.client_factory import get_client_from_cli_profile
from azure.mgmt.resource import ResourceManagementClient
@@ -55,8 +55,8 @@ def _configure_resource_group(config):
resource_group_name=resource_group, parameters=params)
# load the template file
current_path = os.path.dirname(os.path.abspath(__file__))
template_path = os.path.join(current_path, "azure-config-template.json")
current_path = Path(__file__).parent
template_path = current_path.joinpath("azure-config-template.json")
with open(template_path, "r") as template_fp:
template = json.load(template_fp)
@@ -86,16 +86,17 @@ def _configure_resource_group(config):
def _configure_key_pair(config):
ssh_user = config["auth"]["ssh_user"]
public_key = None
# search if the keys exist
for key_type in ["ssh_private_key", "ssh_public_key"]:
try:
key_path = os.path.expanduser(config["auth"][key_type])
key_path = Path(config["auth"][key_type]).expanduser()
except KeyError:
raise Exception("Config must define {}".format(key_type))
except TypeError:
raise Exception("Invalid config value for {}".format(key_type))
assert os.path.exists(key_path), (
assert key_path.is_file(), (
"Could not find ssh key: {}".format(key_path))
if key_type == "ssh_public_key":
@@ -1,6 +1,6 @@
import json
import logging
import os
from pathlib import Path
from threading import RLock
from uuid import uuid4
@@ -178,8 +178,8 @@ class AzureNodeProvider(NodeProvider):
resource_group = self.provider_config["resource_group"]
# load the template file
current_path = os.path.dirname(os.path.abspath(__file__))
template_path = os.path.join(current_path, "azure-vm-template.json")
current_path = Path(__file__).parent
template_path = current_path.joinpath("azure-vm-template.json")
with open(template_path, "r") as template_fp:
template = json.load(template_fp)
+5 -2
View File
@@ -49,6 +49,7 @@ auth:
# you must specify paths to matching private and public key pair files
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
ssh_private_key: ~/.ssh/id_rsa
# changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub
# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
@@ -64,7 +65,7 @@ head_node:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.02.01
imageVersion: 20.07.06
# Provider-specific config for worker nodes, e.g. instance type.
worker_nodes:
@@ -74,7 +75,7 @@ worker_nodes:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.02.01
imageVersion: 20.07.06
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
@@ -86,6 +87,7 @@ worker_nodes:
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
}
# Files or directories to copy from the head node to the worker nodes. The format is a
@@ -118,6 +120,7 @@ setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
@@ -62,6 +62,7 @@ auth:
# you must specify paths to matching private and public key pair files
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
ssh_private_key: ~/.ssh/id_rsa
# changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub
# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
@@ -77,7 +78,7 @@ head_node:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.02.01
imageVersion: 20.07.06
# Provider-specific config for worker nodes, e.g. instance type.
worker_nodes:
@@ -87,7 +88,7 @@ worker_nodes:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.02.01
imageVersion: 20.07.06
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
@@ -99,6 +100,7 @@ worker_nodes:
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
}
# Files or directories to copy from the head node to the worker nodes. The format is a
@@ -130,12 +132,14 @@ initialization_commands:
- touch ~/.sudo_as_admin_successful
# List of shell commands to run to set up nodes.
setup_commands: []
setup_commands:
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest)
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
@@ -52,6 +52,7 @@ auth:
# you must specify paths to matching private and public key pair files
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
ssh_private_key: ~/.ssh/id_rsa
# changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default
@@ -71,6 +72,7 @@ worker_nodes:
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
}
# List of shell commands to run to set up nodes.
+5 -2
View File
@@ -60,6 +60,7 @@ auth:
# you must specify paths to matching private and public key pair files
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
ssh_private_key: ~/.ssh/id_rsa
# changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub
# Provider-specific config for the head node, e.g. instance type. By default
@@ -71,7 +72,7 @@ head_node:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: "1804"
imageVersion: 20.02.01
imageVersion: 20.07.06
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields using defaults.yaml
@@ -82,13 +83,14 @@ worker_nodes:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: "1804"
imageVersion: 20.02.01
imageVersion: 20.07.06
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
"/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
}
# List of commands that will be run before `setup_commands`. If docker is
@@ -103,6 +105,7 @@ setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl
@@ -17,4 +17,5 @@ auth:
# you must specify paths to matching private and public key pair files
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
ssh_private_key: ~/.ssh/id_rsa
# changes to this should match what is specified in file_mounts
ssh_public_key: ~/.ssh/id_rsa.pub
+15 -9
View File
@@ -71,19 +71,23 @@ generated_python_directories = [
optional_ray_files = ["ray/nightly-wheels.yaml"]
ray_autoscaler_files = [
"ray/autoscaler/aws/defaults.yaml", "ray/autoscaler/azure/defaults.yaml",
"ray/autoscaler/azure/azure-vm-template.json",
"ray/autoscaler/azure/azure-config-template.json",
"ray/autoscaler/gcp/defaults.yaml", "ray/autoscaler/local/defaults.yaml",
"ray/autoscaler/aws/defaults.yaml",
"ray/autoscaler/azure/defaults.yaml",
"ray/autoscaler/_private/azure/azure-vm-template.json",
"ray/autoscaler/_private/azure/azure-config-template.json",
"ray/autoscaler/gcp/defaults.yaml",
"ray/autoscaler/local/defaults.yaml",
"ray/autoscaler/kubernetes/defaults.yaml",
"ray/autoscaler/kubernetes/kubectl-rsync.sh",
"ray/autoscaler/staroid/defaults.yaml", "ray/autoscaler/ray-schema.json"
"ray/autoscaler/staroid/defaults.yaml",
"ray/autoscaler/ray-schema.json",
]
ray_project_files = [
"ray/projects/schema.json", "ray/projects/templates/cluster_template.yaml",
"ray/projects/schema.json",
"ray/projects/templates/cluster_template.yaml",
"ray/projects/templates/project_template.yaml",
"ray/projects/templates/requirements.txt"
"ray/projects/templates/requirements.txt",
]
ray_dashboard_files = [
@@ -105,8 +109,10 @@ extras = {
"dataclasses; python_version < '3.7'"
],
"tune": [
"tabulate", "tensorboardX", "pandas",
"dataclasses; python_version < '3.7'"
"dataclasses; python_version < '3.7'",
"pandas",
"tabulate",
"tensorboardX",
]
}