diff --git a/BUILD.bazel b/BUILD.bazel index bde834fef..6494aa563 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1777,6 +1777,7 @@ filegroup( "python/ray/*.py", "python/ray/autoscaler/*.py", "python/ray/autoscaler/_private/*.py", + "python/ray/autoscaler/_private/azure/*.json", "python/ray/autoscaler/aws/defaults.yaml", "python/ray/autoscaler/azure/defaults.yaml", "python/ray/autoscaler/gcp/defaults.yaml", diff --git a/doc/azure/azure-init.sh b/doc/azure/azure-init.sh index 6a2eecd06..0add7a6b2 100755 --- a/doc/azure/azure-init.sh +++ b/doc/azure/azure-init.sh @@ -13,17 +13,21 @@ sudo -u "$USERNAME" -i /bin/bash -l -c "conda activate $CONDA_ENV; pip install $ echo "Setting up service scripts..." cat > /home/"$USERNAME"/ray-head.sh << EOM #!/bin/bash + +eval "$(conda shell.bash hook)" conda activate $CONDA_ENV NUM_GPUS=\`nvidia-smi -L | wc -l\` ray stop ulimit -n 65536 -ray start --head -port=6379 --object-manager-port=8076 --num-gpus=\$NUM_GPUS --block --webui-host 0.0.0.0 +ray start --head --port=6379 --object-manager-port=8076 --num-gpus=\$NUM_GPUS --block --dashboard-host 0.0.0.0 EOM cat > /home/"$USERNAME"/ray-worker.sh << EOM #!/bin/bash + +eval "$(conda shell.bash hook)" conda activate $CONDA_ENV NUM_GPUS=\`nvidia-smi -L | wc -l\` @@ -42,6 +46,7 @@ EOM cat > /home/"$USERNAME"/tensorboard.sh << EOM #!/bin/bash +eval "$(conda shell.bash hook)" conda activate $CONDA_ENV mkdir -p /home/$USERNAME/ray_results diff --git a/doc/azure/azure-ray-template.json b/doc/azure/azure-ray-template.json index 33b5e2db9..be02b6fed 100644 --- a/doc/azure/azure-ray-template.json +++ b/doc/azure/azure-ray-template.json @@ -55,6 +55,7 @@ "type": "int", "defaultValue": 1, "minValue": 0, + "maxValue": 1000, "metadata": { "description": "Initial number of worker nodes" } @@ -63,6 +64,7 @@ "type": "int", "defaultValue": 1, "minValue": 0, + "maxValue": 1000, "metadata": { "description": "Minimum number of worker nodes" } @@ -71,6 +73,7 @@ "type": "int", "defaultValue": 1, "minValue": 0, + "maxValue": 1000, "metadata": { "description": "Maximum number of worker nodes" } @@ -107,30 +110,35 @@ }, "variables": { "azureScriptInitUrl": "https://raw.githubusercontent.com/ray-project/ray/master/doc/azure/azure-init.sh", + "location": "[resourceGroup().location]", "vmName": "ray-node", "subnetWorkers": "10.32.0.0/16", "subnetHead": "10.33.0.0/16", "publicIpAddressName": "[concat(variables('vmName'), '-ip' )]", "networkIpConfig": "[guid(resourceGroup().id, variables('vmName'))]", - "subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]", - "subnetHeadRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet-head')]", + "subnetName": "ray-subnet", + "subnetHeadName": "ray-subnet-head", + "subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', variables('vNetName'), variables('subnetName'))]", + "subnetHeadRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', variables('vNetName'), variables('subnetHeadName'))]", "osDiskType": "Standard_LRS", "vmNameHead": "[concat(variables('vmName'), '-head')]", "vmNameWorker": "[concat(variables('vmName'), '-workers')]", "networkInterfaceName": "[concat(variables('vmName'), '-nic')]", + "networkSecurityGroupName": "ray-nsg", + "vNetName": "ray-vnet", "subnetNetwork": "[split(variables('subnetHead'), '/')[0]]", "headInternalIP": "[concat(substring(variables('subnetNetwork'), 0, lastIndexOf(variables('subnetNetwork'), '.')), '.5')]", "imagePublisher": "microsoft-dsvm", "imageOffer": "ubuntu-1804", "imageSku": "1804", - "imageVersion": "latest" + "imageVersion": "20.07.06" }, "resources": [ { "type": "Microsoft.Network/networkSecurityGroups", "apiVersion": "2019-02-01", - "name": "ray-nsg", - "location": "[resourceGroup().location]", + "name": "[variables('networkSecurityGroupName')]", + "location": "[variables('location')]", "properties": { "securityRules": [ { @@ -191,8 +199,8 @@ { "type": "Microsoft.Network/virtualNetworks", "apiVersion": "2019-11-01", - "name": "ray-vnet", - "location": "[resourceGroup().location]", + "name": "[variables('vNetName')]", + "location": "[variables('location')]", "properties": { "addressSpace": { "addressPrefixes": [ @@ -202,13 +210,13 @@ }, "subnets": [ { - "name": "ray-subnet", + "name": "[variables('subnetName')]", "properties": { "addressPrefix": "[variables('subnetWorkers')]" } }, { - "name": "ray-subnet-head", + "name": "[variables('subnetHeadName')]", "properties": { "addressPrefix": "[variables('subnetHead')]" } @@ -220,7 +228,7 @@ "type": "Microsoft.Network/publicIpAddresses", "apiVersion": "2019-02-01", "name": "[variables('publicIpAddressName')]", - "location": "[resourceGroup().location]", + "location": "[variables('location')]", "properties": { "publicIpAllocationMethod": "Static", "publicIPAddressVersion": "IPv4" @@ -232,12 +240,12 @@ }, { "type": "Microsoft.Network/networkInterfaces", - "apiVersion": "2018-10-01", + "apiVersion": "2020-06-01", "name": "[variables('networkInterfaceName')]", - "location": "[resourceGroup().location]", + "location": "[variables('location')]", "dependsOn": [ - "[resourceId('Microsoft.Network/publicIpAddresses/', variables('publicIpAddressName'))]", - "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]" + "[resourceId('Microsoft.Network/publicIpAddresses', variables('publicIpAddressName'))]", + "[resourceId('Microsoft.Network/networkSecurityGroups', variables('networkSecurityGroupName'))]" ], "properties": { "ipConfigurations": [ @@ -256,17 +264,17 @@ } ], "networkSecurityGroup": { - "id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]" + "id": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('networkSecurityGroupName'))]" } } }, { "type": "Microsoft.Compute/virtualMachines", - "apiVersion": "2019-07-01", + "apiVersion": "2020-06-01", "name": "[variables('vmNameHead')]", - "location": "[resourceGroup().location]", + "location": "[variables('location')]", "dependsOn": [ - "[resourceId('Microsoft.Network/networkInterfaces/', variables('networkInterfaceName'))]" + "[resourceId('Microsoft.Network/networkInterfaces', variables('networkInterfaceName'))]" ], "properties": { "hardwareProfile": { @@ -315,10 +323,10 @@ { "type": "Microsoft.Compute/virtualMachines/extensions", "name": "[concat(variables('vmNameHead'), '/HeadNodeInitScript')]", - "apiVersion": "2017-03-30", - "location": "[resourceGroup().location]", + "apiVersion": "2020-06-01", + "location": "[variables('location')]", "dependsOn": [ - "[concat('Microsoft.Compute/virtualMachines/', variables('vmNameHead'))]" + "[resourceId('Microsoft.Compute/virtualMachines', variables('vmNameHead'))]" ], "properties": { "publisher": "Microsoft.Azure.Extensions", @@ -338,10 +346,10 @@ { "type": "Microsoft.Compute/virtualMachineScaleSets", "name": "[variables('vmNameWorker')]", - "location": "[resourceGroup().location]", + "location": "[variables('location')]", "apiVersion": "2019-07-01", "dependsOn": [ - "Microsoft.Network/virtualNetworks/ray-vnet" + "[resourceId('Microsoft.Network/virtualNetworks', variables('vNetName'))]" ], "sku": { "name": "[parameters('workerNodeSize')]", @@ -430,13 +438,13 @@ "type": "Microsoft.Insights/autoscaleSettings", "apiVersion": "2015-04-01", "name": "cpuautoscale", - "location": "[resourceGroup().location]", + "location": "[variables('location')]", "dependsOn": [ - "[concat('Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]" + "[resourceId('Microsoft.Compute/virtualMachineScaleSets', variables('vmNameWorker'))]" ], "properties": { "name": "cpuautoscale", - "targetResourceUri": "[concat('/subscriptions/',subscription().subscriptionId, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]", + "targetResourceUri": "[concat(resourceGroup().id, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]", "enabled": true, "profiles": [ { @@ -450,8 +458,7 @@ { "metricTrigger": { "metricName": "Percentage CPU", - "metricNamespace": "", - "metricResourceUri": "[concat('/subscriptions/',subscription().subscriptionId, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]", + "metricResourceUri": "[concat(resourceGroup().id, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]", "timeGrain": "PT1M", "statistic": "Average", "timeWindow": "PT10M", @@ -469,8 +476,7 @@ { "metricTrigger": { "metricName": "Percentage CPU", - "metricNamespace": "", - "metricResourceUri": "[concat('/subscriptions/',subscription().subscriptionId, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]", + "metricResourceUri": "[concat(resourceGroup().id, '/providers/Microsoft.Compute/virtualMachineScaleSets/', variables('vmNameWorker'))]", "timeGrain": "PT1M", "statistic": "Average", "timeWindow": "PT30M", diff --git a/doc/source/cluster/cloud.rst b/doc/source/cluster/cloud.rst index f4790ea56..0e00db54c 100644 --- a/doc/source/cluster/cloud.rst +++ b/doc/source/cluster/cloud.rst @@ -48,7 +48,7 @@ AWS/GCP/Azure See :ref:`aws-cluster` for recipes on customizing AWS clusters. .. group-tab:: Azure - First, install the Azure CLI (``pip install azure-cli azure-core``) then login using (``az login``). + First, install the Azure CLI (``pip install azure-cli``) then login using (``az login``). Set the subscription to use from the command line (``az account set -s ``) or by modifying the provider section of the config provided e.g: `ray/python/ray/autoscaler/azure/example-full.yaml` @@ -65,10 +65,7 @@ AWS/GCP/Azure # Get a remote screen on the head node. $ ray attach ray/python/ray/autoscaler/azure/example-full.yaml # test ray setup - # enable conda environment - $ exec bash -l - $ conda activate py37_tensorflow - $ python -c 'import ray; ray.init()' + $ python -c 'import ray; ray.init(address="auto")' $ exit # Tear down the cluster. $ ray down ray/python/ray/autoscaler/azure/example-full.yaml @@ -83,8 +80,8 @@ AWS/GCP/Azure :target: https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fray-project%2Fray%2Fmaster%2Fdoc%2Fazure%2Fazure-ray-template.json :alt: Deploy to Azure - Once the template is successfully deployed the deployment output page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input). - Use the following code in a Jupyter notebook to connect to the Ray cluster. + Once the template is successfully deployed the deployment Outputs page provides the ssh command to connect and the link to the JupyterHub on the head node (username/password as specified on the template input). + Use the following code in a Jupyter notebook (using the conda environment specified in the template input, py37_tensorflow by default) to connect to the Ray cluster. .. code-block:: python diff --git a/python/ray/autoscaler/_private/azure/azure-config-template.json b/python/ray/autoscaler/_private/azure/azure-config-template.json index 11939b76a..41cc11a38 100644 --- a/python/ray/autoscaler/_private/azure/azure-config-template.json +++ b/python/ray/autoscaler/_private/azure/azure-config-template.json @@ -10,18 +10,19 @@ } }, "variables": { - "Contributor": "[concat('/subscriptions/', subscription().subscriptionId, '/providers/Microsoft.Authorization/roleDefinitions/', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]" + "Contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]", + "location": "[resourceGroup().location]" }, "resources": [ { "type": "Microsoft.ManagedIdentity/userAssignedIdentities", "apiVersion": "2018-11-30", - "location": "[resourceGroup().location]", + "location": "[variables('location')]", "name": "ray-msi-user-identity" }, { "type": "Microsoft.Authorization/roleAssignments", - "apiVersion": "2018-09-01-preview", + "apiVersion": "2020-04-01-preview", "name": "[guid(resourceGroup().id)]", "properties": { "principalId": "[reference('ray-msi-user-identity').principalId]", @@ -37,7 +38,7 @@ "type": "Microsoft.Network/networkSecurityGroups", "apiVersion": "2019-02-01", "name": "ray-nsg", - "location": "[resourceGroup().location]", + "location": "[variables('location')]", "properties": { "securityRules": [ { @@ -60,7 +61,7 @@ "type": "Microsoft.Network/virtualNetworks", "apiVersion": "2019-11-01", "name": "ray-vnet", - "location": "[resourceGroup().location]", + "location": "[variables('location')]", "properties": { "addressSpace": { "addressPrefixes": [ diff --git a/python/ray/autoscaler/_private/azure/azure-vm-template.json b/python/ray/autoscaler/_private/azure/azure-vm-template.json index aadd115c6..3c2ac2a66 100644 --- a/python/ray/autoscaler/_private/azure/azure-vm-template.json +++ b/python/ray/autoscaler/_private/azure/azure-vm-template.json @@ -85,20 +85,21 @@ } }, "variables": { - "publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]", + "location": "[resourceGroup().location]", "networkInterfaceNamePrivate": "[concat(parameters('vmName'),'-nic')]", "networkInterfaceNamePublic": "[concat(parameters('vmName'),'-nic-public')]", "networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]", "networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]", - "subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]", - "osDiskType": "Standard_LRS" + "osDiskType": "Standard_LRS", + "publicIpAddressName": "[concat(parameters('vmName'), '-ip' )]", + "subnetRef": "[resourceId('Microsoft.Network/virtualNetworks/subnets', 'ray-vnet', 'ray-subnet')]" }, "resources": [ { "type": "Microsoft.Network/networkInterfaces", - "apiVersion": "2018-10-01", + "apiVersion": "2020-06-01", "name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]", - "location": "[resourceGroup().location]", + "location": "[variables('location')]", "dependsOn": [ "[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]" ], @@ -129,9 +130,9 @@ }, { "type": "Microsoft.Network/networkInterfaces", - "apiVersion": "2018-10-01", + "apiVersion": "2020-06-01", "name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]", - "location": "[resourceGroup().location]", + "location": "[variables('location')]", "copy": { "name": "NICPrivateCopy", "count": "[parameters('vmCount')]" @@ -158,7 +159,7 @@ "type": "Microsoft.Network/publicIpAddresses", "apiVersion": "2019-02-01", "name": "[concat(variables('publicIpAddressName'), copyIndex())]", - "location": "[resourceGroup().location]", + "location": "[variables('location')]", "properties": { "publicIpAllocationMethod": "Static", "publicIPAddressVersion": "IPv4" @@ -177,7 +178,7 @@ "type": "Microsoft.Compute/virtualMachines", "apiVersion": "2019-03-01", "name": "[concat(parameters('vmName'), copyIndex())]", - "location": "[resourceGroup().location]", + "location": "[variables('location')]", "dependsOn": [ "[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]" ], diff --git a/python/ray/autoscaler/_private/azure/config.py b/python/ray/autoscaler/_private/azure/config.py index 9c32bbb2b..ad2918b21 100644 --- a/python/ray/autoscaler/_private/azure/config.py +++ b/python/ray/autoscaler/_private/azure/config.py @@ -1,7 +1,7 @@ import json import logging +from pathlib import Path import random -import os from azure.common.client_factory import get_client_from_cli_profile from azure.mgmt.resource import ResourceManagementClient @@ -55,8 +55,8 @@ def _configure_resource_group(config): resource_group_name=resource_group, parameters=params) # load the template file - current_path = os.path.dirname(os.path.abspath(__file__)) - template_path = os.path.join(current_path, "azure-config-template.json") + current_path = Path(__file__).parent + template_path = current_path.joinpath("azure-config-template.json") with open(template_path, "r") as template_fp: template = json.load(template_fp) @@ -86,16 +86,17 @@ def _configure_resource_group(config): def _configure_key_pair(config): ssh_user = config["auth"]["ssh_user"] + public_key = None # search if the keys exist for key_type in ["ssh_private_key", "ssh_public_key"]: try: - key_path = os.path.expanduser(config["auth"][key_type]) + key_path = Path(config["auth"][key_type]).expanduser() except KeyError: raise Exception("Config must define {}".format(key_type)) except TypeError: raise Exception("Invalid config value for {}".format(key_type)) - assert os.path.exists(key_path), ( + assert key_path.is_file(), ( "Could not find ssh key: {}".format(key_path)) if key_type == "ssh_public_key": diff --git a/python/ray/autoscaler/_private/azure/node_provider.py b/python/ray/autoscaler/_private/azure/node_provider.py index b4a160268..2b8aae556 100644 --- a/python/ray/autoscaler/_private/azure/node_provider.py +++ b/python/ray/autoscaler/_private/azure/node_provider.py @@ -1,6 +1,6 @@ import json import logging -import os +from pathlib import Path from threading import RLock from uuid import uuid4 @@ -178,8 +178,8 @@ class AzureNodeProvider(NodeProvider): resource_group = self.provider_config["resource_group"] # load the template file - current_path = os.path.dirname(os.path.abspath(__file__)) - template_path = os.path.join(current_path, "azure-vm-template.json") + current_path = Path(__file__).parent + template_path = current_path.joinpath("azure-vm-template.json") with open(template_path, "r") as template_fp: template = json.load(template_fp) diff --git a/python/ray/autoscaler/azure/defaults.yaml b/python/ray/autoscaler/azure/defaults.yaml index 2098bbcad..8fbf11930 100644 --- a/python/ray/autoscaler/azure/defaults.yaml +++ b/python/ray/autoscaler/azure/defaults.yaml @@ -49,6 +49,7 @@ auth: # you must specify paths to matching private and public key pair files # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair ssh_private_key: ~/.ssh/id_rsa + # changes to this should match what is specified in file_mounts ssh_public_key: ~/.ssh/id_rsa.pub # More specific customization to node configurations can be made using the ARM template azure-vm-template.json file @@ -64,7 +65,7 @@ head_node: imagePublisher: microsoft-dsvm imageOffer: ubuntu-1804 imageSku: 1804-gen2 - imageVersion: 20.02.01 + imageVersion: 20.07.06 # Provider-specific config for worker nodes, e.g. instance type. worker_nodes: @@ -74,7 +75,7 @@ worker_nodes: imagePublisher: microsoft-dsvm imageOffer: ubuntu-1804 imageSku: 1804-gen2 - imageVersion: 20.02.01 + imageVersion: 20.07.06 # optionally set priority to use Spot instances priority: Spot # set a maximum price for spot instances if desired @@ -86,6 +87,7 @@ worker_nodes: file_mounts: { # "/path1/on/remote/machine": "/path1/on/local/machine", # "/path2/on/remote/machine": "/path2/on/local/machine", + "/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub" } # Files or directories to copy from the head node to the worker nodes. The format is a @@ -118,6 +120,7 @@ setup_commands: # Note: if you're developing Ray, you probably want to create an AMI that # has your Ray repo pre-cloned. Then, you can replace the pip installs # below with a git checkout (and possibly a recompile). + - echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc # - echo 'conda activate py37_pytorch' >> ~/.bashrc - echo 'conda activate py37_tensorflow' >> ~/.bashrc - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl diff --git a/python/ray/autoscaler/azure/example-full.yaml b/python/ray/autoscaler/azure/example-full.yaml index 36c0a0dc5..b007b0b85 100644 --- a/python/ray/autoscaler/azure/example-full.yaml +++ b/python/ray/autoscaler/azure/example-full.yaml @@ -62,6 +62,7 @@ auth: # you must specify paths to matching private and public key pair files # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair ssh_private_key: ~/.ssh/id_rsa + # changes to this should match what is specified in file_mounts ssh_public_key: ~/.ssh/id_rsa.pub # More specific customization to node configurations can be made using the ARM template azure-vm-template.json file @@ -77,7 +78,7 @@ head_node: imagePublisher: microsoft-dsvm imageOffer: ubuntu-1804 imageSku: 1804-gen2 - imageVersion: 20.02.01 + imageVersion: 20.07.06 # Provider-specific config for worker nodes, e.g. instance type. worker_nodes: @@ -87,7 +88,7 @@ worker_nodes: imagePublisher: microsoft-dsvm imageOffer: ubuntu-1804 imageSku: 1804-gen2 - imageVersion: 20.02.01 + imageVersion: 20.07.06 # optionally set priority to use Spot instances priority: Spot # set a maximum price for spot instances if desired @@ -99,6 +100,7 @@ worker_nodes: file_mounts: { # "/path1/on/remote/machine": "/path1/on/local/machine", # "/path2/on/remote/machine": "/path2/on/local/machine", + "/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub" } # Files or directories to copy from the head node to the worker nodes. The format is a @@ -130,12 +132,14 @@ initialization_commands: - touch ~/.sudo_as_admin_successful # List of shell commands to run to set up nodes. -setup_commands: [] +setup_commands: # Note: if you're developing Ray, you probably want to create a Docker image that # has your Ray repo pre-cloned. Then, you can replace the pip installs # below with a git checkout (and possibly a recompile). # Uncomment the following line if you want to run the nightly version of ray (as opposed to the latest) - # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl + - echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc + - echo 'conda activate py37_tensorflow' >> ~/.bashrc + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl # Custom commands that will be run on the head node after common setup. head_setup_commands: diff --git a/python/ray/autoscaler/azure/example-gpu-docker.yaml b/python/ray/autoscaler/azure/example-gpu-docker.yaml index d70b457c2..264c8f229 100644 --- a/python/ray/autoscaler/azure/example-gpu-docker.yaml +++ b/python/ray/autoscaler/azure/example-gpu-docker.yaml @@ -52,6 +52,7 @@ auth: # you must specify paths to matching private and public key pair files # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair ssh_private_key: ~/.ssh/id_rsa + # changes to this should match what is specified in file_mounts ssh_public_key: ~/.ssh/id_rsa.pub # Provider-specific config for the head node, e.g. instance type. By default @@ -71,6 +72,7 @@ worker_nodes: file_mounts: { # "/path1/on/remote/machine": "/path1/on/local/machine", # "/path2/on/remote/machine": "/path2/on/local/machine", + "/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub" } # List of shell commands to run to set up nodes. diff --git a/python/ray/autoscaler/azure/example-gpu.yaml b/python/ray/autoscaler/azure/example-gpu.yaml index 77cccdda7..55133e50b 100644 --- a/python/ray/autoscaler/azure/example-gpu.yaml +++ b/python/ray/autoscaler/azure/example-gpu.yaml @@ -60,6 +60,7 @@ auth: # you must specify paths to matching private and public key pair files # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair ssh_private_key: ~/.ssh/id_rsa + # changes to this should match what is specified in file_mounts ssh_public_key: ~/.ssh/id_rsa.pub # Provider-specific config for the head node, e.g. instance type. By default @@ -71,7 +72,7 @@ head_node: imagePublisher: microsoft-dsvm imageOffer: ubuntu-1804 imageSku: "1804" - imageVersion: 20.02.01 + imageVersion: 20.07.06 # Provider-specific config for worker nodes, e.g. instance type. By default # Ray will auto-configure unspecified fields using defaults.yaml @@ -82,13 +83,14 @@ worker_nodes: imagePublisher: microsoft-dsvm imageOffer: ubuntu-1804 imageSku: "1804" - imageVersion: 20.02.01 + imageVersion: 20.07.06 # Files or directories to copy to the head and worker nodes. The format is a # dictionary from REMOTE_PATH: LOCAL_PATH, e.g. file_mounts: { # "/path1/on/remote/machine": "/path1/on/local/machine", # "/path2/on/remote/machine": "/path2/on/local/machine", + "/home/ubuntu/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub" } # List of commands that will be run before `setup_commands`. If docker is @@ -103,6 +105,7 @@ setup_commands: # Note: if you're developing Ray, you probably want to create an AMI that # has your Ray repo pre-cloned. Then, you can replace the pip installs # below with a git checkout (and possibly a recompile). + - echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc # - echo 'conda activate py37_pytorch' >> ~/.bashrc - echo 'conda activate py37_tensorflow' >> ~/.bashrc - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.1.0.dev0-cp37-cp37m-manylinux1_x86_64.whl diff --git a/python/ray/autoscaler/azure/example-minimal.yaml b/python/ray/autoscaler/azure/example-minimal.yaml index 7d5c11fcd..4965628e6 100644 --- a/python/ray/autoscaler/azure/example-minimal.yaml +++ b/python/ray/autoscaler/azure/example-minimal.yaml @@ -17,4 +17,5 @@ auth: # you must specify paths to matching private and public key pair files # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair ssh_private_key: ~/.ssh/id_rsa + # changes to this should match what is specified in file_mounts ssh_public_key: ~/.ssh/id_rsa.pub diff --git a/python/setup.py b/python/setup.py index 23f303678..0ed74933e 100644 --- a/python/setup.py +++ b/python/setup.py @@ -71,19 +71,23 @@ generated_python_directories = [ optional_ray_files = ["ray/nightly-wheels.yaml"] ray_autoscaler_files = [ - "ray/autoscaler/aws/defaults.yaml", "ray/autoscaler/azure/defaults.yaml", - "ray/autoscaler/azure/azure-vm-template.json", - "ray/autoscaler/azure/azure-config-template.json", - "ray/autoscaler/gcp/defaults.yaml", "ray/autoscaler/local/defaults.yaml", + "ray/autoscaler/aws/defaults.yaml", + "ray/autoscaler/azure/defaults.yaml", + "ray/autoscaler/_private/azure/azure-vm-template.json", + "ray/autoscaler/_private/azure/azure-config-template.json", + "ray/autoscaler/gcp/defaults.yaml", + "ray/autoscaler/local/defaults.yaml", "ray/autoscaler/kubernetes/defaults.yaml", "ray/autoscaler/kubernetes/kubectl-rsync.sh", - "ray/autoscaler/staroid/defaults.yaml", "ray/autoscaler/ray-schema.json" + "ray/autoscaler/staroid/defaults.yaml", + "ray/autoscaler/ray-schema.json", ] ray_project_files = [ - "ray/projects/schema.json", "ray/projects/templates/cluster_template.yaml", + "ray/projects/schema.json", + "ray/projects/templates/cluster_template.yaml", "ray/projects/templates/project_template.yaml", - "ray/projects/templates/requirements.txt" + "ray/projects/templates/requirements.txt", ] ray_dashboard_files = [ @@ -105,8 +109,10 @@ extras = { "dataclasses; python_version < '3.7'" ], "tune": [ - "tabulate", "tensorboardX", "pandas", - "dataclasses; python_version < '3.7'" + "dataclasses; python_version < '3.7'", + "pandas", + "tabulate", + "tensorboardX", ] }