From 0dc01d8c1e7c356947cf8a5cbb231755436d9d6e Mon Sep 17 00:00:00 2001 From: Scott Graham <5720537+gramhagen@users.noreply.github.com> Date: Fri, 24 Apr 2020 20:03:55 -0400 Subject: [PATCH] [autoscaler] Azure versioning (#8168) --- .../azure/azure-config-template.json | 10 ++++- python/ray/autoscaler/azure/config.py | 37 ++++++++------- python/ray/autoscaler/azure/example-full.yaml | 2 +- .../autoscaler/azure/example-gpu-docker.yaml | 2 +- python/ray/autoscaler/azure/example-gpu.yaml | 2 +- python/ray/autoscaler/azure/node_provider.py | 45 ++++++++++--------- 6 files changed, 56 insertions(+), 42 deletions(-) diff --git a/python/ray/autoscaler/azure/azure-config-template.json b/python/ray/autoscaler/azure/azure-config-template.json index 65c671c82..11939b76a 100644 --- a/python/ray/autoscaler/azure/azure-config-template.json +++ b/python/ray/autoscaler/azure/azure-config-template.json @@ -71,11 +71,17 @@ { "name": "ray-subnet", "properties": { - "addressPrefix": "[parameters('subnet')]" + "addressPrefix": "[parameters('subnet')]", + "networkSecurityGroup": { + "id": "[resourceId('Microsoft.Network/networkSecurityGroups','ray-nsg')]" + } } } ] - } + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/networkSecurityGroups', 'ray-nsg')]" + ] } ] } \ No newline at end of file diff --git a/python/ray/autoscaler/azure/config.py b/python/ray/autoscaler/azure/config.py index 0be524a2c..9c32bbb2b 100644 --- a/python/ray/autoscaler/azure/config.py +++ b/python/ray/autoscaler/azure/config.py @@ -54,29 +54,32 @@ def _configure_resource_group(config): resource_client.resource_groups.create_or_update( resource_group_name=resource_group, parameters=params) - # load the template - template_path = os.path.join( - os.path.dirname(__file__), "azure-config-template.json") - with open(template_path, "r") as template_file_fd: - template = json.load(template_file_fd) + # load the template file + current_path = os.path.dirname(os.path.abspath(__file__)) + template_path = os.path.join(current_path, "azure-config-template.json") + with open(template_path, "r") as template_fp: + template = json.load(template_fp) - # choose a random subnet + # choose a random subnet, skipping most common value of 0 random.seed(resource_group) - # start at 1 to avoid most likely collision at 0 - parameters = {"subnet": "10.{}.0.0/16".format(random.randint(1, 254))} + subnet_mask = "10.{}.0.0/16".format(random.randint(1, 254)) - deployment_properties = { - "mode": DeploymentMode.incremental, - "template": template, - "parameters": {k: { - "value": v + parameters = { + "properties": { + "mode": DeploymentMode.incremental, + "template": template, + "parameters": { + "subnet": { + "value": subnet_mask + } + } } - for k, v in parameters.items()} } - deployment_async_operation = resource_client.deployments.create_or_update( - resource_group, "ray-config", deployment_properties) - deployment_async_operation.wait() + resource_client.deployments.create_or_update( + resource_group_name=resource_group, + deployment_name="ray-config", + parameters=parameters).wait() return config diff --git a/python/ray/autoscaler/azure/example-full.yaml b/python/ray/autoscaler/azure/example-full.yaml index bd93481a8..971a6a505 100644 --- a/python/ray/autoscaler/azure/example-full.yaml +++ b/python/ray/autoscaler/azure/example-full.yaml @@ -114,7 +114,7 @@ setup_commands: # Custom commands that will be run on the head node after common setup. head_setup_commands: - - pip install azure-cli-core azure-core azure-mgmt-authorization azure-mgmt-network azure-mgmt-compute azure-mgmt-msi + - pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0 # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] diff --git a/python/ray/autoscaler/azure/example-gpu-docker.yaml b/python/ray/autoscaler/azure/example-gpu-docker.yaml index 52ef692eb..bbca008b1 100644 --- a/python/ray/autoscaler/azure/example-gpu-docker.yaml +++ b/python/ray/autoscaler/azure/example-gpu-docker.yaml @@ -84,7 +84,7 @@ setup_commands: # Custom commands that will be run on the head node after common setup. head_setup_commands: - - pip install azure-cli-core azure-core azure-mgmt-authorization azure-mgmt-compute azure-mgmt-msi azure-mgmt-network + - pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0 # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] diff --git a/python/ray/autoscaler/azure/example-gpu.yaml b/python/ray/autoscaler/azure/example-gpu.yaml index cfe415bd8..afb3319cf 100644 --- a/python/ray/autoscaler/azure/example-gpu.yaml +++ b/python/ray/autoscaler/azure/example-gpu.yaml @@ -116,7 +116,7 @@ setup_commands: # Custom commands that will be run on the head node after common setup. head_setup_commands: - - pip install azure-cli-core azure-core azure-mgmt-authorization azure-mgmt-network azure-mgmt-compute azure-mgmt-msi + - pip install azure-cli-core==2.4.0 azure-mgmt-compute==12.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.1.0 # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] diff --git a/python/ray/autoscaler/azure/node_provider.py b/python/ray/autoscaler/azure/node_provider.py index 3d24bdb33..62b09393f 100644 --- a/python/ray/autoscaler/azure/node_provider.py +++ b/python/ray/autoscaler/azure/node_provider.py @@ -176,11 +176,11 @@ class AzureNodeProvider(NodeProvider): # TODO: restart deallocated nodes if possible resource_group = self.provider_config["resource_group"] - # load the template - template_path = os.path.join( - os.path.dirname(__file__), "azure-vm-template.json") - with open(template_path, "r") as template_file_fd: - template = json.load(template_file_fd) + # load the template file + current_path = os.path.dirname(os.path.abspath(__file__)) + template_path = os.path.join(current_path, "azure-vm-template.json") + with open(template_path, "r") as template_fp: + template = json.load(template_fp) # get the tags config_tags = node_config.get("tags", {}).copy() @@ -189,28 +189,33 @@ class AzureNodeProvider(NodeProvider): name_tag = config_tags.get(TAG_RAY_NODE_NAME, "node") unique_id = uuid4().hex[:VM_NAME_UUID_LEN] + vm_name = "{name}-{id}".format(name=name_tag, id=unique_id) + use_internal_ips = self.provider_config.get("use_internal_ips", False) - parameters = node_config["azure_arm_parameters"].copy() - parameters["vmName"] = "{name}-{id}".format( - name=name_tag, id=unique_id) - parameters["provisionPublicIp"] = not self.provider_config.get( - "use_internal_ips", False) - parameters["vmTags"] = config_tags - parameters["vmCount"] = count + template_params = node_config["azure_arm_parameters"].copy() + template_params["vmName"] = vm_name + template_params["provisionPublicIp"] = not use_internal_ips + template_params["vmTags"] = config_tags + template_params["vmCount"] = count - deployment_properties = { - "mode": DeploymentMode.incremental, - "template": template, - "parameters": {k: { - "value": v + parameters = { + "properties": { + "mode": DeploymentMode.incremental, + "template": template, + "parameters": { + key: { + "value": value + } + for key, value in template_params.items() + } } - for k, v in parameters.items()} } # TODO: we could get the private/public ips back directly self.resource_client.deployments.create_or_update( - resource_group, "ray-vm-{}".format(name_tag), - deployment_properties).wait() + resource_group_name=resource_group, + deployment_name="ray-vm-{}".format(name_tag), + parameters=parameters).wait() @synchronized def set_node_tags(self, node_id, tags):