From 3cb7a7401d6a955e3b78b0b7e1b5570817fa40a4 Mon Sep 17 00:00:00 2001 From: Scott Graham <5720537+gramhagen@users.noreply.github.com> Date: Tue, 28 Feb 2023 14:21:09 -0500 Subject: [PATCH] [azure][autoscaler] Fix Azure autoscaler node naming and deletion delays (#31645) This reverts prior changes to node naming which led to non-unique names, causing constant node refreshing Currently the Azure autoscaler blocks on node destruction, so that was removed in this change Related issue number Closes #31538 Closes #25971 --------- Signed-off-by: Scott Graham Co-authored-by: Scott Graham Signed-off-by: elliottower --- .../_private/_azure/azure-config-template.json | 12 ++++++------ python/ray/autoscaler/_private/_azure/config.py | 8 +++++--- .../autoscaler/_private/_azure/node_provider.py | 15 +++++++++------ 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/python/ray/autoscaler/_private/_azure/azure-config-template.json b/python/ray/autoscaler/_private/_azure/azure-config-template.json index 8a50b09f46cb2..ec050bb6757ed 100644 --- a/python/ray/autoscaler/_private/_azure/azure-config-template.json +++ b/python/ray/autoscaler/_private/_azure/azure-config-template.json @@ -2,7 +2,7 @@ "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", "contentVersion": "1.0.0.0", "parameters": { - "uniqueId": { + "clusterId": { "type": "string", "metadata": { "description": "Unique string appended to resource names to isolate resources from different ray clusters." @@ -18,12 +18,12 @@ "variables": { "contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]", "location": "[resourceGroup().location]", - "msiName": "[concat('ray-msi-', parameters('uniqueId'))]", - "roleAssignmentName": "[concat('ray-ra-', parameters('uniqueId'))]", - "nsgName": "[concat('ray-nsg-', parameters('uniqueId'))]", + "msiName": "[concat('ray-', parameters('clusterId'), '-msi')]", + "roleAssignmentName": "[concat('ray-', parameters('clusterId'), '-ra')]", + "nsgName": "[concat('ray-', parameters('clusterId'), '-nsg')]", "nsg": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('nsgName'))]", - "vnetName": "[concat('ray-vnet-', parameters('uniqueId'))]", - "subnetName": "[concat('ray-subnet-', parameters('uniqueId'))]" + "vnetName": "[concat('ray-', parameters('clusterId'), '-vnet')]", + "subnetName": "[concat('ray-', parameters('clusterId'), '-subnet')]" }, "resources": [ { diff --git a/python/ray/autoscaler/_private/_azure/config.py b/python/ray/autoscaler/_private/_azure/config.py index 3eb1386804fe8..488dbc143ee34 100644 --- a/python/ray/autoscaler/_private/_azure/config.py +++ b/python/ray/autoscaler/_private/_azure/config.py @@ -64,7 +64,7 @@ def _configure_resource_group(config): if "tags" in config["provider"]: params["tags"] = config["provider"]["tags"] - logger.info("Creating/Updating Resource Group: %s", resource_group) + logger.info("Creating/Updating resource group: %s", resource_group) rg_create_or_update = get_azure_sdk_function( client=resource_client.resource_groups, function_name="create_or_update" ) @@ -76,17 +76,19 @@ def _configure_resource_group(config): with open(template_path, "r") as template_fp: template = json.load(template_fp) + logger.info("Using cluster name: %s", config["cluster_name"]) + # set unique id for resources in this cluster unique_id = config["provider"].get("unique_id") if unique_id is None: hasher = sha256() hasher.update(config["provider"]["resource_group"].encode("utf-8")) - hasher.update(config["cluster_name"].encode("utf-8")) unique_id = hasher.hexdigest()[:UNIQUE_ID_LEN] else: unique_id = str(unique_id) config["provider"]["unique_id"] = unique_id logger.info("Using unique id: %s", unique_id) + cluster_id = "{}-{}".format(config["cluster_name"], unique_id) subnet_mask = config["provider"].get("subnet_mask") if subnet_mask is None: @@ -101,7 +103,7 @@ def _configure_resource_group(config): "template": template, "parameters": { "subnet": {"value": subnet_mask}, - "uniqueId": {"value": unique_id}, + "clusterId": {"value": cluster_id}, }, } } diff --git a/python/ray/autoscaler/_private/_azure/node_provider.py b/python/ray/autoscaler/_private/_azure/node_provider.py index edc8efc5db668..97b58827f8902 100644 --- a/python/ray/autoscaler/_private/_azure/node_provider.py +++ b/python/ray/autoscaler/_private/_azure/node_provider.py @@ -2,6 +2,7 @@ import logging from pathlib import Path from threading import RLock +from uuid import uuid4 from azure.identity import DefaultAzureCredential from azure.mgmt.compute import ComputeManagementClient @@ -23,6 +24,7 @@ ) VM_NAME_MAX_LEN = 64 +UNIQUE_ID_LEN = 4 logger = logging.getLogger(__name__) azure_logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy") @@ -221,10 +223,11 @@ def _create_node(self, node_config, tags, count): config_tags.update(tags) config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name - name_tag = config_tags.get(TAG_RAY_NODE_NAME, "node") - vm_name = "{name}-{id}".format( - name=name_tag, id=self.provider_config["unique_id"] - ) + vm_name = "{node}-{unique_id}-{vm_id}".format( + node=config_tags.get(TAG_RAY_NODE_NAME, "node"), + unique_id=self.provider_config["unique_id"], + vm_id=uuid4().hex[:UNIQUE_ID_LEN], + )[:VM_NAME_MAX_LEN] use_internal_ips = self.provider_config.get("use_internal_ips", False) template_params = node_config["azure_arm_parameters"].copy() @@ -252,7 +255,7 @@ def _create_node(self, node_config, tags, count): ) create_or_update( resource_group_name=resource_group, - deployment_name="ray-vm-{}".format(name_tag), + deployment_name=vm_name, parameters=parameters, ).wait() @@ -311,7 +314,7 @@ def terminate_node(self, node_id): delete = get_azure_sdk_function( client=self.compute_client.virtual_machines, function_name="delete" ) - delete(resource_group_name=resource_group, vm_name=node_id).wait() + delete(resource_group_name=resource_group, vm_name=node_id) except Exception as e: logger.warning("Failed to delete VM: {}".format(e))