diff --git a/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 b/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 index 4181e933e25..bbf1d1d4253 100644 --- a/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 +++ b/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 @@ -10,11 +10,7 @@ param( [switch]$PushImages, [string]$ClusterGroup, [string]$DeployId, - - [Parameter(ParameterSetName = 'DoLogin', Mandatory = $true)] [switch]$Login, - - [Parameter(ParameterSetName = 'DoLogin')] [string]$Subscription, # Default to true in Azure Pipelines environments diff --git a/eng/common/scripts/stress-testing/stress-test-deployment-lib.ps1 b/eng/common/scripts/stress-testing/stress-test-deployment-lib.ps1 index 77df8230532..bafbf77a945 100644 --- a/eng/common/scripts/stress-testing/stress-test-deployment-lib.ps1 +++ b/eng/common/scripts/stress-testing/stress-test-deployment-lib.ps1 @@ -117,12 +117,11 @@ function DeployStressTests( } $clusterGroup = 'rg-stress-cluster-prod' $subscription = 'Azure SDK Test Resources' + } elseif (!$clusterGroup -or !$subscription) { + throw "clusterGroup and subscription parameters must be specified when deploying to an environment that is not pg or prod." } if ($login) { - if (!$clusterGroup -or !$subscription) { - throw "clusterGroup and subscription parameters must be specified when logging into an environment that is not pg or prod." - } Login -subscription $subscription -clusterGroup $clusterGroup -pushImages:$pushImages } @@ -160,7 +159,9 @@ function DeployStressTests( -environment $environment ` -repositoryBase $repository ` -pushImages:$pushImages ` - -login:$login + -login:$login ` + -clusterGroup $clusterGroup ` + -subscription $subscription } if ($FailedCommands.Count -lt $pkgs.Count) { @@ -185,7 +186,9 @@ function DeployStressPackage( [string]$environment, [string]$repositoryBase, [switch]$pushImages, - [switch]$login + [switch]$login, + [string]$clusterGroup, + [string]$subscription ) { $registry = RunOrExitOnFailure az acr list -g $clusterGroup --subscription $subscription -o json $registryName = ($registry | ConvertFrom-Json).name diff --git a/tools/stress-cluster/chaos/README.md b/tools/stress-cluster/chaos/README.md index c91bc4c6ea6..c73547f395e 100644 --- a/tools/stress-cluster/chaos/README.md +++ b/tools/stress-cluster/chaos/README.md @@ -484,15 +484,14 @@ The `stress-test-addons` helm library will handle a scenarios matrix automatical ### Node Size Requirements -The stress test cluster is deployed with several node SKUs (see [agentPoolProfiles declaration and +The stress test cluster may be deployed with several node SKUs (see [agentPoolProfiles declaration and variables](https://github.com/Azure/azure-sdk-tools/blob/main/tools/stress-cluster/cluster/azure/cluster/cluster.bicep)), with tests defaulting to the SKU labeled 'default'. By adding the `nodeSelector` field to the job spec, you can override which nodes the test container will be provisioned to. For support adding a custom or dedicated node SKU, reach out to the EngSys team. Available common SKUs in stress test clusters: -- 'default' - Standard\_D2\_v3 -- 'highMem' - Standard\_D4ds\_v4 +- 'default' - Standard\_D4ds\_v4 To deploy a stress test to a custom node (see also [example](https://github.com/Azure/azure-sdk-tools/blob/main/tools/stress-cluster/chaos/examples/network-stress-example/templates/testjob.yaml)): @@ -500,7 +499,7 @@ To deploy a stress test to a custom node (see also ``` spec: nodeSelector: - sku: 'highMem' + sku: '' containers: < container spec ... > ``` diff --git a/tools/stress-cluster/cluster/README.md b/tools/stress-cluster/cluster/README.md index ce82ca84baa..a51fbadb832 100644 --- a/tools/stress-cluster/cluster/README.md +++ b/tools/stress-cluster/cluster/README.md @@ -55,9 +55,13 @@ Cluster buildout and deployment involves three main steps which are automated in 1. Provision static resources (service principal, role assignments, static keyvault). 1. Provision cluster resources (`main.bicep` entrypoint, standard ARM subscription deployment). + - NOTE: if the nodepool configuration for the AKS cluster needs to be updated, it cannot be done + alongside a deployment to the cluster itself. In order to update the nodepool configuration only, pass + the `-UpdateNodes` parameter to the provision script. 1. Provision stress infrastructures resources into the Azure Kubernetes Service cluster via helm (`./kubernetes/stress-infrastructure` helm chart). + ## Dev Cluster First, update the `./azure/parameters/dev.json` parameters file with the values marked `// add me`, then run: diff --git a/tools/stress-cluster/cluster/azure/cluster/acr.bicep b/tools/stress-cluster/cluster/azure/cluster/acr.bicep index 19509e01269..227ab67e17a 100644 --- a/tools/stress-cluster/cluster/azure/cluster/acr.bicep +++ b/tools/stress-cluster/cluster/azure/cluster/acr.bicep @@ -19,19 +19,19 @@ resource registry 'Microsoft.ContainerRegistry/registries@2019-12-01-preview' = // Add AcrPush and AcrPull roles to access groups resource acrPushRole 'Microsoft.Authorization/roleAssignments@2020-04-01-preview' = [for objectId in objectIds: { - name: '${guid('azureContainerRegistryPushRole', objectId, resourceGroup().id)}' + name: guid('azureContainerRegistryPushRole', objectId, resourceGroup().id) scope: registry properties: { - roleDefinitionId: '${subscriptionResourceId('Microsoft.Authorization/roleDefinitions', '8311e382-0749-4cb8-b61a-304f252e45ec')}' + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', '8311e382-0749-4cb8-b61a-304f252e45ec') principalId: objectId } }] resource acrPullRole 'Microsoft.Authorization/roleAssignments@2020-04-01-preview' = [for objectId in objectIds: { - name: '${guid('azureContainerRegistryPullRole', objectId, resourceGroup().id)}' + name: guid('azureContainerRegistryPullRole', objectId, resourceGroup().id) scope: registry properties: { - roleDefinitionId: '${subscriptionResourceId('Microsoft.Authorization/roleDefinitions', '7f951dda-4ed3-4680-a7ca-43fe172d538d')}' + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', '7f951dda-4ed3-4680-a7ca-43fe172d538d') principalId: objectId } }] diff --git a/tools/stress-cluster/cluster/azure/cluster/cluster.bicep b/tools/stress-cluster/cluster/azure/cluster/cluster.bicep index 123038d9ebc..bd6b3d8f4c1 100644 --- a/tools/stress-cluster/cluster/azure/cluster/cluster.bicep +++ b/tools/stress-cluster/cluster/azure/cluster/cluster.bicep @@ -4,53 +4,54 @@ param groupSuffix string param dnsPrefix string = 's1' param clusterName string param location string = resourceGroup().location -param enableHighMemAgentPool bool = false +// AKS does not allow agentPool updates via existing managed cluster resources +param updateNodes bool = false // monitoring parameters param workspaceId string -var kubernetesVersion = '1.24.3' +var kubernetesVersion = '1.25.4' var nodeResourceGroup = 'rg-nodes-${dnsPrefix}-${clusterName}-${groupSuffix}' -var defaultAgentPool = { - name: 'default' - count: 3 - minCount: 3 - maxCount: 9 +var systemAgentPool = { + name: 'system' + count: 1 + minCount: 1 + maxCount: 4 mode: 'System' - vmSize: 'Standard_D2_v3' + vmSize: 'Standard_D4ds_v4' type: 'VirtualMachineScaleSets' osType: 'Linux' enableAutoScaling: true enableEncryptionAtHost: true nodeLabels: { - 'sku': 'default' + sku: 'system' } } -var highMemAgentPool = { - name: 'highmemory' - count: 1 - minCount: 1 - maxCount: 3 - mode: 'System' +var defaultAgentPool = { + name: 'default' + count: 3 + minCount: 5 + maxCount: 24 + mode: 'User' vmSize: 'Standard_D4ds_v4' type: 'VirtualMachineScaleSets' osType: 'Linux' + osDiskType: 'Ephemeral' enableAutoScaling: true enableEncryptionAtHost: true nodeLabels: { - 'sku': 'highMem' + sku: 'default' } } -var agentPools = concat([ - defaultAgentPool - ], enableHighMemAgentPool ? [ - highMemAgentPool - ] : []) +var agentPools = [ + systemAgentPool + defaultAgentPool +] -resource cluster 'Microsoft.ContainerService/managedClusters@2020-09-01' = { +resource newCluster 'Microsoft.ContainerService/managedClusters@2022-09-02-preview' = if (!updateNodes) { name: clusterName location: location tags: tags @@ -83,14 +84,39 @@ resource cluster 'Microsoft.ContainerService/managedClusters@2020-09-01' = { } } +resource existingCluster 'Microsoft.ContainerService/managedClusters@2022-09-02-preview' existing = if (updateNodes) { + name: clusterName +} + +// Workaround for duplicate variable names when conditionals are in use +// See https://github.com/Azure/bicep/issues/1410 +var cluster = updateNodes ? existingCluster : newCluster + +resource pools 'Microsoft.ContainerService/managedClusters/agentPools@2022-09-02-preview' = [for pool in agentPools: if (updateNodes) { + parent: existingCluster + name: pool.name + properties: { + count: pool.count + minCount: pool.minCount + maxCount: pool.maxCount + mode: pool.mode + vmSize: pool.vmSize + type: pool.type + osType: pool.osType + enableAutoScaling: pool.enableAutoScaling + // enableEncryptionAtHost: pool.enableEncryptionAtHost + nodeLabels: pool.nodeLabels + } +}] + // Add Monitoring Metrics Publisher role to omsagent identity. Required to publish metrics data to // cluster resource container insights. // https://docs.microsoft.com/azure/azure-monitor/containers/container-insights-update-metrics -resource metricsPublisher 'Microsoft.Authorization/roleAssignments@2020-04-01-preview' = { - name: '${guid('monitoringMetricsPublisherRole', resourceGroup().id)}' - scope: cluster +resource metricsPublisher 'Microsoft.Authorization/roleAssignments@2020-04-01-preview' = if (!updateNodes) { + name: guid('monitoringMetricsPublisherRole', resourceGroup().id) + scope: newCluster properties: { - roleDefinitionId: '${subscriptionResourceId('Microsoft.Authorization/roleDefinitions', '3913510d-42f4-4e42-8a64-420c390055eb')}' + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', '3913510d-42f4-4e42-8a64-420c390055eb') // NOTE: using objectId over clientId seems to handle cross-region propagation delays better for newly created identities principalId: cluster.properties.addonProfiles.omsagent.identity.objectId } @@ -99,4 +125,4 @@ resource metricsPublisher 'Microsoft.Authorization/roleAssignments@2020-04-01-pr output secretProviderObjectId string = cluster.properties.addonProfiles.azureKeyvaultSecretsProvider.identity.objectId output secretProviderClientId string = cluster.properties.addonProfiles.azureKeyvaultSecretsProvider.identity.clientId output kubeletIdentityObjectId string = cluster.properties.identityProfile.kubeletidentity.objectId -output clusterName string = cluster.name +output clusterName string = clusterName diff --git a/tools/stress-cluster/cluster/azure/main.bicep b/tools/stress-cluster/cluster/azure/main.bicep index b3e256dba66..878569287c9 100644 --- a/tools/stress-cluster/cluster/azure/main.bicep +++ b/tools/stress-cluster/cluster/azure/main.bicep @@ -4,11 +4,12 @@ param subscriptionId string = '' param groupSuffix string param clusterName string param clusterLocation string = 'westus3' -param staticTestSecretsKeyvaultName string -param staticTestSecretsKeyvaultGroup string +param staticTestKeyvaultName string +param staticTestKeyvaultGroup string param monitoringLocation string = 'centralus' param tags object -param enableHighMemAgentPool bool = false +// AKS does not allow agentPool updates via existing managed cluster resources +param updateNodes bool = false // Azure Developer Platform Team Group // https://ms.portal.azure.com/#blade/Microsoft_AAD_IAM/GroupDetailsMenuBlade/Overview/groupId/56709ad9-8962-418a-ad0d-4b25fa962bae @@ -52,6 +53,7 @@ module test_dashboard 'monitoring/stress-test-workbook.bicep' = { scope: group params: { workbookDisplayName: 'Azure SDK Stress Testing - ${groupSuffix}' + location: clusterLocation logAnalyticsResource: logWorkspace.outputs.id } } @@ -61,6 +63,7 @@ module status_dashboard 'monitoring/stress-status-workbook.bicep' = { scope: group params: { workbookDisplayName: 'Stress Status - ${groupSuffix}' + location: clusterLocation logAnalyticsResource: logWorkspace.outputs.id } } @@ -69,10 +72,11 @@ module cluster 'cluster/cluster.bicep' = { name: 'cluster' scope: group params: { + updateNodes: updateNodes + location: clusterLocation clusterName: clusterName tags: tags groupSuffix: groupSuffix - enableHighMemAgentPool: enableHighMemAgentPool workspaceId: logWorkspace.outputs.id } } @@ -88,13 +92,13 @@ module containerRegistry 'cluster/acr.bicep' = { } module storage 'cluster/storage.bicep' = { - name: 'storage' - scope: group - params: { - storageName: 'stressdebug${resourceSuffix}' - fileShareName: 'stressfiles${resourceSuffix}' - location: clusterLocation - } + name: 'storage' + scope: group + params: { + storageName: 'stressdebug${resourceSuffix}' + fileShareName: 'stressfiles${resourceSuffix}' + location: clusterLocation + } } var appInsightsInstrumentationKeySecretName = 'appInsightsInstrumentationKey-${resourceSuffix}' @@ -109,9 +113,9 @@ var appInsightsConnectionStringSecretValue = 'APPLICATIONINSIGHTS_CONNECTION_STR // See https://docs.microsoft.com/azure/aks/azure-files-volume#create-a-kubernetes-secret // See https://docs.microsoft.com/azure/aks/azure-files-csi var debugStorageKeySecretName = 'debugStorageKey-${resourceSuffix}' -var debugStorageKeySecretValue = '${storage.outputs.key}' +var debugStorageKeySecretValue = storage.outputs.key var debugStorageAccountSecretName = 'debugStorageAccount-${resourceSuffix}' -var debugStorageAccountSecretValue = '${storage.outputs.name}' +var debugStorageAccountSecretValue = storage.outputs.name module keyvault 'cluster/keyvault.bicep' = { name: 'keyvault' @@ -146,15 +150,15 @@ module keyvault 'cluster/keyvault.bicep' = { module accessPolicy 'cluster/static-vault-access-policy.bicep' = { name: 'accessPolicy' - scope: resourceGroup(staticTestSecretsKeyvaultGroup) + scope: resourceGroup(staticTestKeyvaultGroup) params: { - vaultName: staticTestSecretsKeyvaultName + vaultName: staticTestKeyvaultName tenantId: subscription().tenantId objectId: cluster.outputs.secretProviderObjectId } } -output STATIC_TEST_SECRETS_KEYVAULT string = staticTestSecretsKeyvaultName +output STATIC_TEST_SECRETS_KEYVAULT string = staticTestKeyvaultName output CLUSTER_TEST_SECRETS_KEYVAULT string = keyvault.outputs.keyvaultName output SECRET_PROVIDER_CLIENT_ID string = cluster.outputs.secretProviderClientId output CLUSTER_NAME string = cluster.outputs.clusterName diff --git a/tools/stress-cluster/cluster/azure/monitoring/stress-status-workbook.bicep b/tools/stress-cluster/cluster/azure/monitoring/stress-status-workbook.bicep index 4bce7d4b2f0..ab082304b8c 100644 --- a/tools/stress-cluster/cluster/azure/monitoring/stress-status-workbook.bicep +++ b/tools/stress-cluster/cluster/azure/monitoring/stress-status-workbook.bicep @@ -1,4 +1,5 @@ param logAnalyticsResource string +param location string = resourceGroup().location @description('The friendly name for the workbook that is used in the Gallery or Saved List. This name must be unique within a resource group.') param workbookDisplayName string @@ -233,7 +234,7 @@ var workbookContent = { resource workbookId_resource 'microsoft.insights/workbooks@2021-03-08' = { name: workbookId - location: resourceGroup().location + location: location kind: 'shared' properties: { displayName: workbookDisplayName diff --git a/tools/stress-cluster/cluster/azure/monitoring/stress-test-workbook.bicep b/tools/stress-cluster/cluster/azure/monitoring/stress-test-workbook.bicep index 34fa06e406d..b36b25ec0a3 100644 --- a/tools/stress-cluster/cluster/azure/monitoring/stress-test-workbook.bicep +++ b/tools/stress-cluster/cluster/azure/monitoring/stress-test-workbook.bicep @@ -1,4 +1,5 @@ param logAnalyticsResource string +param location string = resourceGroup().location @description('The friendly name for the workbook that is used in the Gallery or Saved List. This name must be unique within a resource group.') param workbookDisplayName string @@ -308,7 +309,7 @@ var workbookContent = { resource workbookId_resource 'microsoft.insights/workbooks@2021-03-08' = { name: workbookId - location: resourceGroup().location + location: location kind: 'shared' properties: { displayName: workbookDisplayName diff --git a/tools/stress-cluster/cluster/azure/parameters/dev.json b/tools/stress-cluster/cluster/azure/parameters/dev.json index 621340039fd..23af8e5592f 100644 --- a/tools/stress-cluster/cluster/azure/parameters/dev.json +++ b/tools/stress-cluster/cluster/azure/parameters/dev.json @@ -14,10 +14,10 @@ "clusterLocation": { "value": "westus2" }, - "staticTestSecretsKeyvaultName": { + "staticTestKeyvaultName": { "value": // add me, e.g. stress-secrets- }, - "staticTestSecretsKeyvaultGroup": { + "staticTestKeyvaultGroup": { "value": // add me, e.g. rg-stress-secrets- }, "tags": { diff --git a/tools/stress-cluster/cluster/azure/parameters/pg.json b/tools/stress-cluster/cluster/azure/parameters/pg.json index 8ba5c52d4a6..e1ce19e6e4d 100644 --- a/tools/stress-cluster/cluster/azure/parameters/pg.json +++ b/tools/stress-cluster/cluster/azure/parameters/pg.json @@ -14,15 +14,12 @@ "clusterLocation": { "value": "westus3" }, - "staticTestSecretsKeyvaultName": { + "staticTestKeyvaultName": { "value": "stress-secrets-pg" }, - "staticTestSecretsKeyvaultGroup": { + "staticTestKeyvaultGroup": { "value": "rg-stress-secrets-pg" }, - "enableHighMemAgentPool": { - "value": true - }, "tags": { "value": { "environment": "pg", diff --git a/tools/stress-cluster/cluster/azure/parameters/prod.json b/tools/stress-cluster/cluster/azure/parameters/prod.json index f39490aed31..6800e78c30f 100644 --- a/tools/stress-cluster/cluster/azure/parameters/prod.json +++ b/tools/stress-cluster/cluster/azure/parameters/prod.json @@ -17,10 +17,10 @@ "monitoringLocation": { "value": "centralus" }, - "staticTestSecretsKeyvaultName": { + "staticTestKeyvaultName": { "value": "stress-secrets-prod" }, - "staticTestSecretsKeyvaultGroup": { + "staticTestKeyvaultGroup": { "value": "rg-stress-secrets-prod" }, "tags": { diff --git a/tools/stress-cluster/cluster/kubernetes/stress-infrastructure/templates/stresswatcher.yaml b/tools/stress-cluster/cluster/kubernetes/stress-infrastructure/templates/stresswatcher.yaml index 89bad7cdaaf..dae909fde90 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-infrastructure/templates/stresswatcher.yaml +++ b/tools/stress-cluster/cluster/kubernetes/stress-infrastructure/templates/stresswatcher.yaml @@ -18,7 +18,7 @@ spec: app: stress-watcher spec: nodeSelector: - sku: 'default' + sku: 'system' initContainers: # Init container template for injecting secrets # (e.g. app insights instrumentation key, azure client credentials) diff --git a/tools/stress-cluster/cluster/provision.ps1 b/tools/stress-cluster/cluster/provision.ps1 index 2660d362472..f773d8ac260 100644 --- a/tools/stress-cluster/cluster/provision.ps1 +++ b/tools/stress-cluster/cluster/provision.ps1 @@ -3,6 +3,8 @@ param ( [string]$Environment = 'dev', [string]$Namespace = 'stress-infra', [switch]$Development = $false, + # If provisioning an existing cluster and updating nodes, it must be done exclusively + [switch]$UpdateNodes = $false, [Parameter(ParameterSetName = 'Provisioner', Mandatory = $true)] [ValidateNotNullOrEmpty()] @@ -88,19 +90,19 @@ function DeployStaticResources([hashtable]$params) $formattedTags = $formattedTags -join ' ' RunOrExitOnFailure az group create ` - -n $params.staticTestSecretsKeyvaultGroup ` + -n $params.staticTestKeyvaultGroup ` -l $params.clusterLocation ` --subscription $params.subscriptionId ` --tags $formattedTags $kv = Run az keyvault show ` - -n $params.staticTestSecretsKeyvaultName ` - -g $params.staticTestSecretsKeyvaultGroup ` + -n $params.staticTestKeyvaultName ` + -g $params.staticTestKeyvaultGroup ` --subscription $params.subscriptionId if (!$kv) { RunOrExitOnFailure az keyvault create ` - -n $params.staticTestSecretsKeyvaultName ` - -g $params.staticTestSecretsKeyvaultGroup ` + -n $params.staticTestKeyvaultName ` + -g $params.staticTestKeyvaultGroup ` --subscription $params.subscriptionId } @@ -125,7 +127,7 @@ function DeployStaticResources([hashtable]$params) --scopes "/subscriptions/$($params.subscriptionId)" $spInfo = $sp | ConvertFrom-Json # Force check to see if the service principal was succesfully created and propagated - $oid = (RunOrExitOnFailure az ad sp show -o json --id $spInfo.appId | ConvertFrom-Json).objectId + $oid = (RunOrExitOnFailure az ad sp show -o json --id $spInfo.appId | ConvertFrom-Json).id $credentials = @{ AZURE_CLIENT_ID = $spInfo.appId @@ -141,7 +143,7 @@ function DeployStaticResources([hashtable]$params) $envFile = Join-Path ([System.IO.Path]::GetTempPath()) "/static.env" $dotenv = $credentials.GetEnumerator() | ForEach-Object { "$($_.Key)=$($_.Value)`n" } (-join $dotenv) | Out-File $envFile - Run az keyvault secret set --vault-name $params.staticTestSecretsKeyvaultName --file $envFile -n $STATIC_TEST_DOTENV_NAME + Run az keyvault secret set --vault-name $params.staticTestKeyvaultName --file $envFile -n $STATIC_TEST_DOTENV_NAME if (Test-Path $envFile) { Remove-Item -Force $envFile } @@ -207,7 +209,8 @@ function DeployClusterResources([hashtable]$params) -l $params.clusterLocation ` -f $PSScriptRoot/azure/main.bicep ` --parameters $PSScriptRoot/azure/parameters/$Environment.json ` - --parameters groupName=$STRESS_CLUSTER_RESOURCE_GROUP + --parameters groupName=$STRESS_CLUSTER_RESOURCE_GROUP ` + --parameters updateNodes=$UpdateNodes SetEnvOutputs $params