From dd1c8ca2a8a561f93b50bf2eaad08f32a3fe8aa3 Mon Sep 17 00:00:00 2001 From: Ben Broderick Phillips Date: Fri, 22 Oct 2021 18:08:10 -0400 Subject: [PATCH] Fully automate stress cluster buildout and add support for azure file share mounting (#2106) - Fully automate cluster buildout. Add azure file share mount to stress tests. - Moving the test/ad-hoc cluster back to the playground subscription - Upgrading kubernetes cluster version to 1.21.x to pull in support for the azure csi file driver - Adding high memory agent nodes to the base deployment - Enabling node autoscaler in the base deployment - Publish stress watcher image in CI. Run docker build on PR - Using common image location across stress clusters to simplify buildout+deployment - Add stress test debug file share usage example Resolves #1903 --- .../stress-testing/deploy-stress-tests.ps1 | 4 +- eng/containers/ci.yml | 21 +- tools/stress-cluster/chaos/README.md | 29 ++- .../network-stress-example/Chart.lock | 6 +- .../network-stress-example/Chart.yaml | 2 +- .../stress-debug-share-example/Chart.lock | 6 + .../stress-debug-share-example/Chart.yaml | 14 ++ .../templates/debug-share-job.yaml | 23 +++ .../stress-deployment-example/Chart.lock | 6 +- .../stress-deployment-example/Chart.yaml | 2 +- .../stress-test-resources.json | 4 +- tools/stress-cluster/cluster/README.md | 194 +++++++++--------- .../cluster/azure/cluster/cluster.bicep | 60 ++++-- .../cluster/azure/cluster/storage.bicep | 21 ++ tools/stress-cluster/cluster/azure/main.bicep | 43 +++- .../cluster/azure/parameters/dev.json | 12 +- .../cluster/azure/parameters/prod.json | 7 +- .../cluster/azure/parameters/test.json | 15 +- .../templates/stresswatcher.yaml | 2 +- .../stress-infrastructure/values.yaml | 1 - .../kubernetes/stress-test-addons/Chart.yaml | 2 +- .../kubernetes/stress-test-addons/index.yaml | 11 +- .../templates/_container_env.tpl | 14 ++ .../templates/_debug_file_volumes.tpl | 7 + .../templates/_deploy_configmap.tpl | 2 +- .../templates/_deploy_volumes.tpl | 4 +- .../templates/_init_deploy.tpl | 4 +- .../templates/_init_env.tpl | 5 +- .../templates/_stress_test.tpl | 17 ++ .../stress-test-cluster-secret-provider.yaml | 16 ++ .../kubernetes/stress-test-addons/values.yaml | 32 ++- tools/stress-cluster/cluster/provision.ps1 | 146 +++++++++++++ 32 files changed, 569 insertions(+), 163 deletions(-) create mode 100644 tools/stress-cluster/chaos/examples/stress-debug-share-example/Chart.lock create mode 100644 tools/stress-cluster/chaos/examples/stress-debug-share-example/Chart.yaml create mode 100644 tools/stress-cluster/chaos/examples/stress-debug-share-example/templates/debug-share-job.yaml create mode 100644 tools/stress-cluster/cluster/azure/cluster/storage.bicep create mode 100644 tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_debug_file_volumes.tpl create mode 100644 tools/stress-cluster/cluster/provision.ps1 diff --git a/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 b/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 index 76eff6a31e1..02e91cc843a 100644 --- a/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 +++ b/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 @@ -76,9 +76,9 @@ function DeployStressTests( [string]$environment = 'test', [string]$repository = 'images', [boolean]$pushImages = $false, - [string]$clusterGroup = 'rg-stress-test-cluster-', + [string]$clusterGroup = 'rg-stress-cluster-test', [string]$deployId = 'local', - [string]$subscription = 'Azure SDK Test Resources' + [string]$subscription = 'Azure SDK Developer Playground' ) { if ($PSCmdlet.ParameterSetName -eq 'DoLogin') { Login $subscription $clusterGroup $pushImages diff --git a/eng/containers/ci.yml b/eng/containers/ci.yml index 73f50d2b920..5b699191c7c 100644 --- a/eng/containers/ci.yml +++ b/eng/containers/ci.yml @@ -22,6 +22,12 @@ parameters: dockerFile: 'tools/test-proxy/docker/dockerfile-win' stableTags: - 'latest' + - name: stress_watcher + pool: 'ubuntu-20.04' + dockerRepo: 'stress/watcher' + dockerFile: 'tools/stress-cluster/services/Stress.Watcher/Dockerfile' + stableTags: + - 'latest' trigger: branches: @@ -32,8 +38,18 @@ trigger: - eng/containers/ - tools/test-proxy/docker/ - tools/keyvault-mock-attestation/Dockerfile + - tools/stress-cluster/services/Stress.Watcher/Dockerfile -pr: none +pr: + branches: + include: + - main + paths: + include: + - eng/containers/ + - tools/test-proxy/docker/ + - tools/keyvault-mock-attestation/Dockerfile + - tools/stress-cluster/services/Stress.Watcher/Dockerfile variables: - name: containerRegistry @@ -64,6 +80,7 @@ jobs: - task: Docker@2 displayName: Push ${{ config.name }}:$(imageTag) + condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest')) inputs: containerRegistry: $(containerRegistry) repository: ${{ config.dockerRepo }} @@ -81,6 +98,8 @@ jobs: - task: Docker@2 displayName: Push ${{ config.name }}:${{ stableTag }} + condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest')) + inputs: containerRegistry: $(containerRegistry) repository: ${{ config.dockerRepo }} diff --git a/tools/stress-cluster/chaos/README.md b/tools/stress-cluster/chaos/README.md index c3fd82e5b9d..b1b82d6a902 100644 --- a/tools/stress-cluster/chaos/README.md +++ b/tools/stress-cluster/chaos/README.md @@ -10,6 +10,7 @@ The chaos environment is an AKS cluster (Azure Kubernetes Service) with several * [Creating a Stress Test](#creating-a-stress-test) * [Layout](#layout) * [Stress Test Secrets](#stress-test-secrets) + * [Stress Test File Share](#stress-test-file-share) * [Stress Test Azure Resources](#stress-test-azure-resources) * [Helm Chart Dependencies](#helm-chart-dependencies) * [Job Manifest](#job-manifest) @@ -41,12 +42,14 @@ You will need the following tools to create and run tests: ## Access -To access the cluster, run the following: +To access the cluster, run the following. These commands are unnecessary for stress test deployment but can be useful +for verifying permissions and directly interacting with containers via the kubernetes command line tool `kubectl`. For +running the build and deployment script, see [Deploying a Stress Test](#deploying-a-stress-test). ``` az login # Download the kubeconfig for the cluster -az aks get-credentials --subscription "Azure SDK Test Resources" -g rg-stress-test-cluster- -n stress-test +az aks get-credentials --subscription "Azure SDK Developer Playground" -g rg-stress-cluster-test -n stress-test ``` You should now be able to access the cluster. To verify, you should see a list of namespaces when running the command: @@ -198,6 +201,28 @@ APPINSIGHTS_INSTRUMENTATIONKEY= RESOURCE_GROUP= ``` +### Stress Test File Share + +Stress tests are encouraged to use app insights logs and metrics as much as possible for diagnostics. However there +are some times where larger files (such as verbose logs, heap dumps, packet captures, etc.) need to be persisted for +a duration longer than the lifespan of the test itself. + +All stress tests have an azure file share automatically mounted into the container by default. The path to this share +is available via the environment variable `$DEBUG_SHARE` and is global to all tests in the cluster. +The `$DEBUG_SHARE` path includes the namespace and pod name of the test in order to avoid path overlaps with other +tests. The `$DEBUG_SHARE_ROOT` path is also available, which points to the root of the file share, but this directory +should only be used in special circumstances and with caution. + +NOTE: The share directory path MUST be created by the test before using it. + +After writing debug files to the share, the files can be viewed by navigating to the [file share +portal](https://aka.ms/azsdk/stress/share), +selecting the `namespace/` directory, and clicking the download link for any files in that directory. + +See +[stress-debug-share-example](https://github.com/Azure/azure-sdk-tools/tree/main/tools/stress-cluster/chaos/examples/stress-debug-share-example) +for example usage. + ### Stress Test Azure Resources Stress test resources can either be defined as azure bicep files, or an ARM template directly, provided there is diff --git a/tools/stress-cluster/chaos/examples/network-stress-example/Chart.lock b/tools/stress-cluster/chaos/examples/network-stress-example/Chart.lock index 19f736aa462..f807d482b51 100644 --- a/tools/stress-cluster/chaos/examples/network-stress-example/Chart.lock +++ b/tools/stress-cluster/chaos/examples/network-stress-example/Chart.lock @@ -1,6 +1,6 @@ dependencies: - name: stress-test-addons repository: https://stresstestcharts.blob.core.windows.net/helm/ - version: 0.1.6 -digest: sha256:b97697ef5f303eec43e9a94fca8e312d20b8aed71318250499344aeca9880d31 -generated: "2021-08-16T12:57:01.466377-04:00" + version: 0.1.9 +digest: sha256:2a32027871497958af15562a675bad47f4e29523cb18a91ce17b5078eaf9bbdf +generated: "2021-10-15T13:37:14.6487529-04:00" diff --git a/tools/stress-cluster/chaos/examples/network-stress-example/Chart.yaml b/tools/stress-cluster/chaos/examples/network-stress-example/Chart.yaml index 230a4616630..3fd648acf96 100644 --- a/tools/stress-cluster/chaos/examples/network-stress-example/Chart.yaml +++ b/tools/stress-cluster/chaos/examples/network-stress-example/Chart.yaml @@ -10,5 +10,5 @@ annotations: dependencies: - name: stress-test-addons - version: 0.1.7 + version: 0.1.9 repository: https://stresstestcharts.blob.core.windows.net/helm/ diff --git a/tools/stress-cluster/chaos/examples/stress-debug-share-example/Chart.lock b/tools/stress-cluster/chaos/examples/stress-debug-share-example/Chart.lock new file mode 100644 index 00000000000..8d01e1211f8 --- /dev/null +++ b/tools/stress-cluster/chaos/examples/stress-debug-share-example/Chart.lock @@ -0,0 +1,6 @@ +dependencies: +- name: stress-test-addons + repository: https://stresstestcharts.blob.core.windows.net/helm/ + version: 0.1.9 +digest: sha256:2a32027871497958af15562a675bad47f4e29523cb18a91ce17b5078eaf9bbdf +generated: "2021-10-15T13:23:41.8857818-04:00" diff --git a/tools/stress-cluster/chaos/examples/stress-debug-share-example/Chart.yaml b/tools/stress-cluster/chaos/examples/stress-debug-share-example/Chart.yaml new file mode 100644 index 00000000000..467c5defefc --- /dev/null +++ b/tools/stress-cluster/chaos/examples/stress-debug-share-example/Chart.yaml @@ -0,0 +1,14 @@ +apiVersion: v2 +name: debug-share-example +description: An example stress test chart that uses a file share for debugging (e.g. for large log files, heap dumps) +version: 0.1.1 +appVersion: v0.1 +annotations: + stressTest: 'true' # enable auto-discovery of this test via `find-all-stress-packages.ps1` + example: 'true' # enable auto-discovery filtering `find-all-stress-packages.ps1 -filters @{example='true'}` + namespace: 'examples' + +dependencies: +- name: stress-test-addons + version: 0.1.9 + repository: https://stresstestcharts.blob.core.windows.net/helm/ diff --git a/tools/stress-cluster/chaos/examples/stress-debug-share-example/templates/debug-share-job.yaml b/tools/stress-cluster/chaos/examples/stress-debug-share-example/templates/debug-share-job.yaml new file mode 100644 index 00000000000..d85ad4de79a --- /dev/null +++ b/tools/stress-cluster/chaos/examples/stress-debug-share-example/templates/debug-share-job.yaml @@ -0,0 +1,23 @@ +{{- include "stress-test-addons.env-job-template.from-pod" (list . "stress.deploy-example") -}} +{{- define "stress.deploy-example" -}} +metadata: + labels: + testName: "debug-share-example" +spec: + containers: + - name: debug-share-example + image: busybox + command: ['sh', '-c'] + args: + - | + set -ex; + mkdir -p $DEBUG_SHARE; + cd $DEBUG_SHARE; + pwd; + ls -R $DEBUG_SHARE_ROOT; + echo "debug share example success" > success; + cat success; + # The file share is mounted by default at the path $DEBUG_SHARE + # when including the container-env template + {{- include "stress-test-addons.container-env" . | nindent 6 }} +{{- end -}} diff --git a/tools/stress-cluster/chaos/examples/stress-deployment-example/Chart.lock b/tools/stress-cluster/chaos/examples/stress-deployment-example/Chart.lock index 437b3f52997..b95077bb56a 100644 --- a/tools/stress-cluster/chaos/examples/stress-deployment-example/Chart.lock +++ b/tools/stress-cluster/chaos/examples/stress-deployment-example/Chart.lock @@ -1,6 +1,6 @@ dependencies: - name: stress-test-addons repository: https://stresstestcharts.blob.core.windows.net/helm/ - version: 0.1.6 -digest: sha256:b97697ef5f303eec43e9a94fca8e312d20b8aed71318250499344aeca9880d31 -generated: "2021-08-13T17:24:51.4285458-04:00" + version: 0.1.9 +digest: sha256:2a32027871497958af15562a675bad47f4e29523cb18a91ce17b5078eaf9bbdf +generated: "2021-10-18T17:44:55.9281601-04:00" diff --git a/tools/stress-cluster/chaos/examples/stress-deployment-example/Chart.yaml b/tools/stress-cluster/chaos/examples/stress-deployment-example/Chart.yaml index 09d5be72dc5..1576e2cb4d8 100644 --- a/tools/stress-cluster/chaos/examples/stress-deployment-example/Chart.yaml +++ b/tools/stress-cluster/chaos/examples/stress-deployment-example/Chart.yaml @@ -10,5 +10,5 @@ annotations: dependencies: - name: stress-test-addons - version: 0.1.7 + version: 0.1.9 repository: https://stresstestcharts.blob.core.windows.net/helm/ diff --git a/tools/stress-cluster/chaos/examples/stress-deployment-example/stress-test-resources.json b/tools/stress-cluster/chaos/examples/stress-deployment-example/stress-test-resources.json index 39a7199b56a..0a923a1f510 100644 --- a/tools/stress-cluster/chaos/examples/stress-deployment-example/stress-test-resources.json +++ b/tools/stress-cluster/chaos/examples/stress-deployment-example/stress-test-resources.json @@ -4,8 +4,8 @@ "metadata": { "_generator": { "name": "bicep", - "version": "0.4.63.48766", - "templateHash": "13987799099034517242" + "version": "0.4.613.9944", + "templateHash": "9940417978769654920" } }, "parameters": { diff --git a/tools/stress-cluster/cluster/README.md b/tools/stress-cluster/cluster/README.md index c957217ddbd..d01dd35bf9e 100644 --- a/tools/stress-cluster/cluster/README.md +++ b/tools/stress-cluster/cluster/README.md @@ -1,160 +1,160 @@ -This directory contains [Azure Bicep](https://docs.microsoft.com/en-us/azure/azure-resource-manager/bicep/overview) +Table of Contents +* [Layout](#layout) +* [Dependencies](#dependencies) +* [Deploying Cluster(s)](#deploying-clusters) + * [Dev Cluster](#dev-cluster) + * [Test Cluster](#test-cluster) + * [Prod Cluster](#prod-cluster) + * [Local Cluster](#local-cluster) +* [Development](#development) + * [Bicep templates](#bicep-templates) + * [Helm templates](#helm-templates) + + +# Layout + +This directory contains all configuration used for stress test cluster buildout (azure and kubernetes buildout), as well +as a set of common stress test config boilerplate (helm library). + +The `./azure` directory contains [Azure Bicep](https://docs.microsoft.com/en-us/azure/azure-resource-manager/bicep/overview) files for deploying Azure resources (mainly [AKS clusters](https://azure.microsoft.com/en-us/services/kubernetes-service/) to support stress testing (for dev/test and/or production). Azure Bicep comes pre-installed with the Azure CLI, and is a DSL for generating ARM templates. +The `./kubernetes/stress-infrastructure` directory contains a helm chart for deploying the core services +that must be installed into any stress cluster: chaos-mesh (for chaos) and stress-watcher (for event handling like chaos +resource start and resource group cleanup). + +The `./kubernetes/stress-test-addons` directory contains a [library chart](https://helm.sh/docs/topics/library_charts/) +for use by stress test packages. This common set of config boilerplate simplifies stress test authoring, and makes it +easier to make and roll out config changes to tests across repos by using helm chart dependency versioning. + + # Dependencies +- [Powershell Core](https://docs.microsoft.com/en-us/powershell/scripting/install/installing-powershell-core-on-linux?view=powershell-7.1#ubuntu-2004) - [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) - - If using app insights, install the az extension: `az extension add --name application-insights` -- [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) (if accessing clusters) -- [helm](https://helm.sh) (if installing stress infrastructure) +- [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) +- [helm](https://helm.sh) - [kind](https://github.com/kubernetes-sigs/kind/releases) (if testing locally) - [Docker](https://docs.docker.com/get-docker/) (if deploying/testing locally) -# Cluster Deployment Quick Start -## Deploying a Dev Cluster +# Deploying Cluster(s) -First, update the `./azure/parameters/dev.json` parameters file with the values marked `// add me`, then: +The cluster-specific configurations can be found at `./azure/parameters/.json`. -``` -az deployment sub create -o json -n -l westus -f ./azure/main.bicep --parameters ./azure/parameters/dev.json - -# wait until resource group and AKS cluster are deployed -az aks get-credentials stress-azuresdk -g rg-stress-test-cluster- -``` +Almost all stress test infrastructure is local to the cluster resource group, including storage accounts, keyvaults, +log workspaces and the AKS cluster. There is also a set of static resources, including a subscription service principal +and a keyvault containing the credential configuration. These are shared across clusters located in the same subscription +and are provisioned independently of the bicep templates. -## Deploying a Local Cluster +Cluster buildout and deployment involves three main steps which are automated in `./provision.ps1`: -NOTE: Chaos-Mesh may not work on all local deployments (e.g. Docker Desktop on Windows via WSL). -It may be easier to test services, manifests and containers locally with KIND, and test chaos -in an Azure AKS cluster (shared or personal). +1. Provision static resources (service principal, role assignments, static keyvault). +1. Provision cluster resources (`main.bicep` entrypoint, standard ARM subscription deployment). +1. Provision stress infrastructures resources into the Azure Kubernetes Service cluster via helm + (`./kubernetes/stress-infrastructure` helm chart). -``` -# Ensure docker is running -kind create cluster -``` +## Dev Cluster -## Deploying Stress Infrastructure into Cluster +First, update the `./azure/parameters/dev.json` parameters file with the values marked `// add me`, then run: ``` -helm repo add chaos-mesh https://charts.chaos-mesh.org -helm dependency update ./kubernetes/stress-infrastructure -helm install stress-infra -n stress-infra --create-namespace ./kubernetes/stress-infrastructure +./provision.ps1 -env dev ``` +To deploy stress test packages to the dev environment +(e.g. the [examples](https://github.com/Azure/bicep/tree/main/docs/examples)), pass in `-Environment dev` (see below). +The provision script will update the `./kubernetes/stress-test-addons/values.yaml` file with all the relevant +resource values from the newly provisioned dev environment that are required by the stress test common configuration. -# Development - -Examples detailing the Azure Bicep DSL can be found [here](https://github.com/Azure/bicep/tree/main/docs/examples). +Avoid checking in the updated dev values, they are for local use only. -Bicep also has a [VSCode extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-bicep). - -To validate file changes/compilation: - -``` -az bicep build -f ./azure/main.bicep ``` - -To deploy and access resources: - +# -Login only needs to be run once or if the azure container registry credentials have expired (~24 hours) +/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 -Login -Environment dev ``` -# Edit ./azure/parameters/dev.json, replacing // add me values -# Add -c to dry run changes with a chance to confirm -az deployment sub create -o json -n -l westus -f ./azure/main.bicep --parameters ./azure/parameters/dev.json -# Copy the relevant outputs from the deployment to ./kubernetes/environments/ -# for deploying stress tests later on -az deployment sub show -o json -n --query properties.outputs +## Test Cluster -az aks list -g rg-stress-test-cluster- -az aks get-credentials stress-test -g rg-stress-test-cluster- +The test cluster is the main ad-hoc cluster made available to SDK developers and partners. Changes to this cluster +should be made carefully and announced in advance in order not to disrupt people's work. -# Verify cluster access -kubectl get pods - -# Install stress infrastructure components -helm repo add chaos-mesh https://charts.chaos-mesh.org -helm dependency update ./kubernetes/stress-infrastructure -helm install stress-infra -n stress-infra --create-namespace ./kubernetes/stress-infrastructure -kubectl get pods --namespace stress-infra +``` +./provision.ps1 -env test ``` -To access the chaos-mesh dashboard, run the below command then navigate to `localhost:2333` in the browser: +## Prod Cluster + +The "prod" cluster is the main cluster used for auto-deployment of checked-in stress tests via the StressTestRelease pipeline. +Currently, new instances of all stress tests across the language repositories are deployed on a weekly cadence. +Changes to the prod cluster should ideally be made around the stress test deployment cycle so as to avoid disruption +of test metrics. ``` -kubectl port-forward -n stress-infra svc/chaos-dashboard 2333:2333 +./provision.ps1 -env prod ``` -To remove AKS cluster stress testing resources: +## Local Cluster -``` -helm uninstall stress-infra --namespace stress-infra -``` +For quick testing of various kubernetes configurations, it can be faster and cheaper to use a local cluster. +Not all components of stress testing work in local clusters, however. If testing these components is necessary, the +recommended action is to spin up a dev cluster. -To remove Azure resources: +NOTE: Chaos-Mesh may not work on all local deployments (e.g. Docker Desktop on Windows via WSL). +It may be easier to test services, manifests and containers locally with KIND, and test chaos +in an Azure AKS cluster (shared or personal). ``` -az group delete -az keyvault purge -n +# Ensure docker is running +kind create cluster ``` -# Building out the Main/Prod Testing Cluster - -If not already done, enable the relevant preview features in the subscription and CLI: -- [AKS-AzureKeyVaultSecretsProvider](https://docs.microsoft.com/en-us/azure/aks/csi-secrets-store-driver#register-the-aks-azurekeyvaultsecretsprovider-preview-feature) -## Initializing static identities - -The "official" stress testing clusters rely on a separately created keyvault containing secrets with subscription credentials for stress test resource deployments. -The identities/credentials in these keyvaults can't be created via ARM/Bicep, and should be managed independently of the individual environments. +# Development -To initialize these resources, if they don't exist: +## Bicep templates -``` -az group create rg-StressTestSecrets -az keyvault create -n StressTestSecrets -g rg-StressTestSecrets -az ad sp create-for-rbac -n 'stress-test-provisioner' --role Contributor --scopes '/subscriptions/' -``` - -Create an env file with the service principal values created above: +Examples detailing the Azure Bicep DSL can be found [here](https://github.com/Azure/bicep/tree/main/docs/examples). -``` -AZURE_CLIENT_ID= -AZURE_CLIENT_SECRET= -AZURE_TENANT_ID= -``` +Bicep also has a [VSCode extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-bicep). -Upload it to the static keyvault: +To validate file changes/compilation: ``` -az keyvault secret set --vault-name StressTestSecrets -f ./ -n public +az bicep build -f ./azure/main.bicep ``` -## Building Out Stress Test Cluster Resources +## Helm templates -Various environment configurations are located in `./azure/parameters/.json` to be configured when deploying. +When making changes to `stress-test-addons`, it is easiest to validate them by building one of the [example projects +](https://github.com/Azure/azure-sdk-tools/tree/main/tools/stress-cluster/chaos/examples). -Deploy the cluster and related components (app insights, container registry, keyvault, access policies, etc.) +First, update the `dependencies section of the example's `Chart.yaml` file to point to your local changes on disk: ``` -az deployment sub create -o json -n stress-test-deploy -l westus -f ./azure/main.bicep --parameters ./azure/parameters/test.json +dependencies: +- name: stress-test-addons + version: + repository: https://stresstestcharts.blob.core.windows.net/helm/ + repository: file:////tools/stress-cluster/cluster/kubernetes/stress-test-addons ``` -Gain access to the cluster and install the stress infrastructure components: +Then you can test out the template changes by running, in the example stress test package directory: ``` -az aks get-credentials stress-test -g rg-stress-test-cluster- - -helm repo add chaos-mesh https://charts.chaos-mesh.org -helm dependency update ./kubernetes/stress-infrastructure -helm install stress-infra -n stress-infra --create-namespace ./kubernetes/stress-infrastructure +helm template testrelease . ``` -Update the values in `./kubernetes/stress-test-addons/values.yaml` to match the deployment outputs and check in the changes. +If there are any issues, the helm command will print any errors. If there are no errors, the rendered yaml +may still be an invalid kubernetes manifest, so the example stress test should also be deployed to validate +the full set of changes: ``` -az deployment sub show -o json -n --query properties.outputs +# -Login only needs to be run once or if the azure container registry credentials have expired (~24 hours) +/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 -Login ``` + +For more helm debugging info, see [here](https://helm.sh/docs/chart_template_guide/debugging/). diff --git a/tools/stress-cluster/cluster/azure/cluster/cluster.bicep b/tools/stress-cluster/cluster/azure/cluster/cluster.bicep index 82808295128..54d6a0824c5 100644 --- a/tools/stress-cluster/cluster/azure/cluster/cluster.bicep +++ b/tools/stress-cluster/cluster/azure/cluster/cluster.bicep @@ -4,20 +4,52 @@ param groupSuffix string param dnsPrefix string = 's1' param clusterName string param location string = resourceGroup().location -param agentVMSize string = 'Standard_D2_v3' - -@minValue(1) -@maxValue(50) -@description('The number of nodes for the cluster.') -param agentCount int = 3 +param enableHighMemAgentPool bool = false // monitoring parameters param enableMonitoring bool = false param workspaceId string -var kubernetesVersion = '1.20.5' +var kubernetesVersion = '1.21.2' var nodeResourceGroup = 'rg-nodes-${dnsPrefix}-${clusterName}-${groupSuffix}' -var agentPoolName = 'agentpool01' + +var defaultAgentPool = { + name: 'default' + count: 3 + minCount: 3 + maxCount: 9 + mode: 'System' + vmSize: 'Standard_D2_v3' + type: 'VirtualMachineScaleSets' + osType: 'Linux' + enableAutoScaling: true + enableEncryptionAtHost: true + nodeLabels: { + 'sku': 'default' + } +} + +var highMemAgentPool = { + name: 'highmemory' + count: 1 + minCount: 1 + maxCount: 3 + mode: 'System' + vmSize: 'Standard_D4ds_v4' + type: 'VirtualMachineScaleSets' + osType: 'Linux' + enableAutoScaling: true + enableEncryptionAtHost: true + nodeLabels: { + 'sku': 'highMem' + } +} + +var agentPools = concat([ + defaultAgentPool + ], enableHighMemAgentPool ? [ + highMemAgentPool + ] : []) resource cluster 'Microsoft.ContainerService/managedClusters@2020-09-01' = { name: clusterName @@ -41,17 +73,7 @@ resource cluster 'Microsoft.ContainerService/managedClusters@2020-09-01' = { kubernetesVersion: kubernetesVersion enableRBAC: true dnsPrefix: dnsPrefix - agentPoolProfiles: [ - { - name: agentPoolName - count: agentCount - mode: 'System' - vmSize: agentVMSize - type: 'VirtualMachineScaleSets' - osType: 'Linux' - enableAutoScaling: false - } - ] + agentPoolProfiles: agentPools servicePrincipalProfile: { clientId: 'msi' } diff --git a/tools/stress-cluster/cluster/azure/cluster/storage.bicep b/tools/stress-cluster/cluster/azure/cluster/storage.bicep new file mode 100644 index 00000000000..afcc36ed8ae --- /dev/null +++ b/tools/stress-cluster/cluster/azure/cluster/storage.bicep @@ -0,0 +1,21 @@ +param location string = resourceGroup().location +param storageName string +param fileShareName string + +resource storage 'Microsoft.Storage/storageAccounts@2019-06-01' = { + name: storageName + location: location + kind: 'StorageV2' + sku: { + name: 'Standard_LRS' + } +} + +resource fileshare 'Microsoft.Storage/storageAccounts/fileServices/shares@2021-04-01' = { + name: '${storage.name}/default/${fileShareName}' + properties: { } +} + +output name string = storage.name +output key string = storage.listKeys().keys[0].value +output fileShareName string = fileShareName diff --git a/tools/stress-cluster/cluster/azure/main.bicep b/tools/stress-cluster/cluster/azure/main.bicep index b1beb6a4309..594c7b66a4f 100644 --- a/tools/stress-cluster/cluster/azure/main.bicep +++ b/tools/stress-cluster/cluster/azure/main.bicep @@ -1,5 +1,6 @@ targetScope = 'subscription' +param subscriptionId string = '' param groupSuffix string param clusterName string param clusterLocation string = 'westus2' @@ -8,6 +9,8 @@ param staticTestSecretsKeyvaultGroup string param monitoringLocation string = 'centralus' param tags object param enableMonitoring bool = false +param enableHighMemAgentPool bool = false +param enableDebugStorage bool = false // Azure Developer Platform Team Group // https://ms.portal.azure.com/#blade/Microsoft_AAD_IAM/GroupDetailsMenuBlade/Overview/groupId/56709ad9-8962-418a-ad0d-4b25fa962bae @@ -15,8 +18,10 @@ param accessGroups array = [ '56709ad9-8962-418a-ad0d-4b25fa962bae' ] +var groupName = 'rg-stress-cluster-${groupSuffix}' + resource group 'Microsoft.Resources/resourceGroups@2020-10-01' = { - name: 'rg-stress-test-cluster-${groupSuffix}' + name: groupName location: clusterLocation tags: tags } @@ -52,6 +57,7 @@ module cluster 'cluster/cluster.bicep' = { tags: tags groupSuffix: groupSuffix enableMonitoring: enableMonitoring + enableHighMemAgentPool: enableHighMemAgentPool workspaceId: enableMonitoring ? logWorkspace.outputs.id : '' } } @@ -60,15 +66,34 @@ module containerRegistry 'cluster/acr.bicep' = { name: 'containerRegistry' scope: group params: { - registryName: '${replace(clusterName, '-', '')}registry' + registryName: '${replace(clusterName, '-', '')}${resourceSuffix}' location: clusterLocation objectIds: concat(accessGroups, array(cluster.outputs.kubeletIdentityObjectId)) } } +module storage 'cluster/storage.bicep' = if (enableDebugStorage) { + name: 'storage' + scope: group + params: { + storageName: 'stressdebug${resourceSuffix}' + fileShareName: 'stressfiles${resourceSuffix}' + location: clusterLocation + } +} + var appInsightsInstrumentationKeySecretName = 'appInsightsInstrumentationKey-${resourceSuffix}' +// Value is in dotenv format as it will be appended to stress test container dotenv files var appInsightsInstrumentationKeySecretValue = 'APPINSIGHTS_INSTRUMENTATIONKEY=${appInsights.outputs.instrumentationKey}\n' +// Storage account information used for kubernetes fileshare volume mounting via the azure files csi driver +// See https://docs.microsoft.com/en-us/azure/aks/azure-files-volume#create-a-kubernetes-secret +// See https://docs.microsoft.com/en-us/azure/aks/azure-files-csi +var debugStorageKeySecretName = 'debugStorageKey-${resourceSuffix}' +var debugStorageKeySecretValue = '${storage.outputs.key}' +var debugStorageAccountSecretName = 'debugStorageAccount-${resourceSuffix}' +var debugStorageAccountSecretValue = '${storage.outputs.name}' + module keyvault 'cluster/keyvault.bicep' = if (enableMonitoring) { name: 'keyvault' scope: group @@ -83,6 +108,14 @@ module keyvault 'cluster/keyvault.bicep' = if (enableMonitoring) { secretName: appInsightsInstrumentationKeySecretName secretValue: appInsightsInstrumentationKeySecretValue } + { + secretName: debugStorageKeySecretName + secretValue: debugStorageKeySecretValue + } + { + secretName: debugStorageAccountSecretName + secretValue: debugStorageAccountSecretValue + } ] } } @@ -99,10 +132,14 @@ module accessPolicy 'cluster/static-vault-access-policy.bicep' = { } output STATIC_TEST_SECRETS_KEYVAULT string = staticTestSecretsKeyvaultName -output CLUSTER_KEYVAULT string = keyvault.outputs.keyvaultName +output CLUSTER_TEST_SECRETS_KEYVAULT string = keyvault.outputs.keyvaultName output SECRET_PROVIDER_CLIENT_ID string = cluster.outputs.secretProviderClientId output CLUSTER_NAME string = cluster.outputs.clusterName output CONTAINER_REGISTRY_NAME string = containerRegistry.outputs.containerRegistryName output APPINSIGHTS_KEY_SECRET_NAME string = appInsightsInstrumentationKeySecretName +output DEBUG_STORAGE_KEY_SECRET_NAME string = debugStorageKeySecretName +output DEBUG_STORAGE_ACCOUNT_SECRET_NAME string = debugStorageAccountSecretName +output DEBUG_FILESHARE_NAME string = storage.outputs.fileShareName output RESOURCE_GROUP string = group.name +output SUBSCRIPTION_ID string = subscriptionId output TENANT_ID string = subscription().tenantId diff --git a/tools/stress-cluster/cluster/azure/parameters/dev.json b/tools/stress-cluster/cluster/azure/parameters/dev.json index 1de6b9a6863..7f3f79a2801 100644 --- a/tools/stress-cluster/cluster/azure/parameters/dev.json +++ b/tools/stress-cluster/cluster/azure/parameters/dev.json @@ -2,7 +2,10 @@ "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", "contentVersion": "1.0.0.0", "parameters": { - "groupName": { + "subscriptionId": { + "value": // add me + }, + "groupSuffix": { "value": // add me }, "clusterName": { @@ -12,10 +15,13 @@ "value": "westus2" }, "staticTestSecretsKeyvaultName": { - "value": "StressTestSecrets" + "value": "stress-secrets-dev" }, "staticTestSecretsKeyvaultGroup": { - "value": "rg-StressTestSecrets" + "value": "rg-stress-secrets-dev" + }, + "enableDebugStorage": { + "value": true }, "tags": { "value": { diff --git a/tools/stress-cluster/cluster/azure/parameters/prod.json b/tools/stress-cluster/cluster/azure/parameters/prod.json index 711a854de79..2cbdc63b579 100644 --- a/tools/stress-cluster/cluster/azure/parameters/prod.json +++ b/tools/stress-cluster/cluster/azure/parameters/prod.json @@ -2,6 +2,9 @@ "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", "contentVersion": "1.0.0.0", "parameters": { + "subscriptionId": { + "value": "2cd617ea-1866-46b1-90e3-fffb087ebf9b" + }, "groupSuffix": { "value": "prod" }, @@ -15,10 +18,10 @@ "value": "centralus" }, "staticTestSecretsKeyvaultName": { - "value": "StressTestSecrets" + "value": "stress-secrets-prod" }, "staticTestSecretsKeyvaultGroup": { - "value": "rg-StressTestSecrets" + "value": "rg-stress-secrets-prod" }, "enableMonitoring": { "value": true diff --git a/tools/stress-cluster/cluster/azure/parameters/test.json b/tools/stress-cluster/cluster/azure/parameters/test.json index 46025581ec3..4a13646fe20 100644 --- a/tools/stress-cluster/cluster/azure/parameters/test.json +++ b/tools/stress-cluster/cluster/azure/parameters/test.json @@ -2,8 +2,11 @@ "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", "contentVersion": "1.0.0.0", "parameters": { + "subscriptionId": { + "value": "faa080af-c1d8-40ad-9cce-e1a450ca5b57" + }, "groupSuffix": { - "value": "" + "value": "test" }, "clusterName": { "value": "stress-test" @@ -12,14 +15,20 @@ "value": "westus2" }, "staticTestSecretsKeyvaultName": { - "value": "StressTestSecrets" + "value": "stress-secrets-test" }, "staticTestSecretsKeyvaultGroup": { - "value": "rg-StressTestSecrets" + "value": "rg-stress-secrets-test" }, "enableMonitoring": { "value": true }, + "enableDebugStorage": { + "value": true + }, + "enableHighMemAgentPool": { + "value": true + }, "tags": { "value": { "environment": "Test", diff --git a/tools/stress-cluster/cluster/kubernetes/stress-infrastructure/templates/stresswatcher.yaml b/tools/stress-cluster/cluster/kubernetes/stress-infrastructure/templates/stresswatcher.yaml index ad835284e09..45898c9b300 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-infrastructure/templates/stresswatcher.yaml +++ b/tools/stress-cluster/cluster/kubernetes/stress-infrastructure/templates/stresswatcher.yaml @@ -18,7 +18,7 @@ spec: spec: containers: - name: stresswatcher - image: stress{{ .Values.repository }}registry.azurecr.io/services/stresswatcher:{{ .Values.tag }} + image: azsdkengsys.azurecr.io/stress/watcher:{{ .Values.tag }} command: ["dotnet", "Stress.Watcher.dll"] --- apiVersion: rbac.authorization.k8s.io/v1 diff --git a/tools/stress-cluster/cluster/kubernetes/stress-infrastructure/values.yaml b/tools/stress-cluster/cluster/kubernetes/stress-infrastructure/values.yaml index 4a0be5627f4..c555c215043 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-infrastructure/values.yaml +++ b/tools/stress-cluster/cluster/kubernetes/stress-infrastructure/values.yaml @@ -1,4 +1,3 @@ -repository: test tag: latest chaos-mesh: diff --git a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/Chart.yaml b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/Chart.yaml index 2d916242a84..2ec9399926f 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/Chart.yaml +++ b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/Chart.yaml @@ -2,5 +2,5 @@ apiVersion: v2 name: stress-test-addons description: Baseline resources and templates for stress testing clusters -version: 0.1.7 +version: 0.1.9 appVersion: v0.1 diff --git a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/index.yaml b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/index.yaml index 0495a74cbf1..6385f59a6c7 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/index.yaml +++ b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/index.yaml @@ -1,6 +1,15 @@ apiVersion: v1 entries: stress-test-addons: + - apiVersion: v2 + appVersion: v0.1 + created: "2021-10-15T13:47:55.8699185-04:00" + description: Baseline resources and templates for stress testing clusters + digest: c1b35be1b87f1ec5d62c69fc8d19186098108eb5aa00d40fc5f48a3b0091af97 + name: stress-test-addons + urls: + - https://stresstestcharts.blob.core.windows.net/helm/stress-test-addons-0.1.9.tgz + version: 0.1.9 - apiVersion: v2 appVersion: v0.1 created: "2021-09-27T15:06:23.276703-04:00" @@ -55,4 +64,4 @@ entries: urls: - https://stresstestcharts.blob.core.windows.net/helm/stress-test-addons-0.1.2.tgz version: 0.1.2 -generated: "2021-09-27T15:06:23.2761458-04:00" +generated: "2021-10-15T13:47:55.8693591-04:00" diff --git a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_container_env.tpl b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_container_env.tpl index 5354973fb2e..66646fb1c9d 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_container_env.tpl +++ b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_container_env.tpl @@ -2,7 +2,21 @@ env: - name: ENV_FILE value: /mnt/outputs/.env + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: DEBUG_SHARE + value: /mnt/share/$(POD_NAMESPACE)/$(POD_NAME)/ + - name: DEBUG_SHARE_ROOT + value: /mnt/share/ volumeMounts: - name: test-env-{{ lower .Scenario }}-{{ .Release.Name }}-{{ .Release.Revision }} mountPath: /mnt/outputs + - name: debug-file-share-{{ .Release.Name }} + mountPath: /mnt/share {{- end -}} diff --git a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_debug_file_volumes.tpl b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_debug_file_volumes.tpl new file mode 100644 index 00000000000..4ab7629bbf2 --- /dev/null +++ b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_debug_file_volumes.tpl @@ -0,0 +1,7 @@ +{{ define "stress-test-addons.debug-file-volumes" }} +- name: debug-file-share-{{ .Release.Name }} + azureFile: + secretName: debugstorageaccountconfig + shareName: {{ get .Values.debugFileShareName .Values.env }} + readOnly: false +{{ end }} diff --git a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_deploy_configmap.tpl b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_deploy_configmap.tpl index 61a93746ef2..013fefe9f53 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_deploy_configmap.tpl +++ b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_deploy_configmap.tpl @@ -2,7 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: "{{ .Release.Name }}-test-resources" + name: "{{ .Release.Name }}-{{ .Release.Revision }}-test-resources" namespace: {{ .Release.Namespace }} data: template: | diff --git a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_deploy_volumes.tpl b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_deploy_volumes.tpl index 5e8b70ab274..f4c9e9d36d3 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_deploy_volumes.tpl +++ b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_deploy_volumes.tpl @@ -1,7 +1,7 @@ {{ define "stress-test-addons.deploy-volumes" }} -- name: {{ .Release.Name }}-test-resources +- name: {{ .Release.Name }}-{{ .Release.Revision }}-test-resources configMap: - name: "{{ .Release.Name }}-test-resources" + name: "{{ .Release.Name }}-{{ .Release.Revision }}-test-resources" items: - key: template path: test-resources.json diff --git a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_init_deploy.tpl b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_init_deploy.tpl index d6da18d550a..dd797fb70a9 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_init_deploy.tpl +++ b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_init_deploy.tpl @@ -1,5 +1,5 @@ {{ define "stress-test-addons.init-deploy" }} -- name: azure-deployer +- name: init-azure-deployer image: mcr.microsoft.com/azure-cli command: ['bash', '-c'] args: @@ -27,7 +27,7 @@ - name: ENV_FILE value: /mnt/outputs/.env volumeMounts: - - name: "{{ .Release.Name }}-test-resources" + - name: "{{ .Release.Name }}-{{ .Release.Revision }}-test-resources" mountPath: /mnt/testresources - name: test-env-{{ lower .Scenario }}-{{ .Release.Name }}-{{ .Release.Revision }} mountPath: /mnt/outputs diff --git a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_init_env.tpl b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_init_env.tpl index c1c83ed515a..4f7e2f40816 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_init_env.tpl +++ b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_init_env.tpl @@ -1,9 +1,10 @@ {{ define "stress-test-addons.init-env" }} -- name: test-env-initializer +- name: init-test-env image: k8s.gcr.io/e2e-test-images/busybox:1.29 command: ['sh', '-c'] args: - # Merge all mounted keyvault secrets into env file + # Merge all mounted keyvault secrets into env file. + # Secret values are expected to be in format = - 'cat /mnt/secrets/static/* /mnt/secrets/cluster/* > $ENV_FILE' env: - name: ENV_FILE diff --git a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_stress_test.tpl b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_stress_test.tpl index 8f85715398f..9723800de6f 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_stress_test.tpl +++ b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/_stress_test.tpl @@ -27,6 +27,8 @@ spec: {{- include "stress-test-addons.env-volumes" . | nindent 8 }} # Volume template for mounting ARM templates {{- include "stress-test-addons.deploy-volumes" . | nindent 8 }} + # Volume template for mounting azure file share for debugging + {{- include "stress-test-addons.debug-file-volumes" . | nindent 8 }} initContainers: # Init container template for injecting secrets # (e.g. app insights instrumentation key, azure client credentials) @@ -45,6 +47,13 @@ spec: {{- /* Copy scenario name into top level key of global context */}} {{ $instance := deepCopy $global | merge (dict "Scenario" . ) -}} {{- $jobOverride := fromYaml (include "stress-test-addons.job-wrapper.tpl" (list $instance $podDefinition)) -}} +{{- /* + The .Values context here corresponds to the parent chart that includes this library as a dependency, + meaning there will be a .Values.stress-test-addons key that contains the values specific to this library. + Given that we are calling into library templates, replace the values context with only the nested + context for this sub-chart. +*/ -}} +{{ $_ := set $instance "Values" (index $instance "Values" "stress-test-addons") -}} {{- $tpl := fromYaml (include "stress-test-addons.deploy-job-template.tpl" $instance) -}} {{- toYaml (merge $jobOverride $tpl) -}} {{- end }} @@ -71,6 +80,7 @@ spec: volumes: # Volume template for mounting secrets {{- include "stress-test-addons.env-volumes" . | nindent 8 }} + {{- include "stress-test-addons.debug-file-volumes" . | nindent 8 }} initContainers: # Init container template for injecting secrets # (e.g. app insights instrumentation key, azure client credentials) @@ -86,6 +96,13 @@ spec: {{- /* Copy scenario name into top level key of global context */}} {{ $instance := deepCopy $global | merge (dict "Scenario" . ) -}} {{- $jobOverride := fromYaml (include "stress-test-addons.job-wrapper.tpl" (list $instance $podDefinition)) -}} +{{- /* + The .Values context here corresponds to the parent chart that includes this library as a dependency, + meaning there will be a .Values.stress-test-addons key that contains the values specific to this library. + Given that we are calling into library templates, replace the values context with only the nested + context for this sub-chart. +*/ -}} +{{ $_ := set $instance "Values" (index $instance "Values" "stress-test-addons") -}} {{- $tpl := fromYaml (include "stress-test-addons.env-job-template.tpl" $instance) -}} {{- toYaml (merge $jobOverride $tpl) -}} {{- end }} diff --git a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/stress-test-cluster-secret-provider.yaml b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/stress-test-cluster-secret-provider.yaml index 3e61093879d..61f984034c5 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/stress-test-cluster-secret-provider.yaml +++ b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/templates/stress-test-cluster-secret-provider.yaml @@ -11,6 +11,16 @@ spec: data: - objectName: {{ get .Values.appInsightsKeySecretName .Values.env }} key: value + # Storage account information used for kubernetes fileshare volume mounting via the azure files csi driver + # See https://docs.microsoft.com/en-us/azure/aks/azure-files-volume#create-a-kubernetes-secret + # See https://docs.microsoft.com/en-us/azure/aks/azure-files-csi + - secretName: debugstorageaccountconfig + type: Opaque + data: + - objectName: {{ get .Values.debugStorageKeySecretName .Values.env }} + key: azurestorageaccountkey + - objectName: {{ get .Values.debugStorageAccountSecretName .Values.env }} + key: azurestorageaccountname parameters: useVMManagedIdentity: "true" userAssignedIdentityID: {{ get .Values.secretProviderIdentity .Values.env }} # az vmss identity show ... @@ -20,4 +30,10 @@ spec: - | objectName: {{ get .Values.appInsightsKeySecretName .Values.env }} objectType: secret + - | + objectName: {{ get .Values.debugStorageKeySecretName .Values.env }} + objectType: secret + - | + objectName: {{ get .Values.debugStorageAccountSecretName .Values.env }} + objectType: secret tenantId: {{ get .Values.tenantId .Values.env }} diff --git a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/values.yaml b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/values.yaml index 27069a7ce4e..5e15fec269c 100644 --- a/tools/stress-cluster/cluster/kubernetes/stress-test-addons/values.yaml +++ b/tools/stress-cluster/cluster/kubernetes/stress-test-addons/values.yaml @@ -1,21 +1,32 @@ env: test - appInsightsKeySecretName: - test: appInsightsInstrumentationKey-uj7jqs4ukw2gi + test: appInsightsInstrumentationKey-tbiruti6oi24k prod: appInsightsInstrumentationKey-dqojlttkovp2c - dev: 'not-specified' + dev: "" +debugStorageKeySecretName: + test: debugStorageKey-tbiruti6oi24k + prod: "" + dev: "" +debugStorageAccountSecretName: + test: debugStorageAccount-tbiruti6oi24k + prod: "" + dev: "" +debugFileShareName: + test: stressfilestbiruti6oi24k + prod: "" + dev: "" staticTestSecretsKeyvaultName: - test: StressTestSecrets - prod: StressTestSecrets - dev: 'not-specified' + test: stress-secrets-test + prod: StressSecretsProd + dev: "" clusterTestSecretsKeyvaultName: - test: stress-kv-uj7jqs4ukw2gi + test: stress-kv-tbiruti6oi24k prod: stress-kv-dqojlttkovp2c - dev: 'not-specified' + dev: "" secretProviderIdentity: - test: bc7712b9-1622-4b7f-9943-604c73cda131 + test: 9eca3e6f-842f-495f-b106-4f3331406e79 prod: ea706f92-1d9a-4611-9cde-8305aa3d9e98 - dev: 'not-specified' + dev: "" subscription: test: public prod: public @@ -24,3 +35,4 @@ tenantId: test: 72f988bf-86f1-41af-91ab-2d7cd011db47 prod: 72f988bf-86f1-41af-91ab-2d7cd011db47 dev: 72f988bf-86f1-41af-91ab-2d7cd011db47 + diff --git a/tools/stress-cluster/cluster/provision.ps1 b/tools/stress-cluster/cluster/provision.ps1 new file mode 100644 index 00000000000..7c00965317b --- /dev/null +++ b/tools/stress-cluster/cluster/provision.ps1 @@ -0,0 +1,146 @@ +param ( + [string]$env = 'test' +) + +function Run() +{ + Write-Host "`n==> $args`n" -ForegroundColor Green + $command, $arguments = $args + & $command $arguments + if ($LASTEXITCODE) { + Write-Error "Command '$args' failed with code: $LASTEXITCODE" -ErrorAction 'Continue' + } +} + +function RunOrExitOnFailure() +{ + Run @args + if ($LASTEXITCODE) { + exit $LASTEXITCODE + } +} + +function DeployStaticResources([hashtable]$params) { + Write-Host "Deploying static resources" + + RunOrExitOnFailure az group create ` + -n $params.staticTestSecretsKeyvaultGroup ` + -l $params.clusterLocation ` + --subscription $params.subscriptionId + $kv = Run az keyvault show ` + -n $params.staticTestSecretsKeyvaultName ` + -g $params.staticTestSecretsKeyvaultGroup ` + --subscription $params.subscriptionId + if (!$kv) { + RunOrExitOnFailure az keyvault create ` + -n $params.staticTestSecretsKeyvaultName ` + -g $params.staticTestSecretsKeyvaultGroup ` + --subscription $params.subscriptionId + } + + $sp = RunOrExitOnFailure az ad sp create-for-rbac ` + -o json ` + -n "stress-provisioner-$env" ` + --role Contributor ` + --scopes "/subscriptions/$($params.subscriptionId)" + $spInfo = $sp | ConvertFrom-Json + $oid = (RunOrExitOnFailure az ad sp show -o json --id $spInfo.appId | ConvertFrom-Json).objectId + + $credentials = @{ + AZURE_CLIENT_ID = $spInfo.appId + AZURE_CLIENT_SECRET = $spInfo.password + AZURE_CLIENT_OID = $oid + AZURE_TENANT_ID = $spInfo.tenant + AZURE_SUBSCRIPTION_ID = $params.subscriptionId + } + + $dotenv = $credentials.GetEnumerator() | ForEach-Object { "$($_.Key)=$($_.Value)" } + $secret = $dotenv -join "`n" + + RunOrExitOnFailure az keyvault secret set --vault-name $params.staticTestSecretsKeyvaultName --value $secret -n public +} + +function UpdateOutputs([hashtable]$params) { + $outputs = (az deployment sub show ` + -o json ` + -n stress-deploy-$env ` + --query properties.outputs ` + --subscription $params.subscriptionId + ) | ConvertFrom-Json + + $valuesFile = "$PSScriptRoot/kubernetes/stress-test-addons/values.yaml" + $values = ConvertFrom-Yaml -Ordered (Get-Content -Raw $valuesFile) + + $values.appInsightsKeySecretName.$env = $outputs.APPINSIGHTS_KEY_SECRET_NAME.value + $values.debugStorageKeySecretName.$env = $outputs.DEBUG_STORAGE_KEY_SECRET_NAME.value + $values.debugStorageAccountSecretName.$env = $outputs.DEBUG_STORAGE_ACCOUNT_SECRET_NAME.value + $values.debugFileShareName.$env = $outputs.DEBUG_FILESHARE_NAME.value + $values.staticTestSecretsKeyvaultName.$env = $outputs.STATIC_TEST_SECRETS_KEYVAULT.value + $values.clusterTestSecretsKeyvaultName.$env = $outputs.CLUSTER_TEST_SECRETS_KEYVAULT.value + $values.secretProviderIdentity.$env = $outputs.SECRET_PROVIDER_CLIENT_ID.value + $values.tenantId.$env = $outputs.TENANT_ID.value + + $values | ConvertTo-Yaml | Out-File $valuesFile + + Write-Host "$valuesFile has been updated and must be checked in." +} + +function DeployClusterResources([hashtable]$params) { + Write-Host "Deploying stress cluster resources" + RunOrExitOnFailure az deployment sub create ` + -o json ` + --subscription $params.subscriptionId ` + -n stress-deploy-$env ` + -l $params.clusterLocation ` + -f $PSScriptRoot/azure/main.bicep ` + --parameters $PSScriptRoot/azure/parameters/$env.json + + UpdateOutputs $params + + Write-Host "Importing cluster credentials" + RunOrExitOnFailure az aks get-credentials ` + -n $params.clusterName ` + -g rg-stress-cluster-$($params.groupSuffix) ` + --overwrite ` + --subscription $params.subscriptionId + + Write-Host "Installing stress infrastructure charts" + RunOrExitOnFailure helm repo add chaos-mesh https://charts.chaos-mesh.org + RunOrExitOnFailure helm dependency update $PSScriptRoot/kubernetes/stress-infrastructure + RunOrExitOnFailure kubectl create namespace stress-infra --dry-run=client -o yaml | kubectl apply -f - + RunOrExitOnFailure helm upgrade --install stress-infra ` + -n stress-infra ` + $PSScriptRoot/kubernetes/stress-infrastructure +} + +function LoadEnvParams() { + $params = (Get-Content $PSScriptRoot/azure/parameters/$env.json | ConvertFrom-Json -AsHashtable).parameters + + if (!$params) { + Write-Error "Error loading parameters file at $PSScriptRoot/azure/parameters/$env.json" + exit 1 + } + + $paramHash = @{} + foreach ($p in $params.GetEnumerator()) { + $paramHash[$p.Key] = $p.Value.value + } + + return $paramHash +} + +function main() { + # . (Join-Path $PSScriptRoot "../Helpers" PSModule-Helpers.ps1) + # Install-ModuleIfNotInstalled "powershell-yaml" "0.4.1" | Import-Module + + $params = LoadEnvParams + + DeployStaticResources $params + DeployClusterResources $params +} + +# Don't call functions when the script is being dot sourced +if ($MyInvocation.InvocationName -ne ".") { + $ErrorActionPreference = 'Stop' + main +}