From f6e7ccdea7e751afc18f486bb0d99dbca03fe742 Mon Sep 17 00:00:00 2001 From: Albert Cheng <38804567+ckairen@users.noreply.github.com> Date: Tue, 28 Mar 2023 09:41:55 -0700 Subject: [PATCH] [stress] stress test rerun failed jobs feature (#5726) closes #5361 --- .../stress-testing/deploy-stress-tests.ps1 | 2 + .../stress-test-deployment-lib.ps1 | 97 ++++++++++++++++--- .../network-stress-example/Dockerfile | 1 + .../Dockerfile | 1 + 4 files changed, 90 insertions(+), 11 deletions(-) diff --git a/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 b/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 index bbf1d1d4253..bc028f26aa1 100644 --- a/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 +++ b/eng/common/scripts/stress-testing/deploy-stress-tests.ps1 @@ -25,6 +25,8 @@ param( # Renders chart templates locally without deployment [Parameter(Mandatory=$False)][switch]$Template, + [Parameter(Mandatory=$False)][switch]$RetryFailedTests, + # Matrix generation parameters [Parameter(Mandatory=$False)][string]$MatrixFileName, [Parameter(Mandatory=$False)][string]$MatrixSelection, diff --git a/eng/common/scripts/stress-testing/stress-test-deployment-lib.ps1 b/eng/common/scripts/stress-testing/stress-test-deployment-lib.ps1 index bafbf77a945..a7d45ae66f2 100644 --- a/eng/common/scripts/stress-testing/stress-test-deployment-lib.ps1 +++ b/eng/common/scripts/stress-testing/stress-test-deployment-lib.ps1 @@ -98,6 +98,7 @@ function DeployStressTests( })] [System.IO.FileInfo]$LocalAddonsPath, [Parameter(Mandatory=$False)][switch]$Template, + [Parameter(Mandatory=$False)][switch]$RetryFailedTests, [Parameter(Mandatory=$False)][string]$MatrixFileName, [Parameter(Mandatory=$False)][string]$MatrixSelection = "sparse", [Parameter(Mandatory=$False)][string]$MatrixDisplayNameFilter, @@ -215,11 +216,16 @@ function DeployStressPackage( if ($LASTEXITCODE) {exit $LASTEXITCODE} $dockerBuildConfigs = @() - - $genValFile = Join-Path $pkg.Directory "generatedValues.yaml" - $genVal = Get-Content $genValFile -Raw | ConvertFrom-Yaml -Ordered - if (Test-Path $genValFile) { - $scenarios = $genVal.Scenarios + + $generatedHelmValuesFilePath = Join-Path $pkg.Directory "generatedValues.yaml" + $generatedHelmValues = Get-Content $generatedHelmValuesFilePath -Raw | ConvertFrom-Yaml -Ordered + $releaseName = $pkg.ReleaseName + if ($RetryFailedTests) { + $releaseName, $generatedHelmValues = generateRetryTestsHelmValues $pkg $releaseName $generatedHelmValues + } + + if (Test-Path $generatedHelmValuesFilePath) { + $scenarios = $generatedHelmValues.Scenarios foreach ($scenario in $scenarios) { if ("image" -in $scenario.keys) { $dockerFilePath = Join-Path $pkg.Directory $scenario.image @@ -286,7 +292,7 @@ function DeployStressPackage( } } } - $genVal.scenarios = @( foreach ($scenario in $genVal.scenarios) { + $generatedHelmValues.scenarios = @( foreach ($scenario in $generatedHelmValues.scenarios) { $dockerPath = if ("image" -notin $scenario) { $dockerFilePath } else { @@ -298,15 +304,15 @@ function DeployStressPackage( $scenario } ) - $genVal | ConvertTo-Yaml | Out-File -FilePath $genValFile + $generatedHelmValues | ConvertTo-Yaml | Out-File -FilePath $generatedHelmValuesFilePath } - Write-Host "Installing or upgrading stress test $($pkg.ReleaseName) from $($pkg.Directory)" + Write-Host "Installing or upgrading stress test $releaseName from $($pkg.Directory)" $generatedConfigPath = Join-Path $pkg.Directory generatedValues.yaml $subCommand = $Template ? "template" : "upgrade" $installFlag = $Template ? "" : "--install" - $helmCommandArg = "helm", $subCommand, $pkg.ReleaseName, $pkg.Directory, "-n", $pkg.Namespace, $installFlag, "--set", "stress-test-addons.env=$environment", "--values", $generatedConfigPath + $helmCommandArg = "helm", $subCommand, $releaseName, $pkg.Directory, "-n", $pkg.Namespace, $installFlag, "--set", "stress-test-addons.env=$environment", "--values", $generatedConfigPath $result = (Run @helmCommandArg) 2>&1 | Write-Host @@ -322,7 +328,7 @@ function DeployStressPackage( # Issues like 'UPGRADE FAILED: another operation (install/upgrade/rollback) is in progress' # can be the result of cancelled `upgrade` operations (e.g. ctrl-c). # See https://github.com/helm/helm/issues/4558 - Write-Warning "The issue may be fixable by first running 'helm rollback -n $($pkg.Namespace) $($pkg.ReleaseName)'" + Write-Warning "The issue may be fixable by first running 'helm rollback -n $($pkg.Namespace) $releaseName'" return } } @@ -333,7 +339,7 @@ function DeployStressPackage( if(!$Template) { $helmReleaseConfig = RunOrExitOnFailure kubectl get secrets ` -n $pkg.Namespace ` - -l "status=deployed,name=$($pkg.ReleaseName)" ` + -l "status=deployed,name=$releaseName" ` -o jsonpath='{.items[0].metadata.name}' Run kubectl label secret -n $pkg.Namespace --overwrite $helmReleaseConfig deployId=$deployId } @@ -375,3 +381,72 @@ function CheckDependencies() } } + +function generateRetryTestsHelmValues ($pkg, $releaseName, $generatedHelmValues) { + $podOutput = RunOrExitOnFailure kubectl get pods -n $pkg.namespace -o json + $pods = $podOutput | ConvertFrom-Json + + # Get all jobs within this helm release + + $helmStatusOutput = RunOrExitOnFailure helm status -n $pkg.Namespace $pkg.ReleaseName --show-resources + # -----Example output----- + # NAME: + # LAST DEPLOYED: Mon Jan 01 12:12:12 2020 + # NAMESPACE: + # STATUS: deployed + # REVISION: 10 + # RESOURCES: + # ==> v1alpha1/Schedule + # NAME AGE + # 5h5m + # 5h5m + + # ==> v1/SecretProviderClass + # 7d4h + + # ==> v1/Job + # NAME COMPLETIONS DURATION AGE + # 0/1 5h5m 5h5m + # 0/1 5h5m 5h5m + $discoveredJob = $False + $jobs = @() + foreach ($line in $helmStatusOutput) { + if ($discoveredJob -and $line -match "==>") {break} + if ($discoveredJob) { + $jobs += ($line -split '\s+')[0] | Where-Object {($_ -ne "NAME") -and ($_)} + } + if ($line -match "==> v1/Job") { + $discoveredJob = $True + } + } + + $failedJobsScenario = @() + $revision = 0 + foreach ($job in $jobs) { + $jobRevision = [int]$job.split('-')[-1] + if ($jobRevision -gt $revision) { + $revision = $jobRevision + } + + $jobOutput = RunOrExitOnFailure kubectl describe jobs -n $pkg.Namespace $job + $podPhase = $jobOutput | Select-String "0 Failed" + if ([System.String]::IsNullOrEmpty($podPhase)) { + $failedJobsScenario += $job.split("-$($pkg.ReleaseName)")[0] + } + } + + $releaseName = "$($pkg.ReleaseName)-$revision-retry" + + $retryTestsHelmVal = @{"scenarios"=@()} + foreach ($failedScenario in $failedJobsScenario) { + $failedScenarioObject = $generatedHelmValues.scenarios | Where {$_.Scenario -eq $failedScenario} + $retryTestsHelmVal.scenarios += $failedScenarioObject + } + + if (!$retryTestsHelmVal.scenarios.length) { + Write-Host "There are no failed pods to retry." + return + } + $generatedHelmValues = $retryTestsHelmVal + return $releaseName, $generatedHelmValues +} diff --git a/tools/stress-cluster/chaos/examples/network-stress-example/Dockerfile b/tools/stress-cluster/chaos/examples/network-stress-example/Dockerfile index d6a56a0d80a..deff62bedaf 100644 --- a/tools/stress-cluster/chaos/examples/network-stress-example/Dockerfile +++ b/tools/stress-cluster/chaos/examples/network-stress-example/Dockerfile @@ -2,6 +2,7 @@ FROM mcr.microsoft.com/cbl-mariner/base/core:2.0 # Included packages: https://github.com/microsoft/CBL-Mariner/blob/1.0/SPECS/core-packages/core-packages.spec ADD ./poll.sh /poll.sh +RUN tdnf -y install wget RUN chmod +x /poll.sh CMD bash /poll.sh diff --git a/tools/stress-cluster/chaos/examples/network-stress-scenarios-example/Dockerfile b/tools/stress-cluster/chaos/examples/network-stress-scenarios-example/Dockerfile index d6a56a0d80a..deff62bedaf 100644 --- a/tools/stress-cluster/chaos/examples/network-stress-scenarios-example/Dockerfile +++ b/tools/stress-cluster/chaos/examples/network-stress-scenarios-example/Dockerfile @@ -2,6 +2,7 @@ FROM mcr.microsoft.com/cbl-mariner/base/core:2.0 # Included packages: https://github.com/microsoft/CBL-Mariner/blob/1.0/SPECS/core-packages/core-packages.spec ADD ./poll.sh /poll.sh +RUN tdnf -y install wget RUN chmod +x /poll.sh CMD bash /poll.sh