From 622b4f9aae9c5e2635ccdce85a529cd6a011f7a8 Mon Sep 17 00:00:00 2001 From: Eddie Zaneski Date: Mon, 4 Mar 2024 11:08:01 -0700 Subject: [PATCH] feat: add configurable backoff and retries for Zarf operations Signed-off-by: Eddie Zaneski Co-authored-by: Wayne Starr --- src/cmd/common/viper.go | 4 +- src/cmd/dev.go | 6 ++ src/cmd/initialize.go | 3 +- src/cmd/package.go | 6 +- src/config/config.go | 7 +- src/config/lang/english.go | 1 + src/internal/packager/helm/chart.go | 96 +++++++++++++--------------- src/internal/packager/helm/common.go | 11 ++-- src/pkg/packager/create_stages.go | 4 +- src/pkg/packager/deploy.go | 12 ++-- src/pkg/utils/helpers/misc.go | 13 ++-- src/types/runtime.go | 1 + 12 files changed, 92 insertions(+), 72 deletions(-) diff --git a/src/cmd/common/viper.go b/src/cmd/common/viper.go index 34d6d0888b..b852800611 100644 --- a/src/cmd/common/viper.go +++ b/src/cmd/common/viper.go @@ -84,6 +84,7 @@ const ( VPkgDeploySget = "package.deploy.sget" VPkgDeploySkipWebhooks = "package.deploy.skip_webhooks" VPkgDeployTimeout = "package.deploy.timeout" + VPkgRetries = "package.deploy.retries" // Package publish config keys @@ -184,7 +185,8 @@ func setDefaults() { // Package defaults that are non-zero values v.SetDefault(VPkgOCIConcurrency, 3) + v.SetDefault(VPkgRetries, config.ZarfDefaultRetries) // Deploy opts that are non-zero values - v.SetDefault(VPkgDeployTimeout, config.ZarfDefaultHelmTimeout) + v.SetDefault(VPkgDeployTimeout, config.ZarfDefaultTimeout) } diff --git a/src/cmd/dev.go b/src/cmd/dev.go index bccac8b781..cc53358e5b 100644 --- a/src/cmd/dev.go +++ b/src/cmd/dev.go @@ -282,6 +282,12 @@ func bindDevDeployFlags(v *viper.Viper) { devDeployFlags.StringToStringVar(&pkgConfig.PkgOpts.SetVariables, "deploy-set", v.GetStringMapString(common.VPkgDeploySet), lang.CmdPackageDeployFlagSet) + // Always require adopt-existing-resources flag (no viper) + devDeployFlags.BoolVar(&pkgConfig.DeployOpts.AdoptExistingResources, "adopt-existing-resources", false, lang.CmdPackageDeployFlagAdoptExistingResources) + devDeployFlags.BoolVar(&pkgConfig.DeployOpts.SkipWebhooks, "skip-webhooks", v.GetBool(common.VPkgDeploySkipWebhooks), lang.CmdPackageDeployFlagSkipWebhooks) + devDeployFlags.DurationVar(&pkgConfig.DeployOpts.Timeout, "timeout", v.GetDuration(common.VPkgDeployTimeout), lang.CmdPackageDeployFlagTimeout) + + devDeployFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries) devDeployFlags.StringVar(&pkgConfig.PkgOpts.OptionalComponents, "components", v.GetString(common.VPkgDeployComponents), lang.CmdPackageDeployFlagComponents) devDeployFlags.BoolVar(&pkgConfig.CreateOpts.NoYOLO, "no-yolo", v.GetBool(common.VDevDeployNoYolo), lang.CmdDevDeployFlagNoYolo) diff --git a/src/cmd/initialize.go b/src/cmd/initialize.go index 906f484720..af0498f2fc 100644 --- a/src/cmd/initialize.go +++ b/src/cmd/initialize.go @@ -215,11 +215,10 @@ func init() { // Flags that control how a deployment proceeds // Always require adopt-existing-resources flag (no viper) initCmd.Flags().BoolVar(&pkgConfig.DeployOpts.AdoptExistingResources, "adopt-existing-resources", false, lang.CmdPackageDeployFlagAdoptExistingResources) - initCmd.Flags().BoolVar(&pkgConfig.DeployOpts.SkipWebhooks, "skip-webhooks", v.GetBool(common.VPkgDeploySkipWebhooks), lang.CmdPackageDeployFlagSkipWebhooks) - initCmd.Flags().DurationVar(&pkgConfig.DeployOpts.Timeout, "timeout", v.GetDuration(common.VPkgDeployTimeout), lang.CmdPackageDeployFlagTimeout) + initCmd.Flags().IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries) initCmd.Flags().StringVarP(&pkgConfig.PkgOpts.PublicKeyPath, "key", "k", v.GetString(common.VPkgPublicKey), lang.CmdPackageFlagFlagPublicKey) initCmd.Flags().SortFlags = true diff --git a/src/cmd/package.go b/src/cmd/package.go index 3e29b74b64..ba0951f1b8 100644 --- a/src/cmd/package.go +++ b/src/cmd/package.go @@ -364,6 +364,8 @@ func bindCreateFlags(v *viper.Viper) { createFlags.StringVarP(&pkgConfig.CreateOpts.SigningKeyPath, "key", "k", v.GetString(common.VPkgCreateSigningKey), lang.CmdPackageCreateFlagDeprecatedKey) createFlags.StringVar(&pkgConfig.CreateOpts.SigningKeyPassword, "key-pass", v.GetString(common.VPkgCreateSigningKeyPassword), lang.CmdPackageCreateFlagDeprecatedKeyPassword) + createFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries) + createFlags.MarkHidden("output-directory") createFlags.MarkHidden("key") createFlags.MarkHidden("key-pass") @@ -377,11 +379,10 @@ func bindDeployFlags(v *viper.Viper) { // Always require adopt-existing-resources flag (no viper) deployFlags.BoolVar(&pkgConfig.DeployOpts.AdoptExistingResources, "adopt-existing-resources", false, lang.CmdPackageDeployFlagAdoptExistingResources) - deployFlags.BoolVar(&pkgConfig.DeployOpts.SkipWebhooks, "skip-webhooks", v.GetBool(common.VPkgDeploySkipWebhooks), lang.CmdPackageDeployFlagSkipWebhooks) - deployFlags.DurationVar(&pkgConfig.DeployOpts.Timeout, "timeout", v.GetDuration(common.VPkgDeployTimeout), lang.CmdPackageDeployFlagTimeout) + deployFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries) deployFlags.StringToStringVar(&pkgConfig.PkgOpts.SetVariables, "set", v.GetStringMapString(common.VPkgDeploySet), lang.CmdPackageDeployFlagSet) deployFlags.StringVar(&pkgConfig.PkgOpts.OptionalComponents, "components", v.GetString(common.VPkgDeployComponents), lang.CmdPackageDeployFlagComponents) deployFlags.StringVar(&pkgConfig.PkgOpts.Shasum, "shasum", v.GetString(common.VPkgDeployShasum), lang.CmdPackageDeployFlagShasum) @@ -403,6 +404,7 @@ func bindMirrorFlags(v *viper.Viper) { mirrorFlags.BoolVar(&pkgConfig.MirrorOpts.NoImgChecksum, "no-img-checksum", false, lang.CmdPackageMirrorFlagNoChecksum) + mirrorFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries) mirrorFlags.StringVar(&pkgConfig.PkgOpts.OptionalComponents, "components", v.GetString(common.VPkgDeployComponents), lang.CmdPackageMirrorFlagComponents) // Flags for using an external Git server diff --git a/src/config/config.go b/src/config/config.go index df4deb9030..309e21bcda 100644 --- a/src/config/config.go +++ b/src/config/config.go @@ -99,8 +99,11 @@ var ( operationStartTime = time.Now().Unix() dataInjectionMarker = ".zarf-injection-%d" - ZarfDefaultCachePath = filepath.Join("~", ".zarf-cache") - ZarfDefaultHelmTimeout = 15 * time.Minute + ZarfDefaultCachePath = filepath.Join("~", ".zarf-cache") + + // Default Time Vars + ZarfDefaultTimeout = 15 * time.Minute + ZarfDefaultRetries = 3 ) // GetArch returns the arch based on a priority list with options for overriding. diff --git a/src/config/lang/english.go b/src/config/lang/english.go index 14e3b19887..e86da293a9 100644 --- a/src/config/lang/english.go +++ b/src/config/lang/english.go @@ -232,6 +232,7 @@ $ zarf init --artifact-push-password={PASSWORD} --artifact-push-username={USERNA CmdPackageShort = "Zarf package commands for creating, deploying, and inspecting packages" CmdPackageFlagConcurrency = "Number of concurrent layer operations to perform when interacting with a remote package." CmdPackageFlagFlagPublicKey = "Path to public key file for validating signed packages" + CmdPackageFlagRetries = "Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs" CmdPackageCreateShort = "Creates a Zarf package from a given directory or the current directory" CmdPackageCreateLong = "Builds an archive of resources and dependencies defined by the 'zarf.yaml' in the specified directory.\n" + diff --git a/src/internal/packager/helm/chart.go b/src/internal/packager/helm/chart.go index a52c51967f..7b82e8b39c 100644 --- a/src/internal/packager/helm/chart.go +++ b/src/internal/packager/helm/chart.go @@ -5,9 +5,12 @@ package helm import ( + "errors" "fmt" "time" + "github.com/defenseunicorns/zarf/src/pkg/utils/helpers" + "github.com/Masterminds/semver/v3" "github.com/defenseunicorns/zarf/src/config" "github.com/defenseunicorns/zarf/src/types" @@ -24,9 +27,6 @@ import ( "helm.sh/helm/v3/pkg/storage/driver" ) -// Set the default number of Helm install/upgrade attempts to 3 -const defaultHelmAttempts = 3 - // InstallOrUpgradeChart performs a helm install of the given chart. func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) { fromMessage := h.chart.URL @@ -63,49 +63,14 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) { return nil, "", fmt.Errorf("unable to create helm renderer: %w", err) } - attempt := 0 - for { - attempt++ - - histClient := action.NewHistory(h.actionConfig) + histClient := action.NewHistory(h.actionConfig) + tryHelm := func() error { + var err error releases, histErr := histClient.Run(h.chart.ReleaseName) - if attempt > 3 { - previouslyDeployedVersion := 0 - - // Check for previous releases that successfully deployed - for _, release := range releases { - if release.Info.Status == "deployed" { - previouslyDeployedVersion = release.Version - } - } - - // On total failure try to rollback (if there was a previously deployed version) or uninstall. - if previouslyDeployedVersion > 0 { - spinner.Updatef("Performing chart rollback") - - err = h.rollbackChart(h.chart.ReleaseName, previouslyDeployedVersion) - if err != nil { - return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts and unable to rollback: %w", defaultHelmAttempts, err) - } - - return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts", defaultHelmAttempts) - } - - spinner.Updatef("Performing chart uninstall") - _, err = h.uninstallChart(h.chart.ReleaseName) - if err != nil { - return nil, "", fmt.Errorf("unable to install chart after %d attempts and unable to uninstall: %w", defaultHelmAttempts, err) - } - - return nil, "", fmt.Errorf("unable to install chart after %d attempts", defaultHelmAttempts) - } - - spinner.Updatef("Attempt %d of %d to install chart", attempt, defaultHelmAttempts) - spinner.Updatef("Checking for existing helm deployment") - if histErr == driver.ErrReleaseNotFound { + if errors.Is(histErr, driver.ErrReleaseNotFound) { // No prior release, try to install it. spinner.Updatef("Attempting chart installation") @@ -119,19 +84,50 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) { output, err = h.upgradeChart(lastRelease, postRender) } else { // 😭 things aren't working - return nil, "", fmt.Errorf("unable to verify the chart installation status: %w", histErr) + return fmt.Errorf("unable to verify the chart installation status: %w", histErr) } if err != nil { - message.Warnf("Unable to complete helm chart install/upgrade, waiting 10 seconds and trying again: %s", err.Error()) + return fmt.Errorf("unable to complete the helm chart install/upgrade: %w", err) + } - // Simply wait for dust to settle and try again. - time.Sleep(10 * time.Second) - } else { - message.Debug(output.Info.Description) - spinner.Success() - break + message.Debug(output.Info.Description) + spinner.Success() + return nil + } + + err = helpers.Retry(tryHelm, h.retries, 5*time.Second, message.Warnf) + if err != nil { + // Try to rollback any deployed releases + releases, _ := histClient.Run(h.chart.ReleaseName) + previouslyDeployedVersion := 0 + + // Check for previous releases that successfully deployed + for _, release := range releases { + if release.Info.Status == "deployed" { + previouslyDeployedVersion = release.Version + } } + + // On total failure try to rollback (if there was a previously deployed version) or uninstall. + if previouslyDeployedVersion > 0 { + spinner.Updatef("Performing chart rollback") + + err = h.rollbackChart(h.chart.ReleaseName, previouslyDeployedVersion) + if err != nil { + return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts and unable to rollback: %w", h.retries, err) + } + + return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts", h.retries) + } + + spinner.Updatef("Performing chart uninstall") + _, err = h.uninstallChart(h.chart.ReleaseName) + if err != nil { + return nil, "", fmt.Errorf("unable to install chart after %d attempts and unable to uninstall: %w", h.retries, err) + } + + return nil, "", fmt.Errorf("unable to install chart after %d attempts", h.retries) } // return any collected connect strings for zarf connect. diff --git a/src/internal/packager/helm/common.go b/src/internal/packager/helm/common.go index 9b2dd729a2..cde9bd6992 100644 --- a/src/internal/packager/helm/common.go +++ b/src/internal/packager/helm/common.go @@ -33,6 +33,7 @@ type Helm struct { component types.ZarfComponent cluster *cluster.Cluster timeout time.Duration + retries int kubeVersion string @@ -52,7 +53,7 @@ func New(chart types.ZarfChart, chartPath string, valuesPath string, mods ...Mod chart: chart, chartPath: chartPath, valuesPath: valuesPath, - timeout: config.ZarfDefaultHelmTimeout, + timeout: config.ZarfDefaultTimeout, } for _, mod := range mods { @@ -67,7 +68,8 @@ func NewClusterOnly(cfg *types.PackagerConfig, cluster *cluster.Cluster) *Helm { return &Helm{ cfg: cfg, cluster: cluster, - timeout: config.ZarfDefaultHelmTimeout, + timeout: config.ZarfDefaultTimeout, + retries: config.ZarfDefaultRetries, } } @@ -118,7 +120,7 @@ func NewFromZarfManifest(manifest types.ZarfManifest, manifestPath, packageName, NoWait: manifest.NoWait, }, chartOverride: tmpChart, - timeout: config.ZarfDefaultHelmTimeout, + timeout: config.ZarfDefaultTimeout, } for _, mod := range mods { @@ -131,13 +133,14 @@ func NewFromZarfManifest(manifest types.ZarfManifest, manifestPath, packageName, } // WithDeployInfo adds the necessary information to deploy a given chart -func WithDeployInfo(component types.ZarfComponent, cfg *types.PackagerConfig, cluster *cluster.Cluster, valuesOverrides map[string]any, timeout time.Duration) Modifier { +func WithDeployInfo(component types.ZarfComponent, cfg *types.PackagerConfig, cluster *cluster.Cluster, valuesOverrides map[string]any, timeout time.Duration, retries int) Modifier { return func(h *Helm) { h.component = component h.cfg = cfg h.cluster = cluster h.valuesOverrides = valuesOverrides h.timeout = timeout + h.retries = retries } } diff --git a/src/pkg/packager/create_stages.go b/src/pkg/packager/create_stages.go index bf4dffedfe..707dcb9a9e 100644 --- a/src/pkg/packager/create_stages.go +++ b/src/pkg/packager/create_stages.go @@ -161,8 +161,8 @@ func (p *Packager) assemble() error { return err } - if err := helpers.Retry(doPull, 3, 5*time.Second, message.Warnf); err != nil { - return fmt.Errorf("unable to pull images after 3 attempts: %w", err) + if err := helpers.Retry(doPull, p.cfg.PkgOpts.Retries, 5*time.Second, message.Warnf); err != nil { + return fmt.Errorf("unable to pull images after %d attempts: %w", p.cfg.PkgOpts.Retries, err) } for _, imgInfo := range pulled { diff --git a/src/pkg/packager/deploy.go b/src/pkg/packager/deploy.go index 585b50de14..2d27174cdd 100644 --- a/src/pkg/packager/deploy.go +++ b/src/pkg/packager/deploy.go @@ -470,7 +470,7 @@ func (p *Packager) pushImagesToRegistry(componentImages []string, noImgChecksum return helpers.Retry(func() error { return imgConfig.PushToZarfRegistry() - }, 3, 5*time.Second, message.Warnf) + }, p.cfg.PkgOpts.Retries, 5*time.Second, message.Warnf) } // Push all of the components git repos to the configured git server. @@ -511,8 +511,8 @@ func (p *Packager) pushReposToRepository(reposPath string, repos []string) error return gitClient.PushRepo(repoURL, reposPath) } - // Try repo push up to 3 times - if err := helpers.Retry(tryPush, 3, 5*time.Second, message.Warnf); err != nil { + // Try repo push up to retry limit + if err := helpers.Retry(tryPush, p.cfg.PkgOpts.Retries, 5*time.Second, message.Warnf); err != nil { return fmt.Errorf("unable to push repo %s to the Git Server: %w", repoURL, err) } } @@ -549,7 +549,8 @@ func (p *Packager) installChartAndManifests(componentPaths *layout.ComponentPath p.cfg, p.cluster, valuesOverrides, - p.cfg.DeployOpts.Timeout), + p.cfg.DeployOpts.Timeout, + p.cfg.PkgOpts.Retries), ) addedConnectStrings, installedChartName, err := helmCfg.InstallOrUpgradeChart() @@ -596,7 +597,8 @@ func (p *Packager) installChartAndManifests(componentPaths *layout.ComponentPath p.cfg, p.cluster, nil, - p.cfg.DeployOpts.Timeout), + p.cfg.DeployOpts.Timeout, + p.cfg.PkgOpts.Retries), ) if err != nil { return installedCharts, err diff --git a/src/pkg/utils/helpers/misc.go b/src/pkg/utils/helpers/misc.go index 9b6fefbbd8..f0129c710f 100644 --- a/src/pkg/utils/helpers/misc.go +++ b/src/pkg/utils/helpers/misc.go @@ -6,22 +6,27 @@ package helpers import ( "fmt" + "math" "reflect" "regexp" "time" ) -// Retry will retry a function until it succeeds or the timeout is reached, timeout == retries * delay. -func Retry(fn func() error, retries int, delay time.Duration, logger func(format string, args ...any)) (err error) { +// Retry will retry a function until it succeeds or the timeout is reached. timeout == 2^attempt * delay. +func Retry(fn func() error, retries int, delay time.Duration, logger func(format string, args ...any)) error { + var err error for r := 0; r < retries; r++ { err = fn() if err == nil { break } - logger("Retrying (%d/%d): %s", r+1, retries, err.Error()) + pow := math.Pow(2, float64(r)) + backoff := delay * time.Duration(pow) - time.Sleep(delay) + logger("Retrying (%d/%d) in %s: %s", r+1, retries, backoff, err.Error()) + + time.Sleep(backoff) } return err diff --git a/src/types/runtime.go b/src/types/runtime.go index 3878bb0aef..becf8ec56c 100644 --- a/src/types/runtime.go +++ b/src/types/runtime.go @@ -43,6 +43,7 @@ type ZarfPackageOptions struct { SGetKeyPath string `json:"sGetKeyPath" jsonschema:"description=Location where the public key component of a cosign key-pair can be found"` SetVariables map[string]string `json:"setVariables" jsonschema:"description=Key-Value map of variable names and their corresponding values that will be used to template manifests and files in the Zarf package"` PublicKeyPath string `json:"publicKeyPath" jsonschema:"description=Location where the public key component of a cosign key-pair can be found"` + Retries int `json:"retries" jsonschema:"description=The number of retries to perform for Zarf deploy operations like image pushes or Helm installs"` } // ZarfInspectOptions tracks the user-defined preferences during a package inspection.