Skip to content

Commit

Permalink
Merge pull request #61 from seaneagan/transient-error-recovery
Browse files Browse the repository at this point in the history
More graceful failure recovery
  • Loading branch information
hiddeco authored Sep 1, 2020
2 parents 0cccc5e + 0d64e8d commit 3413c76
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 98 deletions.
62 changes: 40 additions & 22 deletions api/v2alpha1/helmrelease_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -563,10 +563,18 @@ func (in Uninstall) GetTimeout(defaultTimeout metav1.Duration) metav1.Duration {

// HelmReleaseStatus defines the observed state of HelmRelease
type HelmReleaseStatus struct {
// ObservedGeneration is the last reconciled generation.
// ObservedGeneration is the last observed generation.
// +optional
ObservedGeneration int64 `json:"observedGeneration,omitempty"`

// ObservedStateReconciled represents whether the observed state has been successfully reconciled.
// +optional
ObservedStateReconciled bool `json:"observedStateReconciled,omitempty"`

// LastObservedTime is the last time at which the HelmRelease was observed.
// +optional
LastObservedTime metav1.Time `json:"lastObservedTime,omitempty"`

// Conditions holds the conditions for the HelmRelease.
// +optional
Conditions []Condition `json:"conditions,omitempty"`
Expand All @@ -592,15 +600,18 @@ type HelmReleaseStatus struct {
// +optional
HelmChart string `json:"helmChart,omitempty"`

// Failures is the reconciliation failure count.
// Failures is the reconciliation failure count against the latest observed state.
// It is reset after a successful reconciliation.
// +optional
Failures int64 `json:"failures,omitempty"`

// InstallFailures is the install failure count.
// InstallFailures is the install failure count against the latest observed state.
// It is reset after a successful reconciliation.
// +optional
InstallFailures int64 `json:"installFailures,omitempty"`

// UpgradeFailures is the upgrade failure count.
// UpgradeFailures is the upgrade failure count against the latest observed state.
// It is reset after a successful reconciliation.
// +optional
UpgradeFailures int64 `json:"upgradeFailures,omitempty"`
}
Expand All @@ -617,27 +628,13 @@ func (in HelmReleaseStatus) GetHelmChart() (string, string) {
// HelmReleaseProgressing resets any failures and registers progress toward reconciling the given HelmRelease
// by setting the ReadyCondition to ConditionUnknown for ProgressingReason.
func HelmReleaseProgressing(hr HelmRelease) HelmRelease {
hr.Status.Failures = 0
hr.Status.InstallFailures = 0
hr.Status.UpgradeFailures = 0
resetFailureCounts(&hr)
hr.Status.ObservedStateReconciled = false
hr.Status.Conditions = []Condition{}
SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionUnknown, ProgressingReason, "reconciliation in progress")
return hr
}

// SetHelmReleaseCondition sets the given condition with the given status, reason and message
// on the HelmRelease.
func SetHelmReleaseCondition(hr *HelmRelease, condition string, status corev1.ConditionStatus, reason, message string) {
hr.Status.Conditions = filterOutCondition(hr.Status.Conditions, condition)
hr.Status.Conditions = append(hr.Status.Conditions, Condition{
Type: condition,
Status: status,
LastTransitionTime: metav1.Now(),
Reason: reason,
Message: message,
})
}

// HelmReleaseNotReady registers a failed release attempt of the given HelmRelease.
func HelmReleaseNotReady(hr HelmRelease, reason, message string) HelmRelease {
SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionFalse, reason, message)
Expand All @@ -646,9 +643,11 @@ func HelmReleaseNotReady(hr HelmRelease, reason, message string) HelmRelease {
}

// HelmReleaseReady registers a successful release attempt of the given HelmRelease.
func HelmReleaseReady(hr HelmRelease, reason, message string) HelmRelease {
SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionTrue, reason, message)
func HelmReleaseReady(hr HelmRelease) HelmRelease {
resetFailureCounts(&hr)
hr.Status.ObservedStateReconciled = true
hr.Status.LastAppliedRevision = hr.Status.LastAttemptedRevision
SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionTrue, ReconciliationSucceededReason, "release reconciliation succeeded")
return hr
}

Expand All @@ -665,6 +664,25 @@ func HelmReleaseAttempted(hr HelmRelease, revision string, releaseRevision int,
return hr, changed
}

func resetFailureCounts(hr *HelmRelease) {
hr.Status.Failures = 0
hr.Status.InstallFailures = 0
hr.Status.UpgradeFailures = 0
}

// SetHelmReleaseCondition sets the given condition with the given status, reason and message
// on the HelmRelease.
func SetHelmReleaseCondition(hr *HelmRelease, condition string, status corev1.ConditionStatus, reason, message string) {
hr.Status.Conditions = filterOutCondition(hr.Status.Conditions, condition)
hr.Status.Conditions = append(hr.Status.Conditions, Condition{
Type: condition,
Status: status,
LastTransitionTime: metav1.Now(),
Reason: reason,
Message: message,
})
}

const (
// ReconcileAtAnnotation is the annotation used for triggering a
// reconciliation outside of the defined schedule.
Expand Down
1 change: 1 addition & 0 deletions api/v2alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 16 additions & 4 deletions config/crd/bases/helm.toolkit.fluxcd.io_helmreleases.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -384,15 +384,17 @@ spec:
type: object
type: array
failures:
description: Failures is the reconciliation failure count.
description: Failures is the reconciliation failure count against
the latest observed state. It is reset after a successful reconciliation.
format: int64
type: integer
helmChart:
description: HelmChart is the namespaced name of the HelmChart resource
created by the controller for the HelmRelease.
type: string
installFailures:
description: InstallFailures is the install failure count.
description: InstallFailures is the install failure count against
the latest observed state. It is reset after a successful reconciliation.
format: int64
type: integer
lastAppliedRevision:
Expand All @@ -407,16 +409,26 @@ spec:
description: LastAttemptedValuesChecksum is the SHA1 checksum of the
values of the last reconciliation attempt.
type: string
lastObservedTime:
description: LastObservedTime is the last time at which the HelmRelease
was observed.
format: date-time
type: string
lastReleaseRevision:
description: LastReleaseRevision is the revision of the last successful
Helm release.
type: integer
observedGeneration:
description: ObservedGeneration is the last reconciled generation.
description: ObservedGeneration is the last observed generation.
format: int64
type: integer
observedStateReconciled:
description: ObservedStateReconciled represents whether the observed
state has been successfully reconciled.
type: boolean
upgradeFailures:
description: UpgradeFailures is the upgrade failure count.
description: UpgradeFailures is the upgrade failure count against
the latest observed state. It is reset after a successful reconciliation.
format: int64
type: integer
type: object
Expand Down
131 changes: 63 additions & 68 deletions controllers/helmrelease_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,27 +125,38 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
return ctrl.Result{}, nil
}

if hr.Spec.Suspend {
msg := "HelmRelease is suspended, skipping reconciliation"
hr = v2.HelmReleaseNotReady(hr, v2.SuspendedReason, msg)
if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return ctrl.Result{Requeue: true}, err
}
log.Info(msg)
return ctrl.Result{}, nil
hr, result, err := r.reconcile(ctx, log, hr)

// Update status after reconciliation.
if updateStatusErr := r.updateStatus(ctx, &hr); updateStatusErr != nil {
log.Error(updateStatusErr, "unable to update status after reconciliation")
return ctrl.Result{Requeue: true}, updateStatusErr
}

// Observe the HelmRelease generation.
hasNewGeneration := hr.Status.ObservedGeneration != hr.Generation
if hasNewGeneration {
// Log reconciliation duration
log.Info(fmt.Sprintf("reconcilation finished in %s, next run in %s",
time.Now().Sub(start).String(),
hr.Spec.Interval.Duration.String(),
))

return result, err
}

func (r *HelmReleaseReconciler) reconcile(ctx context.Context, log logr.Logger, hr v2.HelmRelease) (v2.HelmRelease, ctrl.Result, error) {
// Observe HelmRelease generation.
if hr.Status.ObservedGeneration != hr.Generation {
hr.Status.ObservedGeneration = hr.Generation
hr = v2.HelmReleaseProgressing(hr)
if updateStatusErr := r.updateStatus(ctx, &hr); updateStatusErr != nil {
log.Error(updateStatusErr, "unable to update status after generation update")
return hr, ctrl.Result{Requeue: true}, updateStatusErr
}
}

if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return ctrl.Result{Requeue: true}, err
if hr.Spec.Suspend {
msg := "HelmRelease is suspended, skipping reconciliation"
log.Info(msg)
return v2.HelmReleaseNotReady(hr, v2.SuspendedReason, msg), ctrl.Result{}, nil
}

// Reconcile chart based on the HelmChartTemplate
Expand All @@ -159,25 +170,15 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
msg = "HelmChart is not ready"
r.event(hr, hr.Status.LastAttemptedRevision, recorder.EventSeverityInfo, msg)
}
hr = v2.HelmReleaseNotReady(hr, v2.ArtifactFailedReason, msg)
if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return ctrl.Result{Requeue: true}, err
}
return ctrl.Result{}, reconcileErr
return v2.HelmReleaseNotReady(hr, v2.ArtifactFailedReason, msg), ctrl.Result{}, reconcileErr
}

// Check chart artifact readiness
if hc.GetArtifact() == nil {
msg := "HelmChart is not ready"
hr = v2.HelmReleaseNotReady(hr, v2.ArtifactFailedReason, msg)
r.event(hr, hr.Status.LastAttemptedRevision, recorder.EventSeverityInfo, msg)
log.Info(msg)
if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return ctrl.Result{Requeue: true}, err
}
return ctrl.Result{}, nil
return v2.HelmReleaseNotReady(hr, v2.ArtifactFailedReason, msg), ctrl.Result{}, nil
}

// Check dependencies
Expand All @@ -187,47 +188,26 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
r.event(hr, hc.GetArtifact().Revision, recorder.EventSeverityInfo, msg)
log.Info(msg)

hr = v2.HelmReleaseNotReady(hr, v2.DependencyNotReadyReason, err.Error())
if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return ctrl.Result{Requeue: true}, err
}
// Exponential backoff would cause execution to be prolonged too much,
// instead we requeue on a fixed interval.
return ctrl.Result{RequeueAfter: r.requeueDependency}, nil
return v2.HelmReleaseNotReady(hr, v2.DependencyNotReadyReason, err.Error()), ctrl.Result{RequeueAfter: r.requeueDependency}, nil
}
log.Info("all dependencies are ready, proceeding with release")
}

// Compose values
values, err := r.composeValues(ctx, hr)
if err != nil {
hr = v2.HelmReleaseNotReady(hr, v2.InitFailedReason, err.Error())
r.event(hr, hr.Status.LastAttemptedRevision, recorder.EventSeverityError, err.Error())
if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return ctrl.Result{Requeue: true}, err
}
return ctrl.Result{}, nil
return v2.HelmReleaseNotReady(hr, v2.InitFailedReason, err.Error()), ctrl.Result{}, nil
}

reconciledHr, reconcileErr := r.release(log, *hr.DeepCopy(), hc, values, hasNewGeneration)
reconciledHr, reconcileErr := r.release(ctx, log, *hr.DeepCopy(), hc, values)
if reconcileErr != nil {
r.event(hr, hc.GetArtifact().Revision, recorder.EventSeverityError, fmt.Sprintf("reconciliation failed: %s", reconcileErr.Error()))
}

if err := r.Status().Update(ctx, &reconciledHr); err != nil {
log.Error(err, "unable to update status after reconciliation")
return ctrl.Result{Requeue: true}, err
}

// Log reconciliation duration
log.Info(fmt.Sprintf("reconcilation finished in %s, next run in %s",
time.Now().Sub(start).String(),
hr.Spec.Interval.Duration.String(),
))

return ctrl.Result{RequeueAfter: hr.Spec.Interval.Duration}, reconcileErr
return reconciledHr, ctrl.Result{RequeueAfter: hr.Spec.Interval.Duration}, reconcileErr
}

type HelmReleaseReconcilerOptions struct {
Expand Down Expand Up @@ -295,7 +275,7 @@ func (r *HelmReleaseReconciler) reconcileChart(ctx context.Context, hr *v2.HelmR
return &helmChart, true, nil
}

func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, source sourcev1.Source, values chartutil.Values, hasNewGeneration bool) (v2.HelmRelease, error) {
func (r *HelmReleaseReconciler) release(ctx context.Context, log logr.Logger, hr v2.HelmRelease, source sourcev1.Source, values chartutil.Values) (v2.HelmRelease, error) {
// Acquire lock
unlock, err := lock(fmt.Sprintf("%s-%s", hr.GetName(), hr.GetNamespace()))
if err != nil {
Expand Down Expand Up @@ -342,20 +322,29 @@ func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, sour
hr, hasNewState := v2.HelmReleaseAttempted(hr, revision, releaseRevision, valuesChecksum)
if hasNewState {
hr = v2.HelmReleaseProgressing(hr)
if updateStatusErr := r.updateStatus(ctx, &hr); updateStatusErr != nil {
log.Error(updateStatusErr, "unable to update status after state update")
return hr, updateStatusErr
}
}

// Determine release deployment action.
var deployAction v2.DeploymentAction
switch {
// Install if there is none.
// Install if there is no release.
case rel == nil:
deployAction = hr.Spec.GetInstall()
// Upgrade if there is a new generation, new state, or this is an upgrade retry.
case hasNewGeneration || hasNewState || hr.Spec.GetUpgrade().GetRemediation().GetFailureCount(hr) > 0:
deployAction = hr.Spec.GetUpgrade()
// Otherwise no action needed.
// Fail if the release was due to a failed install (which was not uninstalled).
// The uninstall may have failed, or was not needed due to retries being exhausted
// and remediateLastFailure being false.
case hr.Spec.GetInstall().GetRemediation().GetFailureCount(hr) > 0:
return hr, fmt.Errorf("last install failed but was not uninstalled")
// Skip and mark ready if the observed state was already reconciled.
case hr.Status.ObservedStateReconciled:
return v2.HelmReleaseReady(hr), nil
// Otherwise upgrade.
default:
return hr, nil
deployAction = hr.Spec.GetUpgrade()
}

// Check if retries exhausted.
Expand Down Expand Up @@ -405,17 +394,18 @@ func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, sour
err = uninstallConditionErr
}
}
}
}

// Determine release revision after deployment/remediation.
rel, observeLastReleaseErr = observeLastRelease(cfg, hr)
if observeLastReleaseErr != nil {
err = &ConditionError{
Reason: v2.GetLastReleaseFailedReason,
Err: errors.New("failed to get last release revision after deployment/remediation"),
// Determine release after remediation.
rel, observeLastReleaseErr = observeLastRelease(cfg, hr)
if observeLastReleaseErr != nil {
err = &ConditionError{
Reason: v2.GetLastReleaseFailedReason,
Err: errors.New("failed to get last release revision after remediation"),
}
}
}
}

hr.Status.LastReleaseRevision = getReleaseRevision(rel)

if err != nil {
Expand All @@ -426,7 +416,12 @@ func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, sour
}
return v2.HelmReleaseNotReady(hr, reason, err.Error()), err
}
return v2.HelmReleaseReady(hr, v2.ReconciliationSucceededReason, "release reconciliation succeeded"), nil
return v2.HelmReleaseReady(hr), nil
}

func (r *HelmReleaseReconciler) updateStatus(ctx context.Context, hr *v2.HelmRelease) error {
hr.Status.LastObservedTime = v1.Now()
return r.Status().Update(ctx, hr)
}

func (r *HelmReleaseReconciler) checkDependencies(hr v2.HelmRelease) error {
Expand Down
Loading

0 comments on commit 3413c76

Please sign in to comment.