Skip to content

Commit

Permalink
More graceful failure recovery
Browse files Browse the repository at this point in the history
- Ensure upgrade actually occurs if known state was not reached
  for any reason (other than install failure).
- After transient failures not tied to new state application, ensure
  spurious upgrades do not occur and ready state is again reached,
  by remembering that the known state was already successfully applied.
- Reset failure counts after success so they're not stale.
- Only lookup post-deployment release revision on remediation,
  since otherwise we already have it.
- Push status update after finding new state so user can observe.
  • Loading branch information
seaneagan committed Aug 31, 2020
1 parent 0cccc5e commit be9e22f
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 46 deletions.
56 changes: 35 additions & 21 deletions api/v2alpha1/helmrelease_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,10 @@ type HelmReleaseStatus struct {
// +optional
Conditions []Condition `json:"conditions,omitempty"`

// KnownStateApplied represents whether the known state has been successfully applied.
// +optional
KnownStateApplied bool `json:"knownStateApplied,omitempty"`

// LastAppliedRevision is the revision of the last successfully applied source.
// +optional
LastAppliedRevision string `json:"lastAppliedRevision,omitempty"`
Expand All @@ -592,15 +596,18 @@ type HelmReleaseStatus struct {
// +optional
HelmChart string `json:"helmChart,omitempty"`

// Failures is the reconciliation failure count.
// Failures is the reconciliation failure count against the known state.
// It is reset after a successful reconciliation.
// +optional
Failures int64 `json:"failures,omitempty"`

// InstallFailures is the install failure count.
// InstallFailures is the install failure count against the known state.
// It is reset after a successful reconciliation.
// +optional
InstallFailures int64 `json:"installFailures,omitempty"`

// UpgradeFailures is the upgrade failure count.
// UpgradeFailures is the upgrade failure count against the known state.
// It is reset after a successful reconciliation.
// +optional
UpgradeFailures int64 `json:"upgradeFailures,omitempty"`
}
Expand All @@ -617,27 +624,13 @@ func (in HelmReleaseStatus) GetHelmChart() (string, string) {
// HelmReleaseProgressing resets any failures and registers progress toward reconciling the given HelmRelease
// by setting the ReadyCondition to ConditionUnknown for ProgressingReason.
func HelmReleaseProgressing(hr HelmRelease) HelmRelease {
hr.Status.Failures = 0
hr.Status.InstallFailures = 0
hr.Status.UpgradeFailures = 0
resetFailureCounts(&hr)
hr.Status.KnownStateApplied = false
hr.Status.Conditions = []Condition{}
SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionUnknown, ProgressingReason, "reconciliation in progress")
return hr
}

// SetHelmReleaseCondition sets the given condition with the given status, reason and message
// on the HelmRelease.
func SetHelmReleaseCondition(hr *HelmRelease, condition string, status corev1.ConditionStatus, reason, message string) {
hr.Status.Conditions = filterOutCondition(hr.Status.Conditions, condition)
hr.Status.Conditions = append(hr.Status.Conditions, Condition{
Type: condition,
Status: status,
LastTransitionTime: metav1.Now(),
Reason: reason,
Message: message,
})
}

// HelmReleaseNotReady registers a failed release attempt of the given HelmRelease.
func HelmReleaseNotReady(hr HelmRelease, reason, message string) HelmRelease {
SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionFalse, reason, message)
Expand All @@ -646,9 +639,11 @@ func HelmReleaseNotReady(hr HelmRelease, reason, message string) HelmRelease {
}

// HelmReleaseReady registers a successful release attempt of the given HelmRelease.
func HelmReleaseReady(hr HelmRelease, reason, message string) HelmRelease {
SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionTrue, reason, message)
func HelmReleaseReady(hr HelmRelease) HelmRelease {
resetFailureCounts(&hr)
hr.Status.KnownStateApplied = true
hr.Status.LastAppliedRevision = hr.Status.LastAttemptedRevision
SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionTrue, ReconciliationSucceededReason, "release reconciliation succeeded")
return hr
}

Expand All @@ -665,6 +660,25 @@ func HelmReleaseAttempted(hr HelmRelease, revision string, releaseRevision int,
return hr, changed
}

func resetFailureCounts(hr *HelmRelease) {
hr.Status.Failures = 0
hr.Status.InstallFailures = 0
hr.Status.UpgradeFailures = 0
}

// SetHelmReleaseCondition sets the given condition with the given status, reason and message
// on the HelmRelease.
func SetHelmReleaseCondition(hr *HelmRelease, condition string, status corev1.ConditionStatus, reason, message string) {
hr.Status.Conditions = filterOutCondition(hr.Status.Conditions, condition)
hr.Status.Conditions = append(hr.Status.Conditions, Condition{
Type: condition,
Status: status,
LastTransitionTime: metav1.Now(),
Reason: reason,
Message: message,
})
}

const (
// ReconcileAtAnnotation is the annotation used for triggering a
// reconciliation outside of the defined schedule.
Expand Down
13 changes: 10 additions & 3 deletions config/crd/bases/helm.toolkit.fluxcd.io_helmreleases.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -384,17 +384,23 @@ spec:
type: object
type: array
failures:
description: Failures is the reconciliation failure count.
description: Failures is the reconciliation failure count against
the known state. It is reset after a successful reconciliation.
format: int64
type: integer
helmChart:
description: HelmChart is the namespaced name of the HelmChart resource
created by the controller for the HelmRelease.
type: string
installFailures:
description: InstallFailures is the install failure count.
description: InstallFailures is the install failure count against
the known state. It is reset after a successful reconciliation.
format: int64
type: integer
knownStateApplied:
description: KnownStateApplied represents whether the known state
has been successfully applied.
type: boolean
lastAppliedRevision:
description: LastAppliedRevision is the revision of the last successfully
applied source.
Expand All @@ -416,7 +422,8 @@ spec:
format: int64
type: integer
upgradeFailures:
description: UpgradeFailures is the upgrade failure count.
description: UpgradeFailures is the upgrade failure count against
the known state. It is reset after a successful reconciliation.
format: int64
type: integer
type: object
Expand Down
47 changes: 28 additions & 19 deletions controllers/helmrelease_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,7 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
}

// Observe the HelmRelease generation.
hasNewGeneration := hr.Status.ObservedGeneration != hr.Generation
if hasNewGeneration {
if hr.Status.ObservedGeneration != hr.Generation {
hr.Status.ObservedGeneration = hr.Generation
hr = v2.HelmReleaseProgressing(hr)
}
Expand Down Expand Up @@ -211,7 +210,7 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
return ctrl.Result{}, nil
}

reconciledHr, reconcileErr := r.release(log, *hr.DeepCopy(), hc, values, hasNewGeneration)
reconciledHr, reconcileErr := r.release(ctx, log, *hr.DeepCopy(), hc, values)
if reconcileErr != nil {
r.event(hr, hc.GetArtifact().Revision, recorder.EventSeverityError, fmt.Sprintf("reconciliation failed: %s", reconcileErr.Error()))
}
Expand Down Expand Up @@ -295,7 +294,7 @@ func (r *HelmReleaseReconciler) reconcileChart(ctx context.Context, hr *v2.HelmR
return &helmChart, true, nil
}

func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, source sourcev1.Source, values chartutil.Values, hasNewGeneration bool) (v2.HelmRelease, error) {
func (r *HelmReleaseReconciler) release(ctx context.Context, log logr.Logger, hr v2.HelmRelease, source sourcev1.Source, values chartutil.Values) (v2.HelmRelease, error) {
// Acquire lock
unlock, err := lock(fmt.Sprintf("%s-%s", hr.GetName(), hr.GetNamespace()))
if err != nil {
Expand Down Expand Up @@ -342,20 +341,29 @@ func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, sour
hr, hasNewState := v2.HelmReleaseAttempted(hr, revision, releaseRevision, valuesChecksum)
if hasNewState {
hr = v2.HelmReleaseProgressing(hr)
if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return hr, err
}
}

// Determine release deployment action.
var deployAction v2.DeploymentAction
switch {
// Install if there is none.
// Install if there is no release.
case rel == nil:
deployAction = hr.Spec.GetInstall()
// Upgrade if there is a new generation, new state, or this is an upgrade retry.
case hasNewGeneration || hasNewState || hr.Spec.GetUpgrade().GetRemediation().GetFailureCount(hr) > 0:
deployAction = hr.Spec.GetUpgrade()
// Otherwise no action needed.
// Fail if the release was due to a failed install (which was not uninstalled).
// The uninstall may have failed, or was not needed due to retries being exhausted
// and remediateLastFailure being false.
case hr.Spec.GetInstall().GetRemediation().GetFailureCount(hr) > 0:
return hr, fmt.Errorf("last install failed but was not uninstalled")
// Skip and mark ready if the known state was already applied.
case hr.Status.KnownStateApplied:
return v2.HelmReleaseReady(hr), nil
// Otherwise upgrade.
default:
return hr, nil
deployAction = hr.Spec.GetUpgrade()
}

// Check if retries exhausted.
Expand Down Expand Up @@ -405,17 +413,18 @@ func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, sour
err = uninstallConditionErr
}
}
}
}

// Determine release revision after deployment/remediation.
rel, observeLastReleaseErr = observeLastRelease(cfg, hr)
if observeLastReleaseErr != nil {
err = &ConditionError{
Reason: v2.GetLastReleaseFailedReason,
Err: errors.New("failed to get last release revision after deployment/remediation"),
// Determine release after remediation.
rel, observeLastReleaseErr = observeLastRelease(cfg, hr)
if observeLastReleaseErr != nil {
err = &ConditionError{
Reason: v2.GetLastReleaseFailedReason,
Err: errors.New("failed to get last release revision after remediation"),
}
}
}
}

hr.Status.LastReleaseRevision = getReleaseRevision(rel)

if err != nil {
Expand All @@ -426,7 +435,7 @@ func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, sour
}
return v2.HelmReleaseNotReady(hr, reason, err.Error()), err
}
return v2.HelmReleaseReady(hr, v2.ReconciliationSucceededReason, "release reconciliation succeeded"), nil
return v2.HelmReleaseReady(hr), nil
}

func (r *HelmReleaseReconciler) checkDependencies(hr v2.HelmRelease) error {
Expand Down
21 changes: 18 additions & 3 deletions docs/api/helmrelease.md
Original file line number Diff line number Diff line change
Expand Up @@ -789,6 +789,18 @@ int64
</tr>
<tr>
<td>
<code>knownStateApplied</code><br>
<em>
bool
</em>
</td>
<td>
<em>(Optional)</em>
<p>KnownStateApplied represents whether the known state has been successfully applied.</p>
</td>
</tr>
<tr>
<td>
<code>lastAppliedRevision</code><br>
<em>
string
Expand Down Expand Up @@ -857,7 +869,8 @@ int64
</td>
<td>
<em>(Optional)</em>
<p>Failures is the reconciliation failure count.</p>
<p>Failures is the reconciliation failure count against the known state.
It is reset after a successful reconciliation.</p>
</td>
</tr>
<tr>
Expand All @@ -869,7 +882,8 @@ int64
</td>
<td>
<em>(Optional)</em>
<p>InstallFailures is the install failure count.</p>
<p>InstallFailures is the install failure count against the known state.
It is reset after a successful reconciliation.</p>
</td>
</tr>
<tr>
Expand All @@ -881,7 +895,8 @@ int64
</td>
<td>
<em>(Optional)</em>
<p>UpgradeFailures is the upgrade failure count.</p>
<p>UpgradeFailures is the upgrade failure count against the known state.
It is reset after a successful reconciliation.</p>
</td>
</tr>
</tbody>
Expand Down
1 change: 1 addition & 0 deletions docs/spec/v2alpha1/helmreleases.md
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,7 @@ status:
reason: ReconciliationSucceeded
status: "True"
type: Ready
knownStateApplied: true
lastAppliedRevision: 4.0.6
lastAttemptedRevision: 4.0.6
lastReleaseRevision: 1
Expand Down

0 comments on commit be9e22f

Please sign in to comment.