Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More graceful failure recovery #61

Merged
merged 4 commits into from
Sep 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 40 additions & 22 deletions api/v2alpha1/helmrelease_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -563,10 +563,18 @@ func (in Uninstall) GetTimeout(defaultTimeout metav1.Duration) metav1.Duration {

// HelmReleaseStatus defines the observed state of HelmRelease
type HelmReleaseStatus struct {
// ObservedGeneration is the last reconciled generation.
// ObservedGeneration is the last observed generation.
// +optional
ObservedGeneration int64 `json:"observedGeneration,omitempty"`

// ObservedStateReconciled represents whether the observed state has been successfully reconciled.
// +optional
ObservedStateReconciled bool `json:"observedStateReconciled,omitempty"`
stefanprodan marked this conversation as resolved.
Show resolved Hide resolved

// LastObservedTime is the last time at which the HelmRelease was observed.
// +optional
LastObservedTime metav1.Time `json:"lastObservedTime,omitempty"`

// Conditions holds the conditions for the HelmRelease.
// +optional
Conditions []Condition `json:"conditions,omitempty"`
Expand All @@ -592,15 +600,18 @@ type HelmReleaseStatus struct {
// +optional
HelmChart string `json:"helmChart,omitempty"`

// Failures is the reconciliation failure count.
// Failures is the reconciliation failure count against the latest observed state.
// It is reset after a successful reconciliation.
// +optional
Failures int64 `json:"failures,omitempty"`

// InstallFailures is the install failure count.
// InstallFailures is the install failure count against the latest observed state.
// It is reset after a successful reconciliation.
// +optional
InstallFailures int64 `json:"installFailures,omitempty"`

// UpgradeFailures is the upgrade failure count.
// UpgradeFailures is the upgrade failure count against the latest observed state.
// It is reset after a successful reconciliation.
// +optional
UpgradeFailures int64 `json:"upgradeFailures,omitempty"`
}
Expand All @@ -617,27 +628,13 @@ func (in HelmReleaseStatus) GetHelmChart() (string, string) {
// HelmReleaseProgressing resets any failures and registers progress toward reconciling the given HelmRelease
// by setting the ReadyCondition to ConditionUnknown for ProgressingReason.
func HelmReleaseProgressing(hr HelmRelease) HelmRelease {
hr.Status.Failures = 0
hr.Status.InstallFailures = 0
hr.Status.UpgradeFailures = 0
resetFailureCounts(&hr)
hr.Status.ObservedStateReconciled = false
hr.Status.Conditions = []Condition{}
SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionUnknown, ProgressingReason, "reconciliation in progress")
return hr
}

// SetHelmReleaseCondition sets the given condition with the given status, reason and message
// on the HelmRelease.
func SetHelmReleaseCondition(hr *HelmRelease, condition string, status corev1.ConditionStatus, reason, message string) {
hr.Status.Conditions = filterOutCondition(hr.Status.Conditions, condition)
hr.Status.Conditions = append(hr.Status.Conditions, Condition{
Type: condition,
Status: status,
LastTransitionTime: metav1.Now(),
Reason: reason,
Message: message,
})
}

// HelmReleaseNotReady registers a failed release attempt of the given HelmRelease.
func HelmReleaseNotReady(hr HelmRelease, reason, message string) HelmRelease {
SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionFalse, reason, message)
Expand All @@ -646,9 +643,11 @@ func HelmReleaseNotReady(hr HelmRelease, reason, message string) HelmRelease {
}

// HelmReleaseReady registers a successful release attempt of the given HelmRelease.
func HelmReleaseReady(hr HelmRelease, reason, message string) HelmRelease {
SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionTrue, reason, message)
func HelmReleaseReady(hr HelmRelease) HelmRelease {
resetFailureCounts(&hr)
hr.Status.ObservedStateReconciled = true
hr.Status.LastAppliedRevision = hr.Status.LastAttemptedRevision
SetHelmReleaseCondition(&hr, ReadyCondition, corev1.ConditionTrue, ReconciliationSucceededReason, "release reconciliation succeeded")
return hr
}

Expand All @@ -665,6 +664,25 @@ func HelmReleaseAttempted(hr HelmRelease, revision string, releaseRevision int,
return hr, changed
}

func resetFailureCounts(hr *HelmRelease) {
hr.Status.Failures = 0
hr.Status.InstallFailures = 0
hr.Status.UpgradeFailures = 0
}

// SetHelmReleaseCondition sets the given condition with the given status, reason and message
// on the HelmRelease.
func SetHelmReleaseCondition(hr *HelmRelease, condition string, status corev1.ConditionStatus, reason, message string) {
hr.Status.Conditions = filterOutCondition(hr.Status.Conditions, condition)
hr.Status.Conditions = append(hr.Status.Conditions, Condition{
Type: condition,
Status: status,
LastTransitionTime: metav1.Now(),
Reason: reason,
Message: message,
})
}

const (
// ReconcileAtAnnotation is the annotation used for triggering a
// reconciliation outside of the defined schedule.
Expand Down
1 change: 1 addition & 0 deletions api/v2alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 16 additions & 4 deletions config/crd/bases/helm.toolkit.fluxcd.io_helmreleases.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -384,15 +384,17 @@ spec:
type: object
type: array
failures:
description: Failures is the reconciliation failure count.
description: Failures is the reconciliation failure count against
the latest observed state. It is reset after a successful reconciliation.
format: int64
type: integer
helmChart:
description: HelmChart is the namespaced name of the HelmChart resource
created by the controller for the HelmRelease.
type: string
installFailures:
description: InstallFailures is the install failure count.
description: InstallFailures is the install failure count against
the latest observed state. It is reset after a successful reconciliation.
format: int64
type: integer
lastAppliedRevision:
Expand All @@ -407,16 +409,26 @@ spec:
description: LastAttemptedValuesChecksum is the SHA1 checksum of the
values of the last reconciliation attempt.
type: string
lastObservedTime:
description: LastObservedTime is the last time at which the HelmRelease
was observed.
format: date-time
type: string
lastReleaseRevision:
description: LastReleaseRevision is the revision of the last successful
Helm release.
type: integer
observedGeneration:
description: ObservedGeneration is the last reconciled generation.
description: ObservedGeneration is the last observed generation.
format: int64
type: integer
observedStateReconciled:
description: ObservedStateReconciled represents whether the observed
state has been successfully reconciled.
type: boolean
upgradeFailures:
description: UpgradeFailures is the upgrade failure count.
description: UpgradeFailures is the upgrade failure count against
the latest observed state. It is reset after a successful reconciliation.
format: int64
type: integer
type: object
Expand Down
131 changes: 63 additions & 68 deletions controllers/helmrelease_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,27 +125,38 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
return ctrl.Result{}, nil
}

if hr.Spec.Suspend {
msg := "HelmRelease is suspended, skipping reconciliation"
hr = v2.HelmReleaseNotReady(hr, v2.SuspendedReason, msg)
if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return ctrl.Result{Requeue: true}, err
}
log.Info(msg)
return ctrl.Result{}, nil
hr, result, err := r.reconcile(ctx, log, hr)

// Update status after reconciliation.
if updateStatusErr := r.updateStatus(ctx, &hr); updateStatusErr != nil {
log.Error(updateStatusErr, "unable to update status after reconciliation")
return ctrl.Result{Requeue: true}, updateStatusErr
}

// Observe the HelmRelease generation.
hasNewGeneration := hr.Status.ObservedGeneration != hr.Generation
if hasNewGeneration {
// Log reconciliation duration
log.Info(fmt.Sprintf("reconcilation finished in %s, next run in %s",
time.Now().Sub(start).String(),
hr.Spec.Interval.Duration.String(),
))

return result, err
}

func (r *HelmReleaseReconciler) reconcile(ctx context.Context, log logr.Logger, hr v2.HelmRelease) (v2.HelmRelease, ctrl.Result, error) {
// Observe HelmRelease generation.
if hr.Status.ObservedGeneration != hr.Generation {
hr.Status.ObservedGeneration = hr.Generation
hr = v2.HelmReleaseProgressing(hr)
if updateStatusErr := r.updateStatus(ctx, &hr); updateStatusErr != nil {
log.Error(updateStatusErr, "unable to update status after generation update")
return hr, ctrl.Result{Requeue: true}, updateStatusErr
}
}

if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return ctrl.Result{Requeue: true}, err
if hr.Spec.Suspend {
msg := "HelmRelease is suspended, skipping reconciliation"
log.Info(msg)
return v2.HelmReleaseNotReady(hr, v2.SuspendedReason, msg), ctrl.Result{}, nil
}

// Reconcile chart based on the HelmChartTemplate
Expand All @@ -159,25 +170,15 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
msg = "HelmChart is not ready"
r.event(hr, hr.Status.LastAttemptedRevision, recorder.EventSeverityInfo, msg)
}
hr = v2.HelmReleaseNotReady(hr, v2.ArtifactFailedReason, msg)
if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return ctrl.Result{Requeue: true}, err
}
return ctrl.Result{}, reconcileErr
return v2.HelmReleaseNotReady(hr, v2.ArtifactFailedReason, msg), ctrl.Result{}, reconcileErr
}

// Check chart artifact readiness
if hc.GetArtifact() == nil {
msg := "HelmChart is not ready"
hr = v2.HelmReleaseNotReady(hr, v2.ArtifactFailedReason, msg)
r.event(hr, hr.Status.LastAttemptedRevision, recorder.EventSeverityInfo, msg)
log.Info(msg)
if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return ctrl.Result{Requeue: true}, err
}
return ctrl.Result{}, nil
return v2.HelmReleaseNotReady(hr, v2.ArtifactFailedReason, msg), ctrl.Result{}, nil
}

// Check dependencies
Expand All @@ -187,47 +188,26 @@ func (r *HelmReleaseReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error)
r.event(hr, hc.GetArtifact().Revision, recorder.EventSeverityInfo, msg)
log.Info(msg)

hr = v2.HelmReleaseNotReady(hr, v2.DependencyNotReadyReason, err.Error())
if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return ctrl.Result{Requeue: true}, err
}
// Exponential backoff would cause execution to be prolonged too much,
// instead we requeue on a fixed interval.
return ctrl.Result{RequeueAfter: r.requeueDependency}, nil
return v2.HelmReleaseNotReady(hr, v2.DependencyNotReadyReason, err.Error()), ctrl.Result{RequeueAfter: r.requeueDependency}, nil
}
log.Info("all dependencies are ready, proceeding with release")
}

// Compose values
values, err := r.composeValues(ctx, hr)
if err != nil {
hr = v2.HelmReleaseNotReady(hr, v2.InitFailedReason, err.Error())
r.event(hr, hr.Status.LastAttemptedRevision, recorder.EventSeverityError, err.Error())
if err := r.Status().Update(ctx, &hr); err != nil {
log.Error(err, "unable to update status")
return ctrl.Result{Requeue: true}, err
}
return ctrl.Result{}, nil
return v2.HelmReleaseNotReady(hr, v2.InitFailedReason, err.Error()), ctrl.Result{}, nil
}

reconciledHr, reconcileErr := r.release(log, *hr.DeepCopy(), hc, values, hasNewGeneration)
reconciledHr, reconcileErr := r.release(ctx, log, *hr.DeepCopy(), hc, values)
if reconcileErr != nil {
r.event(hr, hc.GetArtifact().Revision, recorder.EventSeverityError, fmt.Sprintf("reconciliation failed: %s", reconcileErr.Error()))
}

if err := r.Status().Update(ctx, &reconciledHr); err != nil {
log.Error(err, "unable to update status after reconciliation")
return ctrl.Result{Requeue: true}, err
}

// Log reconciliation duration
log.Info(fmt.Sprintf("reconcilation finished in %s, next run in %s",
time.Now().Sub(start).String(),
hr.Spec.Interval.Duration.String(),
))

return ctrl.Result{RequeueAfter: hr.Spec.Interval.Duration}, reconcileErr
return reconciledHr, ctrl.Result{RequeueAfter: hr.Spec.Interval.Duration}, reconcileErr
}

type HelmReleaseReconcilerOptions struct {
Expand Down Expand Up @@ -295,7 +275,7 @@ func (r *HelmReleaseReconciler) reconcileChart(ctx context.Context, hr *v2.HelmR
return &helmChart, true, nil
}

func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, source sourcev1.Source, values chartutil.Values, hasNewGeneration bool) (v2.HelmRelease, error) {
func (r *HelmReleaseReconciler) release(ctx context.Context, log logr.Logger, hr v2.HelmRelease, source sourcev1.Source, values chartutil.Values) (v2.HelmRelease, error) {
// Acquire lock
unlock, err := lock(fmt.Sprintf("%s-%s", hr.GetName(), hr.GetNamespace()))
if err != nil {
Expand Down Expand Up @@ -342,20 +322,29 @@ func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, sour
hr, hasNewState := v2.HelmReleaseAttempted(hr, revision, releaseRevision, valuesChecksum)
if hasNewState {
hr = v2.HelmReleaseProgressing(hr)
if updateStatusErr := r.updateStatus(ctx, &hr); updateStatusErr != nil {
log.Error(updateStatusErr, "unable to update status after state update")
return hr, updateStatusErr
}
}

// Determine release deployment action.
var deployAction v2.DeploymentAction
switch {
// Install if there is none.
// Install if there is no release.
case rel == nil:
deployAction = hr.Spec.GetInstall()
// Upgrade if there is a new generation, new state, or this is an upgrade retry.
case hasNewGeneration || hasNewState || hr.Spec.GetUpgrade().GetRemediation().GetFailureCount(hr) > 0:
deployAction = hr.Spec.GetUpgrade()
// Otherwise no action needed.
// Fail if the release was due to a failed install (which was not uninstalled).
// The uninstall may have failed, or was not needed due to retries being exhausted
// and remediateLastFailure being false.
case hr.Spec.GetInstall().GetRemediation().GetFailureCount(hr) > 0:
return hr, fmt.Errorf("last install failed but was not uninstalled")
// Skip and mark ready if the observed state was already reconciled.
case hr.Status.ObservedStateReconciled:
return v2.HelmReleaseReady(hr), nil
// Otherwise upgrade.
default:
return hr, nil
deployAction = hr.Spec.GetUpgrade()
}

// Check if retries exhausted.
Expand Down Expand Up @@ -405,17 +394,18 @@ func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, sour
err = uninstallConditionErr
}
}
}
}

// Determine release revision after deployment/remediation.
rel, observeLastReleaseErr = observeLastRelease(cfg, hr)
if observeLastReleaseErr != nil {
err = &ConditionError{
Reason: v2.GetLastReleaseFailedReason,
Err: errors.New("failed to get last release revision after deployment/remediation"),
// Determine release after remediation.
rel, observeLastReleaseErr = observeLastRelease(cfg, hr)
if observeLastReleaseErr != nil {
err = &ConditionError{
Reason: v2.GetLastReleaseFailedReason,
Err: errors.New("failed to get last release revision after remediation"),
}
}
}
}

hr.Status.LastReleaseRevision = getReleaseRevision(rel)

if err != nil {
Expand All @@ -426,7 +416,12 @@ func (r *HelmReleaseReconciler) release(log logr.Logger, hr v2.HelmRelease, sour
}
return v2.HelmReleaseNotReady(hr, reason, err.Error()), err
}
return v2.HelmReleaseReady(hr, v2.ReconciliationSucceededReason, "release reconciliation succeeded"), nil
return v2.HelmReleaseReady(hr), nil
}

func (r *HelmReleaseReconciler) updateStatus(ctx context.Context, hr *v2.HelmRelease) error {
hr.Status.LastObservedTime = v1.Now()
return r.Status().Update(ctx, hr)
}

func (r *HelmReleaseReconciler) checkDependencies(hr v2.HelmRelease) error {
Expand Down
Loading