Skip to content

Commit

Permalink
oadp-1.3: OADP-4265 Mark InProgress backup/restore as failed upon req…
Browse files Browse the repository at this point in the history
…ueuing (#315)

* Mark InProgress backup/restore as failed upon requeuing

Signed-off-by: Tiger Kaovilai <[email protected]>

remove uuid, return err to requeue instead of requeue: true

Signed-off-by: Tiger Kaovilai <[email protected]>

* cleanup to minimize diff from upstream

Signed-off-by: Scott Seago <[email protected]>

* error message update

Signed-off-by: Scott Seago <[email protected]>

* requeue on finalize status update.

Unlike the InProgress transition, there's no need to fail here,
since the Finalize steps can be repeated.

* Only run patch once for all backup finalizer return scenarios

Signed-off-by: Scott Seago <[email protected]>

---------

Signed-off-by: Tiger Kaovilai <[email protected]>
Signed-off-by: Scott Seago <[email protected]>
Co-authored-by: Scott Seago <[email protected]>
  • Loading branch information
kaovilai and sseago authored Jun 18, 2024
1 parent e32cf00 commit 4b5cf07
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 12 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/7863-kaovilai
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Requeue backup/restore in their respective controllers to unstuck InProgress status without requiring velero pod restart.
17 changes: 17 additions & 0 deletions pkg/controller/backup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,20 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
switch original.Status.Phase {
case "", velerov1api.BackupPhaseNew:
// only process new backups
case velerov1api.BackupPhaseInProgress:
// if backup is in progress, we should not process it again
// we want to mark it as failed to avoid it being stuck in progress
// if so, mark it as failed, last loop did not successfully complete the backup
log.Debug("Backup has in progress status from prior reconcile, marking it as failed")
failedCopy := original.DeepCopy()
failedCopy.Status.Phase = velerov1api.BackupPhaseFailed
failedCopy.Status.FailureReason = "Backup from previous reconcile still in progress. The API Server may have been down."
if err := kubeutil.PatchResource(original, failedCopy, b.kbClient); err != nil {
// return the error so the status can be re-processed; it's currently still not completed or failed
return ctrl.Result{}, err
}
// patch to mark it as failed succeeded, do not requeue
return ctrl.Result{}, nil
default:
b.logger.WithFields(logrus.Fields{
"backup": kubeutil.NamespaceAndName(original),
Expand Down Expand Up @@ -309,7 +323,10 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
log.Info("Updating backup's final status")
if err := kubeutil.PatchResource(original, request.Backup, b.kbClient); err != nil {
log.WithError(err).Error("error updating backup's final status")
// return the error so the status can be re-processed; it's currently still not completed or failed
return ctrl.Result{}, err
}

return ctrl.Result{}, nil
}

Expand Down
28 changes: 21 additions & 7 deletions pkg/controller/backup_finalizer_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,15 +106,21 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
}

original := backup.DeepCopy()
patched := false
patchErr := false
defer func() {
switch backup.Status.Phase {
case velerov1api.BackupPhaseCompleted, velerov1api.BackupPhasePartiallyFailed, velerov1api.BackupPhaseFailed, velerov1api.BackupPhaseFailedValidation:
r.backupTracker.Delete(backup.Namespace, backup.Name)
}
// Always attempt to Patch the backup object and status after each reconciliation.
if err := r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil {
log.WithError(err).Error("Error updating backup")
return
if !patched {
if err := r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil {
log.WithError(err).Error("Error updating backup")
return
}
}
if !patchErr {
switch backup.Status.Phase {
case velerov1api.BackupPhaseCompleted, velerov1api.BackupPhasePartiallyFailed, velerov1api.BackupPhaseFailed, velerov1api.BackupPhaseFailedValidation:
r.backupTracker.Delete(backup.Namespace, backup.Name)
}
}
}()

Expand Down Expand Up @@ -207,6 +213,14 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
return ctrl.Result{}, errors.Wrap(err, "error uploading backup final contents")
}
}
// Even though we already have patch in a defer call, we call it explicitly here so that
// if update fails, we can requeue. Prior return statements already requeue.
patched = true
if err = r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil {
log.WithError(err).Error("Error updating backup")
patchErr = true
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}

Expand Down
21 changes: 16 additions & 5 deletions pkg/controller/restore_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,17 +209,28 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
switch restore.Status.Phase {
case "", api.RestorePhaseNew:
// only process new restores
case api.RestorePhaseInProgress:
// if restore is in progress, we should not process it again
// we want to mark it as failed to avoid it being stuck in progress
// if so, mark it as failed, last loop did not successfully complete the restore
log.Debug("Restore has in progress status from prior reconcile, marking it as failed")
failedCopy := restore.DeepCopy()
failedCopy.Status.Phase = api.RestorePhaseFailed
failedCopy.Status.FailureReason = "Restore from previous reconcile still in progress. The API Server may have been down."
if err := kubeutil.PatchResource(restore, failedCopy, r.kbClient); err != nil {
// return the error so the status can be re-processed; it's currently still not completed or failed
return ctrl.Result{}, err
}
// patch to mark it as failed succeeded, do not requeue
return ctrl.Result{}, nil
default:
r.logger.WithFields(logrus.Fields{
"restore": kubeutil.NamespaceAndName(restore),
"phase": restore.Status.Phase,
}).Debug("Restore is not handled")
return ctrl.Result{}, nil
}

// store a copy of the original restore for creating patch
original := restore.DeepCopy()

// Validate the restore and fetch the backup
info, resourceModifiers := r.validateAndComplete(restore)

Expand Down Expand Up @@ -272,8 +283,8 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct

if err = kubeutil.PatchResource(original, restore, r.kbClient); err != nil {
log.WithError(errors.WithStack(err)).Info("Error updating restore's final status")
// No need to re-enqueue here, because restore's already set to InProgress before.
// Controller only handle New restore.
// return the error so the status can be re-processed; it's currently still not completed or failed
return ctrl.Result{}, err
}

return ctrl.Result{}, nil
Expand Down

0 comments on commit 4b5cf07

Please sign in to comment.