From 4b5cf0787f9d9601e106b138f667402040e72405 Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Tue, 18 Jun 2024 09:58:19 -0400 Subject: [PATCH] oadp-1.3: OADP-4265 Mark InProgress backup/restore as failed upon requeuing (#315) * Mark InProgress backup/restore as failed upon requeuing Signed-off-by: Tiger Kaovilai remove uuid, return err to requeue instead of requeue: true Signed-off-by: Tiger Kaovilai * cleanup to minimize diff from upstream Signed-off-by: Scott Seago * error message update Signed-off-by: Scott Seago * requeue on finalize status update. Unlike the InProgress transition, there's no need to fail here, since the Finalize steps can be repeated. * Only run patch once for all backup finalizer return scenarios Signed-off-by: Scott Seago --------- Signed-off-by: Tiger Kaovilai Signed-off-by: Scott Seago Co-authored-by: Scott Seago --- changelogs/unreleased/7863-kaovilai | 1 + pkg/controller/backup_controller.go | 17 +++++++++++ pkg/controller/backup_finalizer_controller.go | 28 ++++++++++++++----- pkg/controller/restore_controller.go | 21 ++++++++++---- 4 files changed, 55 insertions(+), 12 deletions(-) create mode 100644 changelogs/unreleased/7863-kaovilai diff --git a/changelogs/unreleased/7863-kaovilai b/changelogs/unreleased/7863-kaovilai new file mode 100644 index 0000000000..af78704fbc --- /dev/null +++ b/changelogs/unreleased/7863-kaovilai @@ -0,0 +1 @@ +Requeue backup/restore in their respective controllers to unstuck InProgress status without requiring velero pod restart. \ No newline at end of file diff --git a/pkg/controller/backup_controller.go b/pkg/controller/backup_controller.go index 862ddb3c6e..b3be1d0a33 100644 --- a/pkg/controller/backup_controller.go +++ b/pkg/controller/backup_controller.go @@ -233,6 +233,20 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr switch original.Status.Phase { case "", velerov1api.BackupPhaseNew: // only process new backups + case velerov1api.BackupPhaseInProgress: + // if backup is in progress, we should not process it again + // we want to mark it as failed to avoid it being stuck in progress + // if so, mark it as failed, last loop did not successfully complete the backup + log.Debug("Backup has in progress status from prior reconcile, marking it as failed") + failedCopy := original.DeepCopy() + failedCopy.Status.Phase = velerov1api.BackupPhaseFailed + failedCopy.Status.FailureReason = "Backup from previous reconcile still in progress. The API Server may have been down." + if err := kubeutil.PatchResource(original, failedCopy, b.kbClient); err != nil { + // return the error so the status can be re-processed; it's currently still not completed or failed + return ctrl.Result{}, err + } + // patch to mark it as failed succeeded, do not requeue + return ctrl.Result{}, nil default: b.logger.WithFields(logrus.Fields{ "backup": kubeutil.NamespaceAndName(original), @@ -309,7 +323,10 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr log.Info("Updating backup's final status") if err := kubeutil.PatchResource(original, request.Backup, b.kbClient); err != nil { log.WithError(err).Error("error updating backup's final status") + // return the error so the status can be re-processed; it's currently still not completed or failed + return ctrl.Result{}, err } + return ctrl.Result{}, nil } diff --git a/pkg/controller/backup_finalizer_controller.go b/pkg/controller/backup_finalizer_controller.go index a27aa45215..1bacf5480c 100644 --- a/pkg/controller/backup_finalizer_controller.go +++ b/pkg/controller/backup_finalizer_controller.go @@ -106,15 +106,21 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ } original := backup.DeepCopy() + patched := false + patchErr := false defer func() { - switch backup.Status.Phase { - case velerov1api.BackupPhaseCompleted, velerov1api.BackupPhasePartiallyFailed, velerov1api.BackupPhaseFailed, velerov1api.BackupPhaseFailedValidation: - r.backupTracker.Delete(backup.Namespace, backup.Name) - } // Always attempt to Patch the backup object and status after each reconciliation. - if err := r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil { - log.WithError(err).Error("Error updating backup") - return + if !patched { + if err := r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil { + log.WithError(err).Error("Error updating backup") + return + } + } + if !patchErr { + switch backup.Status.Phase { + case velerov1api.BackupPhaseCompleted, velerov1api.BackupPhasePartiallyFailed, velerov1api.BackupPhaseFailed, velerov1api.BackupPhaseFailedValidation: + r.backupTracker.Delete(backup.Namespace, backup.Name) + } } }() @@ -207,6 +213,14 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, errors.Wrap(err, "error uploading backup final contents") } } + // Even though we already have patch in a defer call, we call it explicitly here so that + // if update fails, we can requeue. Prior return statements already requeue. + patched = true + if err = r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil { + log.WithError(err).Error("Error updating backup") + patchErr = true + return ctrl.Result{}, err + } return ctrl.Result{}, nil } diff --git a/pkg/controller/restore_controller.go b/pkg/controller/restore_controller.go index f6b9b39d96..d58c21cb3a 100644 --- a/pkg/controller/restore_controller.go +++ b/pkg/controller/restore_controller.go @@ -209,6 +209,20 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct switch restore.Status.Phase { case "", api.RestorePhaseNew: // only process new restores + case api.RestorePhaseInProgress: + // if restore is in progress, we should not process it again + // we want to mark it as failed to avoid it being stuck in progress + // if so, mark it as failed, last loop did not successfully complete the restore + log.Debug("Restore has in progress status from prior reconcile, marking it as failed") + failedCopy := restore.DeepCopy() + failedCopy.Status.Phase = api.RestorePhaseFailed + failedCopy.Status.FailureReason = "Restore from previous reconcile still in progress. The API Server may have been down." + if err := kubeutil.PatchResource(restore, failedCopy, r.kbClient); err != nil { + // return the error so the status can be re-processed; it's currently still not completed or failed + return ctrl.Result{}, err + } + // patch to mark it as failed succeeded, do not requeue + return ctrl.Result{}, nil default: r.logger.WithFields(logrus.Fields{ "restore": kubeutil.NamespaceAndName(restore), @@ -216,10 +230,7 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct }).Debug("Restore is not handled") return ctrl.Result{}, nil } - - // store a copy of the original restore for creating patch original := restore.DeepCopy() - // Validate the restore and fetch the backup info, resourceModifiers := r.validateAndComplete(restore) @@ -272,8 +283,8 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct if err = kubeutil.PatchResource(original, restore, r.kbClient); err != nil { log.WithError(errors.WithStack(err)).Info("Error updating restore's final status") - // No need to re-enqueue here, because restore's already set to InProgress before. - // Controller only handle New restore. + // return the error so the status can be re-processed; it's currently still not completed or failed + return ctrl.Result{}, err } return ctrl.Result{}, nil