oadp-1.3: OADP-4265 Mark InProgress backup/restore as failed upon req…

…ueuing (openshift#315) * Mark InProgress backup/restore as failed upon requeuing Signed-off-by: Tiger Kaovilai <[email protected]> remove uuid, return err to requeue instead of requeue: true Signed-off-by: Tiger Kaovilai <[email protected]> * cleanup to minimize diff from upstream Signed-off-by: Scott Seago <[email protected]> * error message update Signed-off-by: Scott Seago <[email protected]> * requeue on finalize status update. Unlike the InProgress transition, there's no need to fail here, since the Finalize steps can be repeated. * Only run patch once for all backup finalizer return scenarios Signed-off-by: Scott Seago <[email protected]> --------- Signed-off-by: Tiger Kaovilai <[email protected]> Signed-off-by: Scott Seago <[email protected]> Co-authored-by: Scott Seago <[email protected]>
msfrucht · Jun 18, 2024 · 4b5cf07 · 4b5cf07
1 parent e32cf00
commit 4b5cf07
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 12 deletions.
diff --git a/changelogs/unreleased/7863-kaovilai b/changelogs/unreleased/7863-kaovilai
@@ -0,0 +1 @@
+Requeue backup/restore in their respective controllers to unstuck InProgress status without requiring velero pod restart.
diff --git a/pkg/controller/backup_controller.go b/pkg/controller/backup_controller.go
@@ -233,6 +233,20 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
 	switch original.Status.Phase {
 	case "", velerov1api.BackupPhaseNew:
 		// only process new backups
+	case velerov1api.BackupPhaseInProgress:
+		// if backup is in progress, we should not process it again
+		// we want to mark it as failed to avoid it being stuck in progress
+		// if so, mark it as failed, last loop did not successfully complete the backup
+		log.Debug("Backup has in progress status from prior reconcile, marking it as failed")
+		failedCopy := original.DeepCopy()
+		failedCopy.Status.Phase = velerov1api.BackupPhaseFailed
+		failedCopy.Status.FailureReason = "Backup from previous reconcile still in progress. The API Server may have been down."
+		if err := kubeutil.PatchResource(original, failedCopy, b.kbClient); err != nil {
+			// return the error so the status can be re-processed; it's currently still not completed or failed
+			return ctrl.Result{}, err
+		}
+		// patch to mark it as failed succeeded, do not requeue
+		return ctrl.Result{}, nil
 	default:
 		b.logger.WithFields(logrus.Fields{
 			"backup": kubeutil.NamespaceAndName(original),
@@ -309,7 +323,10 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
 	log.Info("Updating backup's final status")
 	if err := kubeutil.PatchResource(original, request.Backup, b.kbClient); err != nil {
 		log.WithError(err).Error("error updating backup's final status")
+		// return the error so the status can be re-processed; it's currently still not completed or failed
+		return ctrl.Result{}, err
 	}
+
 	return ctrl.Result{}, nil
 }
 

diff --git a/pkg/controller/backup_finalizer_controller.go b/pkg/controller/backup_finalizer_controller.go
@@ -106,15 +106,21 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
 	}
 
 	original := backup.DeepCopy()
+	patched := false
+	patchErr := false
 	defer func() {
-		switch backup.Status.Phase {
-		case velerov1api.BackupPhaseCompleted, velerov1api.BackupPhasePartiallyFailed, velerov1api.BackupPhaseFailed, velerov1api.BackupPhaseFailedValidation:
-			r.backupTracker.Delete(backup.Namespace, backup.Name)
-		}
 		// Always attempt to Patch the backup object and status after each reconciliation.
-		if err := r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil {
-			log.WithError(err).Error("Error updating backup")
-			return
+		if !patched {
+			if err := r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil {
+				log.WithError(err).Error("Error updating backup")
+				return
+			}
+		}
+		if !patchErr {
+			switch backup.Status.Phase {
+			case velerov1api.BackupPhaseCompleted, velerov1api.BackupPhasePartiallyFailed, velerov1api.BackupPhaseFailed, velerov1api.BackupPhaseFailedValidation:
+				r.backupTracker.Delete(backup.Namespace, backup.Name)
+			}
 		}
 	}()
 
@@ -207,6 +213,14 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
 			return ctrl.Result{}, errors.Wrap(err, "error uploading backup final contents")
 		}
 	}
+	// Even though we already have patch in a defer call, we call it explicitly here so that
+	// if update fails, we can requeue. Prior return statements already requeue.
+	patched = true
+	if err = r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil {
+		log.WithError(err).Error("Error updating backup")
+		patchErr = true
+		return ctrl.Result{}, err
+	}
 	return ctrl.Result{}, nil
 }
 

diff --git a/pkg/controller/restore_controller.go b/pkg/controller/restore_controller.go
@@ -209,17 +209,28 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 	switch restore.Status.Phase {
 	case "", api.RestorePhaseNew:
 		// only process new restores
+	case api.RestorePhaseInProgress:
+		// if restore is in progress, we should not process it again
+		// we want to mark it as failed to avoid it being stuck in progress
+		// if so, mark it as failed, last loop did not successfully complete the restore
+		log.Debug("Restore has in progress status from prior reconcile, marking it as failed")
+		failedCopy := restore.DeepCopy()
+		failedCopy.Status.Phase = api.RestorePhaseFailed
+		failedCopy.Status.FailureReason = "Restore from previous reconcile still in progress. The API Server may have been down."
+		if err := kubeutil.PatchResource(restore, failedCopy, r.kbClient); err != nil {
+			// return the error so the status can be re-processed; it's currently still not completed or failed
+			return ctrl.Result{}, err
+		}
+		// patch to mark it as failed succeeded, do not requeue
+		return ctrl.Result{}, nil
 	default:
 		r.logger.WithFields(logrus.Fields{
 			"restore": kubeutil.NamespaceAndName(restore),
 			"phase":   restore.Status.Phase,
 		}).Debug("Restore is not handled")
 		return ctrl.Result{}, nil
 	}
-
-	// store a copy of the original restore for creating patch
 	original := restore.DeepCopy()
-
 	// Validate the restore and fetch the backup
 	info, resourceModifiers := r.validateAndComplete(restore)
 
@@ -272,8 +283,8 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 
 	if err = kubeutil.PatchResource(original, restore, r.kbClient); err != nil {
 		log.WithError(errors.WithStack(err)).Info("Error updating restore's final status")
-		// No need to re-enqueue here, because restore's already set to InProgress before.
-		// Controller only handle New restore.
+		// return the error so the status can be re-processed; it's currently still not completed or failed
+		return ctrl.Result{}, err
 	}
 
 	return ctrl.Result{}, nil
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Requeue backup/restore in their respective controllers to unstuck InProgress status without requiring velero pod restart.