Skip to content

Commit

Permalink
release failed targets with retry
Browse files Browse the repository at this point in the history
  • Loading branch information
ColdsteelRail committed Dec 18, 2024
1 parent d6e4b41 commit b039e92
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 11 deletions.
4 changes: 4 additions & 0 deletions pkg/controllers/operationjob/operationjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,10 @@ func (r *ReconcileOperationJob) Reconcile(ctx context.Context, req reconcile.Req
return reconcile.Result{}, err
}

if err := r.ensureFailedTargetsReleased(ctx, instance, candidates); err != nil {
return reconcile.Result{}, err
}

Check warning on line 143 in pkg/controllers/operationjob/operationjob_controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/operationjob/operationjob_controller.go#L142-L143

Added lines #L142 - L143 were not covered by tests

err = r.doReconcile(ctx, instance, candidates)
return requeueResult(requeueAfter), err
}
Expand Down
31 changes: 20 additions & 11 deletions pkg/controllers/operationjob/operationjob_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,11 +245,10 @@ func (r *ReconcileOperationJob) getTargetsOpsStatus(
// ensureActiveDeadlineAndTTL calculate time to ActiveDeadlineSeconds and TTLSecondsAfterFinished and release targets
func (r *ReconcileOperationJob) ensureActiveDeadlineAndTTL(ctx context.Context, operationJob *appsv1alpha1.OperationJob, candidates []*OpsCandidate, logger logr.Logger) (bool, *time.Duration, error) {
if operationJob.Spec.ActiveDeadlineSeconds != nil {
var allowReleaseCandidates []*OpsCandidate
for i := range candidates {
candidate := candidates[i]
// just skip if target operation already released, or not started
if IsCandidateOpsReleased(candidate) || candidate.OpsStatus.StartTime == nil {
// just skip if target not started
if candidate.OpsStatus.StartTime == nil {
continue
}
leftTime := time.Duration(*operationJob.Spec.ActiveDeadlineSeconds)*time.Second - time.Since(candidate.OpsStatus.StartTime.Time)
Expand All @@ -258,17 +257,10 @@ func (r *ReconcileOperationJob) ensureActiveDeadlineAndTTL(ctx context.Context,
} else {
logger.Info("should end but still processing")
r.Recorder.Eventf(operationJob, corev1.EventTypeNormal, "Timeout", "Try to fail OperationJob for timeout...")
// mark operationjob and targets failed and release targets
// mark target failed if timeout
MarkCandidateFailed(candidate)
allowReleaseCandidates = append(allowReleaseCandidates, candidate)
}
}
if len(allowReleaseCandidates) > 0 {
releaseErr := r.releaseTargets(ctx, operationJob, allowReleaseCandidates, false)
operationJob.Status = r.calculateStatus(operationJob, candidates)
updateErr := r.updateStatus(ctx, operationJob)
return false, nil, controllerutils.AggregateErrors([]error{releaseErr, updateErr})
}
}

if operationJob.Spec.TTLSecondsAfterFinished != nil {
Expand All @@ -287,6 +279,23 @@ func (r *ReconcileOperationJob) ensureActiveDeadlineAndTTL(ctx context.Context,
return false, nil, nil
}

// ensureFailedTargetsReleased select failed but unreleased targets and call releaseTargets
func (r *ReconcileOperationJob) ensureFailedTargetsReleased(ctx context.Context, operationJob *appsv1alpha1.OperationJob, candidates []*OpsCandidate) error {
var allowReleaseCandidates []*OpsCandidate
for i := range candidates {
if IsCandidateOpsFailed(candidates[i]) && !IsCandidateOpsReleased(candidates[i]) {
allowReleaseCandidates = append(allowReleaseCandidates, candidates[i])
}
}
if len(allowReleaseCandidates) > 0 {
releaseErr := r.releaseTargets(ctx, operationJob, allowReleaseCandidates, false)
operationJob.Status = r.calculateStatus(operationJob, candidates)
updateErr := r.updateStatus(ctx, operationJob)
return controllerutils.AggregateErrors([]error{releaseErr, updateErr})
}
return nil
}

// releaseTargets try to release the targets from operation when the operationJob is deleted
func (r *ReconcileOperationJob) releaseTargets(ctx context.Context, operationJob *appsv1alpha1.OperationJob, candidates []*OpsCandidate, needUpdateStatus bool) error {
actionHandler, enablePodOpsLifecycle, err := r.getActionHandler(operationJob)
Expand Down
7 changes: 7 additions & 0 deletions pkg/controllers/operationjob/opscore/candidate.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@ func IsCandidateOpsReleased(candidate *OpsCandidate) bool {
return false
}

func IsCandidateOpsFailed(candidate *OpsCandidate) bool {
if candidate.OpsStatus == nil || candidate.OpsStatus.Progress == "" {
return false
}
return candidate.OpsStatus.Progress == appsv1alpha1.OperationProgressFailed
}

func IsCandidateServiceAvailable(candidate *OpsCandidate) bool {
if candidate.Pod == nil || candidate.Pod.Labels == nil {
return false
Expand Down

0 comments on commit b039e92

Please sign in to comment.