From 208737113e48a2f93bcfa717973cd970a892a5a6 Mon Sep 17 00:00:00 2001 From: Xinru Zhang Date: Thu, 8 Dec 2022 11:44:36 -0500 Subject: [PATCH] TEP-0121: Reconciler Implementation Prior to this commit, `retries` logic for TaskRun is handled by Tekton `PipelineRun` reconciler. This commit delegates the retries implementation for `TaskRun` to the `TaskRun` reconciler. The major change is we stopped relying on `len(retriesStatus)` to decide if a target `TaskRun` failed or not. Instead, we use status `ConditionSuceeded` to gate the completion of a `TaskRun`. Even a `TaskRun` failed on one execution, as long as it has remaining retries, the TaskRun won't be stored in etcd with its status set as `Failed`. Instead, the status will be: ```yaml Type: Succeeded Status: Unknown Reason: ToBeRetried ``` --- docs/taskruns.md | 21 +- pkg/pod/status.go | 10 + pkg/pod/status_test.go | 45 +++ pkg/reconciler/pipelinerun/pipelinerun.go | 31 +- .../pipelinerun/pipelinerun_test.go | 124 ++----- .../resources/pipelinerunresolution.go | 48 +-- .../resources/pipelinerunresolution_test.go | 161 +++------ .../pipelinerun/resources/pipelinerunstate.go | 24 +- .../resources/pipelinerunstate_test.go | 209 ++++++----- pkg/reconciler/taskrun/taskrun.go | 20 +- pkg/reconciler/taskrun/taskrun_test.go | 338 ++++++++++++++++++ 11 files changed, 648 insertions(+), 383 deletions(-) diff --git a/docs/taskruns.md b/docs/taskruns.md index fda6d28e11b..e76175710d4 100644 --- a/docs/taskruns.md +++ b/docs/taskruns.md @@ -26,6 +26,7 @@ weight: 300 - [Specifying `Sidecars`](#specifying-sidecars) - [Overriding `Task` `Steps` and `Sidecars`](#overriding-task-steps-and-sidecars) - [Specifying `LimitRange` values](#specifying-limitrange-values) + - [Specifying `Retries`](#specifying-retries) - [Configuring the failure timeout](#configuring-the-failure-timeout) - [Specifying `ServiceAccount` credentials](#specifying-serviceaccount-credentials) - [Monitoring execution status](#monitoring-execution-status) @@ -698,11 +699,25 @@ object(s), if present. Any `Request` or `Limit` specified by the user (on `Task` For more information, see the [`LimitRange` support in Pipeline](./compute-resources.md#limitrange-support). +### Specifying `Retries` +You can use the `retries` field to set how many times you want to retry on a failed TaskRun. +All TaskRun failures are retriable except for `Cancellation`. + +For a retriable `TaskRun`, when an error occurs: +- The error status is archived in `status.RetriesStatus` +- The `Succeeded` condition in `status` is updated: +``` +Type: Succeeded +Status: Unknown +Reason: ToBeRetried +``` +- `status.StartTime` and `status.PodName` are unset to trigger another retry attempt. + ### Configuring the failure timeout -You can use the `timeout` field to set the `TaskRun's` desired timeout value. If you do not specify this -value for the `TaskRun`, the global default timeout value applies. If you set the timeout to 0, the `TaskRun` will -have no timeout and will run until it completes successfully or fails from an error. +You can use the `timeout` field to set the `TaskRun's` desired timeout value for **each retry attempt**. If you do +not specify this value, the global default timeout value applies (the same, to `each retry attempt`). If you set the timeout to 0, +the `TaskRun` will have no timeout and will run until it completes successfully or fails from an error. The global default timeout is set to 60 minutes when you first install Tekton. You can set a different global default timeout value using the `default-timeout-minutes` field in diff --git a/pkg/pod/status.go b/pkg/pod/status.go index 8646c3b2395..ca29686ddf0 100644 --- a/pkg/pod/status.go +++ b/pkg/pod/status.go @@ -368,6 +368,16 @@ func DidTaskRunFail(pod *corev1.Pod) bool { return f } +// IsPodArchived indicates if a pod is archived in the retriesStatus. +func IsPodArchived(pod *corev1.Pod, trs *v1beta1.TaskRunStatus) bool { + for _, retryStatus := range trs.RetriesStatus { + if retryStatus.PodName == pod.GetName() { + return true + } + } + return false +} + func areStepsComplete(pod *corev1.Pod) bool { stepsComplete := len(pod.Status.ContainerStatuses) > 0 && pod.Status.Phase == corev1.PodRunning for _, s := range pod.Status.ContainerStatuses { diff --git a/pkg/pod/status_test.go b/pkg/pod/status_test.go index ade28904617..55eb43a86d0 100644 --- a/pkg/pod/status_test.go +++ b/pkg/pod/status_test.go @@ -1634,6 +1634,51 @@ func TestMarkStatusSuccess(t *testing.T) { } } +func TestIsPodArchived(t *testing.T) { + for _, tc := range []struct { + name string + podName string + retriesStatus []v1beta1.TaskRunStatus + want bool + }{{ + name: "Pod is not in the empty retriesStatus", + podName: "pod", + retriesStatus: []v1beta1.TaskRunStatus{}, + want: false, + }, { + name: "Pod is not in the non-empty retriesStatus", + podName: "pod-retry1", + retriesStatus: []v1beta1.TaskRunStatus{{ + TaskRunStatusFields: v1beta1.TaskRunStatusFields{ + PodName: "pod", + }, + }}, + want: false, + }, { + name: "Pod is in the retriesStatus", + podName: "pod", + retriesStatus: []v1beta1.TaskRunStatus{{ + TaskRunStatusFields: v1beta1.TaskRunStatusFields{ + PodName: "pod", + }}, + }, + want: true, + }} { + t.Run(tc.name, func(t *testing.T) { + trs := v1beta1.TaskRunStatus{ + TaskRunStatusFields: v1beta1.TaskRunStatusFields{ + PodName: "pod", + RetriesStatus: tc.retriesStatus, + }, + } + got := IsPodArchived(&corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: tc.podName}}, &trs) + if tc.want != got { + t.Errorf("got: %v, want: %v", got, tc.want) + } + }) + } +} + func statusRunning() duckv1.Status { var trs v1beta1.TaskRunStatus markStatusRunning(&trs, v1beta1.TaskRunReasonRunning.String(), "Not all Steps in the Task have finished executing") diff --git a/pkg/reconciler/pipelinerun/pipelinerun.go b/pkg/reconciler/pipelinerun/pipelinerun.go index 971948d3f22..92d8751dfde 100644 --- a/pkg/reconciler/pipelinerun/pipelinerun.go +++ b/pkg/reconciler/pipelinerun/pipelinerun.go @@ -888,26 +888,10 @@ func (c *Reconciler) createTaskRuns(ctx context.Context, rpt *resources.Resolved func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, params []v1beta1.Param, rpt *resources.ResolvedPipelineTask, pr *v1beta1.PipelineRun, storageBasePath string) (*v1beta1.TaskRun, error) { logger := logging.FromContext(ctx) - tr, _ := c.taskRunLister.TaskRuns(pr.Namespace).Get(taskRunName) - if tr != nil { - // retry should happen only when the taskrun has failed - if !tr.Status.GetCondition(apis.ConditionSucceeded).IsFalse() { - return tr, nil - } - // Don't modify the lister cache's copy. - tr = tr.DeepCopy() - // is a retry - addRetryHistory(tr) - clearStatus(tr) - tr.Status.MarkResourceOngoing("", "") - logger.Infof("Updating taskrun %s with cleared status and retry history (length: %d).", tr.GetName(), len(tr.Status.RetriesStatus)) - return c.PipelineClientSet.TektonV1beta1().TaskRuns(pr.Namespace).UpdateStatus(ctx, tr, metav1.UpdateOptions{}) - } - rpt.PipelineTask = resources.ApplyPipelineTaskContexts(rpt.PipelineTask) taskRunSpec := pr.GetTaskRunSpec(rpt.PipelineTask.Name) params = append(params, rpt.PipelineTask.Params...) - tr = &v1beta1.TaskRun{ + tr := &v1beta1.TaskRun{ ObjectMeta: metav1.ObjectMeta{ Name: taskRunName, Namespace: pr.Namespace, @@ -916,6 +900,7 @@ func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, para Annotations: combineTaskRunAndTaskSpecAnnotations(pr, rpt.PipelineTask), }, Spec: v1beta1.TaskRunSpec{ + Retries: rpt.PipelineTask.Retries, Params: params, ServiceAccountName: taskRunSpec.TaskServiceAccountName, PodTemplate: taskRunSpec.TaskPodTemplate, @@ -1169,18 +1154,6 @@ func combinedSubPath(workspaceSubPath string, pipelineTaskSubPath string) string return filepath.Join(workspaceSubPath, pipelineTaskSubPath) } -func addRetryHistory(tr *v1beta1.TaskRun) { - newStatus := *tr.Status.DeepCopy() - newStatus.RetriesStatus = nil - tr.Status.RetriesStatus = append(tr.Status.RetriesStatus, newStatus) -} - -func clearStatus(tr *v1beta1.TaskRun) { - tr.Status.StartTime = nil - tr.Status.CompletionTime = nil - tr.Status.PodName = "" -} - func getTaskrunAnnotations(pr *v1beta1.PipelineRun) map[string]string { // Propagate annotations from PipelineRun to TaskRun. annotations := make(map[string]string, len(pr.ObjectMeta.Annotations)+1) diff --git a/pkg/reconciler/pipelinerun/pipelinerun_test.go b/pkg/reconciler/pipelinerun/pipelinerun_test.go index b7e760c285a..434a8823ae7 100644 --- a/pkg/reconciler/pipelinerun/pipelinerun_test.go +++ b/pkg/reconciler/pipelinerun/pipelinerun_test.go @@ -494,6 +494,7 @@ spec: value: test-pipeline - name: contextRetriesParam value: "5" + retries: 5 resources: inputs: - name: workspace @@ -3527,103 +3528,6 @@ spec: } } -// TestReconcileWithTimeoutAndRetry runs "Reconcile" against pipelines with -// retries and timeout settings, and status that represents different number of -// retries already performed. It verifies the reconciled status and events -// generated -func TestReconcileWithTimeoutAndRetry(t *testing.T) { - for _, tc := range []struct { - name string - retries int - conditionSucceeded corev1.ConditionStatus - wantEvents []string - }{{ - name: "One try has to be done", - retries: 1, - conditionSucceeded: corev1.ConditionFalse, - wantEvents: []string{ - "Warning Failed PipelineRun \"test-pipeline-retry-run-with-timeout\" failed to finish within", - }, - }, { - name: "No more retries are needed", - retries: 2, - conditionSucceeded: corev1.ConditionUnknown, - wantEvents: []string{ - "Warning Failed PipelineRun \"test-pipeline-retry-run-with-timeout\" failed to finish within", - }, - }} { - t.Run(tc.name, func(t *testing.T) { - ps := []*v1beta1.Pipeline{parse.MustParseV1beta1Pipeline(t, fmt.Sprintf(` -metadata: - name: test-pipeline-retry - namespace: foo -spec: - tasks: - - name: hello-world-1 - retries: %d - taskRef: - name: hello-world -`, tc.retries))} - prs := []*v1beta1.PipelineRun{parse.MustParseV1beta1PipelineRun(t, ` -metadata: - name: test-pipeline-retry-run-with-timeout - namespace: foo -spec: - pipelineRef: - name: test-pipeline-retry - serviceAccountName: test-sa - timeout: 12h0m0s -status: - startTime: "2021-12-31T00:00:00Z" -`)} - - ts := []*v1beta1.Task{ - simpleHelloWorldTask, - } - trs := []*v1beta1.TaskRun{parse.MustParseV1beta1TaskRun(t, ` -metadata: - name: hello-world-1 - namespace: foo -status: - conditions: - - status: "False" - type: Succeeded - podName: my-pod-name - retriesStatus: - - conditions: - - status: "False" - type: Succeeded -`)} - - prtrs := &v1beta1.PipelineRunTaskRunStatus{ - PipelineTaskName: "hello-world-1", - Status: &trs[0].Status, - } - prs[0].Status.TaskRuns = make(map[string]*v1beta1.PipelineRunTaskRunStatus) - prs[0].Status.TaskRuns["hello-world-1"] = prtrs - - d := test.Data{ - PipelineRuns: prs, - Pipelines: ps, - Tasks: ts, - TaskRuns: trs, - } - prt := newPipelineRunTest(d, t) - defer prt.Cancel() - - reconciledRun, _ := prt.reconcileRun("foo", "test-pipeline-retry-run-with-timeout", []string{}, false) - - if len(reconciledRun.Status.TaskRuns["hello-world-1"].Status.RetriesStatus) != tc.retries { - t.Fatalf(" %d retries expected but got %d ", tc.retries, len(reconciledRun.Status.TaskRuns["hello-world-1"].Status.RetriesStatus)) - } - - if status := reconciledRun.Status.TaskRuns["hello-world-1"].Status.GetCondition(apis.ConditionSucceeded).Status; status != tc.conditionSucceeded { - t.Fatalf("Succeeded expected to be %s but is %s", tc.conditionSucceeded, status) - } - }) - } -} - // TestReconcileAndPropagateCustomPipelineTaskRunSpec tests that custom PipelineTaskRunSpec declared // in PipelineRun is propagated to created TaskRuns func TestReconcileAndPropagateCustomPipelineTaskRunSpec(t *testing.T) { @@ -9673,6 +9577,7 @@ spec: "pr", "p", "platforms-and-browsers", false), ` spec: + retries: 1 params: - name: platform value: linux @@ -9687,12 +9592,17 @@ status: conditions: - type: Succeeded status: "False" + retriesStatus: + - conditions: + - status: "False" + type: Succeeded `), mustParseTaskRunWithObjectMeta(t, taskRunObjectMeta("pr-platforms-and-browsers-1", "foo", "pr", "p", "platforms-and-browsers", false), ` spec: + retries: 1 params: - name: platform value: mac @@ -9807,6 +9717,7 @@ status: "pr", "p", "platforms-and-browsers", false), ` spec: + retries: 1 params: - name: platform value: linux @@ -9820,7 +9731,7 @@ spec: status: conditions: - type: Succeeded - status: "Unknown" + status: "False" retriesStatus: - conditions: - status: "False" @@ -9831,6 +9742,7 @@ status: "pr", "p", "platforms-and-browsers", false), ` spec: + retries: 1 params: - name: platform value: mac @@ -9855,6 +9767,7 @@ status: "pr", "p", "platforms-and-browsers", false), ` spec: + retries: 1 params: - name: platform value: linux @@ -9868,13 +9781,18 @@ spec: status: conditions: - type: Succeeded - status: "False" + status: "Unknown" + retriesStatus: + - conditions: + - status: "False" + type: Succeeded `), mustParseTaskRunWithObjectMeta(t, taskRunObjectMeta("pr-platforms-and-browsers-1", "foo", "pr", "p", "platforms-and-browsers", false), ` spec: + retries: 1 params: - name: platform value: mac @@ -9888,7 +9806,11 @@ spec: status: conditions: - type: Succeeded - status: "False" + status: "Unknown" + retriesStatus: + - conditions: + - status: "False" + type: Succeeded `), }, prs: []*v1beta1.PipelineRun{ @@ -9989,6 +9911,7 @@ status: "pr", "p", "platforms-and-browsers", false), ` spec: + retries: 1 params: - name: platform value: linux @@ -10013,6 +9936,7 @@ status: "pr", "p", "platforms-and-browsers", false), ` spec: + retries: 1 params: - name: platform value: mac diff --git a/pkg/reconciler/pipelinerun/resources/pipelinerunresolution.go b/pkg/reconciler/pipelinerun/resources/pipelinerunresolution.go index 8be3199a5e4..434b1e95f3e 100644 --- a/pkg/reconciler/pipelinerun/resources/pipelinerunresolution.go +++ b/pkg/reconciler/pipelinerun/resources/pipelinerunresolution.go @@ -157,7 +157,6 @@ func (t ResolvedPipelineTask) isFailure() bool { } var c *apis.Condition var isDone bool - switch { case t.IsCustomTask() && t.IsMatrixed(): if len(t.RunObjects) == 0 { @@ -167,7 +166,7 @@ func (t ResolvedPipelineTask) isFailure() bool { atLeastOneFailed := false for _, run := range t.RunObjects { isDone = isDone && run.IsDone() - runFailed := run.GetStatusCondition().GetCondition(apis.ConditionSucceeded).IsFalse() && !t.hasRemainingRetries() + runFailed := run.GetStatusCondition().GetCondition(apis.ConditionSucceeded).IsFalse() && !t.isRunRetriable() atLeastOneFailed = atLeastOneFailed || runFailed } return atLeastOneFailed && isDone @@ -177,6 +176,7 @@ func (t ResolvedPipelineTask) isFailure() bool { } c = t.RunObject.GetStatusCondition().GetCondition(apis.ConditionSucceeded) isDone = t.RunObject.IsDone() + return isDone && c.IsFalse() && !t.isRunRetriable() case t.IsMatrixed(): if len(t.TaskRuns) == 0 { return false @@ -185,7 +185,7 @@ func (t ResolvedPipelineTask) isFailure() bool { atLeastOneFailed := false for _, taskRun := range t.TaskRuns { isDone = isDone && taskRun.IsDone() - taskRunFailed := taskRun.Status.GetCondition(apis.ConditionSucceeded).IsFalse() && !t.hasRemainingRetries() + taskRunFailed := taskRun.Status.GetCondition(apis.ConditionSucceeded).IsFalse() atLeastOneFailed = atLeastOneFailed || taskRunFailed } return atLeastOneFailed && isDone @@ -195,14 +195,15 @@ func (t ResolvedPipelineTask) isFailure() bool { } c = t.TaskRun.Status.GetCondition(apis.ConditionSucceeded) isDone = t.TaskRun.IsDone() + return isDone && c.IsFalse() } - return isDone && c.IsFalse() && !t.hasRemainingRetries() } -// hasRemainingRetries returns true only when the number of retries already attempted -// is less than the number of retries allowed. -func (t ResolvedPipelineTask) hasRemainingRetries() bool { - var retriesDone int +// isRunRetriable returns true only when the number of retries already attempted +// is less than the number of retries allowed for v1alpha1.Run. +// +// This should be removed once v1alpha1.Run is fully deprecated. +func (t ResolvedPipelineTask) isRunRetriable() bool { switch { case t.IsCustomTask() && t.IsMatrixed(): if len(t.RunObjects) == 0 { @@ -210,36 +211,19 @@ func (t ResolvedPipelineTask) hasRemainingRetries() bool { } // has remaining retries when any Run has a remaining retry for _, run := range t.RunObjects { - retriesDone = run.GetRetryCount() - if retriesDone < t.PipelineTask.Retries { - return true + if _, ok := run.(*v1alpha1.Run); ok { + if run.GetRetryCount() < t.PipelineTask.Retries { + return true + } } } return false case t.IsCustomTask(): - if t.RunObject == nil { - return true - } - retriesDone = t.RunObject.GetRetryCount() - case t.IsMatrixed(): - if len(t.TaskRuns) == 0 { - return true + if fmt.Sprintf("%T", t.RunObject) == "*v1alpha1.Run" { + return t.RunObject.GetRetryCount() < t.PipelineTask.Retries } - // has remaining retries when any TaskRun has a remaining retry - for _, taskRun := range t.TaskRuns { - retriesDone = len(taskRun.Status.RetriesStatus) - if retriesDone < t.PipelineTask.Retries { - return true - } - } - return false - default: - if t.TaskRun == nil { - return true - } - retriesDone = len(t.TaskRun.Status.RetriesStatus) } - return retriesDone < t.PipelineTask.Retries + return false } // isCancelledForTimeOut returns true only if the run is cancelled due to PipelineRun-controlled timeout diff --git a/pkg/reconciler/pipelinerun/resources/pipelinerunresolution_test.go b/pkg/reconciler/pipelinerun/resources/pipelinerunresolution_test.go index 49184ed1735..5d608a36e93 100644 --- a/pkg/reconciler/pipelinerun/resources/pipelinerunresolution_test.go +++ b/pkg/reconciler/pipelinerun/resources/pipelinerunresolution_test.go @@ -464,22 +464,6 @@ var finalScheduledState = PipelineRunState{{ }, }} -var retryableFinalState = PipelineRunState{{ - PipelineTask: &pts[0], - TaskRunName: "pipelinerun-mytask1", - TaskRun: makeSucceeded(trs[0]), - ResolvedTaskResources: &resources.ResolvedTaskResources{ - TaskSpec: &task.Spec, - }, -}, { - PipelineTask: &pts[3], - TaskRunName: "pipelinerun-mytask4", - TaskRun: makeFailed(trs[2]), - ResolvedTaskResources: &resources.ResolvedTaskResources{ - TaskSpec: &task.Spec, - }, -}} - var allFinishedState = PipelineRunState{{ PipelineTask: &pts[0], TaskRunName: "pipelinerun-mytask1", @@ -1289,13 +1273,6 @@ func TestIsFailure(t *testing.T) { RunObject: makeRunFailed(runs[0]), }, want: true, - }, { - name: "taskrun failed: retries remaining", - rpt: ResolvedPipelineTask{ - PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, - TaskRun: makeFailed(trs[0]), - }, - want: false, }, { name: "run failed: retries remaining", rpt: ResolvedPipelineTask{ @@ -1303,16 +1280,16 @@ func TestIsFailure(t *testing.T) { CustomTask: true, RunObject: makeRunFailed(runs[0]), }, - want: false, + want: true, }, { - name: "taskrun failed: no retries remaining", + name: "taskrun failed - Retried", rpt: ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, TaskRun: withRetries(makeFailed(trs[0])), }, want: true, }, { - name: "run failed: no retries remaining", + name: "run failed - Retried", rpt: ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, CustomTask: true, @@ -1380,14 +1357,14 @@ func TestIsFailure(t *testing.T) { }, want: true, }, { - name: "taskrun cancelled: no retries remaining", + name: "taskrun cancelled: retried", rpt: ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, TaskRun: withCancelled(withRetries(makeFailed(trs[0]))), }, want: true, }, { - name: "run cancelled: no retries remaining", + name: "run cancelled: retried", rpt: ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, RunObject: withRunCancelled(withRunRetries(makeRunFailed(runs[0]))), @@ -1490,12 +1467,12 @@ func TestIsFailure(t *testing.T) { }, want: false, }, { - name: "matrixed taskruns failed: retries remaining", + name: "matrixed taskruns failed: retried", rpt: ResolvedPipelineTask{ PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), - TaskRuns: []*v1beta1.TaskRun{makeFailed(trs[0]), makeFailed(trs[1])}, + TaskRuns: []*v1beta1.TaskRun{withRetries(makeFailed(trs[0])), withRetries(makeFailed(trs[1]))}, }, - want: false, + want: true, }, { name: "matrixed runs failed: retries remaining", rpt: ResolvedPipelineTask{ @@ -1503,14 +1480,7 @@ func TestIsFailure(t *testing.T) { PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), RunObjects: []v1beta1.RunObject{makeRunFailed(runs[0]), makeRunFailed(runs[1])}, }, - want: false, - }, { - name: "matrixed taskruns failed: one taskrun with retries remaining", - rpt: ResolvedPipelineTask{ - PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), - TaskRuns: []*v1beta1.TaskRun{makeFailed(trs[0]), withRetries(makeFailed(trs[1]))}, - }, - want: false, + want: true, }, { name: "matrixed runs failed: one run with retries remaining", rpt: ResolvedPipelineTask{ @@ -1518,16 +1488,9 @@ func TestIsFailure(t *testing.T) { PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), RunObjects: []v1beta1.RunObject{makeRunFailed(runs[0]), withRunRetries(makeRunFailed(runs[1]))}, }, - want: false, - }, { - name: "matrixed taskruns failed: no retries remaining", - rpt: ResolvedPipelineTask{ - PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), - TaskRuns: []*v1beta1.TaskRun{withRetries(makeFailed(trs[0])), withRetries(makeFailed(trs[1]))}, - }, want: true, }, { - name: "matrixed runs failed: no retries remaining", + name: "matrixed runs failed: retried", rpt: ResolvedPipelineTask{ CustomTask: true, PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), @@ -1625,14 +1588,14 @@ func TestIsFailure(t *testing.T) { }, want: false, }, { - name: "matrixed taskruns cancelled: no retries remaining", + name: "matrixed taskruns cancelled: retried", rpt: ResolvedPipelineTask{ PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), TaskRuns: []*v1beta1.TaskRun{withCancelled(withRetries(makeFailed(trs[0]))), withCancelled(withRetries(makeFailed(trs[1])))}, }, want: true, }, { - name: "matrixed runs cancelled: no retries remaining", + name: "matrixed runs cancelled: retried", rpt: ResolvedPipelineTask{ CustomTask: true, PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), @@ -1640,14 +1603,14 @@ func TestIsFailure(t *testing.T) { }, want: true, }, { - name: "one matrixed taskrun cancelled: no retries remaining, one matrixed taskrun running", + name: "one matrixed taskrun cancelled: retried, one matrixed taskrun running", rpt: ResolvedPipelineTask{ PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), TaskRuns: []*v1beta1.TaskRun{withCancelled(withRetries(makeFailed(trs[0]))), makeStarted(trs[1])}, }, want: false, }, { - name: "one matrixed run cancelled: no retries remaining, one matrixed run running", + name: "one matrixed run cancelled: retried, one matrixed run running", rpt: ResolvedPipelineTask{ CustomTask: true, PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), @@ -3922,10 +3885,10 @@ func TestIsSuccessful(t *testing.T) { }, want: false, }, { - name: "taskrun failed: retries remaining", + name: "taskrun failed: retried", rpt: ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, - TaskRun: makeFailed(trs[0]), + TaskRun: withRetries(makeFailed(trs[0])), }, want: false, }, { @@ -3937,14 +3900,7 @@ func TestIsSuccessful(t *testing.T) { }, want: false, }, { - name: "taskrun failed: no retries remaining", - rpt: ResolvedPipelineTask{ - PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, - TaskRun: withRetries(makeFailed(trs[0])), - }, - want: false, - }, { - name: "run failed: no retries remaining", + name: "run failed: retried", rpt: ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, CustomTask: true, @@ -3997,14 +3953,14 @@ func TestIsSuccessful(t *testing.T) { }, want: false, }, { - name: "taskrun cancelled: no retries remaining", + name: "taskrun cancelled: retried", rpt: ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, TaskRun: withCancelled(withRetries(makeFailed(trs[0]))), }, want: false, }, { - name: "run cancelled: no retries remaining", + name: "run cancelled: retried", rpt: ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, RunObject: withRunCancelled(withRunRetries(makeRunFailed(runs[0]))), @@ -4115,10 +4071,10 @@ func TestIsSuccessful(t *testing.T) { }, want: false, }, { - name: "matrixed taskruns failed: retries remaining", + name: "matrixed taskruns failed: retried", rpt: ResolvedPipelineTask{ PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), - TaskRuns: []*v1beta1.TaskRun{makeFailed(trs[0]), makeFailed(trs[1])}, + TaskRuns: []*v1beta1.TaskRun{withRetries(makeFailed(trs[0])), withRetries(makeFailed(trs[1]))}, }, want: false, }, { @@ -4129,13 +4085,6 @@ func TestIsSuccessful(t *testing.T) { RunObjects: []v1beta1.RunObject{makeRunFailed(runs[0]), makeRunFailed(runs[1])}, }, want: false, - }, { - name: "matrixed taskruns failed: one taskrun with retries remaining", - rpt: ResolvedPipelineTask{ - PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), - TaskRuns: []*v1beta1.TaskRun{makeFailed(trs[0]), withRetries(makeFailed(trs[1]))}, - }, - want: false, }, { name: "matrixed runs failed: one run with retries remaining", rpt: ResolvedPipelineTask{ @@ -4145,14 +4094,7 @@ func TestIsSuccessful(t *testing.T) { }, want: false, }, { - name: "matrixed taskruns failed: no retries remaining", - rpt: ResolvedPipelineTask{ - PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), - TaskRuns: []*v1beta1.TaskRun{withRetries(makeFailed(trs[0])), withRetries(makeFailed(trs[1]))}, - }, - want: false, - }, { - name: "matrixed runs failed: no retries remaining", + name: "matrixed runs failed: retried", rpt: ResolvedPipelineTask{ CustomTask: true, PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), @@ -4250,14 +4192,14 @@ func TestIsSuccessful(t *testing.T) { }, want: false, }, { - name: "matrixed taskruns cancelled: no retries remaining", + name: "matrixed taskruns cancelled: retried", rpt: ResolvedPipelineTask{ PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), TaskRuns: []*v1beta1.TaskRun{withCancelled(withRetries(makeFailed(trs[0]))), withCancelled(withRetries(makeFailed(trs[1])))}, }, want: false, }, { - name: "matrixed runs cancelled: no retries remaining", + name: "matrixed runs cancelled: retried", rpt: ResolvedPipelineTask{ CustomTask: true, PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), @@ -4265,14 +4207,14 @@ func TestIsSuccessful(t *testing.T) { }, want: false, }, { - name: "one matrixed taskrun cancelled: no retries remaining, one matrixed taskrun running", + name: "one matrixed taskrun cancelled: retried, one matrixed taskrun running", rpt: ResolvedPipelineTask{ PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), TaskRuns: []*v1beta1.TaskRun{withCancelled(withRetries(makeFailed(trs[0]))), makeStarted(trs[1])}, }, want: false, }, { - name: "one matrixed run cancelled: no retries remaining, one matrixed run running", + name: "one matrixed run cancelled: retried, one matrixed run running", rpt: ResolvedPipelineTask{ CustomTask: true, PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), @@ -4352,12 +4294,12 @@ func TestIsRunning(t *testing.T) { }, want: false, }, { - name: "taskrun failed: retries remaining", + name: "taskrun failed: retried", rpt: ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, - TaskRun: makeFailed(trs[0]), + TaskRun: withRetries(makeFailed(trs[0])), }, - want: true, + want: false, }, { name: "run failed: retries remaining", rpt: ResolvedPipelineTask{ @@ -4365,16 +4307,9 @@ func TestIsRunning(t *testing.T) { CustomTask: true, RunObject: makeRunFailed(runs[0]), }, - want: true, - }, { - name: "taskrun failed: no retries remaining", - rpt: ResolvedPipelineTask{ - PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, - TaskRun: withRetries(makeFailed(trs[0])), - }, want: false, }, { - name: "run failed: no retries remaining", + name: "run failed: retried", rpt: ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, CustomTask: true, @@ -4427,14 +4362,14 @@ func TestIsRunning(t *testing.T) { }, want: false, }, { - name: "taskrun cancelled: no retries remaining", + name: "taskrun cancelled: retried", rpt: ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, TaskRun: withCancelled(withRetries(makeFailed(trs[0]))), }, want: false, }, { - name: "run cancelled: no retries remaining", + name: "run cancelled: retried", rpt: ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{Name: "task", Retries: 1}, RunObject: withRunCancelled(withRunRetries(makeRunFailed(runs[0]))), @@ -4530,12 +4465,12 @@ func TestIsRunning(t *testing.T) { }, want: true, }, { - name: "matrixed taskruns failed: retries remaining", + name: "matrixed taskruns failed: retried", rpt: ResolvedPipelineTask{ PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), - TaskRuns: []*v1beta1.TaskRun{makeFailed(trs[0]), makeFailed(trs[1])}, + TaskRuns: []*v1beta1.TaskRun{withRetries(makeFailed(trs[0])), withRetries(makeFailed(trs[1]))}, }, - want: true, + want: false, }, { name: "matrixed runs failed: retries remaining", rpt: ResolvedPipelineTask{ @@ -4543,14 +4478,7 @@ func TestIsRunning(t *testing.T) { PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), RunObjects: []v1beta1.RunObject{makeRunFailed(runs[0]), makeRunFailed(runs[1])}, }, - want: true, - }, { - name: "matrixed taskruns failed: one taskrun with retries remaining", - rpt: ResolvedPipelineTask{ - PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), - TaskRuns: []*v1beta1.TaskRun{makeFailed(trs[0]), withRetries(makeFailed(trs[1]))}, - }, - want: true, + want: false, }, { name: "matrixed runs failed: one run with retries remaining", rpt: ResolvedPipelineTask{ @@ -4558,16 +4486,9 @@ func TestIsRunning(t *testing.T) { PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), RunObjects: []v1beta1.RunObject{makeRunFailed(runs[0]), withRunRetries(makeRunFailed(runs[1]))}, }, - want: true, - }, { - name: "matrixed taskruns failed: no retries remaining", - rpt: ResolvedPipelineTask{ - PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), - TaskRuns: []*v1beta1.TaskRun{withRetries(makeFailed(trs[0])), withRetries(makeFailed(trs[1]))}, - }, want: false, }, { - name: "matrixed runs failed: no retries remaining", + name: "matrixed runs failed: retried", rpt: ResolvedPipelineTask{ CustomTask: true, PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), @@ -4665,14 +4586,14 @@ func TestIsRunning(t *testing.T) { }, want: true, }, { - name: "matrixed taskruns cancelled: no retries remaining", + name: "matrixed taskruns cancelled: retried", rpt: ResolvedPipelineTask{ PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), TaskRuns: []*v1beta1.TaskRun{withCancelled(withRetries(makeFailed(trs[0]))), withCancelled(withRetries(makeFailed(trs[1])))}, }, want: false, }, { - name: "matrixed runs cancelled: no retries remaining", + name: "matrixed runs cancelled: retried", rpt: ResolvedPipelineTask{ CustomTask: true, PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), @@ -4680,14 +4601,14 @@ func TestIsRunning(t *testing.T) { }, want: false, }, { - name: "one matrixed taskrun cancelled: no retries remaining, one matrixed taskrun running", + name: "one matrixed taskrun cancelled: retried, one matrixed taskrun running", rpt: ResolvedPipelineTask{ PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), TaskRuns: []*v1beta1.TaskRun{withCancelled(withRetries(makeFailed(trs[0]))), makeStarted(trs[1])}, }, want: true, }, { - name: "one matrixed run cancelled: no retries remaining, one matrixed run running", + name: "one matrixed run cancelled: retried, one matrixed run running", rpt: ResolvedPipelineTask{ CustomTask: true, PipelineTask: withPipelineTaskRetries(*matrixedPipelineTask, 1), diff --git a/pkg/reconciler/pipelinerun/resources/pipelinerunstate.go b/pkg/reconciler/pipelinerun/resources/pipelinerunstate.go index 45473b4d2b9..c04dcf3dbb4 100644 --- a/pkg/reconciler/pipelinerun/resources/pipelinerunstate.go +++ b/pkg/reconciler/pipelinerun/resources/pipelinerunstate.go @@ -346,30 +346,22 @@ func (state PipelineRunState) getNextTasks(candidateTasks sets.String) []*Resolv } } } - tasks = append(tasks, state.getRetryableTasks(candidateTasks)...) + tasks = append(tasks, state.getRetriableRuns(candidateTasks)...) return tasks } -// getRetryableTasks returns a list of pipelinetasks which should be executed next when the pipelinerun is stopping, +// getRetriableRuns returns a list of pipelinetasks which should be executed next when the pipelinerun is stopping, // i.e. a list of failed pipelinetasks from candidateTasks which haven't exhausted their retries. Note that if a // pipelinetask is cancelled, the retries are not exhausted - they are not retryable. -func (state PipelineRunState) getRetryableTasks(candidateTasks sets.String) []*ResolvedPipelineTask { +// +// Note: for now, this function only detects if v1alpha1.Run is retriable. For v1beta1.CustomRun and TaskRuns, +// the retries implementation is hidden from PipelineRun reconciler. See TEP-0121 for details. +func (state PipelineRunState) getRetriableRuns(candidateTasks sets.String) []*ResolvedPipelineTask { var tasks []*ResolvedPipelineTask for _, t := range state { if _, ok := candidateTasks[t.PipelineTask.Name]; ok { var status *apis.Condition switch { - case t.TaskRun != nil: - status = t.TaskRun.Status.GetCondition(apis.ConditionSucceeded) - case len(t.TaskRuns) != 0: - isDone := true - for _, taskRun := range t.TaskRuns { - isDone = isDone && taskRun.IsDone() - c := taskRun.Status.GetCondition(apis.ConditionSucceeded) - if c.IsFalse() { - status = c - } - } case t.RunObject != nil: status = t.RunObject.GetStatusCondition().GetCondition(apis.ConditionSucceeded) case len(t.RunObjects) != 0: @@ -382,7 +374,7 @@ func (state PipelineRunState) getRetryableTasks(candidateTasks sets.String) []*R } } } - if status.IsFalse() && !t.isCancelled() && t.hasRemainingRetries() { + if status.IsFalse() && !t.isCancelled() && t.isRunRetriable() { tasks = append(tasks, t) } } @@ -449,7 +441,7 @@ func (facts *PipelineRunFacts) DAGExecutionQueue() (PipelineRunState, error) { } else { // when pipeline run is stopping normally or gracefully, do not schedule any new tasks and only // wait for all running tasks to complete (including exhausting retries) and report their status - tasks = facts.State.getRetryableTasks(candidateTasks) + tasks = facts.State.getRetriableRuns(candidateTasks) } return tasks, nil } diff --git a/pkg/reconciler/pipelinerun/resources/pipelinerunstate_test.go b/pkg/reconciler/pipelinerun/resources/pipelinerunstate_test.go index b5c8ff783d6..b1bce4c76cb 100644 --- a/pkg/reconciler/pipelinerun/resources/pipelinerunstate_test.go +++ b/pkg/reconciler/pipelinerun/resources/pipelinerunstate_test.go @@ -94,7 +94,7 @@ func TestPipelineRunFacts_CheckDAGTasksDoneDone(t *testing.T) { var taskExpectedState = PipelineRunState{{ PipelineTask: &pts[4], // 2 retries needed TaskRunName: "pipelinerun-mytask1", - TaskRun: withRetries(makeFailed(trs[0])), + TaskRun: withRetries(makeStarted(trs[0])), ResolvedTaskResources: &resources.ResolvedTaskResources{ TaskSpec: &task.Spec, }, @@ -130,15 +130,6 @@ func TestPipelineRunFacts_CheckDAGTasksDoneDone(t *testing.T) { RunObject: makeRunFailed(runs[0]), }} - var taskFailedWithRetries = PipelineRunState{{ - PipelineTask: &pts[4], // 2 retries needed - TaskRunName: "pipelinerun-mytask1", - TaskRun: makeFailed(trs[0]), - ResolvedTaskResources: &resources.ResolvedTaskResources{ - TaskSpec: &task.Spec, - }, - }} - var taskCancelledFailedWithRetries = PipelineRunState{{ PipelineTask: &pts[4], // 2 retries needed TaskRunName: "pipelinerun-mytask1", @@ -203,11 +194,6 @@ func TestPipelineRunFacts_CheckDAGTasksDoneDone(t *testing.T) { state: runFailedState, expected: true, ptExpected: []bool{true}, - }, { - name: "run-failed-with-retries", - state: taskFailedWithRetries, - expected: false, - ptExpected: []bool{false}, }, { name: "run-cancelled-failed-with-retries", state: taskCancelledFailedWithRetries, @@ -571,6 +557,24 @@ func TestGetNextTasks(t *testing.T) { } func TestGetNextTaskWithRetries(t *testing.T) { + var failedRun = parse.MustParseRun(t, ` +metadata: + name: custom-task + namespace: namespace +spec: + retries: 1 + params: + - name: param1 + value: value1 + ref: + apiVersion: example.dev/v0 + kind: Example +status: + conditions: + - type: Succeeded + status: "False" + reason: Timedout +`) var taskCancelledByStatusState = PipelineRunState{{ PipelineTask: &pts[4], // 2 retries needed TaskRunName: "pipelinerun-mytask1", @@ -616,15 +620,6 @@ func TestGetNextTaskWithRetries(t *testing.T) { }, }} - var taskExpectedState = PipelineRunState{{ - PipelineTask: &pts[4], // 2 retries needed - TaskRunName: "pipelinerun-mytask1", - TaskRun: withRetries(makeFailed(trs[0])), - ResolvedTaskResources: &resources.ResolvedTaskResources{ - TaskSpec: &task.Spec, - }, - }} - var runCancelledByStatusState = PipelineRunState{{ PipelineTask: &pts[4], // 2 retries needed RunObjectName: "pipelinerun-mytask1", @@ -678,7 +673,7 @@ func TestGetNextTaskWithRetries(t *testing.T) { var runExpectedState = PipelineRunState{{ PipelineTask: &pts[4], // 2 retries needed RunObjectName: "pipelinerun-mytask1", - RunObject: withRunRetries(makeRunFailed(runs[0])), + RunObject: failedRun, CustomTask: true, ResolvedTaskResources: &resources.ResolvedTaskResources{ TaskSpec: &task.Spec, @@ -730,15 +725,6 @@ func TestGetNextTaskWithRetries(t *testing.T) { }, }} - var taskExpectedStateMatrix = PipelineRunState{{ - PipelineTask: &pts[20], // 2 retries needed - TaskRunNames: []string{"pipelinerun-mytask1"}, - TaskRuns: []*v1beta1.TaskRun{withRetries(makeFailed(trs[0]))}, - ResolvedTaskResources: &resources.ResolvedTaskResources{ - TaskSpec: &task.Spec, - }, - }} - var runCancelledByStatusStateMatrix = PipelineRunState{{ PipelineTask: &pts[20], // 2 retries needed RunObjectNames: []string{"pipelinerun-mytask1"}, @@ -792,7 +778,7 @@ func TestGetNextTaskWithRetries(t *testing.T) { var runExpectedStateMatrix = PipelineRunState{{ PipelineTask: &pts[20], // 2 retries needed RunObjectNames: []string{"pipelinerun-mytask1"}, - RunObjects: []v1beta1.RunObject{withRunRetries(makeRunFailed(runs[0]))}, + RunObjects: []v1beta1.RunObject{failedRun}, CustomTask: true, ResolvedTaskResources: &resources.ResolvedTaskResources{ TaskSpec: &task.Spec, @@ -829,11 +815,6 @@ func TestGetNextTaskWithRetries(t *testing.T) { state: taskRetriedState, candidates: sets.NewString("mytask5"), expectedNext: []*ResolvedPipelineTask{}, - }, { - name: "tasks-retried-one-candidates", - state: taskExpectedState, - candidates: sets.NewString("mytask5"), - expectedNext: []*ResolvedPipelineTask{taskExpectedState[0]}, }, { name: "runs-cancelled-no-candidates", state: runCancelledByStatusState, @@ -889,11 +870,6 @@ func TestGetNextTaskWithRetries(t *testing.T) { state: taskRetriedStateMatrix, candidates: sets.NewString("mytask21"), expectedNext: []*ResolvedPipelineTask{}, - }, { - name: "tasks-retried-one-candidate-matrix", - state: taskExpectedStateMatrix, - candidates: sets.NewString("mytask21"), - expectedNext: []*ResolvedPipelineTask{taskExpectedStateMatrix[0]}, }, { name: "runs-cancelled-no-candidates-matrix", state: runCancelledByStatusStateMatrix, @@ -1011,7 +987,25 @@ func TestDAGExecutionQueue(t *testing.T) { TaskSpec: &task.Spec, }, } - failedRun := ResolvedPipelineTask{ + failedRun := parse.MustParseRun(t, ` +metadata: + name: task + namespace: namespace +spec: + retries: 1 + params: + - name: param1 + value: value1 + ref: + apiVersion: example.dev/v0 + kind: Example +status: + conditions: + - type: Succeeded + status: "False" + reason: Timedout +`) + failedCustomRun := ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{ Name: "failedrun", TaskRef: &v1beta1.TaskRef{Name: "task"}, @@ -1020,18 +1014,6 @@ func TestDAGExecutionQueue(t *testing.T) { RunObject: makeRunFailed(runs[0]), CustomTask: true, } - failedTaskWithRetries := ResolvedPipelineTask{ - PipelineTask: &v1beta1.PipelineTask{ - Name: "failedtaskwithretries", - TaskRef: &v1beta1.TaskRef{Name: "task"}, - Retries: 1, - }, - TaskRunName: "failedtaskwithretries", - TaskRun: makeFailed(trs[0]), - ResolvedTaskResources: &resources.ResolvedTaskResources{ - TaskSpec: &task.Spec, - }, - } failedRunWithRetries := ResolvedPipelineTask{ PipelineTask: &v1beta1.PipelineTask{ Name: "failedrunwithretries", @@ -1039,7 +1021,7 @@ func TestDAGExecutionQueue(t *testing.T) { Retries: 1, }, RunObjectName: "failedrunwithretries", - RunObject: makeRunFailed(runs[0]), + RunObject: failedRun, CustomTask: true, } tcs := []struct { @@ -1053,7 +1035,7 @@ func TestDAGExecutionQueue(t *testing.T) { state: PipelineRunState{ &createdTask, &createdRun, &runningTask, &runningRun, &successfulTask, &successfulRun, - &failedTaskWithRetries, &failedRunWithRetries, + &failedRunWithRetries, }, }, { name: "gracefully cancelled", @@ -1061,7 +1043,7 @@ func TestDAGExecutionQueue(t *testing.T) { state: PipelineRunState{ &createdTask, &createdRun, &runningTask, &runningRun, &successfulTask, &successfulRun, - &failedTaskWithRetries, &failedRunWithRetries, + &failedRunWithRetries, }, }, { name: "gracefully stopped", @@ -1074,32 +1056,33 @@ func TestDAGExecutionQueue(t *testing.T) { specStatus: v1beta1.PipelineRunSpecStatusStoppedRunFinally, state: PipelineRunState{ &createdTask, &createdRun, &runningTask, &runningRun, &successfulTask, &successfulRun, - &failedTask, &failedRun, &failedTaskWithRetries, &failedRunWithRetries, + &failedTask, &failedCustomRun, &failedRunWithRetries, }, - want: PipelineRunState{&failedTaskWithRetries, &failedRunWithRetries}, + want: PipelineRunState{&failedRunWithRetries}, }, { name: "running", state: PipelineRunState{ &createdTask, &createdRun, &runningTask, &runningRun, - &failedTaskWithRetries, &failedRunWithRetries, &successfulTask, &successfulRun, + &successfulTask, &successfulRun, + &failedRunWithRetries, }, - want: PipelineRunState{&createdTask, &createdRun, &failedTaskWithRetries, &failedRunWithRetries}, + want: PipelineRunState{&createdTask, &createdRun, &failedRunWithRetries}, }, { name: "stopped", state: PipelineRunState{ &createdTask, &createdRun, &runningTask, &runningRun, - &successfulTask, &successfulRun, &failedTask, &failedRun, + &successfulTask, &successfulRun, &failedTask, &failedCustomRun, }, }, { name: "stopped with retryable tasks", state: PipelineRunState{ - &createdTask, &createdRun, &runningTask, &runningRun, &successfulTask, &successfulRun, - &failedTask, &failedRun, &failedTaskWithRetries, &failedRunWithRetries, + &createdTask, &createdRun, &runningTask, &runningRun, + &successfulTask, &successfulRun, &failedCustomRun, &failedRunWithRetries, }, - want: PipelineRunState{&failedTaskWithRetries, &failedRunWithRetries}, + want: PipelineRunState{&failedRunWithRetries}, }, { name: "all tasks finished", - state: PipelineRunState{&successfulTask, &successfulRun, &failedTask, &failedRun}, + state: PipelineRunState{&successfulTask, &successfulRun, &failedTask, &failedCustomRun}, }} for _, tc := range tcs { t.Run(tc.name, func(t *testing.T) { @@ -1613,17 +1596,6 @@ func TestPipelineRunState_GetFinalTasksAndNames(t *testing.T) { expectedFinalTasks: PipelineRunState{}, expectedFinalNames: sets.NewString(pts[1].Name), expectedTaskNames: sets.NewString(pts[0].Name), - }, { - // tasks: [ mytask1] - // finally: [mytask4] - name: "07 - DAG tasks succeeded, return retryable final tasks", - desc: "DAG task (mytask1) finished successfully - retry failed final tasks (mytask4)", - state: retryableFinalState, - DAGTasks: []v1beta1.PipelineTask{pts[0]}, - finalTasks: []v1beta1.PipelineTask{pts[3]}, - expectedFinalTasks: PipelineRunState{retryableFinalState[1]}, - expectedFinalNames: sets.NewString(pts[3].Name), - expectedTaskNames: sets.NewString(pts[0].Name), }} for _, tc := range tcs { dagGraph, err := dag.Build(v1beta1.PipelineTaskList(tc.DAGTasks), v1beta1.PipelineTaskList(tc.DAGTasks).Deps()) @@ -3528,3 +3500,78 @@ func customRunWithName(name string) *v1beta1.CustomRun { }, } } + +func TestGetRetriableTasks(t *testing.T) { + failedRun := parse.MustParseRun(t, ` +metadata: + name: custom-task + namespace: namespace +spec: + retries: 1 + params: + - name: param1 + value: value1 + ref: + apiVersion: example.dev/v0 + kind: Example +status: + conditions: + - type: Succeeded + status: "False" + reason: Timedout +`) + failedCustomRunPT := ResolvedPipelineTask{ + PipelineTask: &v1beta1.PipelineTask{ + Name: "failedrun", + TaskRef: &v1beta1.TaskRef{Name: "task"}, + }, + RunObjectName: "failedrun", + RunObject: makeRunFailed(runs[0]), + CustomTask: true, + } + failedTaskRunPTWithRetries := ResolvedPipelineTask{ + PipelineTask: &v1beta1.PipelineTask{ + Name: "failedtask", + TaskRef: &v1beta1.TaskRef{Name: "task"}, + Retries: 1, + }, + TaskRunName: "failedtask", + TaskRun: makeFailed(trs[0]), + ResolvedTaskResources: &resources.ResolvedTaskResources{ + TaskSpec: &task.Spec, + }, + } + failedRunPTWithRetries := ResolvedPipelineTask{ + PipelineTask: &v1beta1.PipelineTask{ + Name: "failedrunwithretries", + TaskRef: &v1beta1.TaskRef{Name: "task"}, + Retries: 1, + }, + RunObjectName: "failedrunwithretries", + RunObject: failedRun, + CustomTask: true, + } + for _, tc := range []struct { + name string + state PipelineRunState + candidates sets.String + want []*ResolvedPipelineTask + }{{ + name: "Failed Custom Task with Retries", + state: PipelineRunState{&failedRunPTWithRetries}, + candidates: sets.String{}.Insert(failedCustomRunPT.RunObjectName).Insert(failedRunPTWithRetries.RunObjectName), + want: []*ResolvedPipelineTask{&failedRunPTWithRetries}, + }, { + name: "Failed Custom Task and TaskRuns with Retries", + state: PipelineRunState{&failedRunPTWithRetries}, + candidates: sets.String{}.Insert(failedRunPTWithRetries.RunObjectName).Insert(failedTaskRunPTWithRetries.TaskRunName), + want: []*ResolvedPipelineTask{&failedRunPTWithRetries}, + }} { + t.Run(tc.name, func(t *testing.T) { + queue := tc.state.getRetriableRuns(tc.candidates) + if d := cmp.Diff(tc.want, queue); d != "" { + t.Errorf("Didn't get expected execution queue: %s", diff.PrintWantGot(d)) + } + }) + } +} diff --git a/pkg/reconciler/taskrun/taskrun.go b/pkg/reconciler/taskrun/taskrun.go index 506b934e11b..3bcaeb9f180 100644 --- a/pkg/reconciler/taskrun/taskrun.go +++ b/pkg/reconciler/taskrun/taskrun.go @@ -304,7 +304,9 @@ func (c *Reconciler) finishReconcileUpdateEmitEvents(ctx context.Context, tr *v1 logger := logging.FromContext(ctx) afterCondition := tr.Status.GetCondition(apis.ConditionSucceeded) - + if afterCondition.IsFalse() && !tr.IsCancelled() && tr.IsRetriable() { + retryTaskRun(tr, afterCondition.Message) + } // Send k8s events and cloud events (when configured) events.Emit(ctx, beforeCondition, afterCondition, tr) @@ -465,6 +467,7 @@ func (c *Reconciler) prepare(ctx context.Context, tr *v1beta1.TaskRun) (*v1beta1 // error but it does not sync updates back to etcd. It does not emit events. // `reconcile` consumes spec and resources returned by `prepare` func (c *Reconciler) reconcile(ctx context.Context, tr *v1beta1.TaskRun, rtr *resources.ResolvedTaskResources) error { + logger := logging.FromContext(ctx) recorder := controller.GetEventRecorder(ctx) var err error @@ -496,7 +499,7 @@ func (c *Reconciler) reconcile(ctx context.Context, tr *v1beta1.TaskRun, rtr *re } for index := range pos { po := pos[index] - if metav1.IsControlledBy(po, tr) && !podconvert.DidTaskRunFail(po) { + if metav1.IsControlledBy(po, tr) && !podconvert.DidTaskRunFail(po) && !podconvert.IsPodArchived(po, &tr.Status) { pod = po } } @@ -987,3 +990,16 @@ func isResourceQuotaConflictError(err error) bool { } return k8ErrStatus.Details != nil && k8ErrStatus.Details.Kind == "resourcequotas" } + +// retryTaskRun archives taskRun.Status to taskRun.Status.RetriesStatus, and set +// taskRun status to Unknown with Reason v1beta1.TaskRunReasonToBeRetried. +func retryTaskRun(tr *v1beta1.TaskRun, message string) { + newStatus := tr.Status.DeepCopy() + newStatus.RetriesStatus = nil + tr.Status.RetriesStatus = append(tr.Status.RetriesStatus, *newStatus) + tr.Status.StartTime = nil + tr.Status.CompletionTime = nil + tr.Status.PodName = "" + taskRunCondSet := apis.NewBatchConditionSet() + taskRunCondSet.Manage(&tr.Status).MarkUnknown(apis.ConditionSucceeded, v1beta1.TaskRunReasonToBeRetried.String(), message) +} diff --git a/pkg/reconciler/taskrun/taskrun_test.go b/pkg/reconciler/taskrun/taskrun_test.go index 0c622c095d0..710dd187e78 100644 --- a/pkg/reconciler/taskrun/taskrun_test.go +++ b/pkg/reconciler/taskrun/taskrun_test.go @@ -104,6 +104,11 @@ var ( }, cmp.Comparer(func(name1, name2 string) bool { return name1[:len(name1)-5] == name2[:len(name2)-5] })) + ignoreCompletionTime = cmpopts.IgnoreFields(v1beta1.TaskRunStatusFields{}, "CompletionTime") + ignoreObjectMeta = cmpopts.IgnoreFields(metav1.ObjectMeta{}, "Labels", "ResourceVersion", "Annotations") + ignoreStatusTaskSpec = cmpopts.IgnoreFields(v1beta1.TaskRunStatusFields{}, "TaskSpec") + ignoreTaskRunStatusFields = cmpopts.IgnoreFields(v1beta1.TaskRunStatusFields{}, "Steps", "Sidecars") + resourceQuantityCmp = cmp.Comparer(func(x, y resource.Quantity) bool { return x.Cmp(y) == 0 }) @@ -1710,6 +1715,339 @@ spec: } } +func TestReconcileRetry(t *testing.T) { + var ( + toBeCanceledTaskRun = parse.MustParseV1beta1TaskRun(t, ` +metadata: + name: test-taskrun-run-retry-canceled + namespace: foo +spec: + retries: 1 + status: TaskRunCancelled + taskRef: + name: test-task +status: + startTime: "2021-12-31T23:59:59Z" + conditions: + - reason: Running + status: Unknown + type: Succeeded + message: "TaskRun \"test-taskrun-run-retry-canceled\" was cancelled. " +`) + canceledTaskRun = parse.MustParseV1beta1TaskRun(t, ` +metadata: + name: test-taskrun-run-retry-canceled + namespace: foo +spec: + retries: 1 + status: TaskRunCancelled + taskRef: + name: test-task +status: + startTime: "2021-12-31T23:59:59Z" + completionTime: "2022-01-01T00:00:00Z" + conditions: + - reason: TaskRunCancelled + status: "False" + type: Succeeded + message: "TaskRun \"test-taskrun-run-retry-canceled\" was cancelled. " +`) + toBeTimedOutTaskRun = parse.MustParseV1beta1TaskRun(t, ` +metadata: + name: test-taskrun-run-retry-timedout + namespace: foo +spec: + retries: 1 + timeout: "10s" + taskRef: + name: test-task +status: + startTime: "2021-12-31T00:00:00Z" + conditions: + - reason: Running + status: Unknown + type: Succeeded + message: TaskRun "test-taskrun-run-retry-timedout" failed to finish within "10s" +`) + timedOutTaskRun = parse.MustParseV1beta1TaskRun(t, ` +metadata: + name: test-taskrun-run-retry-timedout + namespace: foo +spec: + retries: 1 + timeout: "10s" + taskRef: + name: test-task +status: + conditions: + - reason: ToBeRetried + status: Unknown + type: Succeeded + message: TaskRun "test-taskrun-run-retry-timedout" failed to finish within "10s" + retriesStatus: + - conditions: + - reason: "TaskRunTimeout" + status: "False" + type: Succeeded + message: TaskRun "test-taskrun-run-retry-timedout" failed to finish within "10s" + startTime: "2021-12-31T00:00:00Z" + completionTime: "2022-01-01T00:00:00Z" + `) + toFailOnPodFailureTaskRun = parse.MustParseV1beta1TaskRun(t, ` +metadata: + name: test-taskrun-run-retry-pod-failure + namespace: foo +spec: + retries: 1 + taskRef: + name: test-task +status: + startTime: "2021-12-31T23:59:59Z" + podName: test-taskrun-run-retry-pod-failure-pod + steps: + - container: step-unamed-0 + name: unamed-0 + waiting: + reason: "ImagePullBackOff" +`) + failedOnPodFailureTaskRun = parse.MustParseV1beta1TaskRun(t, ` +metadata: + name: test-taskrun-run-retry-pod-failure + namespace: foo +spec: + retries: 1 + taskRef: + name: test-task +status: + conditions: + - reason: ToBeRetried + status: Unknown + type: Succeeded + message: "The step \"unamed-0\" in TaskRun \"test-taskrun-run-retry-pod-failure\" failed to pull the image \"\". The pod errored with the message: \".\"" + steps: + - container: step-unamed-0 + name: unamed-0 + terminated: + exitCode: 1 + finishedAt: "2022-01-01T00:00:00Z" + reason: "TaskRunImagePullFailed" + retriesStatus: + - conditions: + - reason: "TaskRunImagePullFailed" + status: "False" + type: Succeeded + message: "The step \"unamed-0\" in TaskRun \"test-taskrun-run-retry-pod-failure\" failed to pull the image \"\". The pod errored with the message: \".\"" + startTime: "2021-12-31T23:59:59Z" + completionTime: "2022-01-01T00:00:00Z" + podName: test-taskrun-run-retry-pod-failure-pod + steps: + - container: step-unamed-0 + name: unamed-0 + terminated: + exitCode: 1 + finishedAt: "2022-01-01T00:00:00Z" + reason: "TaskRunImagePullFailed" +`) + failedPod = &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "test-taskrun-run-retry-pod-failure-pod"}, + Status: corev1.PodStatus{Conditions: []corev1.PodCondition{{ + Type: corev1.PodReady, + Status: "False", + Reason: "PodFailed", + }}}, + } + toFailOnPrepareTaskRun = parse.MustParseV1beta1TaskRun(t, ` +metadata: + name: test-taskrun-run-retry-prepare-failure + namespace: foo +spec: + retries: 1 + taskRef: + name: test-task +status: + startTime: "2021-12-31T23:59:59Z" + conditions: + - reason: Running + status: Unknown + type: Succeeded + lastTransitionTime: "2022-01-01T00:00:00Z" +`) + failedOnPrepareTaskRun = parse.MustParseV1beta1TaskRun(t, ` +metadata: + name: test-taskrun-run-retry-prepare-failure + namespace: foo +spec: + retries: 1 + taskRef: + name: test-task +status: + conditions: + - reason: ToBeRetried + status: Unknown + type: Succeeded + message: "error when listing tasks for taskRun test-taskrun-run-retry-prepare-failure: failed to get task: tasks.tekton.dev \"test-task\" not found" + retriesStatus: + - conditions: + - reason: TaskRunResolutionFailed + status: "False" + type: "Succeeded" + message: "error when listing tasks for taskRun test-taskrun-run-retry-prepare-failure: failed to get task: tasks.tekton.dev \"test-task\" not found" + startTime: "2021-12-31T23:59:59Z" + completionTime: "2022-01-01T00:00:00Z" +`) + prepareError = fmt.Errorf("1 error occurred:\n\t* error when listing tasks for taskRun test-taskrun-run-retry-prepare-failure: failed to get task: tasks.tekton.dev \"test-task\" not found") + toFailOnReconcileFailureTaskRun = parse.MustParseV1beta1TaskRun(t, ` +metadata: + name: test-taskrun-results-type-mismatched + namespace: foo +spec: + retries: 1 + taskRef: + name: test-results-task +status: + startTime: "2021-12-31T23:59:59Z" + taskResults: + - name: aResult + type: string + value: aResultValue +`) + failedOnReconcileFailureTaskRun = parse.MustParseV1beta1TaskRun(t, ` +metadata: + name: test-taskrun-results-type-mismatched + namespace: foo +spec: + retries: 1 + taskRef: + name: test-results-task +status: + taskResults: + - name: aResult + type: string + value: aResultValue + conditions: + - reason: ToBeRetried + status: Unknown + type: Succeeded + message: "missmatched Types for these results, map[aResult:[array]]" + sideCars: + retriesStatus: + - conditions: + - reason: TaskRunValidationFailed + status: "False" + type: "Succeeded" + message: "missmatched Types for these results, map[aResult:[array]]" + startTime: "2021-12-31T23:59:59Z" + completionTime: "2022-01-01T00:00:00Z" + podName: "test-taskrun-results-type-mismatched-pod" + taskResults: + - name: aResult + type: string + value: aResultValue +`) + reconciliatonError = fmt.Errorf("1 error occurred:\n\t* missmatched Types for these results, map[aResult:[array]]") + ) + + for _, tc := range []struct { + name string + testData test.Data + task *v1beta1.Task + tr *v1beta1.TaskRun + pod *corev1.Pod + wantTr *v1beta1.TaskRun + wantReconcileError error + wantCompletionTime bool + }{{ + name: "No Retry on Cancellation", + testData: test.Data{ + TaskRuns: []*v1beta1.TaskRun{toBeCanceledTaskRun}, + Tasks: []*v1beta1.Task{simpleTask}, + }, + tr: toBeCanceledTaskRun, + wantTr: canceledTaskRun, + wantCompletionTime: true, + }, { + name: "Retry on TimedOut", + testData: test.Data{ + TaskRuns: []*v1beta1.TaskRun{toBeTimedOutTaskRun}, + Tasks: []*v1beta1.Task{simpleTask}, + }, + tr: toBeTimedOutTaskRun, + wantTr: timedOutTaskRun, + }, { + name: "Retry on TaskRun Pod Failure", + testData: test.Data{ + TaskRuns: []*v1beta1.TaskRun{toFailOnPodFailureTaskRun}, + Tasks: []*v1beta1.Task{simpleTask}, + Pods: []*corev1.Pod{failedPod}, + }, + tr: toFailOnPodFailureTaskRun, + wantTr: failedOnPodFailureTaskRun, + }, { + name: "Retry on TaskRun Prepare Failure", + testData: test.Data{ + TaskRuns: []*v1beta1.TaskRun{toFailOnPrepareTaskRun}, + }, + tr: toFailOnPrepareTaskRun, + wantTr: failedOnPrepareTaskRun, + wantReconcileError: prepareError, + }, { + name: "Retry on TaskRun Reconciliation Failure", + testData: test.Data{ + TaskRuns: []*v1beta1.TaskRun{toFailOnReconcileFailureTaskRun}, + Tasks: []*v1beta1.Task{resultsTask}, + ConfigMaps: []*corev1.ConfigMap{{ + ObjectMeta: metav1.ObjectMeta{Namespace: system.Namespace(), Name: config.GetFeatureFlagsConfigName()}, + Data: map[string]string{ + "enable-api-fields": config.AlphaAPIFields, + }, + }}, + }, + tr: toFailOnReconcileFailureTaskRun, + wantTr: failedOnReconcileFailureTaskRun, + wantReconcileError: reconciliatonError, + }} { + t.Run(tc.name, func(t *testing.T) { + testAssets, cancel := getTaskRunController(t, tc.testData) + defer cancel() + c := testAssets.Controller + clients := testAssets.Clients + createServiceAccount(t, testAssets, "default", tc.tr.Namespace) + + err := c.Reconciler.Reconcile(testAssets.Ctx, getRunName(tc.tr)) + if ok, _ := controller.IsRequeueKey(err); err == nil || !ok { + // No error; or error equals each other + if !(err == nil && tc.wantReconcileError == nil) && + !(err != nil && tc.wantReconcileError != nil && strings.TrimSuffix(err.Error(), "\n\n") == tc.wantReconcileError.Error()) { + t.Errorf("Reconcile(): %v, want %v", err, tc.wantReconcileError) + } + } + + reconciledTaskRun, err := clients.Pipeline.TektonV1beta1().TaskRuns("foo").Get(testAssets.Ctx, tc.tr.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("got %v; want nil", err) + } + + ignoreFields := []cmp.Option{ + ignoreLastTransitionTime, + ignoreCompletionTime, + ignoreObjectMeta, + ignoreStatusTaskSpec, + ignoreTaskRunStatusFields, + } + if d := cmp.Diff(reconciledTaskRun, tc.wantTr, ignoreFields...); d != "" { + t.Errorf("Didn't get expected TaskRun: %v", diff.PrintWantGot(d)) + } + + if !tc.wantCompletionTime && reconciledTaskRun.Status.CompletionTime != nil { + t.Error("Expect completion time to be nil") + } + if tc.wantCompletionTime && reconciledTaskRun.Status.CompletionTime == nil { + t.Error("Didn't expect completion time to be nil") + } + }) + } +} + func TestReconcileGetTaskError(t *testing.T) { tr := parse.MustParseV1beta1TaskRun(t, ` metadata: