Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEP-0121: Retries - Reconciler Implementation #5844

Merged
merged 1 commit into from
Dec 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions docs/taskruns.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ weight: 300
- [Specifying `Sidecars`](#specifying-sidecars)
- [Overriding `Task` `Steps` and `Sidecars`](#overriding-task-steps-and-sidecars)
- [Specifying `LimitRange` values](#specifying-limitrange-values)
- [Specifying `Retries`](#specifying-retries)
- [Configuring the failure timeout](#configuring-the-failure-timeout)
- [Specifying `ServiceAccount` credentials](#specifying-serviceaccount-credentials)
- [Monitoring execution status](#monitoring-execution-status)
Expand Down Expand Up @@ -698,11 +699,25 @@ object(s), if present. Any `Request` or `Limit` specified by the user (on `Task`

For more information, see the [`LimitRange` support in Pipeline](./compute-resources.md#limitrange-support).

### Specifying `Retries`
You can use the `retries` field to set how many times you want to retry on a failed TaskRun.
All TaskRun failures are retriable except for `Cancellation`.

For a retriable `TaskRun`, when an error occurs:
- The error status is archived in `status.RetriesStatus`
- The `Succeeded` condition in `status` is updated:
```
Type: Succeeded
Status: Unknown
Reason: ToBeRetried
```
- `status.StartTime` and `status.PodName` are unset to trigger another retry attempt.

### Configuring the failure timeout

You can use the `timeout` field to set the `TaskRun's` desired timeout value. If you do not specify this
value for the `TaskRun`, the global default timeout value applies. If you set the timeout to 0, the `TaskRun` will
have no timeout and will run until it completes successfully or fails from an error.
You can use the `timeout` field to set the `TaskRun's` desired timeout value for **each retry attempt**. If you do
not specify this value, the global default timeout value applies (the same, to `each retry attempt`). If you set the timeout to 0,
the `TaskRun` will have no timeout and will run until it completes successfully or fails from an error.

The global default timeout is set to 60 minutes when you first install Tekton. You can set
a different global default timeout value using the `default-timeout-minutes` field in
Expand Down
10 changes: 10 additions & 0 deletions pkg/pod/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,16 @@ func DidTaskRunFail(pod *corev1.Pod) bool {
return f
}

// IsPodArchived indicates if a pod is archived in the retriesStatus.
func IsPodArchived(pod *corev1.Pod, trs *v1beta1.TaskRunStatus) bool {
for _, retryStatus := range trs.RetriesStatus {
if retryStatus.PodName == pod.GetName() {
return true
}
}
return false
}

func areStepsComplete(pod *corev1.Pod) bool {
stepsComplete := len(pod.Status.ContainerStatuses) > 0 && pod.Status.Phase == corev1.PodRunning
for _, s := range pod.Status.ContainerStatuses {
Expand Down
45 changes: 45 additions & 0 deletions pkg/pod/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1634,6 +1634,51 @@ func TestMarkStatusSuccess(t *testing.T) {
}
}

func TestIsPodArchived(t *testing.T) {
for _, tc := range []struct {
name string
podName string
retriesStatus []v1beta1.TaskRunStatus
want bool
}{{
name: "Pod is not in the empty retriesStatus",
podName: "pod",
retriesStatus: []v1beta1.TaskRunStatus{},
want: false,
}, {
name: "Pod is not in the non-empty retriesStatus",
podName: "pod-retry1",
retriesStatus: []v1beta1.TaskRunStatus{{
TaskRunStatusFields: v1beta1.TaskRunStatusFields{
PodName: "pod",
},
}},
want: false,
}, {
name: "Pod is in the retriesStatus",
podName: "pod",
retriesStatus: []v1beta1.TaskRunStatus{{
TaskRunStatusFields: v1beta1.TaskRunStatusFields{
PodName: "pod",
}},
},
want: true,
}} {
t.Run(tc.name, func(t *testing.T) {
trs := v1beta1.TaskRunStatus{
TaskRunStatusFields: v1beta1.TaskRunStatusFields{
PodName: "pod",
RetriesStatus: tc.retriesStatus,
},
}
got := IsPodArchived(&corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: tc.podName}}, &trs)
if tc.want != got {
t.Errorf("got: %v, want: %v", got, tc.want)
}
})
}
}

func statusRunning() duckv1.Status {
var trs v1beta1.TaskRunStatus
markStatusRunning(&trs, v1beta1.TaskRunReasonRunning.String(), "Not all Steps in the Task have finished executing")
Expand Down
31 changes: 2 additions & 29 deletions pkg/reconciler/pipelinerun/pipelinerun.go
Original file line number Diff line number Diff line change
Expand Up @@ -888,26 +888,10 @@ func (c *Reconciler) createTaskRuns(ctx context.Context, rpt *resources.Resolved
func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, params []v1beta1.Param, rpt *resources.ResolvedPipelineTask, pr *v1beta1.PipelineRun, storageBasePath string) (*v1beta1.TaskRun, error) {
logger := logging.FromContext(ctx)

tr, _ := c.taskRunLister.TaskRuns(pr.Namespace).Get(taskRunName)
XinruZhang marked this conversation as resolved.
Show resolved Hide resolved
if tr != nil {
// retry should happen only when the taskrun has failed
if !tr.Status.GetCondition(apis.ConditionSucceeded).IsFalse() {
return tr, nil
}
// Don't modify the lister cache's copy.
tr = tr.DeepCopy()
// is a retry
addRetryHistory(tr)
clearStatus(tr)
tr.Status.MarkResourceOngoing("", "")
logger.Infof("Updating taskrun %s with cleared status and retry history (length: %d).", tr.GetName(), len(tr.Status.RetriesStatus))
return c.PipelineClientSet.TektonV1beta1().TaskRuns(pr.Namespace).UpdateStatus(ctx, tr, metav1.UpdateOptions{})
}

rpt.PipelineTask = resources.ApplyPipelineTaskContexts(rpt.PipelineTask)
taskRunSpec := pr.GetTaskRunSpec(rpt.PipelineTask.Name)
params = append(params, rpt.PipelineTask.Params...)
tr = &v1beta1.TaskRun{
tr := &v1beta1.TaskRun{
ObjectMeta: metav1.ObjectMeta{
Name: taskRunName,
Namespace: pr.Namespace,
Expand All @@ -916,6 +900,7 @@ func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, para
Annotations: combineTaskRunAndTaskSpecAnnotations(pr, rpt.PipelineTask),
},
Spec: v1beta1.TaskRunSpec{
Retries: rpt.PipelineTask.Retries,
afrittoli marked this conversation as resolved.
Show resolved Hide resolved
Params: params,
ServiceAccountName: taskRunSpec.TaskServiceAccountName,
PodTemplate: taskRunSpec.TaskPodTemplate,
Expand Down Expand Up @@ -1169,18 +1154,6 @@ func combinedSubPath(workspaceSubPath string, pipelineTaskSubPath string) string
return filepath.Join(workspaceSubPath, pipelineTaskSubPath)
}

func addRetryHistory(tr *v1beta1.TaskRun) {
newStatus := *tr.Status.DeepCopy()
newStatus.RetriesStatus = nil
tr.Status.RetriesStatus = append(tr.Status.RetriesStatus, newStatus)
}

func clearStatus(tr *v1beta1.TaskRun) {
tr.Status.StartTime = nil
tr.Status.CompletionTime = nil
tr.Status.PodName = ""
}

func getTaskrunAnnotations(pr *v1beta1.PipelineRun) map[string]string {
// Propagate annotations from PipelineRun to TaskRun.
annotations := make(map[string]string, len(pr.ObjectMeta.Annotations)+1)
Expand Down
124 changes: 24 additions & 100 deletions pkg/reconciler/pipelinerun/pipelinerun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ spec:
value: test-pipeline
- name: contextRetriesParam
value: "5"
retries: 5
resources:
inputs:
- name: workspace
Expand Down Expand Up @@ -3527,103 +3528,6 @@ spec:
}
}

// TestReconcileWithTimeoutAndRetry runs "Reconcile" against pipelines with
// retries and timeout settings, and status that represents different number of
// retries already performed. It verifies the reconciled status and events
// generated
func TestReconcileWithTimeoutAndRetry(t *testing.T) {
jerop marked this conversation as resolved.
Show resolved Hide resolved
for _, tc := range []struct {
name string
retries int
conditionSucceeded corev1.ConditionStatus
wantEvents []string
}{{
name: "One try has to be done",
retries: 1,
conditionSucceeded: corev1.ConditionFalse,
wantEvents: []string{
"Warning Failed PipelineRun \"test-pipeline-retry-run-with-timeout\" failed to finish within",
},
}, {
name: "No more retries are needed",
retries: 2,
conditionSucceeded: corev1.ConditionUnknown,
wantEvents: []string{
"Warning Failed PipelineRun \"test-pipeline-retry-run-with-timeout\" failed to finish within",
},
}} {
t.Run(tc.name, func(t *testing.T) {
ps := []*v1beta1.Pipeline{parse.MustParseV1beta1Pipeline(t, fmt.Sprintf(`
metadata:
name: test-pipeline-retry
namespace: foo
spec:
tasks:
- name: hello-world-1
retries: %d
taskRef:
name: hello-world
`, tc.retries))}
prs := []*v1beta1.PipelineRun{parse.MustParseV1beta1PipelineRun(t, `
metadata:
name: test-pipeline-retry-run-with-timeout
namespace: foo
spec:
pipelineRef:
name: test-pipeline-retry
serviceAccountName: test-sa
timeout: 12h0m0s
status:
startTime: "2021-12-31T00:00:00Z"
`)}

ts := []*v1beta1.Task{
simpleHelloWorldTask,
}
trs := []*v1beta1.TaskRun{parse.MustParseV1beta1TaskRun(t, `
metadata:
name: hello-world-1
namespace: foo
status:
conditions:
- status: "False"
type: Succeeded
podName: my-pod-name
retriesStatus:
- conditions:
- status: "False"
type: Succeeded
`)}

prtrs := &v1beta1.PipelineRunTaskRunStatus{
PipelineTaskName: "hello-world-1",
Status: &trs[0].Status,
}
prs[0].Status.TaskRuns = make(map[string]*v1beta1.PipelineRunTaskRunStatus)
prs[0].Status.TaskRuns["hello-world-1"] = prtrs

d := test.Data{
PipelineRuns: prs,
Pipelines: ps,
Tasks: ts,
TaskRuns: trs,
}
prt := newPipelineRunTest(d, t)
defer prt.Cancel()

reconciledRun, _ := prt.reconcileRun("foo", "test-pipeline-retry-run-with-timeout", []string{}, false)

if len(reconciledRun.Status.TaskRuns["hello-world-1"].Status.RetriesStatus) != tc.retries {
t.Fatalf(" %d retries expected but got %d ", tc.retries, len(reconciledRun.Status.TaskRuns["hello-world-1"].Status.RetriesStatus))
}

if status := reconciledRun.Status.TaskRuns["hello-world-1"].Status.GetCondition(apis.ConditionSucceeded).Status; status != tc.conditionSucceeded {
t.Fatalf("Succeeded expected to be %s but is %s", tc.conditionSucceeded, status)
}
})
}
}

// TestReconcileAndPropagateCustomPipelineTaskRunSpec tests that custom PipelineTaskRunSpec declared
// in PipelineRun is propagated to created TaskRuns
func TestReconcileAndPropagateCustomPipelineTaskRunSpec(t *testing.T) {
Expand Down Expand Up @@ -9673,6 +9577,7 @@ spec:
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
XinruZhang marked this conversation as resolved.
Show resolved Hide resolved
params:
- name: platform
value: linux
Expand All @@ -9687,12 +9592,17 @@ status:
conditions:
- type: Succeeded
status: "False"
retriesStatus:
- conditions:
- status: "False"
type: Succeeded
`),
mustParseTaskRunWithObjectMeta(t,
taskRunObjectMeta("pr-platforms-and-browsers-1", "foo",
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: mac
Expand Down Expand Up @@ -9807,6 +9717,7 @@ status:
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: linux
Expand All @@ -9820,7 +9731,7 @@ spec:
status:
conditions:
- type: Succeeded
status: "Unknown"
status: "False"
retriesStatus:
- conditions:
- status: "False"
Expand All @@ -9831,6 +9742,7 @@ status:
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: mac
Expand All @@ -9855,6 +9767,7 @@ status:
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: linux
Expand All @@ -9868,13 +9781,18 @@ spec:
status:
conditions:
- type: Succeeded
status: "False"
status: "Unknown"
retriesStatus:
- conditions:
- status: "False"
type: Succeeded
`),
mustParseTaskRunWithObjectMeta(t,
taskRunObjectMeta("pr-platforms-and-browsers-1", "foo",
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: mac
Expand All @@ -9888,7 +9806,11 @@ spec:
status:
conditions:
- type: Succeeded
status: "False"
status: "Unknown"
retriesStatus:
- conditions:
- status: "False"
type: Succeeded
`),
},
prs: []*v1beta1.PipelineRun{
Expand Down Expand Up @@ -9989,6 +9911,7 @@ status:
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: linux
Expand All @@ -10013,6 +9936,7 @@ status:
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: mac
Expand Down
Loading