Skip to content

Commit

Permalink
TEP-0121: Reconciler Implementation
Browse files Browse the repository at this point in the history
Prior to this commit, `retries` logic for TaskRun is handled by
Tekton `PipelineRun` reconciler. This commit delegates the retries
implementation for `TaskRun` to the `TaskRun` reconciler.

The major change is we stopped relying on `len(retriesStatus)` to
decide if a target `TaskRun` failed or not. Instead, we use status
`ConditionSuceeded` to gate the completion of a `TaskRun`. Even a
`TaskRun` failed on one execution, as long as it has remaining
retries, the TaskRun won't be stored in etcd with its status set
as `Failed`. Instead, the status will be:

```yaml
Type: Succeeded
Status: Unknown
Reason: ToBeRetried
```
  • Loading branch information
XinruZhang committed Dec 20, 2022
1 parent 17b705c commit 2087371
Show file tree
Hide file tree
Showing 11 changed files with 648 additions and 383 deletions.
21 changes: 18 additions & 3 deletions docs/taskruns.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ weight: 300
- [Specifying `Sidecars`](#specifying-sidecars)
- [Overriding `Task` `Steps` and `Sidecars`](#overriding-task-steps-and-sidecars)
- [Specifying `LimitRange` values](#specifying-limitrange-values)
- [Specifying `Retries`](#specifying-retries)
- [Configuring the failure timeout](#configuring-the-failure-timeout)
- [Specifying `ServiceAccount` credentials](#specifying-serviceaccount-credentials)
- [Monitoring execution status](#monitoring-execution-status)
Expand Down Expand Up @@ -698,11 +699,25 @@ object(s), if present. Any `Request` or `Limit` specified by the user (on `Task`

For more information, see the [`LimitRange` support in Pipeline](./compute-resources.md#limitrange-support).

### Specifying `Retries`
You can use the `retries` field to set how many times you want to retry on a failed TaskRun.
All TaskRun failures are retriable except for `Cancellation`.

For a retriable `TaskRun`, when an error occurs:
- The error status is archived in `status.RetriesStatus`
- The `Succeeded` condition in `status` is updated:
```
Type: Succeeded
Status: Unknown
Reason: ToBeRetried
```
- `status.StartTime` and `status.PodName` are unset to trigger another retry attempt.

### Configuring the failure timeout

You can use the `timeout` field to set the `TaskRun's` desired timeout value. If you do not specify this
value for the `TaskRun`, the global default timeout value applies. If you set the timeout to 0, the `TaskRun` will
have no timeout and will run until it completes successfully or fails from an error.
You can use the `timeout` field to set the `TaskRun's` desired timeout value for **each retry attempt**. If you do
not specify this value, the global default timeout value applies (the same, to `each retry attempt`). If you set the timeout to 0,
the `TaskRun` will have no timeout and will run until it completes successfully or fails from an error.

The global default timeout is set to 60 minutes when you first install Tekton. You can set
a different global default timeout value using the `default-timeout-minutes` field in
Expand Down
10 changes: 10 additions & 0 deletions pkg/pod/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,16 @@ func DidTaskRunFail(pod *corev1.Pod) bool {
return f
}

// IsPodArchived indicates if a pod is archived in the retriesStatus.
func IsPodArchived(pod *corev1.Pod, trs *v1beta1.TaskRunStatus) bool {
for _, retryStatus := range trs.RetriesStatus {
if retryStatus.PodName == pod.GetName() {
return true
}
}
return false
}

func areStepsComplete(pod *corev1.Pod) bool {
stepsComplete := len(pod.Status.ContainerStatuses) > 0 && pod.Status.Phase == corev1.PodRunning
for _, s := range pod.Status.ContainerStatuses {
Expand Down
45 changes: 45 additions & 0 deletions pkg/pod/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1634,6 +1634,51 @@ func TestMarkStatusSuccess(t *testing.T) {
}
}

func TestIsPodArchived(t *testing.T) {
for _, tc := range []struct {
name string
podName string
retriesStatus []v1beta1.TaskRunStatus
want bool
}{{
name: "Pod is not in the empty retriesStatus",
podName: "pod",
retriesStatus: []v1beta1.TaskRunStatus{},
want: false,
}, {
name: "Pod is not in the non-empty retriesStatus",
podName: "pod-retry1",
retriesStatus: []v1beta1.TaskRunStatus{{
TaskRunStatusFields: v1beta1.TaskRunStatusFields{
PodName: "pod",
},
}},
want: false,
}, {
name: "Pod is in the retriesStatus",
podName: "pod",
retriesStatus: []v1beta1.TaskRunStatus{{
TaskRunStatusFields: v1beta1.TaskRunStatusFields{
PodName: "pod",
}},
},
want: true,
}} {
t.Run(tc.name, func(t *testing.T) {
trs := v1beta1.TaskRunStatus{
TaskRunStatusFields: v1beta1.TaskRunStatusFields{
PodName: "pod",
RetriesStatus: tc.retriesStatus,
},
}
got := IsPodArchived(&corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: tc.podName}}, &trs)
if tc.want != got {
t.Errorf("got: %v, want: %v", got, tc.want)
}
})
}
}

func statusRunning() duckv1.Status {
var trs v1beta1.TaskRunStatus
markStatusRunning(&trs, v1beta1.TaskRunReasonRunning.String(), "Not all Steps in the Task have finished executing")
Expand Down
31 changes: 2 additions & 29 deletions pkg/reconciler/pipelinerun/pipelinerun.go
Original file line number Diff line number Diff line change
Expand Up @@ -888,26 +888,10 @@ func (c *Reconciler) createTaskRuns(ctx context.Context, rpt *resources.Resolved
func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, params []v1beta1.Param, rpt *resources.ResolvedPipelineTask, pr *v1beta1.PipelineRun, storageBasePath string) (*v1beta1.TaskRun, error) {
logger := logging.FromContext(ctx)

tr, _ := c.taskRunLister.TaskRuns(pr.Namespace).Get(taskRunName)
if tr != nil {
// retry should happen only when the taskrun has failed
if !tr.Status.GetCondition(apis.ConditionSucceeded).IsFalse() {
return tr, nil
}
// Don't modify the lister cache's copy.
tr = tr.DeepCopy()
// is a retry
addRetryHistory(tr)
clearStatus(tr)
tr.Status.MarkResourceOngoing("", "")
logger.Infof("Updating taskrun %s with cleared status and retry history (length: %d).", tr.GetName(), len(tr.Status.RetriesStatus))
return c.PipelineClientSet.TektonV1beta1().TaskRuns(pr.Namespace).UpdateStatus(ctx, tr, metav1.UpdateOptions{})
}

rpt.PipelineTask = resources.ApplyPipelineTaskContexts(rpt.PipelineTask)
taskRunSpec := pr.GetTaskRunSpec(rpt.PipelineTask.Name)
params = append(params, rpt.PipelineTask.Params...)
tr = &v1beta1.TaskRun{
tr := &v1beta1.TaskRun{
ObjectMeta: metav1.ObjectMeta{
Name: taskRunName,
Namespace: pr.Namespace,
Expand All @@ -916,6 +900,7 @@ func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, para
Annotations: combineTaskRunAndTaskSpecAnnotations(pr, rpt.PipelineTask),
},
Spec: v1beta1.TaskRunSpec{
Retries: rpt.PipelineTask.Retries,
Params: params,
ServiceAccountName: taskRunSpec.TaskServiceAccountName,
PodTemplate: taskRunSpec.TaskPodTemplate,
Expand Down Expand Up @@ -1169,18 +1154,6 @@ func combinedSubPath(workspaceSubPath string, pipelineTaskSubPath string) string
return filepath.Join(workspaceSubPath, pipelineTaskSubPath)
}

func addRetryHistory(tr *v1beta1.TaskRun) {
newStatus := *tr.Status.DeepCopy()
newStatus.RetriesStatus = nil
tr.Status.RetriesStatus = append(tr.Status.RetriesStatus, newStatus)
}

func clearStatus(tr *v1beta1.TaskRun) {
tr.Status.StartTime = nil
tr.Status.CompletionTime = nil
tr.Status.PodName = ""
}

func getTaskrunAnnotations(pr *v1beta1.PipelineRun) map[string]string {
// Propagate annotations from PipelineRun to TaskRun.
annotations := make(map[string]string, len(pr.ObjectMeta.Annotations)+1)
Expand Down
124 changes: 24 additions & 100 deletions pkg/reconciler/pipelinerun/pipelinerun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ spec:
value: test-pipeline
- name: contextRetriesParam
value: "5"
retries: 5
resources:
inputs:
- name: workspace
Expand Down Expand Up @@ -3527,103 +3528,6 @@ spec:
}
}

// TestReconcileWithTimeoutAndRetry runs "Reconcile" against pipelines with
// retries and timeout settings, and status that represents different number of
// retries already performed. It verifies the reconciled status and events
// generated
func TestReconcileWithTimeoutAndRetry(t *testing.T) {
for _, tc := range []struct {
name string
retries int
conditionSucceeded corev1.ConditionStatus
wantEvents []string
}{{
name: "One try has to be done",
retries: 1,
conditionSucceeded: corev1.ConditionFalse,
wantEvents: []string{
"Warning Failed PipelineRun \"test-pipeline-retry-run-with-timeout\" failed to finish within",
},
}, {
name: "No more retries are needed",
retries: 2,
conditionSucceeded: corev1.ConditionUnknown,
wantEvents: []string{
"Warning Failed PipelineRun \"test-pipeline-retry-run-with-timeout\" failed to finish within",
},
}} {
t.Run(tc.name, func(t *testing.T) {
ps := []*v1beta1.Pipeline{parse.MustParseV1beta1Pipeline(t, fmt.Sprintf(`
metadata:
name: test-pipeline-retry
namespace: foo
spec:
tasks:
- name: hello-world-1
retries: %d
taskRef:
name: hello-world
`, tc.retries))}
prs := []*v1beta1.PipelineRun{parse.MustParseV1beta1PipelineRun(t, `
metadata:
name: test-pipeline-retry-run-with-timeout
namespace: foo
spec:
pipelineRef:
name: test-pipeline-retry
serviceAccountName: test-sa
timeout: 12h0m0s
status:
startTime: "2021-12-31T00:00:00Z"
`)}

ts := []*v1beta1.Task{
simpleHelloWorldTask,
}
trs := []*v1beta1.TaskRun{parse.MustParseV1beta1TaskRun(t, `
metadata:
name: hello-world-1
namespace: foo
status:
conditions:
- status: "False"
type: Succeeded
podName: my-pod-name
retriesStatus:
- conditions:
- status: "False"
type: Succeeded
`)}

prtrs := &v1beta1.PipelineRunTaskRunStatus{
PipelineTaskName: "hello-world-1",
Status: &trs[0].Status,
}
prs[0].Status.TaskRuns = make(map[string]*v1beta1.PipelineRunTaskRunStatus)
prs[0].Status.TaskRuns["hello-world-1"] = prtrs

d := test.Data{
PipelineRuns: prs,
Pipelines: ps,
Tasks: ts,
TaskRuns: trs,
}
prt := newPipelineRunTest(d, t)
defer prt.Cancel()

reconciledRun, _ := prt.reconcileRun("foo", "test-pipeline-retry-run-with-timeout", []string{}, false)

if len(reconciledRun.Status.TaskRuns["hello-world-1"].Status.RetriesStatus) != tc.retries {
t.Fatalf(" %d retries expected but got %d ", tc.retries, len(reconciledRun.Status.TaskRuns["hello-world-1"].Status.RetriesStatus))
}

if status := reconciledRun.Status.TaskRuns["hello-world-1"].Status.GetCondition(apis.ConditionSucceeded).Status; status != tc.conditionSucceeded {
t.Fatalf("Succeeded expected to be %s but is %s", tc.conditionSucceeded, status)
}
})
}
}

// TestReconcileAndPropagateCustomPipelineTaskRunSpec tests that custom PipelineTaskRunSpec declared
// in PipelineRun is propagated to created TaskRuns
func TestReconcileAndPropagateCustomPipelineTaskRunSpec(t *testing.T) {
Expand Down Expand Up @@ -9673,6 +9577,7 @@ spec:
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: linux
Expand All @@ -9687,12 +9592,17 @@ status:
conditions:
- type: Succeeded
status: "False"
retriesStatus:
- conditions:
- status: "False"
type: Succeeded
`),
mustParseTaskRunWithObjectMeta(t,
taskRunObjectMeta("pr-platforms-and-browsers-1", "foo",
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: mac
Expand Down Expand Up @@ -9807,6 +9717,7 @@ status:
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: linux
Expand All @@ -9820,7 +9731,7 @@ spec:
status:
conditions:
- type: Succeeded
status: "Unknown"
status: "False"
retriesStatus:
- conditions:
- status: "False"
Expand All @@ -9831,6 +9742,7 @@ status:
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: mac
Expand All @@ -9855,6 +9767,7 @@ status:
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: linux
Expand All @@ -9868,13 +9781,18 @@ spec:
status:
conditions:
- type: Succeeded
status: "False"
status: "Unknown"
retriesStatus:
- conditions:
- status: "False"
type: Succeeded
`),
mustParseTaskRunWithObjectMeta(t,
taskRunObjectMeta("pr-platforms-and-browsers-1", "foo",
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: mac
Expand All @@ -9888,7 +9806,11 @@ spec:
status:
conditions:
- type: Succeeded
status: "False"
status: "Unknown"
retriesStatus:
- conditions:
- status: "False"
type: Succeeded
`),
},
prs: []*v1beta1.PipelineRun{
Expand Down Expand Up @@ -9989,6 +9911,7 @@ status:
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: linux
Expand All @@ -10013,6 +9936,7 @@ status:
"pr", "p", "platforms-and-browsers", false),
`
spec:
retries: 1
params:
- name: platform
value: mac
Expand Down
Loading

0 comments on commit 2087371

Please sign in to comment.