Skip to content

Commit

Permalink
Fix Taskrun Failure for Preempted Pod of Taskrun
Browse files Browse the repository at this point in the history
This fixes the Taskrun failure for Preempted Pod of Taskrun. In CreatePod, we stop deep copying task spec since v46.0 which causes failure when we preempt pod. Killing the pod twice or thrice should fail the Taskrun because we overwrite information about task spec in the Taskrun status and when the pod is recreated, validation will fail. Like volume mount for workspace validation will fail in the case of PVC.
  • Loading branch information
khrm authored and tekton-robot committed Aug 1, 2023
1 parent 82a405a commit d65f3a8
Show file tree
Hide file tree
Showing 2 changed files with 178 additions and 0 deletions.
8 changes: 8 additions & 0 deletions pkg/reconciler/taskrun/taskrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,14 @@ func (c *Reconciler) createPod(ctx context.Context, ts *v1.TaskSpec, tr *v1.Task
defer span.End()
logger := logging.FromContext(ctx)

// We don't want to mutate tr.Status.TaskSpec inside
// the createPod function. It's possible that pod will
// be killed before running and when rescheduling it
// will cause bugs. As this function could be called
// multiple times, we copy tr.Status.TaskSpec to help
// in scheduling Pod.
ts = ts.DeepCopy()

// By this time, params and workspaces should be propagated down so we can
// validate that all parameter variables and workspaces used in the TaskSpec are declared by the Task.
if validateErr := v1.ValidateUsageOfDeclaredParameters(ctx, ts.Steps, ts.Params); validateErr != nil {
Expand Down
170 changes: 170 additions & 0 deletions test/premption_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
//go:build e2e
// +build e2e

/*
Copyright 2023 The Tekton Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package test

import (
"context"
"fmt"
"testing"
"time"

"github.com/tektoncd/pipeline/test/parse"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"knative.dev/pkg/apis"
knativetest "knative.dev/pkg/test"
"knative.dev/pkg/test/helpers"
)

// TestTaskRunPremption tests that Taskrun can run again
// after its pod has been prempted before completion.
func TestTaskRunPremption(t *testing.T) {
ctx := context.Background()
ctx, cancel := context.WithCancel(ctx)
defer cancel()
c, namespace := setup(ctx, t)
knativetest.CleanupOnInterrupt(func() { tearDown(ctx, t, c, namespace) }, t.Logf)
defer tearDown(ctx, t, c, namespace)

taskName := helpers.ObjectNameForTest(t)
if _, err := c.V1TaskClient.Create(ctx, parse.MustParseV1Task(t, fmt.Sprintf(`
metadata:
name: %s
spec:
params:
- name: MESSAGE
type: string
default: "Hello World"
stepTemplate:
computeResources:
limits:
cpu: 500
memory: 5000Gi
requests:
cpu: 500
memory: 5000Gi
steps:
- name: echo
image: ubuntu
script: |
#!/usr/bin/env bash
echo "Good Morning!" > $(workspaces.task-ws.path)
args:
- "$(params.MESSAGE)"
workspaces:
- name: task-ws
`, taskName)), metav1.CreateOptions{}); err != nil {
t.Fatalf("Failed to create task %q: %v", taskName, err)
}

pipelineRunName := helpers.ObjectNameForTest(t)
if _, err := c.V1PipelineRunClient.Create(ctx, parse.MustParseV1PipelineRun(t, fmt.Sprintf(`
metadata:
name: %s
spec:
pipelineSpec:
params:
- name: MORNING_GREETINGS
type: string
default: Good Morning!
tasks:
- name: test-pod-premption
workspaces:
- name: task-ws
workspace: ws
taskRef:
name: %s
params:
- name: MESSAGE
value: $(params.MORNING_GREETINGS)
workspaces:
- name: ws
params:
- name: MORNING_GREETINGS
value: Good Morning, Bob!
workspaces:
- name: ws
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 32Mi
`, pipelineRunName, taskName)), metav1.CreateOptions{}); err != nil {
t.Fatalf("Failed to create PipelineRun %q: %v", pipelineRunName, err)
}

t.Logf("Waiting for Pipelinerun %s in namespace %s to be started", pipelineRunName, namespace)
if err := WaitForPipelineRunState(ctx, c, pipelineRunName, timeout, Running(pipelineRunName), "PipelineRunRunning", v1Version); err != nil {
t.Fatalf("Error waiting for PipelineRun %s to be running: %s", pipelineRunName, err)
}

taskrunList, err := c.V1TaskRunClient.List(ctx, metav1.ListOptions{LabelSelector: "tekton.dev/pipelineRun=" + pipelineRunName})
if err != nil {
t.Fatalf("Error listing TaskRuns for PipelineRun %s: %s", pipelineRunName, err)
}

tr := &taskrunList.Items[0]

t.Logf("Waiting for TaskRun from PipelineRun %s in namespace %s to be in ExceededNodeResources", pipelineRunName, namespace)
err = WaitForTaskRunState(ctx, c, tr.Name, exceededNodeResources(tr.Name), "TaskRunRunning", v1Version)
if err != nil {
t.Errorf("Error waiting for TaskRun %s to be ExceededNodeResources: %v", tr.Name, err)
}

tr, err = c.V1TaskRunClient.Get(ctx, tr.Name, metav1.GetOptions{})
if err != nil {
t.Errorf("Error Getting TaskRun %s: %v", tr.Name, err)
}

t.Logf("Deleting Pod %s for TaskRun %s in namespace %s", tr.Status.PodName, tr.Name, namespace)
deletePolicy := metav1.DeletePropagationForeground
err = c.KubeClient.CoreV1().Pods(namespace).Delete(ctx, tr.Status.PodName, metav1.DeleteOptions{
PropagationPolicy: &deletePolicy,
})
if err != nil {
t.Errorf("Error Deleting Pod: %s of TaskRun %s: %v", tr.Status.PodName, tr.Name, err)
}

// Wait for pod deletion event to be handled by TaskrunReconciler
time.Sleep(5 * time.Second)
// Get the status of the Taskrun again.
err = WaitForTaskRunState(ctx, c, tr.Name, exceededNodeResources(tr.Name), "TaskRunRunning", v1Version)
if err != nil {
t.Errorf("Error waiting for TaskRun %s to be ExceededNodeResources: %v", tr.Name, err)
}
}

// exceededNodeResources provides a poll condition function that checks if the ConditionAccessor
// resource is currently exceededNodeResources.
func exceededNodeResources(name string) ConditionAccessorFn {
return func(ca apis.ConditionAccessor) (bool, error) {
c := ca.GetCondition(apis.ConditionSucceeded)
if c != nil {
if c.Status == corev1.ConditionTrue || c.Status == corev1.ConditionFalse {
return true, fmt.Errorf(`%q already finished`, name)
} else if c.Status == corev1.ConditionUnknown && (c.Reason == "ExceededNodeResources") {
return true, nil
}
}
return false, nil
}
}

0 comments on commit d65f3a8

Please sign in to comment.