Skip to content

Commit

Permalink
Fix race condition in PVC deletion
Browse files Browse the repository at this point in the history
fixes [#7138][#7138]. In `coschedule:pipelineruns` mode, the `pvcs` created by `VolumeClaimTemplate` should be automatically deleted when the owning `pipelinerun` is completed.
To delete a `pvc` used by a pod (pipelinerun), the [kubernetes.io/pvc-protection][finalizer] finalizer must be removed.

Prior to this commit, the Tekton reconciler attempted to remove the `kubernetes.io/pvc-protection` finalizer first and then deletes the created pvcs. This results
in race condition since `PVCProtectionController` tries to add back the finalizer if the `pvc` is in `bounded` status . If the add back action happens before the PVC deletion call is completed,
the `PVC` will be left in `terminating` status.

This commit changes the reconciler to delete the `PVC` first and then removes the `finalizer` to fix the race condition.

This commit removes `TestPurgeFinalizerAndDeletePVCForWorkspace` UT, since the finalizer behavior cannot be mocked when a PVC is deleted.
This commit also adds integration test to cover this scenario.

/kind bug

[#7138]: #7138
[finalizer]: https://kubernetes.io/docs/concepts/storage/persistent-volumes/#storage-object-in-use-protection
  • Loading branch information
QuanZhang-William authored and tekton-robot committed Oct 3, 2023
1 parent cbabe7f commit 5a68859
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 15 deletions.
25 changes: 22 additions & 3 deletions pkg/reconciler/pipelinerun/affinity_assistant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,7 @@ func TestThatAffinityAssistantNameIsNoLongerThan53(t *testing.T) {
t.Errorf("affinity assistant name can not be longer than 53 chars")
}
}

func TestCleanupAffinityAssistants_Success(t *testing.T) {
workspaces := []v1.WorkspaceBinding{
{
Expand Down Expand Up @@ -866,6 +867,20 @@ func TestCleanupAffinityAssistants_Success(t *testing.T) {
_, c, _ := seedTestData(data)
ctx := cfgtesting.SetFeatureFlags(context.Background(), t, tc.cfgMap)

// mocks `kubernetes.io/pvc-protection` finalizer behavior by adding DeletionTimestamp when deleting pvcs with the finalizer
// see details in: https://kubernetes.io/docs/concepts/overview/working-with-objects/finalizers/#how-finalizers-work
if tc.aaBehavior == aa.AffinityAssistantPerPipelineRun {
for _, pvc := range pvcs {
pvc.DeletionTimestamp = &metav1.Time{
Time: metav1.Now().Time,
}
c.KubeClientSet.CoreV1().(*fake.FakeCoreV1).PrependReactor("delete", "persistentvolumeclaims",
func(action testing2.Action) (handled bool, ret runtime.Object, err error) {
return true, pvc, nil
})
}
}

// call clean up
err := c.cleanupAffinityAssistantsAndPVCs(ctx, pr)
if err != nil {
Expand All @@ -889,9 +904,13 @@ func TestCleanupAffinityAssistants_Success(t *testing.T) {
t.Errorf("unexpected err when getting PersistentVolumeClaims, err: %v", err)
}
} else {
_, err = c.KubeClientSet.CoreV1().PersistentVolumeClaims(pr.Namespace).Get(ctx, expectedPVCName, metav1.GetOptions{})
if !apierrors.IsNotFound(err) {
t.Errorf("failed to clean up PersistentVolumeClaim")
pvc, err := c.KubeClientSet.CoreV1().PersistentVolumeClaims(pr.Namespace).Get(ctx, expectedPVCName, metav1.GetOptions{})
if err != nil {
t.Fatalf("unexpect err when getting PersistentVolumeClaims, err: %v", err)
}
// validate the finalizer is removed, which mocks the pvc is deleted properly
if len(pvc.Finalizers) > 0 {
t.Errorf("pvc %s kubernetes.io/pvc-protection finalizer is not removed properly", pvc.Name)
}
}
}
Expand Down
14 changes: 7 additions & 7 deletions pkg/reconciler/volumeclaim/pvchandler.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (c *defaultPVCHandler) CreatePVCFromVolumeClaimTemplate(ctx context.Context
return nil
}

// PurgeFinalizerAndDeletePVCForWorkspace purges the `kubernetes.io/pvc-protection` finalizer protection of the given pvc and then deletes it.
// PurgeFinalizerAndDeletePVCForWorkspace deletes pvcs and then purges the `kubernetes.io/pvc-protection` finalizer protection.
// Purging the `kubernetes.io/pvc-protection` finalizer allows the pvc to be deleted even when it is referenced by a taskrun pod.
// See mode details in https://kubernetes.io/docs/concepts/storage/persistent-volumes/#storage-object-in-use-protection.
func (c *defaultPVCHandler) PurgeFinalizerAndDeletePVCForWorkspace(ctx context.Context, pvcName, namespace string) error {
Expand Down Expand Up @@ -113,18 +113,18 @@ func (c *defaultPVCHandler) PurgeFinalizerAndDeletePVCForWorkspace(ctx context.C
return fmt.Errorf("failed to marshal jsonpatch: %w", err)
}

// patch the existing PVC to update the finalizers
_, err = c.clientset.CoreV1().PersistentVolumeClaims(namespace).Patch(ctx, pvcName, types.JSONPatchType, removeFinalizerBytes, metav1.PatchOptions{})
if err != nil {
return fmt.Errorf("failed to patch the PVC %s: %w", pvcName, err)
}

// delete PVC
err = c.clientset.CoreV1().PersistentVolumeClaims(namespace).Delete(ctx, pvcName, metav1.DeleteOptions{})
if err != nil {
return fmt.Errorf("failed to delete the PVC %s: %w", pvcName, err)
}

// remove finalizer
_, err = c.clientset.CoreV1().PersistentVolumeClaims(namespace).Patch(ctx, pvcName, types.JSONPatchType, removeFinalizerBytes, metav1.PatchOptions{})
if err != nil {
return fmt.Errorf("failed to patch the PVC %s: %w", pvcName, err)
}

return nil
}

Expand Down
25 changes: 20 additions & 5 deletions pkg/reconciler/volumeclaim/pvchandler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ import (
"go.uber.org/zap"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
fakek8s "k8s.io/client-go/kubernetes/fake"
"k8s.io/client-go/kubernetes/typed/core/v1/fake"
client_go_testing "k8s.io/client-go/testing"
)

Expand Down Expand Up @@ -263,15 +263,30 @@ func TestPurgeFinalizerAndDeletePVCForWorkspace(t *testing.T) {
t.Fatalf("unexpected error when creating PVC: %v", err)
}

// mocks `kubernetes.io/pvc-protection` finalizer behavior by adding DeletionTimestamp when deleting pvcs with the finalizer
// see details in: https://kubernetes.io/docs/concepts/overview/working-with-objects/finalizers/#how-finalizers-work
pvc.DeletionTimestamp = &metav1.Time{
Time: metav1.Now().Time,
}
kubeClientSet.CoreV1().(*fake.FakeCoreV1).PrependReactor("delete", "persistentvolumeclaims",
func(action client_go_testing.Action) (handled bool, ret runtime.Object, err error) {
return true, pvc, nil
})

// call PurgeFinalizerAndDeletePVCForWorkspace to delete pvc
// note that the pvcs are not actually deleted in the unit test due to the mock limitation of fakek8s.NewSimpleClientset();
// full pvc lifecycle is tested in TestAffinityAssistant_PerPipelineRun integration test
pvcHandler := defaultPVCHandler{kubeClientSet, zap.NewExample().Sugar()}
if err := pvcHandler.PurgeFinalizerAndDeletePVCForWorkspace(ctx, pvcName, namespace); err != nil {
t.Fatalf("unexpected error when calling PurgeFinalizerAndDeletePVCForWorkspace: %v", err)
}

// validate pvc is deleted
_, err := kubeClientSet.CoreV1().PersistentVolumeClaims(namespace).Get(ctx, pvcName, metav1.GetOptions{})
if !apierrors.IsNotFound(err) {
t.Errorf("expected a NotFound response of PersistentVolumeClaims, got: %v", err)
// validate the `kubernetes.io/pvc-protection` finalizer is removed
pvc, err := kubeClientSet.CoreV1().PersistentVolumeClaims(namespace).Get(ctx, pvcName, metav1.GetOptions{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(pvc.Finalizers) > 0 {
t.Errorf("pvc %s kubernetes.io/pvc-protection finalizer is not removed properly", pvcName)
}
}
6 changes: 6 additions & 0 deletions test/affinity_assistant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,12 @@ spec:
t.Errorf("Error waiting for PipelineRun to finish: %s", err)
}

// wait and check PVCs are deleted
t.Logf("Waiting for PVC in namespace %s to delete", namespace)
if err := WaitForPVCIsDeleted(ctx, c, timeout, prName, namespace, "PVCDeleted"); err != nil {
t.Errorf("Error waiting for PVC to be deleted: %s", err)
}

// validate PipelineRun pods sharing the same PVC are scheduled to the same node
podNames := []string{fmt.Sprintf("%v-foo-pod", prName), fmt.Sprintf("%v-bar-pod", prName), fmt.Sprintf("%v-double-ws-pod", prName), fmt.Sprintf("%v-no-ws-pod", prName)}
validatePodAffinity(t, ctx, podNames, namespace, c.KubeClient)
Expand Down
25 changes: 25 additions & 0 deletions test/wait.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,31 @@ func WaitForPodState(ctx context.Context, c *clients, name string, namespace str
})
}

// WaitForPVCIsDeleted polls the number of the PVC in the namespace from client every
// interval until all the PVCs in the namespace are deleted. It returns an
// error or timeout. desc will be used to name the metric that is emitted to
// track how long it took to delete all the PVCs in the namespace.
func WaitForPVCIsDeleted(ctx context.Context, c *clients, polltimeout time.Duration, name, namespace, desc string) error {
metricName := fmt.Sprintf("WaitForPVCIsDeleted/%s/%s", name, desc)
_, span := trace.StartSpan(context.Background(), metricName)
defer span.End()

ctx, cancel := context.WithTimeout(ctx, polltimeout)
defer cancel()

return pollImmediateWithContext(ctx, func() (bool, error) {
pvcList, err := c.KubeClient.CoreV1().PersistentVolumeClaims(namespace).List(ctx, metav1.ListOptions{})
if err != nil {
return true, err
}
if len(pvcList.Items) > 0 {
return false, nil
}

return true, nil
})
}

// WaitForPipelineRunState polls the status of the PipelineRun called name from client every
// interval until inState returns `true` indicating it is done, returns an
// error or timeout. desc will be used to name the metric that is emitted to
Expand Down

0 comments on commit 5a68859

Please sign in to comment.