diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index a3510beb7..6800384ad 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -20,6 +20,7 @@ import ( "context" "net" "strings" + "time" "github.com/container-storage-interface/spec/lib/go/csi" "google.golang.org/grpc" @@ -129,10 +130,9 @@ func (d *Driver) Run() error { // Remove taint from node to indicate driver startup success // This is done at the last possible moment to prevent race conditions or false positive removals - err = removeNotReadyTaint(cloud.DefaultKubernetesAPIClient) - if err != nil { - klog.ErrorS(err, "Unexpected failure when attempting to remove node taint(s)") - } + go tryRemoveNotReadyTaintUntilSucceed(time.Second, func() error { + return removeNotReadyTaint(cloud.DefaultKubernetesAPIClient) + }) klog.Infof("Listening for connections on address: %#v", listener.Addr()) return d.srv.Serve(listener) diff --git a/pkg/driver/node.go b/pkg/driver/node.go index d080d0d47..b730bff0d 100644 --- a/pkg/driver/node.go +++ b/pkg/driver/node.go @@ -25,6 +25,7 @@ import ( "path/filepath" "strconv" "strings" + "time" "github.com/container-storage-interface/spec/lib/go/csi" "github.com/kubernetes-sigs/aws-efs-csi-driver/pkg/cloud" @@ -464,7 +465,7 @@ type JSONPatch struct { Value interface{} `json:"value"` } -// removeNotReadyTaint removes the taint ebs.csi.aws.com/agent-not-ready from the local node +// removeNotReadyTaint removes the taint efs.csi.aws.com/agent-not-ready from the local node // This taint can be optionally applied by users to prevent startup race conditions such as // https://github.com/kubernetes/kubernetes/issues/95911 func removeNotReadyTaint(k8sClient cloud.KubernetesAPIClient) error { @@ -524,3 +525,16 @@ func removeNotReadyTaint(k8sClient cloud.KubernetesAPIClient) error { klog.InfoS("Removed taint(s) from local node", "node", nodeName) return nil } + +// remove taint may failed, this keep retring until succeed, make sure the taint will eventually being removed +func tryRemoveNotReadyTaintUntilSucceed(interval time.Duration, removeFn func() error) { + for { + err := removeFn() + if err == nil { + return + } + + klog.ErrorS(err, "Unexpected failure when attempting to remove node taint(s)") + time.Sleep(interval) + } +} diff --git a/pkg/driver/node_test.go b/pkg/driver/node_test.go index 4e578fce9..f1b01cb73 100644 --- a/pkg/driver/node_test.go +++ b/pkg/driver/node_test.go @@ -18,6 +18,7 @@ package driver import ( "context" + "errors" "fmt" "os" "reflect" @@ -1012,3 +1013,32 @@ func getNodeMock(mockCtl *gomock.Controller, nodeName string, returnNode *corev1 return mockClient, mockNode } + +func TestTryRemoveNotReadyTaintUntilSucceed(t *testing.T) { + { + i := 0 + tryRemoveNotReadyTaintUntilSucceed(time.Second, func() error { + i++ + if i < 3 { + return errors.New("test") + } + + return nil + }) + + if i != 3 { + t.Fatalf("unexpected result") + } + } + { + i := 0 + tryRemoveNotReadyTaintUntilSucceed(time.Second, func() error { + i++ + return nil + }) + + if i != 1 { + t.Fatalf("unexpected result") + } + } +}