From 6ef167bd3095b003d95671fd5e4214ff5238f680 Mon Sep 17 00:00:00 2001 From: ran Date: Sat, 16 Mar 2024 01:33:41 +0800 Subject: [PATCH 1/2] impl compensation mechanism for driver.removeNotReadyTaint() --- pkg/driver/driver.go | 6 ++---- pkg/driver/node.go | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index a3510beb7..f949b56c4 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -20,6 +20,7 @@ import ( "context" "net" "strings" + "time" "github.com/container-storage-interface/spec/lib/go/csi" "google.golang.org/grpc" @@ -129,10 +130,7 @@ func (d *Driver) Run() error { // Remove taint from node to indicate driver startup success // This is done at the last possible moment to prevent race conditions or false positive removals - err = removeNotReadyTaint(cloud.DefaultKubernetesAPIClient) - if err != nil { - klog.ErrorS(err, "Unexpected failure when attempting to remove node taint(s)") - } + go tryRemoveNotReadyTaintUntilSucceed(cloud.DefaultKubernetesAPIClient, time.Second) klog.Infof("Listening for connections on address: %#v", listener.Addr()) return d.srv.Serve(listener) diff --git a/pkg/driver/node.go b/pkg/driver/node.go index 48e9e0113..d3711907c 100644 --- a/pkg/driver/node.go +++ b/pkg/driver/node.go @@ -25,6 +25,7 @@ import ( "path/filepath" "strconv" "strings" + "time" "github.com/container-storage-interface/spec/lib/go/csi" "github.com/kubernetes-sigs/aws-efs-csi-driver/pkg/cloud" @@ -452,7 +453,7 @@ type JSONPatch struct { Value interface{} `json:"value"` } -// removeNotReadyTaint removes the taint ebs.csi.aws.com/agent-not-ready from the local node +// removeNotReadyTaint removes the taint efs.csi.aws.com/agent-not-ready from the local node // This taint can be optionally applied by users to prevent startup race conditions such as // https://github.com/kubernetes/kubernetes/issues/95911 func removeNotReadyTaint(k8sClient cloud.KubernetesAPIClient) error { @@ -512,3 +513,16 @@ func removeNotReadyTaint(k8sClient cloud.KubernetesAPIClient) error { klog.InfoS("Removed taint(s) from local node", "node", nodeName) return nil } + +// remove taint may failed, this keep retring until succeed, make sure the taint will eventually being removed +func tryRemoveNotReadyTaintUntilSucceed(k8sClient cloud.KubernetesAPIClient, interval time.Duration) { + for { + err := removeNotReadyTaint(k8sClient) + if err == nil { + return + } + + klog.ErrorS(err, "Unexpected failure when attempting to remove node taint(s)") + time.Sleep(interval) + } +} From 4e270c4ed723f50335fa44a928498c8c2446644b Mon Sep 17 00:00:00 2001 From: ran Date: Sat, 16 Mar 2024 01:51:15 +0800 Subject: [PATCH 2/2] add test --- pkg/driver/driver.go | 4 +++- pkg/driver/node.go | 4 ++-- pkg/driver/node_test.go | 30 ++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index f949b56c4..6800384ad 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -130,7 +130,9 @@ func (d *Driver) Run() error { // Remove taint from node to indicate driver startup success // This is done at the last possible moment to prevent race conditions or false positive removals - go tryRemoveNotReadyTaintUntilSucceed(cloud.DefaultKubernetesAPIClient, time.Second) + go tryRemoveNotReadyTaintUntilSucceed(time.Second, func() error { + return removeNotReadyTaint(cloud.DefaultKubernetesAPIClient) + }) klog.Infof("Listening for connections on address: %#v", listener.Addr()) return d.srv.Serve(listener) diff --git a/pkg/driver/node.go b/pkg/driver/node.go index d3711907c..6b621e891 100644 --- a/pkg/driver/node.go +++ b/pkg/driver/node.go @@ -515,9 +515,9 @@ func removeNotReadyTaint(k8sClient cloud.KubernetesAPIClient) error { } // remove taint may failed, this keep retring until succeed, make sure the taint will eventually being removed -func tryRemoveNotReadyTaintUntilSucceed(k8sClient cloud.KubernetesAPIClient, interval time.Duration) { +func tryRemoveNotReadyTaintUntilSucceed(interval time.Duration, removeFn func() error) { for { - err := removeNotReadyTaint(k8sClient) + err := removeFn() if err == nil { return } diff --git a/pkg/driver/node_test.go b/pkg/driver/node_test.go index 25b8dc9e6..34a92175d 100644 --- a/pkg/driver/node_test.go +++ b/pkg/driver/node_test.go @@ -18,6 +18,7 @@ package driver import ( "context" + "errors" "fmt" "os" "reflect" @@ -988,3 +989,32 @@ func getNodeMock(mockCtl *gomock.Controller, nodeName string, returnNode *corev1 return mockClient, mockNode } + +func TestTryRemoveNotReadyTaintUntilSucceed(t *testing.T) { + { + i := 0 + tryRemoveNotReadyTaintUntilSucceed(time.Second, func() error { + i++ + if i < 3 { + return errors.New("test") + } + + return nil + }) + + if i != 3 { + t.Fatalf("unexpected result") + } + } + { + i := 0 + tryRemoveNotReadyTaintUntilSucceed(time.Second, func() error { + i++ + return nil + }) + + if i != 1 { + t.Fatalf("unexpected result") + } + } +}