From 05e2ccb5127ef745183312b0c8089a1949853e8d Mon Sep 17 00:00:00 2001 From: changluyi <47097611+changluyi@users.noreply.github.com> Date: Thu, 20 Jun 2024 16:11:08 +0800 Subject: [PATCH] fix kube-ovn-cni crash for newly added nodes , due to old legacy event in deleteNodeQueue (#4194) Signed-off-by: clyi --- pkg/controller/controller.go | 16 +++++++++------- pkg/controller/node.go | 15 +++++++++++++++ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 3aaeaccd172..9ffda1dc996 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -67,6 +67,7 @@ type Controller struct { addOrUpdatePodQueue workqueue.RateLimitingInterface deletePodQueue workqueue.RateLimitingInterface deletingPodObjMap *sync.Map + deletingNodeObjMap *sync.Map updatePodSecurityQueue workqueue.RateLimitingInterface podKeyMutex keymutex.KeyMutex @@ -298,13 +299,14 @@ func Run(ctx context.Context, config *Configuration) { numKeyLocks = config.WorkerNum * 2 } controller := &Controller{ - config: config, - vpcs: &sync.Map{}, - podSubnetMap: &sync.Map{}, - deletingPodObjMap: &sync.Map{}, - ovnLegacyClient: ovs.NewLegacyClient(config.OvnTimeout), - ipam: ovnipam.NewIPAM(), - namedPort: NewNamedPort(), + config: config, + vpcs: &sync.Map{}, + podSubnetMap: &sync.Map{}, + deletingPodObjMap: &sync.Map{}, + deletingNodeObjMap: &sync.Map{}, + ovnLegacyClient: ovs.NewLegacyClient(config.OvnTimeout), + ipam: ovnipam.NewIPAM(), + namedPort: NewNamedPort(), vpcsLister: vpcInformer.Lister(), vpcSynced: vpcInformer.Informer().HasSynced, diff --git a/pkg/controller/node.go b/pkg/controller/node.go index 3ba58da5f4d..54fa1635ed8 100644 --- a/pkg/controller/node.go +++ b/pkg/controller/node.go @@ -75,6 +75,9 @@ func (c *Controller) enqueueDeleteNode(obj interface{}) { return } klog.V(3).Infof("enqueue delete node %s", key) + + n := obj.(*v1.Node) + c.deletingNodeObjMap.Store(key, n) c.deleteNodeQueue.Add(key) } @@ -173,6 +176,7 @@ func (c *Controller) processNextDeleteNodeWorkItem() bool { return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error()) } c.deleteNodeQueue.Forget(obj) + c.deletingNodeObjMap.Delete(key) return nil }(obj) if err != nil { @@ -472,6 +476,17 @@ func (c *Controller) handleDeleteNode(key string) error { defer func() { _ = c.nodeKeyMutex.UnlockKey(key) }() klog.Infof("handle delete node %s", key) + nodeObj, ok := c.deletingNodeObjMap.Load(key) + if !ok { + return nil + } + node := nodeObj.(*v1.Node) + n, _ := c.nodesLister.Get(key) + if n != nil && n.UID != node.UID { + klog.Warningf("Node %s is adding, skip the node delete handler, but it may leave some gc resources behind", key) + return nil + } + portName := util.NodeLspName(key) klog.Infof("delete logical switch port %s", portName) if err := c.OVNNbClient.DeleteLogicalSwitchPort(portName); err != nil {