Skip to content

Commit

Permalink
skip leaked ENIs cleanup on unmanaged nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
sushrk committed Nov 22, 2024
1 parent 79ae371 commit 942935d
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 18 deletions.
18 changes: 9 additions & 9 deletions pkg/node/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,28 +123,28 @@ func NewNodeManager(logger logr.Logger, resourceManager resource.ResourceManager
}

func (m *manager) CheckNodeForLeakedENIs(nodeName string) {
managedNode, found := m.GetNode(nodeName)
if !found {
m.Log.Info("Node manager couldn't find the node for reconciliation cleanup", "NodeName", nodeName)
cachedNode, found := m.GetNode(nodeName)
if !found || !cachedNode.IsManaged() {
m.Log.V(1).Info("node not found or not managed by controller, skip reconciliation", "nodeName", nodeName)
return
}

// Only start a goroutine when need to
if time.Now().After(managedNode.GetNextReconciliationTime()) {
if time.Now().After(cachedNode.GetNextReconciliationTime()) {
go func() {
if resourceProvider, found := m.resourceManager.GetResourceProvider(config.ResourceNamePodENI); found {
foundLeakedENI := resourceProvider.ReconcileNode(nodeName)
if foundLeakedENI {
managedNode.SetReconciliationInterval(node.NodeInitialCleanupInterval)
cachedNode.SetReconciliationInterval(node.NodeInitialCleanupInterval)
} else {
interval := wait.Jitter(managedNode.GetReconciliationInterval(), 5)
interval := wait.Jitter(cachedNode.GetReconciliationInterval(), 5)
if interval > node.MaxNodeReconciliationInterval {
interval = node.MaxNodeReconciliationInterval
}
managedNode.SetReconciliationInterval(interval)
cachedNode.SetReconciliationInterval(interval)
}
managedNode.SetNextReconciliationTime(time.Now().Add(managedNode.GetReconciliationInterval()))
m.Log.Info("reconciled cleanup node for leaking branch interfaces", "NodeName", nodeName, "NextInterval", managedNode.GetReconciliationInterval(), "NextReconciliationTime", managedNode.GetNextReconciliationTime())
cachedNode.SetNextReconciliationTime(time.Now().Add(cachedNode.GetReconciliationInterval()))
m.Log.Info("reconciled node to cleanup leaked branch ENIs", "NodeName", nodeName, "NextInterval", cachedNode.GetReconciliationInterval(), "NextReconciliationTime", cachedNode.GetNextReconciliationTime())
} else {
// no SGP provider enabled
return
Expand Down
7 changes: 2 additions & 5 deletions pkg/provider/branch/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,20 +276,17 @@ func (b *branchENIProvider) ReconcileNode(nodeName string) bool {
log := b.log.WithValues("node", nodeName)
if !isPresent {
// return true to set the node next clean up asap since we don't know why trunk is missing
log.Info("no trunk ENI is pointing to the given node", "nodeName", nodeName)
log.V(1).Info("trunk ENI not found, requeue node", "nodeName", nodeName)
return true
}
podList, err := b.apiWrapper.PodAPI.ListPods(nodeName)
if err != nil {
// return true to set the node next cleanup asap since the LIST call may fail for other reasons
// we should assume that there are leaked resources need to be cleaned up
log.Error(err, "failed fo list pod")
log.Error(err, "failed to list pods, requeue node", "nodeName", nodeName)
return true
}
foundLeakedENI := trunkENI.Reconcile(podList.Items)

log.Info("completed reconcile node cleanup on branch ENIs", "nodeName", nodeName)

return foundLeakedENI
}

Expand Down
6 changes: 2 additions & 4 deletions pkg/provider/branch/trunk/trunk.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,9 +366,7 @@ func (t *trunkENI) Reconcile(pods []v1.Pod) bool {
t.deleteQueue = append(t.deleteQueue, eni)
}
delete(t.uidToBranchENIMap, uid)

t.log.Info("trunk controller found leaked branch ENI. the controller pushed leaked ENI to delete queue and deleted pod that doesn't exist anymore", "pod uid", uid,
"eni", branchENIs)
t.log.Info("leaked eni pushed to delete queue, deleted non-existing pod", "pod uid", uid, "eni", branchENIs)
}
}

Expand Down Expand Up @@ -503,7 +501,7 @@ func (t *trunkENI) PushBranchENIsToCoolDownQueue(UID string) {
branchENIs, isPresent := t.uidToBranchENIMap[UID]
if !isPresent {
t.log.Info("couldn't find Branch ENI in cache, it could have been released if pod"+
"succeeded/failed before being deleted", "UID", UID, "BranchENIs", branchENIs)
"succeeded/failed before being deleted", "UID", UID)
trunkENIOperationsErrCount.WithLabelValues("get_branch_from_cache").Inc()
return
}
Expand Down

0 comments on commit 942935d

Please sign in to comment.