Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

skip node reconciliation to delete leaked ENIs on un-managed nodes #495

Merged
merged 1 commit into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions pkg/node/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,28 +123,28 @@ func NewNodeManager(logger logr.Logger, resourceManager resource.ResourceManager
}

func (m *manager) CheckNodeForLeakedENIs(nodeName string) {
managedNode, found := m.GetNode(nodeName)
if !found {
m.Log.Info("Node manager couldn't find the node for reconciliation cleanup", "NodeName", nodeName)
cachedNode, found := m.GetNode(nodeName)
if !found || !cachedNode.IsManaged() {
m.Log.V(1).Info("node not found or not managed by controller, skip reconciliation", "nodeName", nodeName)
return
}

// Only start a goroutine when need to
if time.Now().After(managedNode.GetNextReconciliationTime()) {
if time.Now().After(cachedNode.GetNextReconciliationTime()) {
go func() {
if resourceProvider, found := m.resourceManager.GetResourceProvider(config.ResourceNamePodENI); found {
foundLeakedENI := resourceProvider.ReconcileNode(nodeName)
if foundLeakedENI {
managedNode.SetReconciliationInterval(node.NodeInitialCleanupInterval)
cachedNode.SetReconciliationInterval(node.NodeInitialCleanupInterval)
} else {
interval := wait.Jitter(managedNode.GetReconciliationInterval(), 5)
interval := wait.Jitter(cachedNode.GetReconciliationInterval(), 5)
if interval > node.MaxNodeReconciliationInterval {
interval = node.MaxNodeReconciliationInterval
}
managedNode.SetReconciliationInterval(interval)
cachedNode.SetReconciliationInterval(interval)
}
managedNode.SetNextReconciliationTime(time.Now().Add(managedNode.GetReconciliationInterval()))
m.Log.Info("reconciled cleanup node for leaking branch interfaces", "NodeName", nodeName, "NextInterval", managedNode.GetReconciliationInterval(), "NextReconciliationTime", managedNode.GetNextReconciliationTime())
cachedNode.SetNextReconciliationTime(time.Now().Add(cachedNode.GetReconciliationInterval()))
m.Log.Info("reconciled node to cleanup leaked branch ENIs", "NodeName", nodeName, "NextInterval", cachedNode.GetReconciliationInterval(), "NextReconciliationTime", cachedNode.GetNextReconciliationTime())
} else {
// no SGP provider enabled
return
Expand Down
7 changes: 2 additions & 5 deletions pkg/provider/branch/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,20 +276,17 @@ func (b *branchENIProvider) ReconcileNode(nodeName string) bool {
log := b.log.WithValues("node", nodeName)
if !isPresent {
// return true to set the node next clean up asap since we don't know why trunk is missing
log.Info("no trunk ENI is pointing to the given node", "nodeName", nodeName)
log.V(1).Info("trunk ENI not found, requeue node", "nodeName", nodeName)
return true
}
podList, err := b.apiWrapper.PodAPI.ListPods(nodeName)
if err != nil {
// return true to set the node next cleanup asap since the LIST call may fail for other reasons
// we should assume that there are leaked resources need to be cleaned up
log.Error(err, "failed fo list pod")
log.Error(err, "failed to list pods, requeue node", "nodeName", nodeName)
return true
}
foundLeakedENI := trunkENI.Reconcile(podList.Items)

log.Info("completed reconcile node cleanup on branch ENIs", "nodeName", nodeName)

return foundLeakedENI
}

Expand Down
6 changes: 2 additions & 4 deletions pkg/provider/branch/trunk/trunk.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,9 +366,7 @@ func (t *trunkENI) Reconcile(pods []v1.Pod) bool {
t.deleteQueue = append(t.deleteQueue, eni)
}
delete(t.uidToBranchENIMap, uid)

t.log.Info("trunk controller found leaked branch ENI. the controller pushed leaked ENI to delete queue and deleted pod that doesn't exist anymore", "pod uid", uid,
"eni", branchENIs)
t.log.Info("leaked eni pushed to delete queue, deleted non-existing pod", "pod uid", uid, "eni", branchENIs)
}
}

Expand Down Expand Up @@ -503,7 +501,7 @@ func (t *trunkENI) PushBranchENIsToCoolDownQueue(UID string) {
branchENIs, isPresent := t.uidToBranchENIMap[UID]
if !isPresent {
t.log.Info("couldn't find Branch ENI in cache, it could have been released if pod"+
"succeeded/failed before being deleted", "UID", UID, "BranchENIs", branchENIs)
"succeeded/failed before being deleted", "UID", UID)
trunkENIOperationsErrCount.WithLabelValues("get_branch_from_cache").Inc()
return
}
Expand Down
Loading