Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[backport] fix: [Linux] [NPM] handle #2977 for netpols without cidr #3006

Merged
merged 1 commit into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
vendor/

# Binaries
out/*
output/*
Expand Down
85 changes: 60 additions & 25 deletions npm/pkg/dataplane/dataplane.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"errors"
"fmt"
"strings"
"sync"
"time"

"github.com/Azure/azure-container-networking/common"
Expand All @@ -18,12 +19,12 @@ import (
const (
reconcileDuration = time.Duration(5 * time.Minute)

contextBackground = "BACKGROUND"
contextApplyDP = "APPLY-DP"
contextAddNetPol = "ADD-NETPOL"
contextAddNetPolBootup = "BOOTUP-ADD-NETPOL"
contextAddNetPolCIDRPrecaution = "ADD-NETPOL-CIDR-PRECAUTION"
contextDelNetPol = "DEL-NETPOL"
contextBackground = "BACKGROUND"
contextApplyDP = "APPLY-DP"
contextAddNetPol = "ADD-NETPOL"
contextAddNetPolBootup = "BOOTUP-ADD-NETPOL"
contextAddNetPolPrecaution = "ADD-NETPOL-PRECAUTION"
contextDelNetPol = "DEL-NETPOL"
)

var (
Expand All @@ -48,6 +49,11 @@ type Config struct {
*policies.PolicyManagerCfg
}

type removePolicyInfo struct {
sync.Mutex
previousRemovePolicyIPSetsFailed bool
}

type DataPlane struct {
*Config
applyInBackground bool
Expand All @@ -64,7 +70,10 @@ type DataPlane struct {
endpointQuery *endpointQuery
applyInfo *applyInfo
netPolQueue *netPolQueue
stopChannel <-chan struct{}
// removePolicyInfo tracks when a policy was removed yet had ApplyIPSet failures.
// This field is only relevant for Linux.
removePolicyInfo removePolicyInfo
stopChannel <-chan struct{}
}

func NewDataPlane(nodeName string, ioShim *common.IOShim, cfg *Config, stopChannel <-chan struct{}) (*DataPlane, error) {
Expand Down Expand Up @@ -335,6 +344,9 @@ func (dp *DataPlane) applyDataPlaneNow(context string) error {
}
klog.Infof("[DataPlane] [ApplyDataPlane] [%s] finished applying ipsets", context)

// see comment in RemovePolicy() for why this is here
dp.setRemovePolicyFailure(false)

if dp.applyInBackground {
dp.applyInfo.Lock()
dp.applyInfo.numBatches = 0
Expand Down Expand Up @@ -472,26 +484,17 @@ func (dp *DataPlane) addPolicies(netPols []*policies.NPMNetworkPolicy) error {
}
}

if !util.IsWindowsDP() {
for _, netPol := range netPols {
if !(netPol.HasCIDRRules() && dp.ipsetMgr.PreviousApplyFailed()) {
continue
}

if inBootupPhase {
// this should never happen because bootup phase is for windows, but just in case, we don't want to applyDataplaneNow() or else there will be a deadlock on dp.applyInfo
msg := fmt.Sprintf("[DataPlane] [%s] at risk of improperly applying a CIDR policy which is removed then readded", contextAddNetPolCIDRPrecaution)
klog.Warning(msg)
metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, msg)
break
}

if dp.hadRemovePolicyFailure() {
if inBootupPhase {
// this should never happen because bootup phase is for windows, but just in case, we don't want to applyDataplaneNow() or else there will be a deadlock on dp.applyInfo
msg := fmt.Sprintf("[DataPlane] [%s] at risk of improperly applying a policy which is removed then readded", contextAddNetPolPrecaution)
klog.Warning(msg)
metrics.SendErrorLogAndMetric(util.DaemonDataplaneID, msg)
} else {
// prevent #2977
if err := dp.applyDataPlaneNow(contextAddNetPolCIDRPrecaution); err != nil {
if err := dp.applyDataPlaneNow(contextAddNetPolPrecaution); err != nil {
return err // nolint:wrapcheck // unnecessary to wrap error since the provided context is included in the error
}

break
}
}

Expand Down Expand Up @@ -531,6 +534,9 @@ func (dp *DataPlane) addPolicies(netPols []*policies.NPMNetworkPolicy) error {
}
klog.Infof("[DataPlane] [%s] finished applying ipsets", contextAddNetPolBootup)

// see comment in RemovePolicy() for why this is here
dp.setRemovePolicyFailure(false)

dp.applyInfo.numBatches = 0
}

Expand Down Expand Up @@ -627,7 +633,16 @@ func (dp *DataPlane) RemovePolicy(policyKey string) error {
return err
}

return dp.applyDataPlaneNow(contextApplyDP)
if err := dp.applyDataPlaneNow(contextDelNetPol); err != nil {
// Failed to apply IPSets while removing this policy.
// Consider this removepolicy call a failure until apply IPSets is successful.
// Related to #2977
klog.Info("[DataPlane] remove policy has failed to apply ipsets. setting remove policy failure")
dp.setRemovePolicyFailure(true)
return err // nolint:wrapcheck // unnecessary to wrap error since the provided context is included in the error
}

return nil
}

// UpdatePolicy takes in updated policy object, calculates the delta and applies changes
Expand Down Expand Up @@ -749,3 +764,23 @@ func (dp *DataPlane) deleteIPSetsAndReferences(sets []*ipsets.TranslatedIPSet, n
}
return nil
}

func (dp *DataPlane) setRemovePolicyFailure(failed bool) {
if util.IsWindowsDP() {
return
}

dp.removePolicyInfo.Lock()
defer dp.removePolicyInfo.Unlock()
dp.removePolicyInfo.previousRemovePolicyIPSetsFailed = failed
}

func (dp *DataPlane) hadRemovePolicyFailure() bool {
if util.IsWindowsDP() {
return false
}

dp.removePolicyInfo.Lock()
defer dp.removePolicyInfo.Unlock()
return dp.removePolicyInfo.previousRemovePolicyIPSetsFailed
}
7 changes: 0 additions & 7 deletions npm/pkg/dataplane/ipsets/ipsetmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,6 @@ func NewIPSetManager(iMgrCfg *IPSetManagerCfg, ioShim *common.IOShim) *IPSetMana
}
}

// PreviousApplyFailed is only relevant for Linux right now since Windows doesn't track failures
func (iMgr *IPSetManager) PreviousApplyFailed() bool {
iMgr.Lock()
defer iMgr.Unlock()
return iMgr.consecutiveApplyFailures > 0
}

/*
Reconcile removes empty/unreferenced sets from the cache.
For ApplyAllIPSets mode, those sets are added to the toDeleteCache.
Expand Down
Loading