Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: paginate DescribeNetworkInterfaces with deep filters #375

Merged
merged 3 commits into from
Feb 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ func main() {
var healthCheckTimeout int
var enableWindowsPrefixDelegation bool
var region string
var vpcID string

flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080",
"The address the metric endpoint binds to.")
Expand Down Expand Up @@ -141,6 +142,7 @@ func main() {
flag.BoolVar(&enableWindowsPrefixDelegation, "enable-windows-prefix-delegation", false,
"Enable the feature flag for Windows prefix delegation")
flag.StringVar(&region, "aws-region", "", "The aws region of the k8s cluster")
flag.StringVar(&vpcID, "vpc-id", "", "The vpc-id where EKS cluster is deployed")

flag.Parse()

Expand Down Expand Up @@ -183,6 +185,11 @@ func main() {
os.Exit(1)
}

if vpcID == "" {
sushrk marked this conversation as resolved.
Show resolved Hide resolved
setupLog.Error(fmt.Errorf("vpc-id is a required parameter"), "unable to start the controller")
os.Exit(1)
}

// Profiler disabled by default, to enable set the enableProfiling argument
if enableProfiling {
// To use the profiler - https://golang.org/pkg/net/http/pprof/
Expand Down Expand Up @@ -336,6 +343,7 @@ func main() {
EC2Wrapper: ec2Wrapper,
ClusterName: clusterName,
Log: ctrl.Log.WithName("eni cleaner"),
VPCID: vpcID,
}).SetupWithManager(ctx, mgr, healthzHandler); err != nil {
setupLog.Error(err, "unable to start eni cleaner")
os.Exit(1)
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

116 changes: 64 additions & 52 deletions pkg/aws/ec2/api/eni_cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ type ENICleaner struct {
EC2Wrapper EC2Wrapper
ClusterName string
Log logr.Logger
VPCID string

availableENIs map[string]struct{}
shutdown bool
Expand All @@ -42,16 +43,22 @@ type ENICleaner struct {
}

var (
vpcCniLeakedENICleanupCnt = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "vpc_cni_created_leaked_eni_cleanup_count",
Help: "The number of leaked ENIs created by VPC-CNI that is cleaned up by the controller",
vpccniAvailableENICnt = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "vpc_cni_created_available_eni_count",
Help: "The number of available ENIs created by VPC-CNI that controller will try to delete in each cleanup cycle",
},
)
vpcrcLeakedENICleanupCnt = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "vpc_rc_created_leaked_eni_cleanup_count",
Help: "The number of leaked ENIs created by VPC-RC that is cleaned up by the controller",
vpcrcAvailableENICnt = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "vpc_rc_created_available_eni_count",
Help: "The number of available ENIs created by VPC-RC that controller will try to delete in each cleanup cycle",
},
)
leakedENICnt = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "leaked_eni_count",
Help: "The number of available ENIs that failed to be deleted by the controller in each cleanup cycle",
},
)
)
Expand Down Expand Up @@ -101,6 +108,9 @@ func (e *ENICleaner) Start(ctx context.Context) error {
// interval between cycle 1 and 2 and hence can be safely deleted. And we can also conclude that Interface 1 was
// created but not attached at the the time when 1st cycle ran and hence it should not be deleted.
func (e *ENICleaner) cleanUpAvailableENIs() {
vpcrcAvailableCount := 0
vpccniAvailableCount := 0
leakedENICount := 0
describeNetworkInterfaceIp := &ec2.DescribeNetworkInterfacesInput{
Filters: []*ec2.Filter{
{
Expand All @@ -116,63 +126,65 @@ func (e *ENICleaner) cleanUpAvailableENIs() {
Values: aws.StringSlice([]string{config.NetworkInterfaceOwnerTagValue,
config.NetworkInterfaceOwnerVPCCNITagValue}),
},
{
Name: aws.String("vpc-id"),
Values: []*string{aws.String(e.VPCID)},
},
},
}

availableENIs := make(map[string]struct{})

for {
describeNetworkInterfaceOp, err := e.EC2Wrapper.DescribeNetworkInterfaces(describeNetworkInterfaceIp)
if err != nil {
e.Log.Error(err, "failed to describe network interfaces, will retry")
return
}

for _, networkInterface := range describeNetworkInterfaceOp.NetworkInterfaces {
if _, exists := e.availableENIs[*networkInterface.NetworkInterfaceId]; exists {
// Increment promethues metrics for number of leaked ENIs cleaned up
if tagIdx := slices.IndexFunc(networkInterface.TagSet, func(tag *ec2.Tag) bool {
return *tag.Key == config.NetworkInterfaceOwnerTagKey
}); tagIdx != -1 {
switch *networkInterface.TagSet[tagIdx].Value {
case config.NetworkInterfaceOwnerTagValue:
vpcrcLeakedENICleanupCnt.Inc()
case config.NetworkInterfaceOwnerVPCCNITagValue:
vpcCniLeakedENICleanupCnt.Inc()
default:
// We will not hit this case as we only filter for above two tag values, adding it for any future use cases
e.Log.Info("found available ENI not created by VPC-CNI/VPC-RC")
}
}
networkInterfaces, err := e.EC2Wrapper.DescribeNetworkInterfacesPages(describeNetworkInterfaceIp)
if err != nil {
e.Log.Error(err, "failed to describe network interfaces, cleanup will be retried in next cycle")
return
}

// The ENI in available state has been sitting for at least the eni clean up interval and it should
// be removed
_, err := e.EC2Wrapper.DeleteNetworkInterface(&ec2.DeleteNetworkInterfaceInput{
NetworkInterfaceId: networkInterface.NetworkInterfaceId,
})
if err != nil {
// Log and continue, if the ENI is still present it will be cleaned up in next 2 cycles
e.Log.Error(err, "failed to delete the dangling network interface",
"id", *networkInterface.NetworkInterfaceId)
for _, networkInterface := range networkInterfaces {
if _, exists := e.availableENIs[*networkInterface.NetworkInterfaceId]; exists {
// Increment promethues metrics for number of leaked ENIs cleaned up
if tagIdx := slices.IndexFunc(networkInterface.TagSet, func(tag *ec2.Tag) bool {
return *tag.Key == config.NetworkInterfaceOwnerTagKey
}); tagIdx != -1 {
switch *networkInterface.TagSet[tagIdx].Value {
case config.NetworkInterfaceOwnerTagValue:
vpcrcAvailableCount += 1
case config.NetworkInterfaceOwnerVPCCNITagValue:
vpccniAvailableCount += 1
default:
// We should not hit this case as we only filter for relevant tag values, log error and continue if unexpected ENIs found
e.Log.Error(fmt.Errorf("found available ENI not created by VPC-CNI/VPC-RC"), "eniID", *networkInterface.NetworkInterfaceId)
continue
}
e.Log.Info("deleted dangling ENI successfully",
"eni id", networkInterface.NetworkInterfaceId)
} else {
// Seeing the ENI for the first time, add it to the new list of available network interfaces
availableENIs[*networkInterface.NetworkInterfaceId] = struct{}{}
e.Log.V(1).Info("adding eni to to the map of available ENIs, will be removed if present in "+
"next run too", "id", *networkInterface.NetworkInterfaceId)
}
}

if describeNetworkInterfaceOp.NextToken == nil {
break
// The ENI in available state has been sitting for at least the eni clean up interval and it should
// be removed
_, err := e.EC2Wrapper.DeleteNetworkInterface(&ec2.DeleteNetworkInterfaceInput{
NetworkInterfaceId: networkInterface.NetworkInterfaceId,
})
if err != nil {
leakedENICount += 1
// Log and continue, if the ENI is still present it will be cleaned up in next 2 cycles
e.Log.Error(err, "failed to delete the dangling network interface",
"id", *networkInterface.NetworkInterfaceId)
continue
}
e.Log.Info("deleted dangling ENI successfully",
"eni id", networkInterface.NetworkInterfaceId)
} else {
// Seeing the ENI for the first time, add it to the new list of available network interfaces
availableENIs[*networkInterface.NetworkInterfaceId] = struct{}{}
e.Log.V(1).Info("adding eni to to the map of available ENIs, will be removed if present in "+
"next run too", "id", *networkInterface.NetworkInterfaceId)
}

describeNetworkInterfaceIp.NextToken = describeNetworkInterfaceOp.NextToken
}

// Update leaked ENI metrics
vpcrcAvailableENICnt.Set(float64(vpcrcAvailableCount))
vpccniAvailableENICnt.Set(float64(vpccniAvailableCount))
leakedENICnt.Set(float64(leakedENICount))
// Set the available ENIs to the list of ENIs seen in the current cycle
e.availableENIs = availableENIs
}
27 changes: 15 additions & 12 deletions pkg/aws/ec2/api/eni_cleanup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ var (
mockNetworkInterfaceId2 = "eni-000000000000001"
mockNetworkInterfaceId3 = "eni-000000000000002"

mockVPCID = "vpc-0000000000000000"

mockDescribeNetworkInterfaceIp = &ec2.DescribeNetworkInterfacesInput{
Filters: []*ec2.Filter{
{
Expand All @@ -52,19 +54,19 @@ var (
Values: aws.StringSlice([]string{config.NetworkInterfaceOwnerTagValue,
config.NetworkInterfaceOwnerVPCCNITagValue}),
},
{
Name: aws.String("vpc-id"),
Values: []*string{aws.String(mockVPCID)},
},
},
}
mockDescribeInterfaceOpWith1And2 = &ec2.DescribeNetworkInterfacesOutput{
NetworkInterfaces: []*ec2.NetworkInterface{
{NetworkInterfaceId: &mockNetworkInterfaceId1},
{NetworkInterfaceId: &mockNetworkInterfaceId2},
},
mockDescribeInterfaceOpWith1And2 = []*ec2.NetworkInterface{
{NetworkInterfaceId: &mockNetworkInterfaceId1},
{NetworkInterfaceId: &mockNetworkInterfaceId2},
}
mockDescribeInterfaceOpWith1And3 = &ec2.DescribeNetworkInterfacesOutput{
NetworkInterfaces: []*ec2.NetworkInterface{
{NetworkInterfaceId: &mockNetworkInterfaceId1},
{NetworkInterfaceId: &mockNetworkInterfaceId3},
},
mockDescribeInterfaceOpWith1And3 = []*ec2.NetworkInterface{
{NetworkInterfaceId: &mockNetworkInterfaceId1},
{NetworkInterfaceId: &mockNetworkInterfaceId3},
}
)

Expand All @@ -74,6 +76,7 @@ func getMockENICleaner(ctrl *gomock.Controller) (*ENICleaner, *mock_api.MockEC2W
EC2Wrapper: mockEC2Wrapper,
availableENIs: map[string]struct{}{},
Log: zap.New(zap.UseDevMode(true)),
VPCID: mockVPCID,
clusterNameTagKey: mockClusterNameTagKey,
ctx: context.Background(),
}, mockEC2Wrapper
Expand All @@ -85,10 +88,10 @@ func TestENICleaner_cleanUpAvailableENIs(t *testing.T) {

gomock.InOrder(
// Return network interface 1 and 2 in first cycle
mockWrapper.EXPECT().DescribeNetworkInterfaces(mockDescribeNetworkInterfaceIp).
mockWrapper.EXPECT().DescribeNetworkInterfacesPages(mockDescribeNetworkInterfaceIp).
Return(mockDescribeInterfaceOpWith1And2, nil),
// Return network interface 1 and 3 in the second cycle
mockWrapper.EXPECT().DescribeNetworkInterfaces(mockDescribeNetworkInterfaceIp).
mockWrapper.EXPECT().DescribeNetworkInterfacesPages(mockDescribeNetworkInterfaceIp).
Return(mockDescribeInterfaceOpWith1And3, nil),
// Expect to delete the network interface 1
mockWrapper.EXPECT().DeleteNetworkInterface(
Expand Down
49 changes: 13 additions & 36 deletions pkg/aws/ec2/api/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ type EC2APIHelper interface {
ipResourceCount *config.IPResourceCount, interfaceType *string) (*ec2.NetworkInterface, error)
DeleteNetworkInterface(interfaceId *string) error
GetSubnet(subnetId *string) (*ec2.Subnet, error)
GetBranchNetworkInterface(trunkID *string) ([]*ec2.NetworkInterface, error)
GetBranchNetworkInterface(trunkID *string, subnetID *string) ([]*ec2.NetworkInterface, error)
GetInstanceNetworkInterface(instanceId *string) ([]*ec2.InstanceNetworkInterface, error)
DescribeNetworkInterfaces(nwInterfaceIds []*string) ([]*ec2.NetworkInterface, error)
DescribeTrunkInterfaceAssociation(trunkInterfaceId *string) ([]*ec2.TrunkInterfaceAssociation, error)
Expand Down Expand Up @@ -562,43 +562,20 @@ func (h *ec2APIHelper) UnassignIPv4Resources(eniID string, resourceType config.R
return err
}

func (h *ec2APIHelper) GetBranchNetworkInterface(trunkID *string) ([]*ec2.NetworkInterface, error) {
filters := []*ec2.Filter{{
Name: aws.String("tag:" + config.TrunkENIIDTag),
Values: []*string{trunkID},
}}

describeNetworkInterfacesInput := &ec2.DescribeNetworkInterfacesInput{Filters: filters}
var nwInterfaces []*ec2.NetworkInterface
for {
describeNetworkInterfaceOutput, err := h.ec2Wrapper.DescribeNetworkInterfaces(describeNetworkInterfacesInput)
if err != nil {
return nil, err
}

if describeNetworkInterfaceOutput == nil || describeNetworkInterfaceOutput.NetworkInterfaces == nil ||
len(describeNetworkInterfaceOutput.NetworkInterfaces) == 0 {
// No more interface associated with the trunk, return the result
break
}

// One or more interface associated with the trunk, return the result
for _, nwInterface := range describeNetworkInterfaceOutput.NetworkInterfaces {
// Only attach the required details to avoid consuming extra memory
nwInterfaces = append(nwInterfaces, &ec2.NetworkInterface{
NetworkInterfaceId: nwInterface.NetworkInterfaceId,
TagSet: nwInterface.TagSet,
})
}

if describeNetworkInterfaceOutput.NextToken == nil {
break
}

describeNetworkInterfacesInput.NextToken = describeNetworkInterfaceOutput.NextToken
func (h *ec2APIHelper) GetBranchNetworkInterface(trunkID *string, subnetID *string) ([]*ec2.NetworkInterface, error) {
filters := []*ec2.Filter{
{
Name: aws.String("tag:" + config.TrunkENIIDTag),
Values: []*string{trunkID},
},
{
Name: aws.String("subnet-id"),
Values: []*string{subnetID},
},
}

return nwInterfaces, nil
describeNetworkInterfacesInput := &ec2.DescribeNetworkInterfacesInput{Filters: filters}
return h.ec2Wrapper.DescribeNetworkInterfacesPages(describeNetworkInterfacesInput)
}

// DetachAndDeleteNetworkInterface detaches the network interface first and then deletes it
Expand Down
Loading
Loading