From dc0da750e159edc8aa459352b20f8c7dea001af6 Mon Sep 17 00:00:00 2001 From: Antonin Bas Date: Mon, 23 Sep 2024 13:28:54 -0700 Subject: [PATCH 1/4] Handle ExternalIPPool range changes in Egress controller The validation webhook for ExternalIPPools prevents modifications to IP address ranges. However, it is theoretically possible for an ExternalIPPool to be deleted, then recreated immediately with different IP address ranges, and for the Egress controller to only process the Egress resources referencing the pool once, *after* the CREATE event has been handled by the ExternalIPPool controller. This is because the ExternalIPPool controller is in charge of notifying the Egress controller through a callback (event handler) mechanism, and that multiple events for the same ExternalIPPool name can be merged in the workqueue. With this change, we try to ensure the same "observable" behavior for the Egress controller, regardless of whether the DELETE and CREATE events have been merged or not. The stale Egress IP should be removed, and a new Egress IP should be allocated from the new IP ranges (regardless of whether or not the initial Egress IP was specified by the user). Prior to this change, the change of IP address ranges would be silently ignored by the Egress controller. Signed-off-by: Antonin Bas --- pkg/controller/egress/controller.go | 31 ++++++--- pkg/controller/egress/controller_test.go | 70 +++++++++++++++++++++ pkg/controller/externalippool/controller.go | 4 ++ 3 files changed, 96 insertions(+), 9 deletions(-) diff --git a/pkg/controller/egress/controller.go b/pkg/controller/egress/controller.go index a6e6d814890..96acc1831a1 100644 --- a/pkg/controller/egress/controller.go +++ b/pkg/controller/egress/controller.go @@ -249,14 +249,24 @@ func (c *EgressController) setIPAllocation(egressName string, ip net.IP, poolNam func (c *EgressController) syncEgressIP(egress *egressv1beta1.Egress) (net.IP, *egressv1beta1.Egress, error) { prevIP, prevIPPool, exists := c.getIPAllocation(egress.Name) if exists { - // The EgressIP and the ExternalIPPool don't change, do nothing. - if prevIP.String() == egress.Spec.EgressIP && prevIPPool == egress.Spec.ExternalIPPool && c.externalIPAllocator.IPPoolExists(egress.Spec.ExternalIPPool) { - return prevIP, egress, nil + // The EgressIP and the ExternalIPPool haven't changed. + if prevIP.String() == egress.Spec.EgressIP && prevIPPool == egress.Spec.ExternalIPPool { + // If the EgressIP is still valid for the ExternalIPPool, nothing needs to be done. + if c.externalIPAllocator.IPPoolHasIP(prevIPPool, prevIP) { + return prevIP, egress, nil + } + // If the ExternalIPPool exists, but the IP is not in range, reclaim the IP from the Egress API. + if c.externalIPAllocator.IPPoolExists(egress.Spec.ExternalIPPool) { + klog.InfoS("Allocated EgressIP is no longer part of ExternalIPPool, releasing it", "egress", klog.KObj(egress), "ip", egress.Spec.EgressIP, "pool", egress.Spec.ExternalIPPool) + if updatedEgress, err := c.updateEgressIP(egress, ""); err != nil { + return nil, egress, err + } else { + egress = updatedEgress + } + } } // Either EgressIP or ExternalIPPool changes, release the previous one first. - if err := c.releaseEgressIP(egress.Name, prevIP, prevIPPool); err != nil { - return nil, egress, err - } + c.releaseEgressIP(egress.Name, prevIP, prevIPPool) } // Skip allocating EgressIP if ExternalIPPool is not specified and return whatever user specifies. @@ -326,20 +336,23 @@ func (c *EgressController) updateEgressIP(egress *egressv1beta1.Egress, ip strin } // releaseEgressIP removes the Egress's ipAllocation in the cache and releases the IP to the pool. -func (c *EgressController) releaseEgressIP(egressName string, egressIP net.IP, poolName string) error { +func (c *EgressController) releaseEgressIP(egressName string, egressIP net.IP, poolName string) { if err := c.externalIPAllocator.ReleaseIP(poolName, egressIP); err != nil { if err == externalippool.ErrExternalIPPoolNotFound { // Ignore the error since the external IP Pool could be deleted. klog.InfoS("Failed to release EgressIP because IP Pool does not exist", "egress", egressName, "ip", egressIP, "pool", poolName) } else { + // It is possible for the external IP Pool to have been deleted and + // recreated immediately with a different range, which would trigger this + // case. Transient errors in ReleaseIP are not possible, so there is no + // point in retrying. We should still delete our own state by calling + // deleteIPAllocation. klog.ErrorS(err, "Failed to release IP", "ip", egressIP, "pool", poolName) - return err } } else { klog.InfoS("Released EgressIP", "egress", egressName, "ip", egressIP, "pool", poolName) } c.deleteIPAllocation(egressName) - return nil } func (c *EgressController) syncEgress(key string) error { diff --git a/pkg/controller/egress/controller_test.go b/pkg/controller/egress/controller_test.go index 6c382280e63..322f1d8688e 100644 --- a/pkg/controller/egress/controller_test.go +++ b/pkg/controller/egress/controller_test.go @@ -546,6 +546,76 @@ func TestUpdateEgress(t *testing.T) { checkExternalIPPoolUsed(t, controller, eipFoo2.Name, 0) } +// TestRecreateExternalIPPoolWithNewRange tests the case where an ExternalIPPool is deleted, then +// immediately recreated with a different IP range. Specifically we test the scenario where +// syncEgress / syncEgressIP are called only once because the DELETE and CREATE events are merged in +// the workqueue. Ideally, the behavior observed by the user should be the same irrespective of +// whether the events are merged or not. +// Note that in an actual cluster, it is very unlikely that both events would be merged. +func TestRecreateExternalIPPoolWithNewRange(t *testing.T) { + stopCh := make(chan struct{}) + defer close(stopCh) + + eipFoo1 := newExternalIPPool("pool1", "1.1.1.0/24", "", "") + egress := &v1beta1.Egress{ + ObjectMeta: metav1.ObjectMeta{Name: "egressA", UID: "uidA"}, + Spec: v1beta1.EgressSpec{ + AppliedTo: v1beta1.AppliedTo{ + PodSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "foo"}, + }, + }, + EgressIP: "", + ExternalIPPool: eipFoo1.Name, + }, + } + + eventCh := make(chan string, 1) + waitForEvent := func(timeout time.Duration) error { + select { + case <-time.After(timeout): + return fmt.Errorf("timeout while waiting for IPPool event") + case <-eventCh: + return nil + } + } + + controller := newController(nil, []runtime.Object{eipFoo1, egress}) + // Register our own event handler to be able to determine when changes have been processed + // by the ExternalIPPool controller. + controller.externalIPAllocator.AddEventHandler(func(poolName string) { + eventCh <- poolName + }) + // A call to RestoreIPAllocations is required for every registered event handler. + controller.externalIPAllocator.RestoreIPAllocations(nil) + controller.informerFactory.Start(stopCh) + controller.crdInformerFactory.Start(stopCh) + controller.informerFactory.WaitForCacheSync(stopCh) + controller.crdInformerFactory.WaitForCacheSync(stopCh) + go controller.externalIPAllocator.Run(stopCh) + require.True(t, cache.WaitForCacheSync(stopCh, controller.externalIPAllocator.HasSynced)) + controller.restoreIPAllocations([]*v1beta1.Egress{egress}) + + getEgressIP, _, err := controller.syncEgressIP(egress) + require.NoError(t, err) + assert.Equal(t, net.ParseIP("1.1.1.1"), getEgressIP) + + // Delete and recreate the ExternalIPPool immediately with a diffenre IP range. We do not + // call syncEgressIP in-between, so the Egress controller doesn't have a chance to process + // both changes independently. + controller.crdClient.CrdV1beta1().ExternalIPPools().Delete(context.TODO(), eipFoo1.Name, metav1.DeleteOptions{}) + require.NoError(t, waitForEvent(1*time.Second)) + + eipFoo1 = newExternalIPPool("pool1", "1.1.2.0/24", "", "") + controller.crdClient.CrdV1beta1().ExternalIPPools().Create(context.TODO(), eipFoo1, metav1.CreateOptions{}) + require.NoError(t, waitForEvent(1*time.Second)) + + egress.Spec.EgressIP = getEgressIP.String() + getEgressIP, _, err = controller.syncEgressIP(egress) + require.NoError(t, err) + assert.Equal(t, net.ParseIP("1.1.2.1"), getEgressIP) +} + func TestSyncEgressIP(t *testing.T) { tests := []struct { name string diff --git a/pkg/controller/externalippool/controller.go b/pkg/controller/externalippool/controller.go index 2964fd53c23..feadec1761f 100644 --- a/pkg/controller/externalippool/controller.go +++ b/pkg/controller/externalippool/controller.go @@ -88,6 +88,10 @@ type ExternalIPAllocator interface { // UpdateIPAllocation marks the IP in the specified ExternalIPPool as occupied. UpdateIPAllocation(externalIPPool string, ip net.IP) error // ReleaseIP releases the IP to the IP pool. + // It returns ErrExternalIPPoolNotFound if the externalIPPool does not exist. + // Any other error indicates that the IP was not allocated, or is not currently allocated. + // In case of an error, there is no reason to try again with the same arguments, as + // transient errors are not possible. ReleaseIP(externalIPPool string, ip net.IP) error // HasSynced indicates ExternalIPAllocator has finished syncing all ExternalIPPool resources. HasSynced() bool From fc164200a9bfcf03c83770e7d50370b44ad903ed Mon Sep 17 00:00:00 2001 From: Antonin Bas Date: Tue, 24 Sep 2024 13:19:27 -0700 Subject: [PATCH 2/4] Address review comment Signed-off-by: Antonin Bas --- pkg/controller/egress/controller_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller/egress/controller_test.go b/pkg/controller/egress/controller_test.go index 322f1d8688e..59a43b26c71 100644 --- a/pkg/controller/egress/controller_test.go +++ b/pkg/controller/egress/controller_test.go @@ -596,7 +596,7 @@ func TestRecreateExternalIPPoolWithNewRange(t *testing.T) { require.True(t, cache.WaitForCacheSync(stopCh, controller.externalIPAllocator.HasSynced)) controller.restoreIPAllocations([]*v1beta1.Egress{egress}) - getEgressIP, _, err := controller.syncEgressIP(egress) + getEgressIP, egress, err := controller.syncEgressIP(egress) require.NoError(t, err) assert.Equal(t, net.ParseIP("1.1.1.1"), getEgressIP) From 4bd69395bc620ef3e14a93241c91de333aa13172 Mon Sep 17 00:00:00 2001 From: Antonin Bas Date: Tue, 24 Sep 2024 14:51:32 -0700 Subject: [PATCH 3/4] Fix bug and improve test Signed-off-by: Antonin Bas --- pkg/controller/egress/controller.go | 15 ++++++------- pkg/controller/egress/controller_test.go | 27 ++++++------------------ 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/pkg/controller/egress/controller.go b/pkg/controller/egress/controller.go index 96acc1831a1..ea91873ddee 100644 --- a/pkg/controller/egress/controller.go +++ b/pkg/controller/egress/controller.go @@ -255,14 +255,13 @@ func (c *EgressController) syncEgressIP(egress *egressv1beta1.Egress) (net.IP, * if c.externalIPAllocator.IPPoolHasIP(prevIPPool, prevIP) { return prevIP, egress, nil } - // If the ExternalIPPool exists, but the IP is not in range, reclaim the IP from the Egress API. - if c.externalIPAllocator.IPPoolExists(egress.Spec.ExternalIPPool) { - klog.InfoS("Allocated EgressIP is no longer part of ExternalIPPool, releasing it", "egress", klog.KObj(egress), "ip", egress.Spec.EgressIP, "pool", egress.Spec.ExternalIPPool) - if updatedEgress, err := c.updateEgressIP(egress, ""); err != nil { - return nil, egress, err - } else { - egress = updatedEgress - } + // The ExternalIPPool may no longer exist, or the IP is not in range. + // Reclaim the IP from the Egress API. + klog.InfoS("Allocated EgressIP is no longer part of ExternalIPPool, releasing it", "egress", klog.KObj(egress), "ip", egress.Spec.EgressIP, "pool", egress.Spec.ExternalIPPool) + if updatedEgress, err := c.updateEgressIP(egress, ""); err != nil { + return nil, egress, err + } else { + egress = updatedEgress } } // Either EgressIP or ExternalIPPool changes, release the previous one first. diff --git a/pkg/controller/egress/controller_test.go b/pkg/controller/egress/controller_test.go index 59a43b26c71..ebee50e4bd4 100644 --- a/pkg/controller/egress/controller_test.go +++ b/pkg/controller/egress/controller_test.go @@ -570,24 +570,7 @@ func TestRecreateExternalIPPoolWithNewRange(t *testing.T) { }, } - eventCh := make(chan string, 1) - waitForEvent := func(timeout time.Duration) error { - select { - case <-time.After(timeout): - return fmt.Errorf("timeout while waiting for IPPool event") - case <-eventCh: - return nil - } - } - controller := newController(nil, []runtime.Object{eipFoo1, egress}) - // Register our own event handler to be able to determine when changes have been processed - // by the ExternalIPPool controller. - controller.externalIPAllocator.AddEventHandler(func(poolName string) { - eventCh <- poolName - }) - // A call to RestoreIPAllocations is required for every registered event handler. - controller.externalIPAllocator.RestoreIPAllocations(nil) controller.informerFactory.Start(stopCh) controller.crdInformerFactory.Start(stopCh) controller.informerFactory.WaitForCacheSync(stopCh) @@ -596,6 +579,7 @@ func TestRecreateExternalIPPoolWithNewRange(t *testing.T) { require.True(t, cache.WaitForCacheSync(stopCh, controller.externalIPAllocator.HasSynced)) controller.restoreIPAllocations([]*v1beta1.Egress{egress}) + require.True(t, controller.externalIPAllocator.IPPoolExists(eipFoo1.Name)) getEgressIP, egress, err := controller.syncEgressIP(egress) require.NoError(t, err) assert.Equal(t, net.ParseIP("1.1.1.1"), getEgressIP) @@ -604,13 +588,16 @@ func TestRecreateExternalIPPoolWithNewRange(t *testing.T) { // call syncEgressIP in-between, so the Egress controller doesn't have a chance to process // both changes independently. controller.crdClient.CrdV1beta1().ExternalIPPools().Delete(context.TODO(), eipFoo1.Name, metav1.DeleteOptions{}) - require.NoError(t, waitForEvent(1*time.Second)) + require.EventuallyWithT(t, func(t *assert.CollectT) { + assert.False(t, controller.externalIPAllocator.IPPoolExists(eipFoo1.Name)) + }, 1*time.Second, 10*time.Millisecond) eipFoo1 = newExternalIPPool("pool1", "1.1.2.0/24", "", "") controller.crdClient.CrdV1beta1().ExternalIPPools().Create(context.TODO(), eipFoo1, metav1.CreateOptions{}) - require.NoError(t, waitForEvent(1*time.Second)) + require.EventuallyWithT(t, func(t *assert.CollectT) { + assert.True(t, controller.externalIPAllocator.IPPoolExists(eipFoo1.Name)) + }, 1*time.Second, 10*time.Millisecond) - egress.Spec.EgressIP = getEgressIP.String() getEgressIP, _, err = controller.syncEgressIP(egress) require.NoError(t, err) assert.Equal(t, net.ParseIP("1.1.2.1"), getEgressIP) From d325b51ebecaf7f7ebe2913024aed51d499bf5b0 Mon Sep 17 00:00:00 2001 From: Antonin Bas Date: Wed, 25 Sep 2024 09:58:41 -0700 Subject: [PATCH 4/4] Fix typo Signed-off-by: Antonin Bas --- pkg/controller/egress/controller_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller/egress/controller_test.go b/pkg/controller/egress/controller_test.go index ebee50e4bd4..c110b5d4a0a 100644 --- a/pkg/controller/egress/controller_test.go +++ b/pkg/controller/egress/controller_test.go @@ -584,7 +584,7 @@ func TestRecreateExternalIPPoolWithNewRange(t *testing.T) { require.NoError(t, err) assert.Equal(t, net.ParseIP("1.1.1.1"), getEgressIP) - // Delete and recreate the ExternalIPPool immediately with a diffenre IP range. We do not + // Delete and recreate the ExternalIPPool immediately with a different IP range. We do not // call syncEgressIP in-between, so the Egress controller doesn't have a chance to process // both changes independently. controller.crdClient.CrdV1beta1().ExternalIPPools().Delete(context.TODO(), eipFoo1.Name, metav1.DeleteOptions{})