Skip to content

Commit

Permalink
Retry on delete backend failure (oracle#330) (oracle#333)
Browse files Browse the repository at this point in the history
* Retry create/delete of backend on failure and ignore reconcile if lb is not active
  • Loading branch information
shyamradhakrishnan committed Mar 22, 2024
1 parent a21605b commit 8713435
Show file tree
Hide file tree
Showing 6 changed files with 270 additions and 238 deletions.
3 changes: 3 additions & 0 deletions cloud/scope/load_balancer_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ func (s *ClusterScope) ReconcileApiServerLB(ctx context.Context) error {
return err
}
if lb != nil {
if lb.LifecycleState != loadbalancer.LoadBalancerLifecycleStateActive {
return errors.New(fmt.Sprintf("load balancer is in %s state. Waiting for ACTIVE state.", lb.LifecycleState))
}
lbIP, err := s.getLoadbalancerIp(*lb)
if err != nil {
return err
Expand Down
94 changes: 64 additions & 30 deletions cloud/scope/load_balancer_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,12 @@ func TestLBReconciliation(t *testing.T) {
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
IpAddresses: []loadbalancer.IpAddress{
{
IpAddress: common.String("2.2.2.2"),
Expand All @@ -123,11 +124,12 @@ func TestLBReconciliation(t *testing.T) {
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
},
}, nil)
},
Expand All @@ -143,11 +145,12 @@ func TestLBReconciliation(t *testing.T) {
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
IpAddresses: []loadbalancer.IpAddress{
{
IpAddress: common.String("2.2.2.2"),
Expand Down Expand Up @@ -189,11 +192,12 @@ func TestLBReconciliation(t *testing.T) {
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
IpAddresses: []loadbalancer.IpAddress{
{
IpAddress: common.String("2.2.2.2"),
Expand Down Expand Up @@ -537,11 +541,12 @@ func TestLBReconciliation(t *testing.T) {
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
IpAddresses: []loadbalancer.IpAddress{
{
IpAddress: common.String("2.2.2.2"),
Expand Down Expand Up @@ -570,6 +575,34 @@ func TestLBReconciliation(t *testing.T) {
}, nil)
},
},
{
name: "lb not active",
errorExpected: true,
errorSubStringMatch: true,
matchError: errors.New(fmt.Sprintf("load balancer is in %s state. Waiting for ACTIVE state.", loadbalancer.LoadBalancerLifecycleStateCreating)),
testSpecificSetup: func(clusterScope *ClusterScope, lbClient *mock_lb.MockLoadBalancerClient) {
ociClusterAccessor.OCICluster.Spec.NetworkSpec.APIServerLB.LoadBalancerId = common.String("lb-id")
lbClient.EXPECT().GetLoadBalancer(gomock.Any(), gomock.Eq(loadbalancer.GetLoadBalancerRequest{
LoadBalancerId: common.String("lb-id"),
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateCreating,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
IpAddresses: []loadbalancer.IpAddress{
{
IpAddress: common.String("2.2.2.2"),
IsPublic: common.Bool(true),
},
},
},
}, nil)
},
},
{
name: "lb update request failed",
errorExpected: true,
Expand All @@ -582,11 +615,12 @@ func TestLBReconciliation(t *testing.T) {
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
IpAddresses: []loadbalancer.IpAddress{
{
IpAddress: common.String("2.2.2.2"),
Expand Down
196 changes: 89 additions & 107 deletions cloud/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -520,33 +520,29 @@ func (m *MachineScope) ReconcileCreateInstanceOnLB(ctx context.Context) error {
backendName := instanceIp + ":" + strconv.Itoa(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port))
if !m.containsLBBackend(backendSet, backendName) {
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
workRequest := m.OCIMachine.Status.CreateBackendWorkRequestId
logger.Info("Checking work request status for create backend")
if workRequest != "" {
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, &workRequest)
if err != nil {
return err
}
} else {
resp, err := m.LoadBalancerClient.CreateBackend(ctx, loadbalancer.CreateBackendRequest{
LoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
CreateBackendDetails: loadbalancer.CreateBackendDetails{
IpAddress: common.String(instanceIp),
Port: common.Int(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port)),
},
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
})
if err != nil {
return err
}
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Add instance to LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for LB work request to be complete")
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
// we always try to create the backend if it exists during a reconcile loop and wait for the work request
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
// work request, the create backend call may throw an error which is ok, as reconcile will go into
// an exponential backoff
resp, err := m.LoadBalancerClient.CreateBackend(ctx, loadbalancer.CreateBackendRequest{
LoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
CreateBackendDetails: loadbalancer.CreateBackendDetails{
IpAddress: common.String(instanceIp),
Port: common.Int(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port)),
},
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
})
if err != nil {
return err
}
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Add instance to LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for LB work request to be complete")
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
}

Expand All @@ -561,37 +557,32 @@ func (m *MachineScope) ReconcileCreateInstanceOnLB(ctx context.Context) error {
if !m.containsNLBBackend(backendSet, m.Name()) {
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
logger.Info("Checking work request status for create backend")
workRequest := m.OCIMachine.Status.CreateBackendWorkRequestId
if workRequest != "" {
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, &workRequest)
if err != nil {
return err
}
} else {
resp, err := m.NetworkLoadBalancerClient.CreateBackend(ctx, networkloadbalancer.CreateBackendRequest{
NetworkLoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
CreateBackendDetails: networkloadbalancer.CreateBackendDetails{
IpAddress: common.String(instanceIp),
Port: common.Int(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port)),
Name: common.String(m.Name()),
},
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
})
if err != nil {
return err
}
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Add instance to NLB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for NLB work request to be complete")
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
logger.Info("NLB Backend addition work request is complete")
// we always try to create the backend if it exists during a reconcile loop and wait for the work request
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
// work request, the create backend call may throw an error which is ok, as reconcile will go into
// an exponential backoff
resp, err := m.NetworkLoadBalancerClient.CreateBackend(ctx, networkloadbalancer.CreateBackendRequest{
NetworkLoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
CreateBackendDetails: networkloadbalancer.CreateBackendDetails{
IpAddress: common.String(instanceIp),
Port: common.Int(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port)),
Name: common.String(m.Name()),
},
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
})
if err != nil {
return err
}
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Add instance to NLB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for NLB work request to be complete")
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
logger.Info("NLB Backend addition work request is complete")
}

}
return nil
}
Expand Down Expand Up @@ -628,36 +619,31 @@ func (m *MachineScope) ReconcileDeleteInstanceOnLB(ctx context.Context) error {
backendName := instanceIp + ":" + strconv.Itoa(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port))
if m.containsLBBackend(backendSet, backendName) {
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
workRequest := m.OCIMachine.Status.DeleteBackendWorkRequestId
if workRequest != "" {
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, &workRequest)
if err != nil {
return err
}
} else {
// in OCI CLI, the colon in the backend name is replaced by %3A
// replace the colon in the backend name by %3A to avoid the error in PCA
escapedBackendName := url.QueryEscape(backendName)

resp, err := m.LoadBalancerClient.DeleteBackend(ctx, loadbalancer.DeleteBackendRequest{
LoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
BackendName: common.String(escapedBackendName),
})
if err != nil {
logger.Error(err, "Delete instance from LB backend-set failed",
"backendSetName", *backendSet.Name,
"backendName", escapedBackendName,
)
return err
}
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for LB work request to be complete")
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
// in OCI CLI, the colon in the backend name is replaced by %3A
// replace the colon in the backend name by %3A to avoid the error in PCA
escapedBackendName := url.QueryEscape(backendName)
// we always try to delete the backend if it exists during a reconcile loop and wait for the work request
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
// work request, the delete backend call may throw an error which is ok, as reconcile will go into
// an exponential backoff
resp, err := m.LoadBalancerClient.DeleteBackend(ctx, loadbalancer.DeleteBackendRequest{
LoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
BackendName: common.String(escapedBackendName),
})
if err != nil {
logger.Error(err, "Delete instance from LB backend-set failed",
"backendSetName", *backendSet.Name,
"backendName", escapedBackendName,
)
return err
}
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for LB work request to be complete")
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
logger.Info("LB Backend addition work request is complete")
}
Expand All @@ -671,28 +657,24 @@ func (m *MachineScope) ReconcileDeleteInstanceOnLB(ctx context.Context) error {
backendSet := lb.BackendSets[APIServerLBBackendSetName]
if m.containsNLBBackend(backendSet, m.Name()) {
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
workRequest := m.OCIMachine.Status.DeleteBackendWorkRequestId
if workRequest != "" {
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, &workRequest)
if err != nil {
return err
}
} else {
resp, err := m.NetworkLoadBalancerClient.DeleteBackend(ctx, networkloadbalancer.DeleteBackendRequest{
NetworkLoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
BackendName: common.String(m.Name()),
})
if err != nil {
return err
}
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for LB work request to be complete")
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
// we always try to delete the backend if it exists during a reconcile loop and wait for the work request
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
// work request, the delete backend call may throw an error which is ok, as reconcile will go into
// an exponential backoff
resp, err := m.NetworkLoadBalancerClient.DeleteBackend(ctx, networkloadbalancer.DeleteBackendRequest{
NetworkLoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
BackendName: common.String(m.Name()),
})
if err != nil {
return err
}
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for LB work request to be complete")
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
}
}
Expand Down
Loading

0 comments on commit 8713435

Please sign in to comment.