Skip to content

Commit

Permalink
Merge pull request #6584 from vbhargav875/delay_retries
Browse files Browse the repository at this point in the history
Delay force refresh by DefaultInterval when OCI GetNodePool call retu…
  • Loading branch information
k8s-ci-robot authored Mar 6, 2024
2 parents c58e3fd + 2bf403e commit 06fa717
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 16 deletions.
33 changes: 21 additions & 12 deletions cluster-autoscaler/cloudprovider/oci/nodepools/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,24 +40,33 @@ func (c *nodePoolCache) nodePools() map[string]*oke.NodePool {
return result
}

func (c *nodePoolCache) rebuild(staticNodePools map[string]NodePool) error {
func (c *nodePoolCache) rebuild(staticNodePools map[string]NodePool, maxGetNodepoolRetries int) (httpStatusCode int, err error) {
klog.Infof("rebuilding cache")
var resp oke.GetNodePoolResponse
var statusCode int
for id := range staticNodePools {
// prevent us from getting a node pool at the same time that we're performing delete actions on the node pool.
c.mu.Lock()
resp, err := c.okeClient.GetNodePool(context.Background(), oke.GetNodePoolRequest{
NodePoolId: common.String(id),
})
c.mu.Unlock()

for i := 1; i <= maxGetNodepoolRetries; i++ {
// prevent us from getting a node pool at the same time that we're performing delete actions on the node pool.
c.mu.Lock()
resp, err = c.okeClient.GetNodePool(context.Background(), oke.GetNodePoolRequest{
NodePoolId: common.String(id),
})
c.mu.Unlock()
httpResp := resp.HTTPResponse()
statusCode = httpResp.StatusCode
if err != nil {
klog.Errorf("Failed to fetch the nodepool : %v. Retries available : %v", id, maxGetNodepoolRetries-i)
} else {
break
}
}
if err != nil {
return err
klog.Errorf("Failed to fetch the nodepool : %v", id)
return statusCode, err
}

c.set(&resp.NodePool)
}

return nil
return statusCode, nil
}

// removeInstance tries to remove the instance from the node pool.
Expand Down
13 changes: 9 additions & 4 deletions cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ import (
)

const (
maxAddTaintRetries = 5
maxAddTaintRetries = 5
maxGetNodepoolRetries = 3
)

var (
Expand Down Expand Up @@ -249,11 +250,15 @@ func (m *ociManagerImpl) TaintToPreventFurtherSchedulingOnRestart(nodes []*apiv1
}

func (m *ociManagerImpl) forceRefresh() error {
err := m.nodePoolCache.rebuild(m.staticNodePools)
httpStatusCode, err := m.nodePoolCache.rebuild(m.staticNodePools, maxGetNodepoolRetries)
if err != nil {
if httpStatusCode == 404 {
m.lastRefresh = time.Now()
klog.Errorf("Failed to fetch the nodepools. Retrying after %v", m.lastRefresh.Add(m.cfg.Global.RefreshInterval))
return err
}
return err
}

m.lastRefresh = time.Now()
klog.Infof("Refreshed NodePool list, next refresh after %v", m.lastRefresh.Add(m.cfg.Global.RefreshInterval))
return nil
Expand Down Expand Up @@ -441,7 +446,7 @@ func (m *ociManagerImpl) GetNodePoolForInstance(instance ocicommon.OciRef) (Node

np, found := m.staticNodePools[instance.NodePoolID]
if !found {
klog.Infof("did not find node pool for reference: %+v", instance)
klog.V(4).Infof("did not find node pool for reference: %+v", instance)
return nil, errInstanceNodePoolNotFound
}

Expand Down

0 comments on commit 06fa717

Please sign in to comment.