Skip to content

Commit

Permalink
daemon/pinnedimagesets: retry if local nose is not found by informer
Browse files Browse the repository at this point in the history
Signed-off-by: Sam Batschelet <[email protected]>
  • Loading branch information
hexfusion committed Apr 25, 2024
1 parent 6ce48d6 commit fa504b8
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 7 deletions.
37 changes: 31 additions & 6 deletions pkg/daemon/pinned_image_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ var (
errInsufficientStorage = errors.New("storage available is less than minimum required")
errFailedToPullImage = errors.New("failed to pull image")
errNotFound = errors.New("not found")
errRequeueAfterTimeout = errors.New("requeue: prefetching images incomplete after timeout")
)

// PinnedImageSetManager manages the prefetching of images.
Expand Down Expand Up @@ -193,7 +194,7 @@ func NewPinnedImageSetManager(

func (p *PinnedImageSetManager) sync(key string) error {
klog.V(4).Infof("Syncing MachineConfigPool %q", key)
node, err := p.nodeLister.Get(p.nodeName)
node, err := p.getNodeWithRetry(p.nodeName)
if err != nil {
return fmt.Errorf("failed to get node %q: %w", p.nodeName, err)
}
Expand All @@ -216,7 +217,7 @@ func (p *PinnedImageSetManager) sync(key string) error {

if err := p.syncMachineConfigPools(ctx, pools); err != nil {
if errors.Is(err, context.DeadlineExceeded) {
ctxErr := fmt.Errorf("requeue: prefetching images incomplete after: %v", p.prefetchTimeout)
ctxErr := fmt.Errorf("%w: %v", errRequeueAfterTimeout, p.prefetchTimeout)
if err := p.updateStatusError(pools, ctxErr); err != nil {
klog.Errorf("failed to update status: %v", err)
}
Expand Down Expand Up @@ -377,6 +378,9 @@ func (p *PinnedImageSetManager) scheduleWork(ctx context.Context, prefetchCh cha
}
scheduledImages := 0
for _, imageRef := range prefetchImages {
if monitor.Drain() {
continue
}
select {
case <-ctx.Done():
return ctx.Err()
Expand Down Expand Up @@ -669,7 +673,7 @@ func getImageSetMetadata(imageSetLister mcfglistersv1alpha1.PinnedImageSetLister

// getWorkerCount returns the number of workers to use for prefetching images.
func (p *PinnedImageSetManager) getWorkerCount() (int, error) {
node, err := p.nodeLister.Get(p.nodeName)
node, err := p.getNodeWithRetry(p.nodeName)
if err != nil {
return 0, fmt.Errorf("failed to get node %q: %w", p.nodeName, err)
}
Expand Down Expand Up @@ -788,7 +792,7 @@ func (p *PinnedImageSetManager) addPinnedImageSet(obj interface{}) {
return
}

node, err := p.nodeLister.Get(p.nodeName)
node, err := p.getNodeWithRetry(p.nodeName)
if err != nil {
klog.Errorf("failed to get node %q: %v", p.nodeName, err)
return
Expand Down Expand Up @@ -827,7 +831,7 @@ func (p *PinnedImageSetManager) deletePinnedImageSet(obj interface{}) {
}
}

node, err := p.nodeLister.Get(p.nodeName)
node, err := p.getNodeWithRetry(p.nodeName)
if err != nil {
klog.Errorf("failed to get node %q: %v", p.nodeName, err)
return
Expand All @@ -851,6 +855,27 @@ func (p *PinnedImageSetManager) deletePinnedImageSet(obj interface{}) {
}
}

// getNodeWithRetry gets the node with retries. This avoids some races when the local node
// is new but not found during startup.
func (p *PinnedImageSetManager) getNodeWithRetry(nodeName string) (*corev1.Node,
error) {
var node *corev1.Node
err := wait.ExponentialBackoff(p.backoff, func() (bool, error) {
var err error
node, err = p.nodeLister.Get(nodeName)
if err != nil {
if apierrors.IsNotFound(err) {
// log warning and retry because we are tolerating unexpected behavior from the informer
klog.Warningf("Node %q not found, retrying", nodeName)
return false, nil
}
return false, err
}
return true, nil
})
return node, err
}

func (p *PinnedImageSetManager) updatePinnedImageSet(oldObj, newObj interface{}) {
oldImageSet := oldObj.(*mcfgv1alpha1.PinnedImageSet)
newImageSet := newObj.(*mcfgv1alpha1.PinnedImageSet)
Expand All @@ -860,7 +885,7 @@ func (p *PinnedImageSetManager) updatePinnedImageSet(oldObj, newObj interface{})
return
}

node, err := p.nodeLister.Get(p.nodeName)
node, err := p.getNodeWithRetry(p.nodeName)
if err != nil {
klog.Errorf("failed to get node %q: %v", p.nodeName, err)
return
Expand Down
2 changes: 1 addition & 1 deletion pkg/daemon/pinned_image_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ func TestPrefetchImageSets(t *testing.T) {
nodeListerSynced: nodeInformer.Informer().HasSynced,
prefetchCh: make(chan prefetch, defaultPrefetchWorkers*2),
backoff: wait.Backoff{
Steps: 1,
Steps: maxRetries,
Duration: 10 * time.Millisecond,
Factor: retryFactor,
Cap: 10 * time.Millisecond,
Expand Down

0 comments on commit fa504b8

Please sign in to comment.