From e0862f67b5c64c1d3ba5e5e88de03cca752c452f Mon Sep 17 00:00:00 2001 From: Dylan Murray Date: Tue, 9 Jun 2020 13:21:27 -0400 Subject: [PATCH] Add retry logic for image copying (#63) * Add retry logic for image copying * Remove unnecessary break * Change to 5 retries * Use proper format for prints * Return empty byte array * Add todo (cherry picked from commit b7c9bbd9ae102fb9c56504eac4be29fe85559015) --- velero-plugins/migimagestream/backup.go | 6 ++--- velero-plugins/migimagestream/restore.go | 6 ++--- velero-plugins/migimagestream/shared.go | 33 +++++++++++++++++++----- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/velero-plugins/migimagestream/backup.go b/velero-plugins/migimagestream/backup.go index 877325f..31c859e 100644 --- a/velero-plugins/migimagestream/backup.go +++ b/velero-plugins/migimagestream/backup.go @@ -79,7 +79,7 @@ func (p *BackupPlugin) Execute(item runtime.Unstructured, backup *v1.Backup) (ru p.Log.Info(fmt.Sprintf("[is-backup] copying from: %s", srcPath)) p.Log.Info(fmt.Sprintf("[is-backup] copying to: %s", destPath)) - imgManifest, err := copyImageBackup(srcPath, destPath) + imgManifest, err := copyImageBackup(p.Log, srcPath, destPath) if err != nil { p.Log.Info(fmt.Sprintf("[is-backup] Error copying image: %v", err)) return nil, nil, err @@ -126,7 +126,7 @@ func findStatusTag(tags []imagev1API.NamedTagEventList, name string) *imagev1API return nil } -func copyImageBackup(src, dest string) ([]byte, error) { +func copyImageBackup(log logrus.FieldLogger, src, dest string) ([]byte, error) { sourceCtx, err := internalRegistrySystemContext() if err != nil { return []byte{}, err @@ -135,5 +135,5 @@ func copyImageBackup(src, dest string) ([]byte, error) { if err != nil { return []byte{}, err } - return copyImage(src, dest, sourceCtx, destinationCtx) + return copyImage(log, src, dest, sourceCtx, destinationCtx) } diff --git a/velero-plugins/migimagestream/restore.go b/velero-plugins/migimagestream/restore.go index 93de1b7..e88aa96 100644 --- a/velero-plugins/migimagestream/restore.go +++ b/velero-plugins/migimagestream/restore.go @@ -91,7 +91,7 @@ func (p *RestorePlugin) Execute(input *velero.RestoreItemActionExecuteInput) (*v p.Log.Info(fmt.Sprintf("[is-restore] copying from: %s", srcPath)) p.Log.Info(fmt.Sprintf("[is-restore] copying to: %s", destPath)) - manifest, err := copyImageRestore(srcPath, destPath) + manifest, err := copyImageRestore(p.Log, srcPath, destPath) if err != nil { p.Log.Info(fmt.Sprintf("[is-restore] Error copying image: %v", err)) return nil, err @@ -108,7 +108,7 @@ func (p *RestorePlugin) Execute(input *velero.RestoreItemActionExecuteInput) (*v return velero.NewRestoreItemActionExecuteOutput(input.Item).WithoutRestore(), nil } -func copyImageRestore(src, dest string) ([]byte, error) { +func copyImageRestore(log logrus.FieldLogger, src, dest string) ([]byte, error) { sourceCtx, err := migrationRegistrySystemContext() if err != nil { return []byte{}, err @@ -117,5 +117,5 @@ func copyImageRestore(src, dest string) ([]byte, error) { if err != nil { return []byte{}, err } - return copyImage(src, dest, sourceCtx, destinationCtx) + return copyImage(log, src, dest, sourceCtx, destinationCtx) } diff --git a/velero-plugins/migimagestream/shared.go b/velero-plugins/migimagestream/shared.go index a2d0b1a..99f0b33 100644 --- a/velero-plugins/migimagestream/shared.go +++ b/velero-plugins/migimagestream/shared.go @@ -4,17 +4,20 @@ import ( "context" "errors" "fmt" + "strings" + "time" "github.com/containers/image/v5/copy" "github.com/containers/image/v5/signature" "github.com/containers/image/v5/transports/alltransports" "github.com/containers/image/v5/types" imagev1API "github.com/openshift/api/image/v1" + "github.com/sirupsen/logrus" "k8s.io/client-go/rest" ) -func copyImage(src, dest string, sourceCtx, destinationCtx *types.SystemContext) ([]byte, error) { +func copyImage(log logrus.FieldLogger, src, dest string, sourceCtx, destinationCtx *types.SystemContext) ([]byte, error) { policyContext, err := getPolicyContext() if err != nil { return []byte{}, fmt.Errorf("Error loading trust policy: %v", err) @@ -29,11 +32,29 @@ func copyImage(src, dest string, sourceCtx, destinationCtx *types.SystemContext) if err != nil { return []byte{}, fmt.Errorf("Invalid destination name %s: %v", dest, err) } - manifest, err := copy.Image(context.Background(), policyContext, destRef, srcRef, ©.Options{ - SourceCtx: sourceCtx, - DestinationCtx: destinationCtx, - }) - return manifest, err + + // Let's retry the image copy up to 10 times + // Each retry will wait 5 seconds longer + // Let's log a warning if we encounter `blob unknown to registry` + // TODO: Change this to only retry on specific errors from image copy + retryWait := 5 + log.Info(fmt.Sprintf("copying image: %s; will attempt up to 5 times...", src)) + for i := 0; i < 4; i++ { + manifest, err := copy.Image(context.Background(), policyContext, destRef, srcRef, ©.Options{ + SourceCtx: sourceCtx, + DestinationCtx: destinationCtx, + }) + if err == nil { + return manifest, nil + } + if strings.Contains(err.Error(), "blob unknown to registry") { + log.Warn(fmt.Sprintf("encountered `blob unknown to registry error` for image %s", src)) + } + log.Info(fmt.Sprintf("attempt #%v failed, waiting %vs and then retrying", i, retryWait)) + time.Sleep(time.Duration(retryWait) * time.Second) + retryWait += 5 + } + return []byte{}, err } func getPolicyContext() (*signature.PolicyContext, error) {