From 62c1122f17baaa854b8d83bef1015fea4c490550 Mon Sep 17 00:00:00 2001 From: Dylan Murray Date: Mon, 8 Jun 2020 16:25:11 -0400 Subject: [PATCH 1/6] Add retry logic for image copying --- velero-plugins/migimagestream/backup.go | 6 ++--- velero-plugins/migimagestream/restore.go | 6 ++--- velero-plugins/migimagestream/shared.go | 33 +++++++++++++++++++----- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/velero-plugins/migimagestream/backup.go b/velero-plugins/migimagestream/backup.go index 877325f..31c859e 100644 --- a/velero-plugins/migimagestream/backup.go +++ b/velero-plugins/migimagestream/backup.go @@ -79,7 +79,7 @@ func (p *BackupPlugin) Execute(item runtime.Unstructured, backup *v1.Backup) (ru p.Log.Info(fmt.Sprintf("[is-backup] copying from: %s", srcPath)) p.Log.Info(fmt.Sprintf("[is-backup] copying to: %s", destPath)) - imgManifest, err := copyImageBackup(srcPath, destPath) + imgManifest, err := copyImageBackup(p.Log, srcPath, destPath) if err != nil { p.Log.Info(fmt.Sprintf("[is-backup] Error copying image: %v", err)) return nil, nil, err @@ -126,7 +126,7 @@ func findStatusTag(tags []imagev1API.NamedTagEventList, name string) *imagev1API return nil } -func copyImageBackup(src, dest string) ([]byte, error) { +func copyImageBackup(log logrus.FieldLogger, src, dest string) ([]byte, error) { sourceCtx, err := internalRegistrySystemContext() if err != nil { return []byte{}, err @@ -135,5 +135,5 @@ func copyImageBackup(src, dest string) ([]byte, error) { if err != nil { return []byte{}, err } - return copyImage(src, dest, sourceCtx, destinationCtx) + return copyImage(log, src, dest, sourceCtx, destinationCtx) } diff --git a/velero-plugins/migimagestream/restore.go b/velero-plugins/migimagestream/restore.go index 93de1b7..e88aa96 100644 --- a/velero-plugins/migimagestream/restore.go +++ b/velero-plugins/migimagestream/restore.go @@ -91,7 +91,7 @@ func (p *RestorePlugin) Execute(input *velero.RestoreItemActionExecuteInput) (*v p.Log.Info(fmt.Sprintf("[is-restore] copying from: %s", srcPath)) p.Log.Info(fmt.Sprintf("[is-restore] copying to: %s", destPath)) - manifest, err := copyImageRestore(srcPath, destPath) + manifest, err := copyImageRestore(p.Log, srcPath, destPath) if err != nil { p.Log.Info(fmt.Sprintf("[is-restore] Error copying image: %v", err)) return nil, err @@ -108,7 +108,7 @@ func (p *RestorePlugin) Execute(input *velero.RestoreItemActionExecuteInput) (*v return velero.NewRestoreItemActionExecuteOutput(input.Item).WithoutRestore(), nil } -func copyImageRestore(src, dest string) ([]byte, error) { +func copyImageRestore(log logrus.FieldLogger, src, dest string) ([]byte, error) { sourceCtx, err := migrationRegistrySystemContext() if err != nil { return []byte{}, err @@ -117,5 +117,5 @@ func copyImageRestore(src, dest string) ([]byte, error) { if err != nil { return []byte{}, err } - return copyImage(src, dest, sourceCtx, destinationCtx) + return copyImage(log, src, dest, sourceCtx, destinationCtx) } diff --git a/velero-plugins/migimagestream/shared.go b/velero-plugins/migimagestream/shared.go index a2d0b1a..c46db79 100644 --- a/velero-plugins/migimagestream/shared.go +++ b/velero-plugins/migimagestream/shared.go @@ -4,17 +4,20 @@ import ( "context" "errors" "fmt" + "strings" + "time" "github.com/containers/image/v5/copy" "github.com/containers/image/v5/signature" "github.com/containers/image/v5/transports/alltransports" "github.com/containers/image/v5/types" imagev1API "github.com/openshift/api/image/v1" + "github.com/sirupsen/logrus" "k8s.io/client-go/rest" ) -func copyImage(src, dest string, sourceCtx, destinationCtx *types.SystemContext) ([]byte, error) { +func copyImage(log logrus.FieldLogger, src, dest string, sourceCtx, destinationCtx *types.SystemContext) ([]byte, error) { policyContext, err := getPolicyContext() if err != nil { return []byte{}, fmt.Errorf("Error loading trust policy: %v", err) @@ -29,11 +32,29 @@ func copyImage(src, dest string, sourceCtx, destinationCtx *types.SystemContext) if err != nil { return []byte{}, fmt.Errorf("Invalid destination name %s: %v", dest, err) } - manifest, err := copy.Image(context.Background(), policyContext, destRef, srcRef, ©.Options{ - SourceCtx: sourceCtx, - DestinationCtx: destinationCtx, - }) - return manifest, err + + // Let's retry the image copy up to 10 times + // Each retry will wait 5 seconds longer + // Let's log a warning if we encounter `blob unknown to registry` + retryWait := 5 + log.Info(fmt.Sprintf("copying image: %s; will attempt up to 10 times...", src)) + for i := 0; i < 9; i++ { + manifest, err := copy.Image(context.Background(), policyContext, destRef, srcRef, ©.Options{ + SourceCtx: sourceCtx, + DestinationCtx: destinationCtx, + }) + if err == nil { + return manifest, err + break + } + if strings.Contains(err.Error(), "blob unknown to registry") { + log.Warn(fmt.Sprintf("encountered `blob unknown to registry error` for image %s", src)) + } + log.Info(fmt.Sprintf("attempt #%s failed, waiting %ss and then retrying", i, retryWait)) + time.Sleep(time.Duration(retryWait) * time.Second) + retryWait += 5 + } + return nil, err } func getPolicyContext() (*signature.PolicyContext, error) { From 0f69df680d27f1df42828a1bbb786c91f452abee Mon Sep 17 00:00:00 2001 From: Dylan Murray Date: Mon, 8 Jun 2020 16:41:42 -0400 Subject: [PATCH 2/6] Remove unnecessary break --- velero-plugins/migimagestream/shared.go | 1 - 1 file changed, 1 deletion(-) diff --git a/velero-plugins/migimagestream/shared.go b/velero-plugins/migimagestream/shared.go index c46db79..3b31c07 100644 --- a/velero-plugins/migimagestream/shared.go +++ b/velero-plugins/migimagestream/shared.go @@ -45,7 +45,6 @@ func copyImage(log logrus.FieldLogger, src, dest string, sourceCtx, destinationC }) if err == nil { return manifest, err - break } if strings.Contains(err.Error(), "blob unknown to registry") { log.Warn(fmt.Sprintf("encountered `blob unknown to registry error` for image %s", src)) From e7853794fb13f1177eace68eea47fa034b66a0b5 Mon Sep 17 00:00:00 2001 From: Dylan Murray Date: Mon, 8 Jun 2020 16:44:58 -0400 Subject: [PATCH 3/6] Change to 5 retries --- velero-plugins/migimagestream/shared.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/velero-plugins/migimagestream/shared.go b/velero-plugins/migimagestream/shared.go index 3b31c07..008d51c 100644 --- a/velero-plugins/migimagestream/shared.go +++ b/velero-plugins/migimagestream/shared.go @@ -37,8 +37,8 @@ func copyImage(log logrus.FieldLogger, src, dest string, sourceCtx, destinationC // Each retry will wait 5 seconds longer // Let's log a warning if we encounter `blob unknown to registry` retryWait := 5 - log.Info(fmt.Sprintf("copying image: %s; will attempt up to 10 times...", src)) - for i := 0; i < 9; i++ { + log.Info(fmt.Sprintf("copying image: %s; will attempt up to 5 times...", src)) + for i := 0; i < 4; i++ { manifest, err := copy.Image(context.Background(), policyContext, destRef, srcRef, ©.Options{ SourceCtx: sourceCtx, DestinationCtx: destinationCtx, From 0724a687fc58b5aa1e729e3a90c2fc932e16e35f Mon Sep 17 00:00:00 2001 From: Dylan Murray Date: Mon, 8 Jun 2020 16:49:52 -0400 Subject: [PATCH 4/6] Use proper format for prints --- velero-plugins/migimagestream/shared.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/velero-plugins/migimagestream/shared.go b/velero-plugins/migimagestream/shared.go index 008d51c..9fda414 100644 --- a/velero-plugins/migimagestream/shared.go +++ b/velero-plugins/migimagestream/shared.go @@ -49,7 +49,7 @@ func copyImage(log logrus.FieldLogger, src, dest string, sourceCtx, destinationC if strings.Contains(err.Error(), "blob unknown to registry") { log.Warn(fmt.Sprintf("encountered `blob unknown to registry error` for image %s", src)) } - log.Info(fmt.Sprintf("attempt #%s failed, waiting %ss and then retrying", i, retryWait)) + log.Info(fmt.Sprintf("attempt #%v failed, waiting %vs and then retrying", i, retryWait)) time.Sleep(time.Duration(retryWait) * time.Second) retryWait += 5 } From f820021d6fe3a38e7d5df9415fd24ae0679642a3 Mon Sep 17 00:00:00 2001 From: Dylan Murray Date: Mon, 8 Jun 2020 16:53:11 -0400 Subject: [PATCH 5/6] Return empty byte array --- velero-plugins/migimagestream/shared.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/velero-plugins/migimagestream/shared.go b/velero-plugins/migimagestream/shared.go index 9fda414..a46318f 100644 --- a/velero-plugins/migimagestream/shared.go +++ b/velero-plugins/migimagestream/shared.go @@ -53,7 +53,7 @@ func copyImage(log logrus.FieldLogger, src, dest string, sourceCtx, destinationC time.Sleep(time.Duration(retryWait) * time.Second) retryWait += 5 } - return nil, err + return []byte{}, err } func getPolicyContext() (*signature.PolicyContext, error) { From c986cfda2f168e32bc842e2e5110d96069d1e00c Mon Sep 17 00:00:00 2001 From: Dylan Murray Date: Mon, 8 Jun 2020 22:42:34 -0400 Subject: [PATCH 6/6] Add todo --- velero-plugins/migimagestream/shared.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/velero-plugins/migimagestream/shared.go b/velero-plugins/migimagestream/shared.go index a46318f..99f0b33 100644 --- a/velero-plugins/migimagestream/shared.go +++ b/velero-plugins/migimagestream/shared.go @@ -36,6 +36,7 @@ func copyImage(log logrus.FieldLogger, src, dest string, sourceCtx, destinationC // Let's retry the image copy up to 10 times // Each retry will wait 5 seconds longer // Let's log a warning if we encounter `blob unknown to registry` + // TODO: Change this to only retry on specific errors from image copy retryWait := 5 log.Info(fmt.Sprintf("copying image: %s; will attempt up to 5 times...", src)) for i := 0; i < 4; i++ { @@ -44,7 +45,7 @@ func copyImage(log logrus.FieldLogger, src, dest string, sourceCtx, destinationC DestinationCtx: destinationCtx, }) if err == nil { - return manifest, err + return manifest, nil } if strings.Contains(err.Error(), "blob unknown to registry") { log.Warn(fmt.Sprintf("encountered `blob unknown to registry error` for image %s", src))