From 74dbf38793f116f6344c9222207ebd50674b10ed Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Thu, 5 Jul 2018 16:49:47 -0400 Subject: [PATCH] Add restore failed phase and metrics Signed-off-by: Nolan Brubaker --- pkg/apis/ark/v1/restore.go | 7 +++ pkg/controller/restore_controller.go | 66 ++++++++++++----------- pkg/controller/restore_controller_test.go | 50 ++++++++++++----- pkg/metrics/metrics.go | 32 +++++++---- pkg/util/test/comparisons.go | 2 + 5 files changed, 102 insertions(+), 55 deletions(-) diff --git a/pkg/apis/ark/v1/restore.go b/pkg/apis/ark/v1/restore.go index f8101ea3dc..ac230fca59 100644 --- a/pkg/apis/ark/v1/restore.go +++ b/pkg/apis/ark/v1/restore.go @@ -85,6 +85,10 @@ const ( // RestorePhaseCompleted means the restore has finished executing. // Any relevant warnings or errors will be captured in the Status. RestorePhaseCompleted RestorePhase = "Completed" + + // RestorePhaseFailed means the restore was unable to execute. + // The failing error is recorded in status.FailureReason. + RestorePhaseFailed RestorePhase = "Failed" ) // RestoreStatus captures the current status of an Ark restore @@ -103,6 +107,9 @@ type RestoreStatus struct { // Errors is a count of all error messages that were generated during // execution of the restore. The actual errors are stored in object storage. Errors int `json:"errors"` + + // FailureReason is an error that caused the entire restore to fail. + FailureReason string `json:"failureReason"` } // RestoreResult is a collection of messages that were generated diff --git a/pkg/controller/restore_controller.go b/pkg/controller/restore_controller.go index 898a6f327e..6eae4ee55e 100644 --- a/pkg/controller/restore_controller.go +++ b/pkg/controller/restore_controller.go @@ -259,28 +259,17 @@ func (c *restoreController) processRestore(key string) error { // don't modify items in the cache restore = restore.DeepCopy() - excludedResources := sets.NewString(restore.Spec.ExcludedResources...) - for _, nonrestorable := range nonRestorableResources { - if !excludedResources.Has(nonrestorable) { - restore.Spec.ExcludedResources = append(restore.Spec.ExcludedResources, nonrestorable) - } - } - - backup, fetchErr := c.fetchBackup(c.bucket, restore.Spec.BackupName) - backupScheduleName := "" - if backup != nil { - backupScheduleName = backup.GetLabels()["ark-schedule"] - } - // Register attempts before we do validation so we can get better tracking - c.metrics.RegisterRestoreAttempt(backupScheduleName) - // validation - if restore.Status.ValidationErrors = c.completeAndValidate(restore, fetchErr); len(restore.Status.ValidationErrors) > 0 { + if restore.Status.ValidationErrors = c.completeAndValidate(restore); len(restore.Status.ValidationErrors) > 0 { restore.Status.Phase = api.RestorePhaseFailedValidation } else { restore.Status.Phase = api.RestorePhaseInProgress } + backupScheduleName := restore.Spec.ScheduleName + // Register attempts after validation so we don't have to fetch the backup multiple times + c.metrics.RegisterRestoreAttempt(backupScheduleName) + // update status updatedRestore, err := patchRestore(original, restore, c.restoreClient) if err != nil { @@ -296,7 +285,7 @@ func (c *restoreController) processRestore(key string) error { } logContext.Debug("Running restore") // execution & upload of restore - restoreWarnings, restoreErrors := c.runRestore(restore, c.bucket, backup) + restoreWarnings, restoreErrors, restoreFailure := c.runRestore(restore, c.bucket) restore.Status.Warnings = len(restoreWarnings.Ark) + len(restoreWarnings.Cluster) for _, w := range restoreWarnings.Namespaces { @@ -307,15 +296,19 @@ func (c *restoreController) processRestore(key string) error { for _, e := range restoreErrors.Namespaces { restore.Status.Errors += len(e) } - if restore.Status.Errors > 0 { - c.metrics.RegisterRestoreIncomplete(backupScheduleName) + + if restoreFailure != nil { + logContext.Debug("restore failed") + restore.Status.Phase = api.RestorePhaseFailed + restore.Status.FailureReason = restoreFailure.Error() + c.metrics.RegisterRestoreFailed(backupScheduleName) } else { + logContext.Debug("restore completed") + // We got through the restore process without failing validation or restore execution + restore.Status.Phase = api.RestorePhaseCompleted c.metrics.RegisterRestoreSuccess(backupScheduleName) } - logContext.Debug("restore completed") - restore.Status.Phase = api.RestorePhaseCompleted - logContext.Debug("Updating Restore final status") if _, err = patchRestore(original, restore, c.restoreClient); err != nil { logContext.WithError(errors.WithStack(err)).Info("Error updating Restore final status") @@ -324,7 +317,7 @@ func (c *restoreController) processRestore(key string) error { return nil } -func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr error) []string { +func (c *restoreController) completeAndValidate(restore *api.Restore) []string { // add non-restorable resources to restore's excluded resources excludedResources := sets.NewString(restore.Spec.ExcludedResources...) for _, nonrestorable := range nonRestorableResources { @@ -334,12 +327,6 @@ func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr e } var validationErrors []string - if restore.Spec.BackupName == "" { - validationErrors = append(validationErrors, "BackupName must be non-empty and correspond to the name of a backup in object storage.") - } else if fetchErr != nil { - validationErrors = append(validationErrors, fmt.Sprintf("Error retrieving backup: %v", fetchErr)) - } - // validate that included resources don't contain any non-restorable resources includedResources := sets.NewString(restore.Spec.IncludedResources...) for _, nonRestorableResource := range nonRestorableResources { @@ -390,11 +377,19 @@ func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr e } } - // validate that we can fetch the source backup - if _, err := c.fetchBackup(c.bucket, restore.Spec.BackupName); err != nil { + var ( + backup *api.Backup + err error + ) + if backup, err = c.fetchBackup(c.bucket, restore.Spec.BackupName); err != nil { return append(validationErrors, fmt.Sprintf("Error retrieving backup: %v", err)) } + // Fill in the ScheduleName so it's easier to consume for metrics. + if restore.Spec.ScheduleName == "" { + restore.Spec.ScheduleName = backup.GetLabels()["ark-schedule"] + } + return validationErrors } @@ -462,7 +457,7 @@ func (c *restoreController) fetchBackup(bucket, name string) (*api.Backup, error return backup, nil } -func (c *restoreController) runRestore(restore *api.Restore, bucket string, backup *api.Backup) (restoreWarnings, restoreErrors api.RestoreResult) { +func (c *restoreController) runRestore(restore *api.Restore, bucket string) (restoreWarnings, restoreErrors api.RestoreResult, restoreFailure error) { logContext := c.logger.WithFields( logrus.Fields{ "restore": kubeutil.NamespaceAndName(restore), @@ -482,6 +477,7 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back if err != nil { logContext.WithError(err).Error("Error downloading backup") restoreErrors.Ark = append(restoreErrors.Ark, err.Error()) + restoreFailure = err return } tempFiles = append(tempFiles, backupFile) @@ -490,6 +486,7 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back if err != nil { logContext.WithError(errors.WithStack(err)).Error("Error creating log temp file") restoreErrors.Ark = append(restoreErrors.Ark, err.Error()) + restoreFailure = err return } tempFiles = append(tempFiles, logFile) @@ -498,6 +495,7 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back if err != nil { logContext.WithError(errors.WithStack(err)).Error("Error creating results temp file") restoreErrors.Ark = append(restoreErrors.Ark, err.Error()) + restoreFailure = err return } tempFiles = append(tempFiles, resultsFile) @@ -506,10 +504,12 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back for _, file := range tempFiles { if err := file.Close(); err != nil { logContext.WithError(errors.WithStack(err)).WithField("file", file.Name()).Error("Error closing file") + restoreFailure = err } if err := os.Remove(file.Name()); err != nil { logContext.WithError(errors.WithStack(err)).WithField("file", file.Name()).Error("Error removing file") + restoreFailure = err } } }() @@ -521,6 +521,8 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back } defer c.pluginManager.CloseRestoreItemActions(restore.Name) + // Any return statement above this line means a total restore failure + // Some failures after this line *may* be a total restore failure logContext.Info("starting restore") restoreWarnings, restoreErrors = c.restorer.Restore(restore, backup, backupFile, logFile, actions) logContext.Info("restore completed") diff --git a/pkg/controller/restore_controller_test.go b/pkg/controller/restore_controller_test.go index 8674741414..b1b5aa53cc 100644 --- a/pkg/controller/restore_controller_test.go +++ b/pkg/controller/restore_controller_test.go @@ -120,19 +120,21 @@ func TestFetchBackup(t *testing.T) { func TestProcessRestore(t *testing.T) { tests := []struct { - name string - restoreKey string - restore *api.Restore - backup *api.Backup - restorerError error - allowRestoreSnapshots bool - expectedErr bool - expectedPhase string - expectedValidationErrors []string - expectedRestoreErrors int - expectedRestorerCall *api.Restore - backupServiceGetBackupError error - uploadLogError error + name string + restoreKey string + restore *api.Restore + backup *api.Backup + restorerError error + allowRestoreSnapshots bool + expectedErr bool + expectedPhase string + expectedValidationErrors []string + expectedRestoreErrors int + expectedRestorerCall *api.Restore + backupServiceGetBackupError error + uploadLogError error + backupServiceDownloadBackupError error + expectedFinalPhase string }{ { name: "invalid key returns error", @@ -300,6 +302,14 @@ func TestProcessRestore(t *testing.T) { "Invalid included/excluded resource lists: excludes list cannot contain an item in the includes list: restores.ark.heptio.com", }, }, + { + name: "backup download error results in failed restore", + restore: NewRestore("foo", "bar", "backup-1", "ns-1", "", api.RestorePhaseNew).Restore, + expectedPhase: string(api.RestorePhaseInProgress), + expectedFinalPhase: string(api.RestorePhaseFailed), + backupServiceDownloadBackupError: errors.New("Couldn't download backup"), + backup: arktest.NewTestBackup().WithName("backup-1").Backup, + }, } for _, test := range tests { @@ -403,6 +413,10 @@ func TestProcessRestore(t *testing.T) { backupSvc.On("GetBackup", "bucket", mock.Anything).Return(nil, test.backupServiceGetBackupError) } + if test.backupServiceDownloadBackupError != nil { + backupSvc.On("DownloadBackup", "bucket", test.restore.Spec.BackupName).Return(nil, test.backupServiceDownloadBackupError) + } + if test.restore != nil { pluginManager.On("GetRestoreItemActions", test.restore.Name).Return(nil, nil) pluginManager.On("CloseRestoreItemActions", test.restore.Name).Return(nil) @@ -477,6 +491,15 @@ func TestProcessRestore(t *testing.T) { Errors: test.expectedRestoreErrors, }, } + // Override our default expectations if the case requires it + if test.expectedFinalPhase != "" { + expected = Patch{ + Status: StatusPatch{ + Phase: api.RestorePhaseCompleted, + Errors: test.expectedRestoreErrors, + }, + } + } arktest.ValidatePatch(t, actions[1], expected, decode) @@ -507,6 +530,7 @@ func TestCompleteAndValidateWhenScheduleNameSpecified(t *testing.T) { false, logger, nil, + nil, ).(*restoreController) restore := &api.Restore{ diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index a95dc059cd..1a4c6c7e62 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -38,7 +38,7 @@ const ( restoreAttemptTotal = "restore_attempt_total" restoreValidationFailedTotal = "restore_validation_failed_total" restoreSuccessTotal = "restore_success_total" - restoreIncompleteTotal = "restore_incomplete_total" + restoreFailedTotal = "restore_failed_total" scheduleLabel = "schedule" backupNameLabel = "backupName" @@ -117,11 +117,11 @@ func NewServerMetrics() *ServerMetrics { }, []string{scheduleLabel}, ), - restoreIncompleteTotal: prometheus.NewCounterVec( + restoreFailedTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metricNamespace, - Name: restoreIncompleteTotal, - Help: "Total number of incomplete restores", + Name: restoreFailedTotal, + Help: "Total number of failed restores", }, []string{scheduleLabel}, ), @@ -129,7 +129,7 @@ func NewServerMetrics() *ServerMetrics { prometheus.CounterOpts{ Namespace: metricNamespace, Name: restoreValidationFailedTotal, - Help: "Total number of failed restore validations", + Help: "Total number of failed restores failing validations", }, []string{scheduleLabel}, ), @@ -154,6 +154,18 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) { if c, ok := m.metrics[backupFailureCount].(*prometheus.CounterVec); ok { c.WithLabelValues(scheduleName).Set(0) } + if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(scheduleName).Set(0) + } + if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(scheduleName).Set(0) + } + if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(scheduleName).Set(0) + } + if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(scheduleName).Set(0) + } } // SetBackupTarballSizeBytesGauge records the size, in bytes, of a backup tarball. @@ -204,21 +216,21 @@ func (m *ServerMetrics) RegisterRestoreAttempt(backupSchedule string) { } } -// RegisterRestoreSuccess records a successful completion of a restore. +// RegisterRestoreSuccess records a successful (maybe partial) completion of a restore. func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) { if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(backupSchedule).Inc() } } -// RegisterRestoreIncomplete records a restore that finished with errors. -func (m *ServerMetrics) RegisterRestoreIncomplete(backupSchedule string) { - if c, ok := m.metrics[restoreIncompleteTotal].(*prometheus.CounterVec); ok { +// RegisterRestoreFailed records a restore that failed. +func (m *ServerMetrics) RegisterRestoreFailed(backupSchedule string) { + if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(backupSchedule).Inc() } } -// RegisterRestoreValidationFailed records a failed restore. +// RegisterRestoreValidationFailed records a restore that failed validation. func (m *ServerMetrics) RegisterRestoreValidationFailed(backupSchedule string) { if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(backupSchedule).Inc() diff --git a/pkg/util/test/comparisons.go b/pkg/util/test/comparisons.go index 68b7a2edcb..5e15e0b091 100644 --- a/pkg/util/test/comparisons.go +++ b/pkg/util/test/comparisons.go @@ -103,6 +103,8 @@ func AssertDeepEqual(t *testing.T, expected, actual interface{}) bool { } if !equality.Semantic.DeepEqual(expected, actual) { + fmt.Printf("expected = %+v\n", expected) + fmt.Printf("actual = %+v\n", actual) return assert.Fail(t, fmt.Sprintf("Objects not equal")) }