Skip to content

Commit

Permalink
Add restore failed phase and metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Nolan Brubaker <[email protected]>
  • Loading branch information
Nolan Brubaker committed Jul 27, 2018
1 parent 7cebfe2 commit 74dbf38
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 55 deletions.
7 changes: 7 additions & 0 deletions pkg/apis/ark/v1/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ const (
// RestorePhaseCompleted means the restore has finished executing.
// Any relevant warnings or errors will be captured in the Status.
RestorePhaseCompleted RestorePhase = "Completed"

// RestorePhaseFailed means the restore was unable to execute.
// The failing error is recorded in status.FailureReason.
RestorePhaseFailed RestorePhase = "Failed"
)

// RestoreStatus captures the current status of an Ark restore
Expand All @@ -103,6 +107,9 @@ type RestoreStatus struct {
// Errors is a count of all error messages that were generated during
// execution of the restore. The actual errors are stored in object storage.
Errors int `json:"errors"`

// FailureReason is an error that caused the entire restore to fail.
FailureReason string `json:"failureReason"`
}

// RestoreResult is a collection of messages that were generated
Expand Down
66 changes: 34 additions & 32 deletions pkg/controller/restore_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,28 +259,17 @@ func (c *restoreController) processRestore(key string) error {
// don't modify items in the cache
restore = restore.DeepCopy()

excludedResources := sets.NewString(restore.Spec.ExcludedResources...)
for _, nonrestorable := range nonRestorableResources {
if !excludedResources.Has(nonrestorable) {
restore.Spec.ExcludedResources = append(restore.Spec.ExcludedResources, nonrestorable)
}
}

backup, fetchErr := c.fetchBackup(c.bucket, restore.Spec.BackupName)
backupScheduleName := ""
if backup != nil {
backupScheduleName = backup.GetLabels()["ark-schedule"]
}
// Register attempts before we do validation so we can get better tracking
c.metrics.RegisterRestoreAttempt(backupScheduleName)

// validation
if restore.Status.ValidationErrors = c.completeAndValidate(restore, fetchErr); len(restore.Status.ValidationErrors) > 0 {
if restore.Status.ValidationErrors = c.completeAndValidate(restore); len(restore.Status.ValidationErrors) > 0 {
restore.Status.Phase = api.RestorePhaseFailedValidation
} else {
restore.Status.Phase = api.RestorePhaseInProgress
}

backupScheduleName := restore.Spec.ScheduleName
// Register attempts after validation so we don't have to fetch the backup multiple times
c.metrics.RegisterRestoreAttempt(backupScheduleName)

// update status
updatedRestore, err := patchRestore(original, restore, c.restoreClient)
if err != nil {
Expand All @@ -296,7 +285,7 @@ func (c *restoreController) processRestore(key string) error {
}
logContext.Debug("Running restore")
// execution & upload of restore
restoreWarnings, restoreErrors := c.runRestore(restore, c.bucket, backup)
restoreWarnings, restoreErrors, restoreFailure := c.runRestore(restore, c.bucket)

restore.Status.Warnings = len(restoreWarnings.Ark) + len(restoreWarnings.Cluster)
for _, w := range restoreWarnings.Namespaces {
Expand All @@ -307,15 +296,19 @@ func (c *restoreController) processRestore(key string) error {
for _, e := range restoreErrors.Namespaces {
restore.Status.Errors += len(e)
}
if restore.Status.Errors > 0 {
c.metrics.RegisterRestoreIncomplete(backupScheduleName)

if restoreFailure != nil {
logContext.Debug("restore failed")
restore.Status.Phase = api.RestorePhaseFailed
restore.Status.FailureReason = restoreFailure.Error()
c.metrics.RegisterRestoreFailed(backupScheduleName)
} else {
logContext.Debug("restore completed")
// We got through the restore process without failing validation or restore execution
restore.Status.Phase = api.RestorePhaseCompleted
c.metrics.RegisterRestoreSuccess(backupScheduleName)
}

logContext.Debug("restore completed")
restore.Status.Phase = api.RestorePhaseCompleted

logContext.Debug("Updating Restore final status")
if _, err = patchRestore(original, restore, c.restoreClient); err != nil {
logContext.WithError(errors.WithStack(err)).Info("Error updating Restore final status")
Expand All @@ -324,7 +317,7 @@ func (c *restoreController) processRestore(key string) error {
return nil
}

func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr error) []string {
func (c *restoreController) completeAndValidate(restore *api.Restore) []string {
// add non-restorable resources to restore's excluded resources
excludedResources := sets.NewString(restore.Spec.ExcludedResources...)
for _, nonrestorable := range nonRestorableResources {
Expand All @@ -334,12 +327,6 @@ func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr e
}
var validationErrors []string

if restore.Spec.BackupName == "" {
validationErrors = append(validationErrors, "BackupName must be non-empty and correspond to the name of a backup in object storage.")
} else if fetchErr != nil {
validationErrors = append(validationErrors, fmt.Sprintf("Error retrieving backup: %v", fetchErr))
}

// validate that included resources don't contain any non-restorable resources
includedResources := sets.NewString(restore.Spec.IncludedResources...)
for _, nonRestorableResource := range nonRestorableResources {
Expand Down Expand Up @@ -390,11 +377,19 @@ func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr e
}
}

// validate that we can fetch the source backup
if _, err := c.fetchBackup(c.bucket, restore.Spec.BackupName); err != nil {
var (
backup *api.Backup
err error
)
if backup, err = c.fetchBackup(c.bucket, restore.Spec.BackupName); err != nil {
return append(validationErrors, fmt.Sprintf("Error retrieving backup: %v", err))
}

// Fill in the ScheduleName so it's easier to consume for metrics.
if restore.Spec.ScheduleName == "" {
restore.Spec.ScheduleName = backup.GetLabels()["ark-schedule"]
}

return validationErrors
}

Expand Down Expand Up @@ -462,7 +457,7 @@ func (c *restoreController) fetchBackup(bucket, name string) (*api.Backup, error
return backup, nil
}

func (c *restoreController) runRestore(restore *api.Restore, bucket string, backup *api.Backup) (restoreWarnings, restoreErrors api.RestoreResult) {
func (c *restoreController) runRestore(restore *api.Restore, bucket string) (restoreWarnings, restoreErrors api.RestoreResult, restoreFailure error) {
logContext := c.logger.WithFields(
logrus.Fields{
"restore": kubeutil.NamespaceAndName(restore),
Expand All @@ -482,6 +477,7 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
if err != nil {
logContext.WithError(err).Error("Error downloading backup")
restoreErrors.Ark = append(restoreErrors.Ark, err.Error())
restoreFailure = err
return
}
tempFiles = append(tempFiles, backupFile)
Expand All @@ -490,6 +486,7 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
if err != nil {
logContext.WithError(errors.WithStack(err)).Error("Error creating log temp file")
restoreErrors.Ark = append(restoreErrors.Ark, err.Error())
restoreFailure = err
return
}
tempFiles = append(tempFiles, logFile)
Expand All @@ -498,6 +495,7 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
if err != nil {
logContext.WithError(errors.WithStack(err)).Error("Error creating results temp file")
restoreErrors.Ark = append(restoreErrors.Ark, err.Error())
restoreFailure = err
return
}
tempFiles = append(tempFiles, resultsFile)
Expand All @@ -506,10 +504,12 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
for _, file := range tempFiles {
if err := file.Close(); err != nil {
logContext.WithError(errors.WithStack(err)).WithField("file", file.Name()).Error("Error closing file")
restoreFailure = err
}

if err := os.Remove(file.Name()); err != nil {
logContext.WithError(errors.WithStack(err)).WithField("file", file.Name()).Error("Error removing file")
restoreFailure = err
}
}
}()
Expand All @@ -521,6 +521,8 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
}
defer c.pluginManager.CloseRestoreItemActions(restore.Name)

// Any return statement above this line means a total restore failure
// Some failures after this line *may* be a total restore failure
logContext.Info("starting restore")
restoreWarnings, restoreErrors = c.restorer.Restore(restore, backup, backupFile, logFile, actions)
logContext.Info("restore completed")
Expand Down
50 changes: 37 additions & 13 deletions pkg/controller/restore_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,19 +120,21 @@ func TestFetchBackup(t *testing.T) {

func TestProcessRestore(t *testing.T) {
tests := []struct {
name string
restoreKey string
restore *api.Restore
backup *api.Backup
restorerError error
allowRestoreSnapshots bool
expectedErr bool
expectedPhase string
expectedValidationErrors []string
expectedRestoreErrors int
expectedRestorerCall *api.Restore
backupServiceGetBackupError error
uploadLogError error
name string
restoreKey string
restore *api.Restore
backup *api.Backup
restorerError error
allowRestoreSnapshots bool
expectedErr bool
expectedPhase string
expectedValidationErrors []string
expectedRestoreErrors int
expectedRestorerCall *api.Restore
backupServiceGetBackupError error
uploadLogError error
backupServiceDownloadBackupError error
expectedFinalPhase string
}{
{
name: "invalid key returns error",
Expand Down Expand Up @@ -300,6 +302,14 @@ func TestProcessRestore(t *testing.T) {
"Invalid included/excluded resource lists: excludes list cannot contain an item in the includes list: restores.ark.heptio.com",
},
},
{
name: "backup download error results in failed restore",
restore: NewRestore("foo", "bar", "backup-1", "ns-1", "", api.RestorePhaseNew).Restore,
expectedPhase: string(api.RestorePhaseInProgress),
expectedFinalPhase: string(api.RestorePhaseFailed),
backupServiceDownloadBackupError: errors.New("Couldn't download backup"),
backup: arktest.NewTestBackup().WithName("backup-1").Backup,
},
}

for _, test := range tests {
Expand Down Expand Up @@ -403,6 +413,10 @@ func TestProcessRestore(t *testing.T) {
backupSvc.On("GetBackup", "bucket", mock.Anything).Return(nil, test.backupServiceGetBackupError)
}

if test.backupServiceDownloadBackupError != nil {
backupSvc.On("DownloadBackup", "bucket", test.restore.Spec.BackupName).Return(nil, test.backupServiceDownloadBackupError)
}

if test.restore != nil {
pluginManager.On("GetRestoreItemActions", test.restore.Name).Return(nil, nil)
pluginManager.On("CloseRestoreItemActions", test.restore.Name).Return(nil)
Expand Down Expand Up @@ -477,6 +491,15 @@ func TestProcessRestore(t *testing.T) {
Errors: test.expectedRestoreErrors,
},
}
// Override our default expectations if the case requires it
if test.expectedFinalPhase != "" {
expected = Patch{
Status: StatusPatch{
Phase: api.RestorePhaseCompleted,
Errors: test.expectedRestoreErrors,
},
}
}

arktest.ValidatePatch(t, actions[1], expected, decode)

Expand Down Expand Up @@ -507,6 +530,7 @@ func TestCompleteAndValidateWhenScheduleNameSpecified(t *testing.T) {
false,
logger,
nil,
nil,
).(*restoreController)

restore := &api.Restore{
Expand Down
32 changes: 22 additions & 10 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ const (
restoreAttemptTotal = "restore_attempt_total"
restoreValidationFailedTotal = "restore_validation_failed_total"
restoreSuccessTotal = "restore_success_total"
restoreIncompleteTotal = "restore_incomplete_total"
restoreFailedTotal = "restore_failed_total"

scheduleLabel = "schedule"
backupNameLabel = "backupName"
Expand Down Expand Up @@ -117,19 +117,19 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel},
),
restoreIncompleteTotal: prometheus.NewCounterVec(
restoreFailedTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: restoreIncompleteTotal,
Help: "Total number of incomplete restores",
Name: restoreFailedTotal,
Help: "Total number of failed restores",
},
[]string{scheduleLabel},
),
restoreValidationFailedTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: restoreValidationFailedTotal,
Help: "Total number of failed restore validations",
Help: "Total number of failed restores failing validations",
},
[]string{scheduleLabel},
),
Expand All @@ -154,6 +154,18 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
if c, ok := m.metrics[backupFailureCount].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
}

// SetBackupTarballSizeBytesGauge records the size, in bytes, of a backup tarball.
Expand Down Expand Up @@ -204,21 +216,21 @@ func (m *ServerMetrics) RegisterRestoreAttempt(backupSchedule string) {
}
}

// RegisterRestoreSuccess records a successful completion of a restore.
// RegisterRestoreSuccess records a successful (maybe partial) completion of a restore.
func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) {
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}

// RegisterRestoreIncomplete records a restore that finished with errors.
func (m *ServerMetrics) RegisterRestoreIncomplete(backupSchedule string) {
if c, ok := m.metrics[restoreIncompleteTotal].(*prometheus.CounterVec); ok {
// RegisterRestoreFailed records a restore that failed.
func (m *ServerMetrics) RegisterRestoreFailed(backupSchedule string) {
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}

// RegisterRestoreValidationFailed records a failed restore.
// RegisterRestoreValidationFailed records a restore that failed validation.
func (m *ServerMetrics) RegisterRestoreValidationFailed(backupSchedule string) {
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
Expand Down
2 changes: 2 additions & 0 deletions pkg/util/test/comparisons.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ func AssertDeepEqual(t *testing.T, expected, actual interface{}) bool {
}

if !equality.Semantic.DeepEqual(expected, actual) {
fmt.Printf("expected = %+v\n", expected)
fmt.Printf("actual = %+v\n", actual)
return assert.Fail(t, fmt.Sprintf("Objects not equal"))
}

Expand Down

0 comments on commit 74dbf38

Please sign in to comment.