diff --git a/pkg/cmd/server/server.go b/pkg/cmd/server/server.go index 59e9e0f9a7..a6b4ffeb52 100644 --- a/pkg/cmd/server/server.go +++ b/pkg/cmd/server/server.go @@ -725,6 +725,7 @@ func (s *server) runControllers(config *api.Config) error { s.snapshotService != nil, s.logger, s.pluginManager, + s.metrics, ) wg.Add(1) go func() { diff --git a/pkg/controller/restore_controller.go b/pkg/controller/restore_controller.go index 7af63bf814..898a6f327e 100644 --- a/pkg/controller/restore_controller.go +++ b/pkg/controller/restore_controller.go @@ -45,6 +45,7 @@ import ( arkv1client "github.com/heptio/ark/pkg/generated/clientset/versioned/typed/ark/v1" informers "github.com/heptio/ark/pkg/generated/informers/externalversions/ark/v1" listers "github.com/heptio/ark/pkg/generated/listers/ark/v1" + "github.com/heptio/ark/pkg/metrics" "github.com/heptio/ark/pkg/plugin" "github.com/heptio/ark/pkg/restore" "github.com/heptio/ark/pkg/util/boolptr" @@ -84,6 +85,7 @@ type restoreController struct { queue workqueue.RateLimitingInterface logger logrus.FieldLogger pluginManager plugin.Manager + metrics *metrics.ServerMetrics } func NewRestoreController( @@ -98,6 +100,7 @@ func NewRestoreController( pvProviderExists bool, logger logrus.FieldLogger, pluginManager plugin.Manager, + metrics *metrics.ServerMetrics, ) Interface { c := &restoreController{ namespace: namespace, @@ -114,6 +117,7 @@ func NewRestoreController( queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "restore"), logger: logger, pluginManager: pluginManager, + metrics: metrics, } c.syncHandler = c.processRestore @@ -255,8 +259,23 @@ func (c *restoreController) processRestore(key string) error { // don't modify items in the cache restore = restore.DeepCopy() - // complete & validate restore - if restore.Status.ValidationErrors = c.completeAndValidate(restore); len(restore.Status.ValidationErrors) > 0 { + excludedResources := sets.NewString(restore.Spec.ExcludedResources...) + for _, nonrestorable := range nonRestorableResources { + if !excludedResources.Has(nonrestorable) { + restore.Spec.ExcludedResources = append(restore.Spec.ExcludedResources, nonrestorable) + } + } + + backup, fetchErr := c.fetchBackup(c.bucket, restore.Spec.BackupName) + backupScheduleName := "" + if backup != nil { + backupScheduleName = backup.GetLabels()["ark-schedule"] + } + // Register attempts before we do validation so we can get better tracking + c.metrics.RegisterRestoreAttempt(backupScheduleName) + + // validation + if restore.Status.ValidationErrors = c.completeAndValidate(restore, fetchErr); len(restore.Status.ValidationErrors) > 0 { restore.Status.Phase = api.RestorePhaseFailedValidation } else { restore.Status.Phase = api.RestorePhaseInProgress @@ -272,12 +291,12 @@ func (c *restoreController) processRestore(key string) error { restore = updatedRestore.DeepCopy() if restore.Status.Phase == api.RestorePhaseFailedValidation { + c.metrics.RegisterRestoreValidationFailed(backupScheduleName) return nil } - logContext.Debug("Running restore") // execution & upload of restore - restoreWarnings, restoreErrors := c.runRestore(restore, c.bucket) + restoreWarnings, restoreErrors := c.runRestore(restore, c.bucket, backup) restore.Status.Warnings = len(restoreWarnings.Ark) + len(restoreWarnings.Cluster) for _, w := range restoreWarnings.Namespaces { @@ -288,6 +307,11 @@ func (c *restoreController) processRestore(key string) error { for _, e := range restoreErrors.Namespaces { restore.Status.Errors += len(e) } + if restore.Status.Errors > 0 { + c.metrics.RegisterRestoreIncomplete(backupScheduleName) + } else { + c.metrics.RegisterRestoreSuccess(backupScheduleName) + } logContext.Debug("restore completed") restore.Status.Phase = api.RestorePhaseCompleted @@ -300,7 +324,7 @@ func (c *restoreController) processRestore(key string) error { return nil } -func (c *restoreController) completeAndValidate(restore *api.Restore) []string { +func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr error) []string { // add non-restorable resources to restore's excluded resources excludedResources := sets.NewString(restore.Spec.ExcludedResources...) for _, nonrestorable := range nonRestorableResources { @@ -308,9 +332,14 @@ func (c *restoreController) completeAndValidate(restore *api.Restore) []string { restore.Spec.ExcludedResources = append(restore.Spec.ExcludedResources, nonrestorable) } } - var validationErrors []string + if restore.Spec.BackupName == "" { + validationErrors = append(validationErrors, "BackupName must be non-empty and correspond to the name of a backup in object storage.") + } else if fetchErr != nil { + validationErrors = append(validationErrors, fmt.Sprintf("Error retrieving backup: %v", fetchErr)) + } + // validate that included resources don't contain any non-restorable resources includedResources := sets.NewString(restore.Spec.IncludedResources...) for _, nonRestorableResource := range nonRestorableResources { @@ -433,7 +462,7 @@ func (c *restoreController) fetchBackup(bucket, name string) (*api.Backup, error return backup, nil } -func (c *restoreController) runRestore(restore *api.Restore, bucket string) (restoreWarnings, restoreErrors api.RestoreResult) { +func (c *restoreController) runRestore(restore *api.Restore, bucket string, backup *api.Backup) (restoreWarnings, restoreErrors api.RestoreResult) { logContext := c.logger.WithFields( logrus.Fields{ "restore": kubeutil.NamespaceAndName(restore), diff --git a/pkg/controller/restore_controller_test.go b/pkg/controller/restore_controller_test.go index a9a15c5047..8674741414 100644 --- a/pkg/controller/restore_controller_test.go +++ b/pkg/controller/restore_controller_test.go @@ -37,6 +37,7 @@ import ( api "github.com/heptio/ark/pkg/apis/ark/v1" "github.com/heptio/ark/pkg/generated/clientset/versioned/fake" informers "github.com/heptio/ark/pkg/generated/informers/externalversions" + "github.com/heptio/ark/pkg/metrics" "github.com/heptio/ark/pkg/restore" "github.com/heptio/ark/pkg/util/collections" arktest "github.com/heptio/ark/pkg/util/test" @@ -95,6 +96,7 @@ func TestFetchBackup(t *testing.T) { false, logger, pluginManager, + metrics.NewServerMetrics(), ).(*restoreController) for _, itm := range test.informerBackups { @@ -326,6 +328,7 @@ func TestProcessRestore(t *testing.T) { test.allowRestoreSnapshots, logger, pluginManager, + metrics.NewServerMetrics(), ).(*restoreController) if test.restore != nil { @@ -410,7 +413,6 @@ func TestProcessRestore(t *testing.T) { restorer.AssertExpectations(t) assert.Equal(t, test.expectedErr, err != nil, "got error %v", err) - actions := client.Actions() if test.expectedPhase == "" { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 75115c5643..a95dc059cd 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -30,12 +30,18 @@ type ServerMetrics struct { const ( metricNamespace = "ark" backupTarballSizeBytesGauge = "backup_tarball_size_bytes" - backupAttemptCount = "backup_attempt_total" - backupSuccessCount = "backup_success_total" - backupFailureCount = "backup_failure_total" - backupDurationSeconds = "backup_duration_seconds" - - scheduleLabel = "schedule" + // TODO: Rename the Count variables to match their strings + backupAttemptCount = "backup_attempt_total" + backupSuccessCount = "backup_success_total" + backupFailureCount = "backup_failure_total" + backupDurationSeconds = "backup_duration_seconds" + restoreAttemptTotal = "restore_attempt_total" + restoreValidationFailedTotal = "restore_validation_failed_total" + restoreSuccessTotal = "restore_success_total" + restoreIncompleteTotal = "restore_incomplete_total" + + scheduleLabel = "schedule" + backupNameLabel = "backupName" secondsInMinute = 60.0 ) @@ -95,6 +101,38 @@ func NewServerMetrics() *ServerMetrics { }, []string{scheduleLabel}, ), + restoreAttemptTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: restoreAttemptTotal, + Help: "Total number of attempted restores", + }, + []string{scheduleLabel}, + ), + restoreSuccessTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: restoreSuccessTotal, + Help: "Total number of successful restores", + }, + []string{scheduleLabel}, + ), + restoreIncompleteTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: restoreIncompleteTotal, + Help: "Total number of incomplete restores", + }, + []string{scheduleLabel}, + ), + restoreValidationFailedTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: restoreValidationFailedTotal, + Help: "Total number of failed restore validations", + }, + []string{scheduleLabel}, + ), }, } } @@ -158,3 +196,31 @@ func (m *ServerMetrics) RegisterBackupDuration(backupSchedule string, seconds fl func toSeconds(d time.Duration) float64 { return float64(d / time.Second) } + +// RegisterRestoreAttempt records an attempt to restore a backup. +func (m *ServerMetrics) RegisterRestoreAttempt(backupSchedule string) { + if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(backupSchedule).Inc() + } +} + +// RegisterRestoreSuccess records a successful completion of a restore. +func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) { + if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(backupSchedule).Inc() + } +} + +// RegisterRestoreIncomplete records a restore that finished with errors. +func (m *ServerMetrics) RegisterRestoreIncomplete(backupSchedule string) { + if c, ok := m.metrics[restoreIncompleteTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(backupSchedule).Inc() + } +} + +// RegisterRestoreValidationFailed records a failed restore. +func (m *ServerMetrics) RegisterRestoreValidationFailed(backupSchedule string) { + if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(backupSchedule).Inc() + } +}