Skip to content

Commit

Permalink
Add restore attempt and success/failure counters
Browse files Browse the repository at this point in the history
Signed-off-by: Nolan Brubaker <[email protected]>
  • Loading branch information
Nolan Brubaker committed Jul 25, 2018
1 parent 39c0300 commit 7cebfe2
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 14 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,7 @@ func (s *server) runControllers(config *api.Config) error {
s.snapshotService != nil,
s.logger,
s.pluginManager,
s.metrics,
)
wg.Add(1)
go func() {
Expand Down
43 changes: 36 additions & 7 deletions pkg/controller/restore_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import (
arkv1client "github.com/heptio/ark/pkg/generated/clientset/versioned/typed/ark/v1"
informers "github.com/heptio/ark/pkg/generated/informers/externalversions/ark/v1"
listers "github.com/heptio/ark/pkg/generated/listers/ark/v1"
"github.com/heptio/ark/pkg/metrics"
"github.com/heptio/ark/pkg/plugin"
"github.com/heptio/ark/pkg/restore"
"github.com/heptio/ark/pkg/util/boolptr"
Expand Down Expand Up @@ -84,6 +85,7 @@ type restoreController struct {
queue workqueue.RateLimitingInterface
logger logrus.FieldLogger
pluginManager plugin.Manager
metrics *metrics.ServerMetrics
}

func NewRestoreController(
Expand All @@ -98,6 +100,7 @@ func NewRestoreController(
pvProviderExists bool,
logger logrus.FieldLogger,
pluginManager plugin.Manager,
metrics *metrics.ServerMetrics,
) Interface {
c := &restoreController{
namespace: namespace,
Expand All @@ -114,6 +117,7 @@ func NewRestoreController(
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "restore"),
logger: logger,
pluginManager: pluginManager,
metrics: metrics,
}

c.syncHandler = c.processRestore
Expand Down Expand Up @@ -255,8 +259,23 @@ func (c *restoreController) processRestore(key string) error {
// don't modify items in the cache
restore = restore.DeepCopy()

// complete & validate restore
if restore.Status.ValidationErrors = c.completeAndValidate(restore); len(restore.Status.ValidationErrors) > 0 {
excludedResources := sets.NewString(restore.Spec.ExcludedResources...)
for _, nonrestorable := range nonRestorableResources {
if !excludedResources.Has(nonrestorable) {
restore.Spec.ExcludedResources = append(restore.Spec.ExcludedResources, nonrestorable)
}
}

backup, fetchErr := c.fetchBackup(c.bucket, restore.Spec.BackupName)
backupScheduleName := ""
if backup != nil {
backupScheduleName = backup.GetLabels()["ark-schedule"]
}
// Register attempts before we do validation so we can get better tracking
c.metrics.RegisterRestoreAttempt(backupScheduleName)

// validation
if restore.Status.ValidationErrors = c.completeAndValidate(restore, fetchErr); len(restore.Status.ValidationErrors) > 0 {
restore.Status.Phase = api.RestorePhaseFailedValidation
} else {
restore.Status.Phase = api.RestorePhaseInProgress
Expand All @@ -272,12 +291,12 @@ func (c *restoreController) processRestore(key string) error {
restore = updatedRestore.DeepCopy()

if restore.Status.Phase == api.RestorePhaseFailedValidation {
c.metrics.RegisterRestoreValidationFailed(backupScheduleName)
return nil
}

logContext.Debug("Running restore")
// execution & upload of restore
restoreWarnings, restoreErrors := c.runRestore(restore, c.bucket)
restoreWarnings, restoreErrors := c.runRestore(restore, c.bucket, backup)

restore.Status.Warnings = len(restoreWarnings.Ark) + len(restoreWarnings.Cluster)
for _, w := range restoreWarnings.Namespaces {
Expand All @@ -288,6 +307,11 @@ func (c *restoreController) processRestore(key string) error {
for _, e := range restoreErrors.Namespaces {
restore.Status.Errors += len(e)
}
if restore.Status.Errors > 0 {
c.metrics.RegisterRestoreIncomplete(backupScheduleName)
} else {
c.metrics.RegisterRestoreSuccess(backupScheduleName)
}

logContext.Debug("restore completed")
restore.Status.Phase = api.RestorePhaseCompleted
Expand All @@ -300,17 +324,22 @@ func (c *restoreController) processRestore(key string) error {
return nil
}

func (c *restoreController) completeAndValidate(restore *api.Restore) []string {
func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr error) []string {
// add non-restorable resources to restore's excluded resources
excludedResources := sets.NewString(restore.Spec.ExcludedResources...)
for _, nonrestorable := range nonRestorableResources {
if !excludedResources.Has(nonrestorable) {
restore.Spec.ExcludedResources = append(restore.Spec.ExcludedResources, nonrestorable)
}
}

var validationErrors []string

if restore.Spec.BackupName == "" {
validationErrors = append(validationErrors, "BackupName must be non-empty and correspond to the name of a backup in object storage.")
} else if fetchErr != nil {
validationErrors = append(validationErrors, fmt.Sprintf("Error retrieving backup: %v", fetchErr))
}

// validate that included resources don't contain any non-restorable resources
includedResources := sets.NewString(restore.Spec.IncludedResources...)
for _, nonRestorableResource := range nonRestorableResources {
Expand Down Expand Up @@ -433,7 +462,7 @@ func (c *restoreController) fetchBackup(bucket, name string) (*api.Backup, error
return backup, nil
}

func (c *restoreController) runRestore(restore *api.Restore, bucket string) (restoreWarnings, restoreErrors api.RestoreResult) {
func (c *restoreController) runRestore(restore *api.Restore, bucket string, backup *api.Backup) (restoreWarnings, restoreErrors api.RestoreResult) {
logContext := c.logger.WithFields(
logrus.Fields{
"restore": kubeutil.NamespaceAndName(restore),
Expand Down
4 changes: 3 additions & 1 deletion pkg/controller/restore_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
api "github.com/heptio/ark/pkg/apis/ark/v1"
"github.com/heptio/ark/pkg/generated/clientset/versioned/fake"
informers "github.com/heptio/ark/pkg/generated/informers/externalversions"
"github.com/heptio/ark/pkg/metrics"
"github.com/heptio/ark/pkg/restore"
"github.com/heptio/ark/pkg/util/collections"
arktest "github.com/heptio/ark/pkg/util/test"
Expand Down Expand Up @@ -95,6 +96,7 @@ func TestFetchBackup(t *testing.T) {
false,
logger,
pluginManager,
metrics.NewServerMetrics(),
).(*restoreController)

for _, itm := range test.informerBackups {
Expand Down Expand Up @@ -326,6 +328,7 @@ func TestProcessRestore(t *testing.T) {
test.allowRestoreSnapshots,
logger,
pluginManager,
metrics.NewServerMetrics(),
).(*restoreController)

if test.restore != nil {
Expand Down Expand Up @@ -410,7 +413,6 @@ func TestProcessRestore(t *testing.T) {
restorer.AssertExpectations(t)

assert.Equal(t, test.expectedErr, err != nil, "got error %v", err)

actions := client.Actions()

if test.expectedPhase == "" {
Expand Down
78 changes: 72 additions & 6 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,18 @@ type ServerMetrics struct {
const (
metricNamespace = "ark"
backupTarballSizeBytesGauge = "backup_tarball_size_bytes"
backupAttemptCount = "backup_attempt_total"
backupSuccessCount = "backup_success_total"
backupFailureCount = "backup_failure_total"
backupDurationSeconds = "backup_duration_seconds"

scheduleLabel = "schedule"
// TODO: Rename the Count variables to match their strings
backupAttemptCount = "backup_attempt_total"
backupSuccessCount = "backup_success_total"
backupFailureCount = "backup_failure_total"
backupDurationSeconds = "backup_duration_seconds"
restoreAttemptTotal = "restore_attempt_total"
restoreValidationFailedTotal = "restore_validation_failed_total"
restoreSuccessTotal = "restore_success_total"
restoreIncompleteTotal = "restore_incomplete_total"

scheduleLabel = "schedule"
backupNameLabel = "backupName"

secondsInMinute = 60.0
)
Expand Down Expand Up @@ -95,6 +101,38 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel},
),
restoreAttemptTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: restoreAttemptTotal,
Help: "Total number of attempted restores",
},
[]string{scheduleLabel},
),
restoreSuccessTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: restoreSuccessTotal,
Help: "Total number of successful restores",
},
[]string{scheduleLabel},
),
restoreIncompleteTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: restoreIncompleteTotal,
Help: "Total number of incomplete restores",
},
[]string{scheduleLabel},
),
restoreValidationFailedTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: restoreValidationFailedTotal,
Help: "Total number of failed restore validations",
},
[]string{scheduleLabel},
),
},
}
}
Expand Down Expand Up @@ -158,3 +196,31 @@ func (m *ServerMetrics) RegisterBackupDuration(backupSchedule string, seconds fl
func toSeconds(d time.Duration) float64 {
return float64(d / time.Second)
}

// RegisterRestoreAttempt records an attempt to restore a backup.
func (m *ServerMetrics) RegisterRestoreAttempt(backupSchedule string) {
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}

// RegisterRestoreSuccess records a successful completion of a restore.
func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) {
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}

// RegisterRestoreIncomplete records a restore that finished with errors.
func (m *ServerMetrics) RegisterRestoreIncomplete(backupSchedule string) {
if c, ok := m.metrics[restoreIncompleteTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}

// RegisterRestoreValidationFailed records a failed restore.
func (m *ServerMetrics) RegisterRestoreValidationFailed(backupSchedule string) {
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}

0 comments on commit 7cebfe2

Please sign in to comment.