Skip to content

Commit

Permalink
Add metrics to track syncer staleness
Browse files Browse the repository at this point in the history
Added metrics to track the sync staleness of NEG syncers, where staleness is
defined as how long a syncer has been out of sync.
  • Loading branch information
sawsa307 committed Feb 1, 2023
1 parent e246ee3 commit aef71bf
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 0 deletions.
17 changes: 17 additions & 0 deletions pkg/neg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ const (
negOpLatencyKey = "neg_operation_duration_seconds"
negOpEndpointsKey = "neg_operation_endpoints"
lastSyncTimestampKey = "sync_timestamp"
syncerStalenessKey = "syncer_staleness"

resultSuccess = "success"
resultError = "error"
Expand Down Expand Up @@ -127,6 +128,17 @@ var (
Help: "The timestamp of the last execution of NEG controller sync loop.",
},
)

// SyncerStaleness tracks for every syncer, how long has it been since the last sync occured
SyncerStaleness = prometheus.NewHistogram(
prometheus.HistogramOpts{
Subsystem: negControllerSubsystem,
Name: syncerStalenessKey,
Help: "The duration that NEG syncers have been stale/out of sync",
// custom buckets - [1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s(~4min), 512s(~8min), 1024s(~17min), 2048 (~34min), 4096(~68min), +Inf]
Buckets: prometheus.ExponentialBuckets(1, 2, 13),
},
)
)

var register sync.Once
Expand All @@ -139,6 +151,7 @@ func RegisterMetrics() {
prometheus.MustRegister(SyncerSyncLatency)
prometheus.MustRegister(LastSyncTimestamp)
prometheus.MustRegister(InitializationLatency)
prometheus.MustRegister(SyncerStaleness)
})
}

Expand Down Expand Up @@ -168,6 +181,10 @@ func PublishNegInitializationMetrics(latency time.Duration) {
InitializationLatency.Observe(latency.Seconds())
}

func PublishNegSyncerStalenessMetrics(syncerStaleness time.Duration) {
SyncerStaleness.Observe(syncerStaleness.Seconds())
}

func getResult(err error) string {
if err != nil {
return resultError
Expand Down
8 changes: 8 additions & 0 deletions pkg/neg/syncers/transaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ type transactionSyncer struct {

logger klog.Logger

// lastSyncTimestamp tracks the timestamp of when the last sync operation happens
lastSyncTimestamp time.Time

// inError indicates if the syncer is in any of 4 error scenarios
// 1. Endpoint counts from EPS is different from calculated endpoint list
// 2. EndpontSlice has missing or invalid data
Expand Down Expand Up @@ -149,6 +152,7 @@ func NewTransactionSyncer(
enableEndpointSlices: enableEndpointSlices,
inError: false,
logger: logger,
lastSyncTimestamp: time.Now(),
}
// Syncer implements life cycle logic
syncer := newSyncer(negSyncerKey, serviceLister, recorder, ts, logger)
Expand Down Expand Up @@ -192,6 +196,10 @@ func (s *transactionSyncer) syncInternal() error {

s.updateStatus(err)
metrics.PublishNegSyncMetrics(string(s.NegSyncerKey.NegType), string(s.endpointsCalculator.Mode()), err, start)

currTime := time.Now()
metrics.PublishNegSyncerStalenessMetrics(currTime.Sub(s.lastSyncTimestamp))
s.lastSyncTimestamp = currTime
return err
}

Expand Down

0 comments on commit aef71bf

Please sign in to comment.