diff --git a/pkg/neg/metrics/metrics.go b/pkg/neg/metrics/metrics.go index b5fe7de76f..ee07e15f16 100644 --- a/pkg/neg/metrics/metrics.go +++ b/pkg/neg/metrics/metrics.go @@ -31,6 +31,7 @@ const ( negOpLatencyKey = "neg_operation_duration_seconds" negOpEndpointsKey = "neg_operation_endpoints" lastSyncTimestampKey = "sync_timestamp" + syncerStalenessKey = "syncer_staleness" resultSuccess = "success" resultError = "error" @@ -127,6 +128,17 @@ var ( Help: "The timestamp of the last execution of NEG controller sync loop.", }, ) + + // SyncerStaleness tracks for every syncer, how long has it been since the last sync occured + SyncerStaleness = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Subsystem: negControllerSubsystem, + Name: syncerStalenessKey, + Help: "The duration that NEG syncers have been stale/out of sync", + // custom buckets - [1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s(~4min), 512s(~8min), 1024s(~17min), 2048 (~34min), 4096(~68min), +Inf] + Buckets: prometheus.ExponentialBuckets(1, 2, 13), + }, + ) ) var register sync.Once @@ -139,6 +151,7 @@ func RegisterMetrics() { prometheus.MustRegister(SyncerSyncLatency) prometheus.MustRegister(LastSyncTimestamp) prometheus.MustRegister(InitializationLatency) + prometheus.MustRegister(SyncerStaleness) }) } @@ -168,6 +181,10 @@ func PublishNegInitializationMetrics(latency time.Duration) { InitializationLatency.Observe(latency.Seconds()) } +func PublishNegSyncerStalenessMetrics(syncerStaleness time.Duration) { + SyncerStaleness.Observe(syncerStaleness.Seconds()) +} + func getResult(err error) string { if err != nil { return resultError diff --git a/pkg/neg/syncers/transaction.go b/pkg/neg/syncers/transaction.go index 2cc107acf5..c7355d0471 100644 --- a/pkg/neg/syncers/transaction.go +++ b/pkg/neg/syncers/transaction.go @@ -97,6 +97,9 @@ type transactionSyncer struct { logger klog.Logger + // lastSyncTimestamp tracks the timestamp of when the last sync operation happens + lastSyncTimestamp time.Time + // inError indicates if the syncer is in any of 4 error scenarios // 1. Endpoint counts from EPS is different from calculated endpoint list // 2. EndpontSlice has missing or invalid data @@ -149,6 +152,7 @@ func NewTransactionSyncer( enableEndpointSlices: enableEndpointSlices, inError: false, logger: logger, + lastSyncTimestamp: time.Now(), } // Syncer implements life cycle logic syncer := newSyncer(negSyncerKey, serviceLister, recorder, ts, logger) @@ -192,6 +196,10 @@ func (s *transactionSyncer) syncInternal() error { s.updateStatus(err) metrics.PublishNegSyncMetrics(string(s.NegSyncerKey.NegType), string(s.endpointsCalculator.Mode()), err, start) + + currTime := time.Now() + metrics.PublishNegSyncerStalenessMetrics(currTime.Sub(s.lastSyncTimestamp)) + s.lastSyncTimestamp = currTime return err }