From b5bac44c44f370228f0e1793e493d9472b3423e7 Mon Sep 17 00:00:00 2001 From: Aaditya Sondhi <20070511+aadityasondhi@users.noreply.github.com> Date: Thu, 20 Apr 2023 09:46:21 -0400 Subject: [PATCH] metric: use cumulative count instead of windowed count in tsdb Previously, we were writing the windowed count of a histogram into tsdb. This meant that on each tick, the count reset. This is useful for calculating averages and quantiles using windowed histograms, but makes little sense to record for `count`. This patch now uses the cumulative count in tsdb. This patch also adds a `-sum` field to maintain a record of the cumulative sum along with the cumulative counts. Fixes #98745 Release note (bug fix): Timeseries metric counts will now show cumulative counts for a histogram rather than the windowed count. A `-sum` timeseries is also exported to keep track of the cumulative sum of all samples in the histogram. --- pkg/ccl/sqlproxyccl/connector_test.go | 13 ++-- pkg/ccl/sqlproxyccl/proxy_handler_test.go | 28 ++++---- .../kvcoord/dist_sender_server_test.go | 3 +- .../kvclient/kvcoord/txn_coord_sender_test.go | 9 ++- pkg/kv/kvserver/scheduler_test.go | 3 +- pkg/server/status/recorder.go | 9 ++- pkg/server/status/recorder_test.go | 1 + pkg/util/asciitsdb/asciitsdb.go | 6 +- pkg/util/metric/aggmetric/histogram.go | 22 ++++-- pkg/util/metric/hdrhistogram.go | 27 ++++--- pkg/util/metric/metric.go | 71 +++++++++++-------- pkg/util/metric/metric_test.go | 9 ++- pkg/util/schedulerlatency/histogram.go | 24 +++++-- pkg/util/schedulerlatency/histogram_test.go | 6 +- .../scheduler_latency_test.go | 5 +- 15 files changed, 142 insertions(+), 94 deletions(-) diff --git a/pkg/ccl/sqlproxyccl/connector_test.go b/pkg/ccl/sqlproxyccl/connector_test.go index 6fa21c56afe3..c0e53ca1f201 100644 --- a/pkg/ccl/sqlproxyccl/connector_test.go +++ b/pkg/ccl/sqlproxyccl/connector_test.go @@ -432,7 +432,8 @@ func TestConnector_dialTenantCluster(t *testing.T) { // Assert existing calls. require.Equal(t, 1, dialSQLServerCount) require.Equal(t, 1, reportFailureFnCount) - require.Equal(t, c.DialTenantLatency.TotalCount(), int64(1)) + count, _ := c.DialTenantLatency.Total() + require.Equal(t, count, int64(1)) require.Equal(t, c.DialTenantRetries.Count(), int64(0)) // Invoke dial tenant with a failure to ReportFailure. Final error @@ -453,7 +454,8 @@ func TestConnector_dialTenantCluster(t *testing.T) { // Assert existing calls. require.Equal(t, 2, dialSQLServerCount) require.Equal(t, 2, reportFailureFnCount) - require.Equal(t, c.DialTenantLatency.TotalCount(), int64(2)) + count, _ = c.DialTenantLatency.Total() + require.Equal(t, count, int64(2)) require.Equal(t, c.DialTenantRetries.Count(), int64(0)) }) @@ -478,8 +480,8 @@ func TestConnector_dialTenantCluster(t *testing.T) { conn, err := c.dialTenantCluster(ctx, nil /* requester */) require.EqualError(t, err, "baz") require.Nil(t, conn) - - require.Equal(t, c.DialTenantLatency.TotalCount(), int64(1)) + count, _ := c.DialTenantLatency.Total() + require.Equal(t, count, int64(1)) require.Equal(t, c.DialTenantRetries.Count(), int64(0)) }) @@ -551,7 +553,8 @@ func TestConnector_dialTenantCluster(t *testing.T) { require.Equal(t, 3, addrLookupFnCount) require.Equal(t, 2, dialSQLServerCount) require.Equal(t, 1, reportFailureFnCount) - require.Equal(t, c.DialTenantLatency.TotalCount(), int64(1)) + count, _ := c.DialTenantLatency.Total() + require.Equal(t, count, int64(1)) require.Equal(t, c.DialTenantRetries.Count(), int64(2)) }) diff --git a/pkg/ccl/sqlproxyccl/proxy_handler_test.go b/pkg/ccl/sqlproxyccl/proxy_handler_test.go index 98a40775df10..63ce5558e953 100644 --- a/pkg/ccl/sqlproxyccl/proxy_handler_test.go +++ b/pkg/ccl/sqlproxyccl/proxy_handler_test.go @@ -470,7 +470,8 @@ func TestProxyAgainstSecureCRDB(t *testing.T) { }) } require.Equal(t, int64(4), s.metrics.SuccessfulConnCount.Count()) - require.Equal(t, int64(4), s.metrics.ConnectionLatency.TotalCount()) + count, _ := s.metrics.ConnectionLatency.Total() + require.Equal(t, int64(4), count) require.Equal(t, int64(2), s.metrics.AuthFailedCount.Count()) require.Equal(t, int64(1), s.metrics.RoutingErrCount.Count()) } @@ -611,7 +612,8 @@ func TestProxyTLSClose(t *testing.T) { _ = conn.Close(ctx) require.Equal(t, int64(1), s.metrics.SuccessfulConnCount.Count()) - require.Equal(t, int64(1), s.metrics.ConnectionLatency.TotalCount()) + count, _ := s.metrics.ConnectionLatency.Total() + require.Equal(t, int64(1), count) require.Equal(t, int64(0), s.metrics.AuthFailedCount.Count()) } @@ -718,7 +720,8 @@ func TestInsecureProxy(t *testing.T) { }) require.Equal(t, int64(1), s.metrics.AuthFailedCount.Count()) require.Equal(t, int64(1), s.metrics.SuccessfulConnCount.Count()) - require.Equal(t, int64(1), s.metrics.ConnectionLatency.TotalCount()) + count, _ := s.metrics.ConnectionLatency.Total() + require.Equal(t, int64(1), count) } func TestErroneousFrontend(t *testing.T) { @@ -827,7 +830,8 @@ func TestProxyRefuseConn(t *testing.T) { _ = te.TestConnectErr(ctx, t, url, codeProxyRefusedConnection, "too many attempts") require.Equal(t, int64(1), s.metrics.RefusedConnCount.Count()) require.Equal(t, int64(0), s.metrics.SuccessfulConnCount.Count()) - require.Equal(t, int64(0), s.metrics.ConnectionLatency.TotalCount()) + count, _ := s.metrics.ConnectionLatency.Total() + require.Equal(t, int64(0), count) require.Equal(t, int64(0), s.metrics.AuthFailedCount.Count()) } @@ -1643,10 +1647,10 @@ func TestConnectionMigration(t *testing.T) { proxy.metrics.ConnMigrationErrorRecoverableCount.Count() + proxy.metrics.ConnMigrationErrorFatalCount.Count() require.Equal(t, totalAttempts, proxy.metrics.ConnMigrationAttemptedCount.Count()) - require.Equal(t, totalAttempts, - proxy.metrics.ConnMigrationAttemptedLatency.TotalCount()) - require.Equal(t, totalAttempts, - proxy.metrics.ConnMigrationTransferResponseMessageSize.TotalCount()) + count, _ := proxy.metrics.ConnMigrationAttemptedLatency.Total() + require.Equal(t, totalAttempts, count) + count, _ = proxy.metrics.ConnMigrationTransferResponseMessageSize.Total() + require.Equal(t, totalAttempts, count) } transferConnWithRetries := func(t *testing.T, f *forwarder) error { @@ -1966,12 +1970,12 @@ func TestConnectionMigration(t *testing.T) { f.metrics.ConnMigrationErrorRecoverableCount.Count() + f.metrics.ConnMigrationErrorFatalCount.Count() require.Equal(t, totalAttempts, f.metrics.ConnMigrationAttemptedCount.Count()) - require.Equal(t, totalAttempts, - f.metrics.ConnMigrationAttemptedLatency.TotalCount()) + count, _ := f.metrics.ConnMigrationAttemptedLatency.Total() + require.Equal(t, totalAttempts, count) // Here, we get a transfer timeout in response, so the message size // should not be recorded. - require.Equal(t, totalAttempts-1, - f.metrics.ConnMigrationTransferResponseMessageSize.TotalCount()) + count, _ = f.metrics.ConnMigrationTransferResponseMessageSize.Total() + require.Equal(t, totalAttempts-1, count) }) // All connections should eventually be terminated. diff --git a/pkg/kv/kvclient/kvcoord/dist_sender_server_test.go b/pkg/kv/kvclient/kvcoord/dist_sender_server_test.go index da56e609947a..6ae3b89d8514 100644 --- a/pkg/kv/kvclient/kvcoord/dist_sender_server_test.go +++ b/pkg/kv/kvclient/kvcoord/dist_sender_server_test.go @@ -3613,7 +3613,8 @@ func TestTxnCoordSenderRetries(t *testing.T) { require.Equal(t, exp.expClientRefreshFailure, metrics.ClientRefreshFail.Count() != 0, "TxnMetrics.ClientRefreshFail") require.Equal(t, exp.expClientAutoRetryAfterRefresh, metrics.ClientRefreshAutoRetries.Count() != 0, "TxnMetrics.ClientRefreshAutoRetries") require.Equal(t, exp.expServerRefresh, metrics.ServerRefreshSuccess.Count() != 0, "TxnMetrics.ServerRefreshSuccess") - require.Equal(t, exp.expClientRestart, metrics.Restarts.TotalSum() != 0, "TxnMetrics.Restarts") + _, restartsSum := metrics.Restarts.Total() + require.Equal(t, exp.expClientRestart, restartsSum != 0, "TxnMetrics.Restarts") require.Equal(t, exp.expOnePhaseCommit, metrics.Commits1PC.Count() != 0, "TxnMetrics.Commits1PC") } for _, tc := range testCases { diff --git a/pkg/kv/kvclient/kvcoord/txn_coord_sender_test.go b/pkg/kv/kvclient/kvcoord/txn_coord_sender_test.go index 37896afbb121..0ded319bb857 100644 --- a/pkg/kv/kvclient/kvcoord/txn_coord_sender_test.go +++ b/pkg/kv/kvclient/kvcoord/txn_coord_sender_test.go @@ -1149,6 +1149,8 @@ func checkTxnMetrics( func checkTxnMetricsOnce( metrics kvcoord.TxnMetrics, name string, commits, commits1PC, aborts, restarts int64, ) error { + durationCounts, _ := metrics.Durations.Total() + restartsCounts, _ := metrics.Restarts.Total() testcases := []struct { name string a, e int64 @@ -1156,8 +1158,8 @@ func checkTxnMetricsOnce( {"commits", metrics.Commits.Count(), commits}, {"commits1PC", metrics.Commits1PC.Count(), commits1PC}, {"aborts", metrics.Aborts.Count(), aborts}, - {"durations", metrics.Durations.TotalCount(), commits + aborts}, - {"restarts", metrics.Restarts.TotalCount(), restarts}, + {"durations", durationCounts, commits + aborts}, + {"restarts", restartsCounts, restarts}, } for _, tc := range testcases { @@ -1374,7 +1376,8 @@ func TestTxnDurations(t *testing.T) { // introducing spurious errors or being overly lax. // // TODO(cdo): look into cause of variance. - if a, e := hist.TotalCount(), int64(puts); a != e { + count, _ := hist.Total() + if a, e := count, int64(puts); a != e { t.Fatalf("durations %d != expected %d", a, e) } diff --git a/pkg/kv/kvserver/scheduler_test.go b/pkg/kv/kvserver/scheduler_test.go index 76435b37439b..58ab11700fdc 100644 --- a/pkg/kv/kvserver/scheduler_test.go +++ b/pkg/kv/kvserver/scheduler_test.go @@ -266,7 +266,8 @@ func TestSchedulerLoop(t *testing.T) { return nil }) - require.Equal(t, int64(3), m.RaftSchedulerLatency.TotalCount()) + count, _ := m.RaftSchedulerLatency.Total() + require.Equal(t, int64(3), count) } // Verify that when we enqueue the same range multiple times for the same diff --git a/pkg/server/status/recorder.go b/pkg/server/status/recorder.go index 95e72522bb0d..e12718196a5b 100644 --- a/pkg/server/status/recorder.go +++ b/pkg/server/status/recorder.go @@ -609,9 +609,12 @@ type registryRecorder struct { func extractValue(name string, mtr interface{}, fn func(string, float64)) error { switch mtr := mtr.(type) { case metric.WindowedHistogram: - n := float64(mtr.TotalCountWindowed()) - fn(name+"-count", n) - avg := mtr.TotalSumWindowed() / n + // Use cumulative stats here + count, sum := mtr.Total() + fn(name+"-count", float64(count)) + fn(name+"-sum", sum) + // Use windowed stats for avg and quantiles + avg := mtr.MeanWindowed() if math.IsNaN(avg) || math.IsInf(avg, +1) || math.IsInf(avg, -1) { avg = 0 } diff --git a/pkg/server/status/recorder_test.go b/pkg/server/status/recorder_test.go index ff4852a51c9d..587b52c6a2a8 100644 --- a/pkg/server/status/recorder_test.go +++ b/pkg/server/status/recorder_test.go @@ -587,6 +587,7 @@ func TestMetricsRecorder(t *testing.T) { addExpected(reg.prefix, data.name+q.Suffix, reg.source, 100, 10, reg.isNode) } addExpected(reg.prefix, data.name+"-count", reg.source, 100, 1, reg.isNode) + addExpected(reg.prefix, data.name+"-sum", reg.source, 100, 9, reg.isNode) addExpected(reg.prefix, data.name+"-avg", reg.source, 100, 9, reg.isNode) default: t.Fatalf("unexpected: %+v", data) diff --git a/pkg/util/asciitsdb/asciitsdb.go b/pkg/util/asciitsdb/asciitsdb.go index a5d0918f3975..108ada2c0058 100644 --- a/pkg/util/asciitsdb/asciitsdb.go +++ b/pkg/util/asciitsdb/asciitsdb.go @@ -103,12 +103,12 @@ func (t *TSDB) Scrape(ctx context.Context) { switch mtr := val.(type) { case metric.WindowedHistogram: - n := float64(mtr.TotalCountWindowed()) if _, ok := t.mu.points[name]; !ok { return } - t.mu.points[name+"-count"] = append(t.mu.points[name+"-count"], n) - avg := mtr.TotalSumWindowed() / n + count, _ := mtr.Total() + t.mu.points[name+"-count"] = append(t.mu.points[name+"-count"], float64(count)) + avg := mtr.MeanWindowed() if math.IsNaN(avg) || math.IsInf(avg, +1) || math.IsInf(avg, -1) { avg = 0 } diff --git a/pkg/util/metric/aggmetric/histogram.go b/pkg/util/metric/aggmetric/histogram.go index 1c07cc1ef3d3..25103e54e37a 100644 --- a/pkg/util/metric/aggmetric/histogram.go +++ b/pkg/util/metric/aggmetric/histogram.go @@ -61,14 +61,24 @@ func (a *AggHistogram) GetMetadata() metric.Metadata { return a.h.GetMetadata() // Inspect is part of the metric.Iterable interface. func (a *AggHistogram) Inspect(f func(interface{})) { f(a) } -// TotalCountWindowed is part of the metric.WindowedHistogram interface -func (a *AggHistogram) TotalCountWindowed() int64 { - return a.h.TotalCountWindowed() +// TotalWindowed is part of the metric.WindowedHistogram interface +func (a *AggHistogram) TotalWindowed() (int64, float64) { + return a.h.TotalWindowed() } -// TotalSumWindowed is part of the metric.WindowedHistogram interface -func (a *AggHistogram) TotalSumWindowed() float64 { - return a.h.TotalSumWindowed() +// Total is part of the metric.WindowedHistogram interface +func (a *AggHistogram) Total() (int64, float64) { + return a.h.Total() +} + +// MeanWindowed is part of the metric.WindowedHistogram interface +func (a *AggHistogram) MeanWindowed() float64 { + return a.h.MeanWindowed() +} + +// Mean is part of the metric.WindowedHistogram interface +func (a *AggHistogram) Mean() float64 { + return a.h.Mean() } // ValueAtQuantileWindowed is part of the metric.WindowedHistogram interface diff --git a/pkg/util/metric/hdrhistogram.go b/pkg/util/metric/hdrhistogram.go index 6ff7eea884fc..b6b562403e2a 100644 --- a/pkg/util/metric/hdrhistogram.go +++ b/pkg/util/metric/hdrhistogram.go @@ -105,11 +105,12 @@ func (h *HdrHistogram) RecordValue(v int64) { } } -// TotalCount returns the (cumulative) number of samples. -func (h *HdrHistogram) TotalCount() int64 { +// Total returns the (cumulative) number of samples and sum of samples. +func (h *HdrHistogram) Total() (int64, float64) { h.mu.Lock() defer h.mu.Unlock() - return h.mu.cumulative.TotalCount() + totalSum := float64(h.mu.cumulative.TotalCount()) * h.mu.cumulative.Mean() + return h.mu.cumulative.TotalCount(), totalSum } // Min returns the minimum. @@ -168,14 +169,10 @@ func (h *HdrHistogram) ToPrometheusMetric() *prometheusgo.Metric { } } -// TotalCountWindowed implements the WindowedHistogram interface. -func (h *HdrHistogram) TotalCountWindowed() int64 { - return int64(h.ToPrometheusMetricWindowed().Histogram.GetSampleCount()) -} - -// TotalSumWindowed implements the WindowedHistogram interface. -func (h *HdrHistogram) TotalSumWindowed() float64 { - return h.ToPrometheusMetricWindowed().Histogram.GetSampleSum() +// TotalWindowed implements the WindowedHistogram interface. +func (h *HdrHistogram) TotalWindowed() (int64, float64) { + pHist := h.ToPrometheusMetricWindowed().Histogram + return int64(pHist.GetSampleCount()), pHist.GetSampleSum() } func (h *HdrHistogram) toPrometheusMetricWindowedLocked() *prometheusgo.Metric { @@ -240,7 +237,9 @@ func (h *HdrHistogram) Mean() float64 { return h.mu.cumulative.Mean() } -// TotalSum returns the (cumulative) sum of samples. -func (h *HdrHistogram) TotalSum() float64 { - return h.ToPrometheusMetric().Histogram.GetSampleSum() +func (h *HdrHistogram) MeanWindowed() float64 { + h.mu.Lock() + defer h.mu.Unlock() + + return h.mu.sliding.Current.Mean() } diff --git a/pkg/util/metric/metric.go b/pkg/util/metric/metric.go index ee00dc006b58..9674739c3109 100644 --- a/pkg/util/metric/metric.go +++ b/pkg/util/metric/metric.go @@ -89,10 +89,16 @@ type PrometheusIterable interface { // and values at specific quantiles from "windowed" histograms and record that // data directly. These windows could be arbitrary and overlapping. type WindowedHistogram interface { - // TotalCountWindowed returns the number of samples in the current window. - TotalCountWindowed() int64 - // TotalSumWindowed returns the number of samples in the current window. - TotalSumWindowed() float64 + // TotalWindowed returns the number of samples and their sum (respectively) + // in the current window. + TotalWindowed() (int64, float64) + // Total returns the number of samples and their sum (respectively) in the + // cumulative histogram. + Total() (int64, float64) + // MeanWindowed returns the average of the samples in the current window. + MeanWindowed() float64 + // Mean returns the average of the sample in teh cumulative histogram. + Mean() float64 // ValueAtQuantileWindowed takes a quantile value [0,100] and returns the // interpolated value at that quantile for the windowed histogram. ValueAtQuantileWindowed(q float64) float64 @@ -313,10 +319,7 @@ type IHistogram interface { WindowedHistogram RecordValue(n int64) - TotalCount() int64 - TotalSum() float64 - TotalCountWindowed() int64 - TotalSumWindowed() float64 + Total() (int64, float64) Mean() float64 } @@ -390,24 +393,16 @@ func (h *Histogram) Inspect(f func(interface{})) { f(h) } -// TotalCount returns the (cumulative) number of samples. -func (h *Histogram) TotalCount() int64 { - return int64(h.ToPrometheusMetric().Histogram.GetSampleCount()) -} - -// TotalCountWindowed implements the WindowedHistogram interface. -func (h *Histogram) TotalCountWindowed() int64 { - return int64(h.ToPrometheusMetricWindowed().Histogram.GetSampleCount()) +// Total returns the (cumulative) number of samples and the sum of all samples. +func (h *Histogram) Total() (int64, float64) { + pHist := h.ToPrometheusMetric().Histogram + return int64(pHist.GetSampleCount()), pHist.GetSampleSum() } -// TotalSum returns the (cumulative) sum of samples. -func (h *Histogram) TotalSum() float64 { - return h.ToPrometheusMetric().Histogram.GetSampleSum() -} - -// TotalSumWindowed implements the WindowedHistogram interface. -func (h *Histogram) TotalSumWindowed() float64 { - return h.ToPrometheusMetricWindowed().Histogram.GetSampleSum() +// TotalWindowed implements the WindowedHistogram interface. +func (h *Histogram) TotalWindowed() (int64, float64) { + pHist := h.ToPrometheusMetricWindowed().Histogram + return int64(pHist.GetSampleCount()), pHist.GetSampleSum() } // Mean returns the (cumulative) mean of samples. @@ -416,6 +411,12 @@ func (h *Histogram) Mean() float64 { return pm.Histogram.GetSampleSum() / float64(pm.Histogram.GetSampleCount()) } +// MeanWindowed implements the WindowedHistogram interface. +func (h *Histogram) MeanWindowed() float64 { + pHist := h.ToPrometheusMetricWindowed().Histogram + return pHist.GetSampleSum() / float64(pHist.GetSampleCount()) +} + // ValueAtQuantileWindowed implements the WindowedHistogram interface. // // https://github.com/prometheus/prometheus/blob/d9162189/promql/quantile.go#L75 @@ -591,18 +592,28 @@ func (mwh *ManualWindowHistogram) ToPrometheusMetric() *prometheusgo.Metric { return m } -// TotalCountWindowed implements the WindowedHistogram interface. -func (mwh *ManualWindowHistogram) TotalCountWindowed() int64 { +// TotalWindowed implements the WindowedHistogram interface. +func (mwh *ManualWindowHistogram) TotalWindowed() (int64, float64) { mwh.mu.RLock() defer mwh.mu.RUnlock() - return int64(mwh.mu.cur.GetSampleCount()) + return int64(mwh.mu.cur.GetSampleCount()), mwh.mu.cur.GetSampleSum() +} + +// Total implements the WindowedHistogram interface. +func (mwh *ManualWindowHistogram) Total() (int64, float64) { + h := mwh.ToPrometheusMetric().Histogram + return int64(h.GetSampleCount()), h.GetSampleSum() } -// TotalSumWindowed implements the WindowedHistogram interface. -func (mwh *ManualWindowHistogram) TotalSumWindowed() float64 { +func (mwh *ManualWindowHistogram) MeanWindowed() float64 { mwh.mu.RLock() defer mwh.mu.RUnlock() - return mwh.mu.cur.GetSampleSum() + return mwh.mu.cur.GetSampleSum() / float64(mwh.mu.cur.GetSampleCount()) +} + +func (mwh *ManualWindowHistogram) Mean() float64 { + h := mwh.ToPrometheusMetric().Histogram + return h.GetSampleSum() / float64(h.GetSampleCount()) } // ValueAtQuantileWindowed implements the WindowedHistogram interface. diff --git a/pkg/util/metric/metric_test.go b/pkg/util/metric/metric_test.go index 436d58547941..b69c87584e2d 100644 --- a/pkg/util/metric/metric_test.go +++ b/pkg/util/metric/metric_test.go @@ -280,15 +280,18 @@ func TestNewHistogramRotate(t *testing.T) { for i := 0; i < 4; i++ { // Windowed histogram is initially empty. h.Inspect(func(interface{}) {}) // triggers ticking - require.Zero(t, h.TotalSumWindowed()) + _, sum := h.TotalWindowed() + require.Zero(t, sum) // But cumulative histogram has history (if i > 0). - require.EqualValues(t, i, h.TotalCount()) + count, _ := h.Total() + require.EqualValues(t, i, count) // Add a measurement and verify it's there. { h.RecordValue(12345) f := float64(12345) - require.Equal(t, h.TotalSumWindowed(), f) + _, wSum := h.TotalWindowed() + require.Equal(t, wSum, f) } // Tick. This rotates the histogram. setNow(time.Duration(i+1) * 10 * time.Second) diff --git a/pkg/util/schedulerlatency/histogram.go b/pkg/util/schedulerlatency/histogram.go index 1a75a5c448fa..6cb7c35e575f 100644 --- a/pkg/util/schedulerlatency/histogram.go +++ b/pkg/util/schedulerlatency/histogram.go @@ -143,14 +143,15 @@ func (h *runtimeHistogram) GetMetadata() metric.Metadata { // Inspect is part of the Iterable interface. func (h *runtimeHistogram) Inspect(f func(interface{})) { f(h) } -// TotalCountWindowed implements the WindowedHistogram interface. -func (h *runtimeHistogram) TotalCountWindowed() int64 { - return int64(h.ToPrometheusMetric().Histogram.GetSampleCount()) +// TotalWindowed implements the WindowedHistogram interface. +func (h *runtimeHistogram) TotalWindowed() (int64, float64) { + return h.Total() } -// TotalSumWindowed implements the WindowedHistogram interface. -func (h *runtimeHistogram) TotalSumWindowed() float64 { - return h.ToPrometheusMetric().Histogram.GetSampleSum() +// Total implements the WindowedHistogram interface. +func (h *runtimeHistogram) Total() (int64, float64) { + pHist := h.ToPrometheusMetric().Histogram + return int64(pHist.GetSampleCount()), pHist.GetSampleSum() } // ValueAtQuantileWindowed implements the WindowedHistogram interface. @@ -158,6 +159,17 @@ func (h *runtimeHistogram) ValueAtQuantileWindowed(q float64) float64 { return metric.ValueAtQuantileWindowed(h.ToPrometheusMetric().Histogram, q) } +// MeanWindowed implements the WindowedHistogram interface. +func (h *runtimeHistogram) MeanWindowed() float64 { + return h.Mean() +} + +// Mean implements the WindowedHistogram interface. +func (h *runtimeHistogram) Mean() float64 { + pHist := h.ToPrometheusMetric().Histogram + return pHist.GetSampleSum() / float64(pHist.GetSampleCount()) +} + // reBucketExpAndTrim takes a list of bucket boundaries (lower bound inclusive) // and down samples the buckets to those a multiple of base apart. The end // result is a roughly exponential (in many cases, perfectly exponential) diff --git a/pkg/util/schedulerlatency/histogram_test.go b/pkg/util/schedulerlatency/histogram_test.go index d5c3fda9fadc..a4e99f30be32 100644 --- a/pkg/util/schedulerlatency/histogram_test.go +++ b/pkg/util/schedulerlatency/histogram_test.go @@ -97,10 +97,8 @@ func TestRuntimeHistogram(t *testing.T) { case "print": var buf strings.Builder - buf.WriteString(fmt.Sprintf("count=%d sum=%0.2f\n", - rh.TotalCountWindowed(), - rh.TotalSumWindowed(), - )) + count, sum := rh.Total() + buf.WriteString(fmt.Sprintf("count=%d sum=%0.2f\n", count, sum)) hist := rh.ToPrometheusMetric().GetHistogram() require.NotNil(t, hist) buf.WriteString("buckets:\n") diff --git a/pkg/util/schedulerlatency/scheduler_latency_test.go b/pkg/util/schedulerlatency/scheduler_latency_test.go index 2539e6f62adb..cb334cc03d85 100644 --- a/pkg/util/schedulerlatency/scheduler_latency_test.go +++ b/pkg/util/schedulerlatency/scheduler_latency_test.go @@ -87,13 +87,12 @@ func TestSchedulerLatencySampler(t *testing.T) { var err error reg.Each(func(name string, mtr interface{}) { wh := mtr.(metric.WindowedHistogram) - count := float64(wh.TotalCountWindowed()) - avg := wh.TotalSumWindowed() / count + avg := wh.MeanWindowed() if math.IsNaN(avg) || math.IsInf(avg, +1) || math.IsInf(avg, -1) { avg = 0 } - if wh.ValueAtQuantileWindowed(99) == 0 || count == 0 || avg == 0 { + if wh.ValueAtQuantileWindowed(99) == 0 || avg == 0 { err = fmt.Errorf("expected non-zero p99 scheduling latency metrics") } })