Skip to content

Commit

Permalink
metrics: refactor histogram bucket generation and testing
Browse files Browse the repository at this point in the history
This commit refactors histogram bucketing for legibility
and composibility. It also introduces a data-driven test
for histogram bucket generation.

This refactor should make it easier to add additional
metric categories, distributions, and bucket types.

Part of #97144.

Release note: None
  • Loading branch information
ericharmeling authored and jmcarp committed Aug 31, 2023
1 parent fc3fe21 commit 81b8be5
Show file tree
Hide file tree
Showing 39 changed files with 726 additions and 662 deletions.
86 changes: 43 additions & 43 deletions pkg/ccl/changefeedccl/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -559,52 +559,52 @@ func newAggregateMetrics(histogramWindow time.Duration) *AggMetrics {
EmittedMessages: b.Counter(metaChangefeedEmittedMessages),
FilteredMessages: b.Counter(metaChangefeedFilteredMessages),
MessageSize: b.Histogram(metric.HistogramOptions{
Metadata: metaMessageSize,
Duration: histogramWindow,
MaxVal: 10 << 20, /* 10MB max message size */
SigFigs: 1,
Buckets: metric.DataSize16MBBuckets,
Metadata: metaMessageSize,
Duration: histogramWindow,
MaxVal: 10 << 20, /* 10MB max message size */
SigFigs: 1,
BucketConfig: metric.DataSize16MBBuckets,
}),
EmittedBytes: b.Counter(metaChangefeedEmittedBytes),
FlushedBytes: b.Counter(metaChangefeedFlushedBytes),
Flushes: b.Counter(metaChangefeedFlushes),
SizeBasedFlushes: b.Counter(metaSizeBasedFlushes),
ParallelIOQueueNanos: b.Histogram(metric.HistogramOptions{
Metadata: metaChangefeedParallelIOQueueNanos,
Duration: histogramWindow,
MaxVal: changefeedIOQueueMaxLatency.Nanoseconds(),
SigFigs: 2,
Buckets: metric.BatchProcessLatencyBuckets,
Metadata: metaChangefeedParallelIOQueueNanos,
Duration: histogramWindow,
MaxVal: changefeedIOQueueMaxLatency.Nanoseconds(),
SigFigs: 2,
BucketConfig: metric.BatchProcessLatencyBuckets,
}),
SinkIOInflight: b.Gauge(metaChangefeedSinkIOInflight),

BatchHistNanos: b.Histogram(metric.HistogramOptions{
Metadata: metaChangefeedBatchHistNanos,
Duration: histogramWindow,
MaxVal: changefeedBatchHistMaxLatency.Nanoseconds(),
SigFigs: 1,
Buckets: metric.BatchProcessLatencyBuckets,
Metadata: metaChangefeedBatchHistNanos,
Duration: histogramWindow,
MaxVal: changefeedBatchHistMaxLatency.Nanoseconds(),
SigFigs: 1,
BucketConfig: metric.BatchProcessLatencyBuckets,
}),
FlushHistNanos: b.Histogram(metric.HistogramOptions{
Metadata: metaChangefeedFlushHistNanos,
Duration: histogramWindow,
MaxVal: changefeedFlushHistMaxLatency.Nanoseconds(),
SigFigs: 2,
Buckets: metric.BatchProcessLatencyBuckets,
Metadata: metaChangefeedFlushHistNanos,
Duration: histogramWindow,
MaxVal: changefeedFlushHistMaxLatency.Nanoseconds(),
SigFigs: 2,
BucketConfig: metric.BatchProcessLatencyBuckets,
}),
CommitLatency: b.Histogram(metric.HistogramOptions{
Metadata: metaCommitLatency,
Duration: histogramWindow,
MaxVal: commitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
Buckets: metric.BatchProcessLatencyBuckets,
Metadata: metaCommitLatency,
Duration: histogramWindow,
MaxVal: commitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
BucketConfig: metric.BatchProcessLatencyBuckets,
}),
AdmitLatency: b.Histogram(metric.HistogramOptions{
Metadata: metaAdmitLatency,
Duration: histogramWindow,
MaxVal: admitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
Buckets: metric.BatchProcessLatencyBuckets,
Metadata: metaAdmitLatency,
Duration: histogramWindow,
MaxVal: admitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
BucketConfig: metric.BatchProcessLatencyBuckets,
}),
BackfillCount: b.Gauge(metaChangefeedBackfillCount),
BackfillPendingRanges: b.Gauge(metaChangefeedBackfillPendingRanges),
Expand Down Expand Up @@ -720,28 +720,28 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
Failures: metric.NewCounter(metaChangefeedFailures),
QueueTimeNanos: metric.NewCounter(metaEventQueueTime),
CheckpointHistNanos: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaChangefeedCheckpointHistNanos,
Duration: histogramWindow,
MaxVal: changefeedCheckpointHistMaxLatency.Nanoseconds(),
SigFigs: 2,
Buckets: metric.IOLatencyBuckets,
Metadata: metaChangefeedCheckpointHistNanos,
Duration: histogramWindow,
MaxVal: changefeedCheckpointHistMaxLatency.Nanoseconds(),
SigFigs: 2,
BucketConfig: metric.IOLatencyBuckets,
}),
FrontierUpdates: metric.NewCounter(metaChangefeedFrontierUpdates),
ThrottleMetrics: cdcutils.MakeMetrics(histogramWindow),
ReplanCount: metric.NewCounter(metaChangefeedReplanCount),
// Below two metrics were never implemented using the hdr histogram. Set ForceUsePrometheus
// to true.
ParallelConsumerFlushNanos: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaChangefeedEventConsumerFlushNanos,
Duration: histogramWindow,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
Metadata: metaChangefeedEventConsumerFlushNanos,
Duration: histogramWindow,
BucketConfig: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
}),
ParallelConsumerConsumeNanos: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaChangefeedEventConsumerConsumeNanos,
Duration: histogramWindow,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
Metadata: metaChangefeedEventConsumerConsumeNanos,
Duration: histogramWindow,
BucketConfig: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
}),
ParallelConsumerInFlightEvents: metric.NewGauge(metaChangefeedEventConsumerInFlightEvents),
}
Expand Down
24 changes: 12 additions & 12 deletions pkg/ccl/sqlproxyccl/connector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,10 +381,10 @@ func TestConnector_dialTenantCluster(t *testing.T) {
c := &connector{
TenantID: roachpb.MustMakeTenantID(42),
DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePrometheus,
Metadata: metaDialTenantLatency,
Duration: time.Millisecond,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
Metadata: metaDialTenantLatency,
Duration: time.Millisecond,
BucketConfig: metric.IOLatencyBuckets,
}),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
}
Expand Down Expand Up @@ -466,10 +466,10 @@ func TestConnector_dialTenantCluster(t *testing.T) {

c := &connector{
DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDialTenantLatency,
Duration: time.Millisecond,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDialTenantLatency,
Duration: time.Millisecond,
BucketConfig: metric.IOLatencyBuckets,
}),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
}
Expand Down Expand Up @@ -500,10 +500,10 @@ func TestConnector_dialTenantCluster(t *testing.T) {
c := &connector{
TenantID: roachpb.MustMakeTenantID(42),
DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDialTenantLatency,
Duration: time.Millisecond,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDialTenantLatency,
Duration: time.Millisecond,
BucketConfig: metric.IOLatencyBuckets,
}),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
}
Expand Down
34 changes: 17 additions & 17 deletions pkg/ccl/sqlproxyccl/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,19 +234,19 @@ func makeProxyMetrics() metrics {
RefusedConnCount: metric.NewCounter(metaRefusedConnCount),
SuccessfulConnCount: metric.NewCounter(metaSuccessfulConnCount),
ConnectionLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaConnMigrationAttemptedCount,
Duration: base.DefaultHistogramWindowInterval(),
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaConnMigrationAttemptedCount,
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.IOLatencyBuckets,
}),
AuthFailedCount: metric.NewCounter(metaAuthFailedCount),
ExpiredClientConnCount: metric.NewCounter(metaExpiredClientConnCount),
// Connector metrics.
DialTenantLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDialTenantLatency,
Duration: base.DefaultHistogramWindowInterval(),
Buckets: metric.IOLatencyBuckets},
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDialTenantLatency,
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.IOLatencyBuckets},
),
DialTenantRetries: metric.NewCounter(metaDialTenantRetries),
// Connection migration metrics.
Expand All @@ -255,17 +255,17 @@ func makeProxyMetrics() metrics {
ConnMigrationErrorRecoverableCount: metric.NewCounter(metaConnMigrationErrorRecoverableCount),
ConnMigrationAttemptedCount: metric.NewCounter(metaConnMigrationAttemptedCount),
ConnMigrationAttemptedLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaConnMigrationAttemptedLatency,
Duration: base.DefaultHistogramWindowInterval(),
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaConnMigrationAttemptedLatency,
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.IOLatencyBuckets,
}),
ConnMigrationTransferResponseMessageSize: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaConnMigrationTransferResponseMessageSize,
Duration: base.DefaultHistogramWindowInterval(),
Buckets: metric.DataSize16MBBuckets,
MaxVal: maxExpectedTransferResponseMessageSize,
SigFigs: 1,
Metadata: metaConnMigrationTransferResponseMessageSize,
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.DataSize16MBBuckets,
MaxVal: maxExpectedTransferResponseMessageSize,
SigFigs: 1,
}),
QueryCancelReceivedPGWire: metric.NewCounter(metaQueryCancelReceivedPGWire),
QueryCancelReceivedHTTP: metric.NewCounter(metaQueryCancelReceivedHTTP),
Expand Down
30 changes: 15 additions & 15 deletions pkg/ccl/streamingccl/streamingest/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,25 +163,25 @@ func MakeMetrics(histogramWindow time.Duration) metric.Struct {
ResolvedEvents: metric.NewCounter(metaReplicationResolvedEventsIngested),
JobProgressUpdates: metric.NewCounter(metaJobProgressUpdates),
FlushHistNanos: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaReplicationFlushHistNanos,
Duration: histogramWindow,
Buckets: metric.BatchProcessLatencyBuckets,
MaxVal: streamingFlushHistMaxLatency.Nanoseconds(),
SigFigs: 1,
Metadata: metaReplicationFlushHistNanos,
Duration: histogramWindow,
BucketConfig: metric.BatchProcessLatencyBuckets,
MaxVal: streamingFlushHistMaxLatency.Nanoseconds(),
SigFigs: 1,
}),
CommitLatency: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaReplicationCommitLatency,
Duration: histogramWindow,
Buckets: metric.BatchProcessLatencyBuckets,
MaxVal: streamingCommitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
Metadata: metaReplicationCommitLatency,
Duration: histogramWindow,
BucketConfig: metric.BatchProcessLatencyBuckets,
MaxVal: streamingCommitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
}),
AdmitLatency: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaReplicationAdmitLatency,
Duration: histogramWindow,
Buckets: metric.BatchProcessLatencyBuckets,
MaxVal: streamingAdmitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
Metadata: metaReplicationAdmitLatency,
Duration: histogramWindow,
BucketConfig: metric.BatchProcessLatencyBuckets,
MaxVal: streamingAdmitLatencyMaxValue.Nanoseconds(),
SigFigs: 1,
}),
RunningCount: metric.NewGauge(metaStreamsRunning),
EarliestDataCheckpointSpan: metric.NewGauge(metaEarliestDataCheckpointSpan),
Expand Down
10 changes: 5 additions & 5 deletions pkg/kv/bulk/bulk_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ const log10int64times1000 = 19 * 1000
func MakeBulkMetrics(histogramWindow time.Duration) Metrics {
return Metrics{
MaxBytesHist: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaMemMaxBytes,
Duration: histogramWindow,
MaxVal: log10int64times1000,
SigFigs: 3,
Buckets: metric.MemoryUsage64MBBuckets,
Metadata: metaMemMaxBytes,
Duration: histogramWindow,
MaxVal: log10int64times1000,
SigFigs: 3,
BucketConfig: metric.MemoryUsage64MBBuckets,
}),
CurBytesCount: metric.NewGauge(metaMemCurBytes),
}
Expand Down
18 changes: 9 additions & 9 deletions pkg/kv/kvclient/kvcoord/txn_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -269,20 +269,20 @@ func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics {
RefreshMemoryLimitExceeded: metric.NewCounter(metaRefreshMemoryLimitExceeded),
RefreshAutoRetries: metric.NewCounter(metaRefreshAutoRetries),
Durations: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDurationsHistograms,
Duration: histogramWindow,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaDurationsHistograms,
Duration: histogramWindow,
BucketConfig: metric.IOLatencyBuckets,
}),
TxnsWithCondensedIntents: metric.NewCounter(metaTxnsWithCondensedIntentSpans),
TxnsWithCondensedIntentsGauge: metric.NewGauge(metaTxnsWithCondensedIntentSpansGauge),
TxnsRejectedByLockSpanBudget: metric.NewCounter(metaTxnsRejectedByLockSpanBudget),
Restarts: metric.NewHistogram(metric.HistogramOptions{
Metadata: metaRestartsHistogram,
Duration: histogramWindow,
MaxVal: 100,
SigFigs: 3,
Buckets: metric.Count1KBuckets,
Metadata: metaRestartsHistogram,
Duration: histogramWindow,
MaxVal: 100,
SigFigs: 3,
BucketConfig: metric.Count1KBuckets,
}),
RestartsWriteTooOld: telemetry.NewCounterWithMetric(metaRestartsWriteTooOld),
RestartsWriteTooOldMulti: telemetry.NewCounterWithMetric(metaRestartsWriteTooOldMulti),
Expand Down
16 changes: 8 additions & 8 deletions pkg/kv/kvprober/kvprober.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,18 +277,18 @@ func NewProber(opts Opts) *Prober {
ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts),
ReadProbeFailures: metric.NewCounter(metaReadProbeFailures),
ReadProbeLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaReadProbeLatency,
Duration: opts.HistogramWindowInterval,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaReadProbeLatency,
Duration: opts.HistogramWindowInterval,
BucketConfig: metric.IOLatencyBuckets,
}),
WriteProbeAttempts: metric.NewCounter(metaWriteProbeAttempts),
WriteProbeFailures: metric.NewCounter(metaWriteProbeFailures),
WriteProbeLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaWriteProbeLatency,
Duration: opts.HistogramWindowInterval,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaWriteProbeLatency,
Duration: opts.HistogramWindowInterval,
BucketConfig: metric.IOLatencyBuckets,
}),
WriteProbeQuarantineOldestDuration: metric.NewFunctionalGauge(
metaWriteProbeQuarantineOldestDuration,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,10 @@ func newMetrics(c *Controller) *metrics {
)
m.WaitDuration[wc] = metric.NewHistogram(
metric.HistogramOptions{
Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration),
Duration: base.DefaultHistogramWindowInterval(),
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration),
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
},
)
m.TotalStreamCount[wc] = metric.NewFunctionalGauge(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ func NewMetrics(registry *metric.Registry) *Metrics {
)
m.WaitDuration[wc] = metric.NewHistogram(
metric.HistogramOptions{
Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration),
Duration: base.DefaultHistogramWindowInterval(),
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
Metadata: annotateMetricTemplateWithWorkClass(wc, waitDuration),
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
},
)
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/kv/kvserver/liveness/liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,10 +330,10 @@ func NewNodeLiveness(opts NodeLivenessOptions) *NodeLiveness {
HeartbeatFailures: telemetry.NewCounterWithMetric(metaHeartbeatFailures),
EpochIncrements: telemetry.NewCounterWithMetric(metaEpochIncrements),
HeartbeatLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaHeartbeatLatency,
Duration: opts.HistogramWindowInterval,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePreferHdrLatency,
Metadata: metaHeartbeatLatency,
Duration: opts.HistogramWindowInterval,
BucketConfig: metric.IOLatencyBuckets,
}),
}
nl.mu.nodes = make(map[roachpb.NodeID]Record)
Expand Down
8 changes: 4 additions & 4 deletions pkg/kv/kvserver/logstore/logstore_bench_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ func runBenchmarkLogStore_StoreEntries(b *testing.B, bytes int64) {
Settings: st,
Metrics: Metrics{
RaftLogCommitLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePrometheus,
Metadata: metric.Metadata{},
Duration: 10 * time.Second,
Buckets: metric.IOLatencyBuckets,
Mode: metric.HistogramModePrometheus,
Metadata: metric.Metadata{},
Duration: 10 * time.Second,
BucketConfig: metric.IOLatencyBuckets,
}),
},
}
Expand Down
Loading

0 comments on commit 81b8be5

Please sign in to comment.