diff --git a/docs/blocky-grafana.json b/docs/blocky-grafana.json index ed7727658..b5b2af563 100644 --- a/docs/blocky-grafana.json +++ b/docs/blocky-grafana.json @@ -325,7 +325,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": false, - "expr": "blocky_build_info ", + "expr": "blocky_build_info", "format": "table", "instant": true, "interval": "", @@ -378,7 +378,7 @@ } ] }, - "unit": "ms" + "unit": "s" }, "overrides": [] }, @@ -413,7 +413,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "sum(increase(blocky_request_duration_ms_sum[$__range])) / sum(increase(blocky_request_duration_ms_count[$__range]))", + "expr": "(histogram_avg(rate(blocky_blocky_request_duration_seconds[$__rate_interval]))\nor\n(sum(rate(blocky_blocky_request_duration_seconds_sum[$__rate_interval])) / sum(rate(blocky_blocky_request_duration_seconds_count[$__rate_interval])))\nor\n(sum(rate(blocky_request_duration_ms_sum[$__rate_interval])) / sum(rate(blocky_request_duration_ms_count[$__rate_interval])) / 1000)", "format": "table", "instant": false, "interval": "", @@ -487,7 +487,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "sum(blocky_denylist_cache) / sum(up{job=~\"$job\"})", + "expr": "(sum(blocky_denylist_cache_entries) or sum(blocky_denylist_cache)) / sum(up{job=~\"$job\"})", "format": "table", "instant": false, "interval": "", @@ -642,7 +642,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "sum(increase(blocky_response_total{response_type=\"BLOCKED\"}[$__range])) / sum(increase(blocky_query_total[$__range])) ", + "expr": "sum(rate(blocky_response_total{response_type=\"BLOCKED\"}[$__rate_interval])) / sum(rate(blocky_query_total[$__rate_interval])) ", "format": "table", "instant": false, "interval": "", @@ -717,9 +717,9 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "ceil(sum(increase(blocky_query_total[$__range]))) ", + "expr": "ceil(sum(increase(blocky_query_total[$__range])))", "format": "table", - "instant": false, + "instant": true, "interval": "", "legendFormat": "", "refId": "A" @@ -778,7 +778,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "sum(blocky_cache_entry_count)/ sum(up{job=~\"$job\"})", + "expr": "(sum(blocky_cache_entries) or sum(blocky_cache_entry_count)) / sum(up{job=~\"$job\"})", "format": "table", "instant": false, "interval": "", @@ -845,7 +845,7 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "sum(increase(blocky_cache_hit_count[$__range])) / (sum(increase(blocky_cache_hit_count[$__range])) + sum(increase(blocky_cache_miss_count[$__range])))", + "expr": "(\n sum(rate(blocky_cache_hits_total[$__rate_interval]))\n /\n (sum(rate(blocky_cache_hits_total[$__rate_interval])) + sum(rate(blocky_cache_misses_total[$__rate_interval]))\n)\nor\n(\n sum(rate(blocky_cache_hit_count[$__rate_interval]))\n /\n (sum(rate(blocky_cache_hit_count[$__rate_interval])) + sum(rate(blocky_cache_miss_count[$__rate_interval]))\n)", "format": "table", "instant": false, "interval": "", @@ -929,7 +929,7 @@ "exemplar": true, "expr": "sum(increase(blocky_error_total[$__range]))", "format": "table", - "instant": false, + "instant": true, "interval": "", "legendFormat": "", "refId": "A" @@ -988,8 +988,9 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "ceil(sum(increase(blocky_prefetch_count[$__range])))", + "expr": "ceil(sum(increase(blocky_prefetches_total[$__range]) or sum(increase(blocky_prefetch_count[$__range])))", "format": "table", + "instant": true, "interval": "", "legendFormat": "", "refId": "A" @@ -1052,8 +1053,9 @@ "uid": "${DS_PROMETHEUS}" }, "exemplar": true, - "expr": "sum(rate(blocky_prefetch_count[5m])) * 60", + "expr": "(sum(rate(blocky_prefetchs_total[$__range])) or sum(rate(blocky_prefetch_count[$__range]))) * 60", "format": "table", + "instant": true, "interval": "", "legendFormat": "", "refId": "A" @@ -1978,4 +1980,4 @@ "uid": "JvOqE4gRk", "version": 1, "weekStart": "" -} \ No newline at end of file +} diff --git a/docs/prometheus_grafana.md b/docs/prometheus_grafana.md index 2f9f50b06..8df88ef52 100644 --- a/docs/prometheus_grafana.md +++ b/docs/prometheus_grafana.md @@ -10,17 +10,21 @@ Following metrics will be exported: | name | Description | | ------------------------------------------------ | -------------------------------------------------------- | -| blocky_denylist_cache / blocky_allowlist_cache | Number of entries in denylist/allowlist cache, partitioned by group | -| blocky_error_total | Number of total queries that ended in error for any reason | -| blocky_query_total | Number of total queries, partitioned by client and DNS request type (A, AAAA, PTR, etc) | -| blocky_request_duration_ms_bucket | Request duration histogram, partitioned by response type (Blocked, cached, etc) | -| blocky_response_total | Number of responses, partitioned by response type (Blocked, cached, etc), DNS response code, and reason | -| blocky_blocking_enabled | 1 if blocking is enabled, 0 otherwise | -| blocky_cache_entry_count | Number of entries in cache | -| blocky_cache_hit_count / blocky_cache_miss_count | Cache hit/miss counters | -| blocky_prefetch_count | Amount of prefetched DNS responses | -| blocky_prefetch_domain_name_cache_count | Amount of domain names being prefetched | -| blocky_failed_download_count | Number of failed list downloads | +| blocky_denylist_cache_entries | Gauge of entries in the denylist cache, partitioned by group | +| blocky_allowlist_cache_entries | Gauge of entries in the allowlist cache, partitioned by group | +| blocky_error_total | Counter of total queries that ended in error for any reason | +| blocky_query_total | Counter of total queries, partitioned by client and DNS request type (A, AAAA, PTR, etc) | +| blocky_blocky_request_duration_seconds | Histogram of request duration, partitioned by response type (Blocked, cached, etc) | +| blocky_response_total | Counter of responses, partitioned by response type (Blocked, cached, etc), DNS response code, and reason | +| blocky_blocking_enabled | Boolean 1 if blocking is enabled, 0 otherwise | +| blocky_cache_entries | Gauge of entries in cache | +| blocky_cache_hits_total | Counter of the number of cache hits | +| blocky_cache_miss_count | Counter of the number of Cache misses | +| blocky_last_list_group_refresh_timestamp_seconds | Timestamp of last list refresh | +| blocky_prefetches_total | Counter of prefetched DNS responses | +| blocky_prefetch_hits_total | Counter of requests that hit the prefetch cache | +| blocky_prefetch_domain_name_cache_entries | Gauge of domain names being prefetched | +| blocky_failed_downloads_total | Counter of failed list downloads | ### Grafana dashboard diff --git a/e2e/metrics_test.go b/e2e/metrics_test.go index ce907bd58..5fcc1c0ae 100644 --- a/e2e/metrics_test.go +++ b/e2e/metrics_test.go @@ -82,9 +82,9 @@ var _ = Describe("Metrics functional tests", func() { BeforeEach(func(ctx context.Context) { Eventually(fetchBlockyMetrics).WithArguments(ctx, metricsURL). Should(ContainElements( - "blocky_cache_entry_count 0", - "blocky_cache_hit_count 0", - "blocky_cache_miss_count 0", + "blocky_cache_entries 0", + "blocky_cache_hits_total 0", + "blocky_cache_misses_total 0", )) }) @@ -101,9 +101,9 @@ var _ = Describe("Metrics functional tests", func() { Eventually(fetchBlockyMetrics).WithArguments(ctx, metricsURL). Should(ContainElements( - "blocky_cache_entry_count 1", - "blocky_cache_hit_count 0", - "blocky_cache_miss_count 1", + "blocky_cache_entries 1", + "blocky_cache_hits_total 0", + "blocky_cache_misses_total 1", )) }) @@ -117,9 +117,9 @@ var _ = Describe("Metrics functional tests", func() { Eventually(fetchBlockyMetrics).WithArguments(ctx, metricsURL). Should(ContainElements( - "blocky_cache_entry_count 1", - "blocky_cache_hit_count 1", - "blocky_cache_miss_count 1", + "blocky_cache_entries 1", + "blocky_cache_hits_total 1", + "blocky_cache_misses_total 1", )) }) }) @@ -129,8 +129,8 @@ var _ = Describe("Metrics functional tests", func() { It("Should expose list cache sizes per group as metrics", func(ctx context.Context) { Eventually(fetchBlockyMetrics).WithArguments(ctx, metricsURL). Should(ContainElements( - "blocky_denylist_cache{group=\"group1\"} 1", - "blocky_denylist_cache{group=\"group2\"} 3", + "blocky_denylist_cache_entries{group=\"group1\"} 1", + "blocky_denylist_cache_entries{group=\"group2\"} 3", )) }) }) diff --git a/metrics/metrics_event_publisher.go b/metrics/metrics_event_publisher.go index 567b3afb4..a0820f260 100644 --- a/metrics/metrics_event_publisher.go +++ b/metrics/metrics_event_publisher.go @@ -86,7 +86,7 @@ func enabledGauge() prometheus.Gauge { func denylistGauge() *prometheus.GaugeVec { denylistCnt := prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Name: "blocky_denylist_cache", + Name: "blocky_denylist_cache_entries", Help: "Number of entries in the denylist cache", }, []string{"group"}, ) @@ -97,7 +97,7 @@ func denylistGauge() *prometheus.GaugeVec { func allowlistGauge() *prometheus.GaugeVec { allowlistCnt := prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Name: "blocky_allowlist_cache", + Name: "blocky_allowlist_cache_entries", Help: "Number of entries in the allowlist cache", }, []string{"group"}, ) @@ -108,7 +108,7 @@ func allowlistGauge() *prometheus.GaugeVec { func lastListGroupRefresh() prometheus.Gauge { return prometheus.NewGauge( prometheus.GaugeOpts{ - Name: "blocky_last_list_group_refresh", + Name: "blocky_last_list_group_refresh_timestamp_seconds", Help: "Timestamp of last list refresh", }, ) @@ -162,7 +162,7 @@ func registerCachingEventListeners() { func failedDownloadCount() prometheus.Counter { return prometheus.NewCounter(prometheus.CounterOpts{ - Name: "blocky_failed_download_count", + Name: "blocky_failed_downloads_total", Help: "Failed download counter", }) } @@ -170,7 +170,7 @@ func failedDownloadCount() prometheus.Counter { func cacheHitCount() prometheus.Counter { return prometheus.NewCounter( prometheus.CounterOpts{ - Name: "blocky_cache_hit_count", + Name: "blocky_cache_hits_total", Help: "Cache hit counter", }, ) @@ -179,7 +179,7 @@ func cacheHitCount() prometheus.Counter { func cacheMissCount() prometheus.Counter { return prometheus.NewCounter( prometheus.CounterOpts{ - Name: "blocky_cache_miss_count", + Name: "blocky_cache_misses_total", Help: "Cache miss counter", }, ) @@ -188,7 +188,7 @@ func cacheMissCount() prometheus.Counter { func domainPrefetchCount() prometheus.Counter { return prometheus.NewCounter( prometheus.CounterOpts{ - Name: "blocky_prefetch_count", + Name: "blocky_prefetches_total", Help: "Prefetch counter", }, ) @@ -197,7 +197,7 @@ func domainPrefetchCount() prometheus.Counter { func domainPrefetchHitCount() prometheus.Counter { return prometheus.NewCounter( prometheus.CounterOpts{ - Name: "blocky_prefetch_hit_count", + Name: "blocky_prefetch_hits_total", Help: "Prefetch hit counter", }, ) @@ -206,7 +206,7 @@ func domainPrefetchHitCount() prometheus.Counter { func cacheEntryCount() prometheus.Gauge { return prometheus.NewGauge( prometheus.GaugeOpts{ - Name: "blocky_cache_entry_count", + Name: "blocky_cache_entries", Help: "Number of entries in cache", }, ) @@ -215,7 +215,7 @@ func cacheEntryCount() prometheus.Gauge { func prefetchDomainCacheCount() prometheus.Gauge { return prometheus.NewGauge( prometheus.GaugeOpts{ - Name: "blocky_prefetch_domain_name_cache_count", + Name: "blocky_prefetch_domain_name_cache_entries", Help: "Number of entries in domain cache", }, ) diff --git a/resolver/metrics_resolver.go b/resolver/metrics_resolver.go index 48b85d46b..46698d5e3 100644 --- a/resolver/metrics_resolver.go +++ b/resolver/metrics_resolver.go @@ -13,6 +13,10 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +// nativeHistogramBucketFactor controls the resolution of native histograms. +// The value of 1.05 is slightly higher accuracy than the default of 1.1. +const nativeHistogramBucketFactor = 1.05 + // MetricsResolver resolver that records metrics about requests/response type MetricsResolver struct { configurable[*config.Metrics] @@ -35,14 +39,14 @@ func (r *MetricsResolver) Resolve(ctx context.Context, request *model.Request) ( "type": dns.TypeToString[request.Req.Question[0].Qtype], }).Inc() - reqDurationMs := float64(time.Since(request.RequestTS).Milliseconds()) + reqDuration := time.Since(request.RequestTS) responseType := "err" if response != nil { responseType = response.RType.String() } - r.durationHistogram.WithLabelValues(responseType).Observe(reqDurationMs) + r.durationHistogram.WithLabelValues(responseType).Observe(reqDuration.Seconds()) if err != nil { r.totalErrors.Inc() @@ -103,9 +107,10 @@ func totalErrorMetric() prometheus.Counter { func durationHistogram() *prometheus.HistogramVec { return prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Name: "blocky_request_duration_ms", - Help: "Request duration distribution", - Buckets: []float64{5, 10, 20, 30, 50, 75, 100, 200, 500, 1000, 2000}, + Name: "blocky_request_duration_seconds", + Help: "Request duration distribution", + Buckets: []float64{0.005, 0.01, 0.02, 0.03, 0.05, 0.075, 0.1, 0.2, 0.5, 1.0, 2.0}, + NativeHistogramBucketFactor: nativeHistogramBucketFactor, }, []string{"response_type"}, )