From a74f44126676baa95f5df04c5122e085bfe55da2 Mon Sep 17 00:00:00 2001 From: dmachard <5562930+dmachard@users.noreply.github.com> Date: Thu, 28 Dec 2023 10:21:25 +0100 Subject: [PATCH 01/12] add golang-lru --- go.mod | 1 + go.sum | 2 + loggers/prometheus.go | 123 ++++++++++++++++++++++-------------------- pkgconfig/loggers.go | 8 +++ 4 files changed, 77 insertions(+), 57 deletions(-) diff --git a/go.mod b/go.mod index b313bdfc..e6c57677 100644 --- a/go.mod +++ b/go.mod @@ -74,6 +74,7 @@ require ( github.com/hashicorp/go-sockaddr v1.0.2 // indirect github.com/hashicorp/go-uuid v1.0.3 // indirect github.com/hashicorp/golang-lru v0.6.0 // indirect + github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect github.com/hashicorp/memberlist v0.5.0 // indirect github.com/hashicorp/serf v0.10.1 // indirect github.com/imdario/mergo v0.3.13 // indirect diff --git a/go.sum b/go.sum index c400ad5b..235d23c3 100644 --- a/go.sum +++ b/go.sum @@ -660,6 +660,8 @@ github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.6.0 h1:uL2shRDx7RTrOrTCUZEGP/wJUFiUI8QT6E7z5o8jga4= github.com/hashicorp/golang-lru v0.6.0/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64= github.com/hashicorp/mdns v1.0.4/go.mod h1:mtBihi+LeNXGtG8L9dX59gAEa12BDtBQSp4v/YAJqrc= github.com/hashicorp/memberlist v0.5.0 h1:EtYPN8DpAURiapus508I4n9CzHs2W+8NZGbmmR/prTM= diff --git a/loggers/prometheus.go b/loggers/prometheus.go index 4d71a5b1..55ccfe6d 100644 --- a/loggers/prometheus.go +++ b/loggers/prometheus.go @@ -19,6 +19,7 @@ import ( "github.com/dmachard/go-dnscollector/transformers" "github.com/dmachard/go-logger" "github.com/dmachard/go-topmap" + "github.com/hashicorp/golang-lru/v2/expirable" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/collectors" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -96,14 +97,14 @@ type PrometheusCountersSet struct { prom *Prometheus // Counters - requesters map[string]int // Requests number made by a specific requestor - domains map[string]int // Requests number made to find out about a specific domain - nxdomains map[string]int // Requests number ended up in NXDOMAIN - sfdomains map[string]int // Requests number ended up in SERVFAIL - tlds map[string]int // Requests number for a specific TLD - etldplusone map[string]int // Requests number for a specific eTLD+1 - suspicious map[string]int // Requests number for a specific name that looked suspicious - evicted map[string]int // Requests number for a specific name that timed out + requesters *expirable.LRU[string, int] // Requests number made by a specific requestor + domains *expirable.LRU[string, int] // Requests number made to find out about a specific domain + nxdomains *expirable.LRU[string, int] // Requests number ended up in NXDOMAIN + sfdomains *expirable.LRU[string, int] // Requests number ended up in SERVFAIL + tlds *expirable.LRU[string, int] // Requests number for a specific TLD + etldplusone *expirable.LRU[string, int] // Requests number for a specific eTLD+1 + suspicious *expirable.LRU[string, int] // Requests number for a specific name that looked suspicious + evicted *expirable.LRU[string, int] // Requests number for a specific name that timed out epsCounters EpsCounters topRequesters *topmap.TopMap @@ -238,14 +239,14 @@ func newPrometheusCounterSet(p *Prometheus, labels prometheus.Labels) *Prometheu pcs := &PrometheusCountersSet{ prom: p, labels: labels, - requesters: make(map[string]int), - domains: make(map[string]int), - nxdomains: make(map[string]int), - sfdomains: make(map[string]int), - tlds: make(map[string]int), - etldplusone: make(map[string]int), - suspicious: make(map[string]int), - evicted: make(map[string]int), + requesters: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.RequestersCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.RequestersCacheTTL)), + domains: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), + nxdomains: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), + sfdomains: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), + tlds: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), + etldplusone: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), + suspicious: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), + evicted: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), epsCounters: EpsCounters{ TotalRcodes: make(map[string]float64), @@ -325,83 +326,91 @@ func (c *PrometheusCountersSet) Describe(ch chan<- *prometheus.Desc) { func (c *PrometheusCountersSet) Record(dm dnsutils.DNSMessage) { c.Lock() defer c.Unlock() - // count number of dns message per requester ip and top clients - if _, exists := c.requesters[dm.NetworkInfo.QueryIP]; !exists { - c.requesters[dm.NetworkInfo.QueryIP] = 1 + + count, exists := c.requesters.Get(dm.NetworkInfo.QueryIP) + if exists { + c.requesters.Add(dm.NetworkInfo.QueryIP, count+1) } else { - c.requesters[dm.NetworkInfo.QueryIP] += 1 + c.requesters.Add(dm.NetworkInfo.QueryIP, 1) } - c.topRequesters.Record(dm.NetworkInfo.QueryIP, c.requesters[dm.NetworkInfo.QueryIP]) + c.topRequesters.Record(dm.NetworkInfo.QueryIP, count+1) // top domains switch dm.DNS.Rcode { case dnsutils.DNSRcodeTimeout: - if _, exists := c.evicted[dm.DNS.Qname]; !exists { - c.evicted[dm.DNS.Qname] = 1 + count, exists := c.evicted.Get(dm.DNS.Qname) + if exists { + c.evicted.Add(dm.DNS.Qname, count+1) } else { - c.evicted[dm.DNS.Qname] += 1 + c.evicted.Add(dm.DNS.Qname, 1) } - c.topEvicted.Record(dm.DNS.Qname, c.evicted[dm.DNS.Qname]) + c.topEvicted.Record(dm.DNS.Qname, count+1) case dnsutils.DNSRcodeServFail: - if _, exists := c.sfdomains[dm.DNS.Qname]; !exists { - c.sfdomains[dm.DNS.Qname] = 1 + count, exists := c.sfdomains.Get(dm.DNS.Qname) + if exists { + c.sfdomains.Add(dm.DNS.Qname, count+1) } else { - c.sfdomains[dm.DNS.Qname] += 1 + c.sfdomains.Add(dm.DNS.Qname, 1) } - c.topSfDomains.Record(dm.DNS.Qname, c.sfdomains[dm.DNS.Qname]) + c.topSfDomains.Record(dm.DNS.Qname, count+1) case dnsutils.DNSRcodeNXDomain: - if _, exists := c.nxdomains[dm.DNS.Qname]; !exists { - c.nxdomains[dm.DNS.Qname] = 1 + count, exists := c.nxdomains.Get(dm.DNS.Qname) + if exists { + c.nxdomains.Add(dm.DNS.Qname, count+1) } else { - c.nxdomains[dm.DNS.Qname] += 1 + c.nxdomains.Add(dm.DNS.Qname, 1) } - c.topNxDomains.Record(dm.DNS.Qname, c.nxdomains[dm.DNS.Qname]) + c.topNxDomains.Record(dm.DNS.Qname, count+1) default: - if _, exists := c.domains[dm.DNS.Qname]; !exists { - c.domains[dm.DNS.Qname] = 1 + count, exists := c.domains.Get(dm.DNS.Qname) + if exists { + c.domains.Add(dm.DNS.Qname, count+1) } else { - c.domains[dm.DNS.Qname] += 1 + c.domains.Add(dm.DNS.Qname, 1) } - c.topDomains.Record(dm.DNS.Qname, c.domains[dm.DNS.Qname]) + c.topDomains.Record(dm.DNS.Qname, count+1) } // count and top tld if dm.PublicSuffix != nil { if dm.PublicSuffix.QnamePublicSuffix != "-" { - if _, exists := c.tlds[dm.PublicSuffix.QnamePublicSuffix]; !exists { - c.tlds[dm.PublicSuffix.QnamePublicSuffix] = 1 + count, exists := c.tlds.Get(dm.PublicSuffix.QnamePublicSuffix) + if exists { + c.tlds.Add(dm.PublicSuffix.QnamePublicSuffix, count+1) } else { - c.tlds[dm.PublicSuffix.QnamePublicSuffix] += 1 + c.tlds.Add(dm.PublicSuffix.QnamePublicSuffix, 1) } - c.topTlds.Record(dm.PublicSuffix.QnamePublicSuffix, c.tlds[dm.PublicSuffix.QnamePublicSuffix]) + c.topTlds.Record(dm.PublicSuffix.QnamePublicSuffix, count+1) } } // count TLD+1 if it is set if dm.PublicSuffix != nil { if dm.PublicSuffix.QnameEffectiveTLDPlusOne != "-" { - if _, exists := c.tlds[dm.PublicSuffix.QnameEffectiveTLDPlusOne]; !exists { - c.etldplusone[dm.PublicSuffix.QnameEffectiveTLDPlusOne] = 1 + count, exists := c.etldplusone.Get(dm.PublicSuffix.QnameEffectiveTLDPlusOne) + if exists { + c.etldplusone.Add(dm.PublicSuffix.QnameEffectiveTLDPlusOne, count+1) } else { - c.etldplusone[dm.PublicSuffix.QnameEffectiveTLDPlusOne] += 1 + c.etldplusone.Add(dm.PublicSuffix.QnameEffectiveTLDPlusOne, 1) } - c.topETLDPlusOne.Record(dm.PublicSuffix.QnameEffectiveTLDPlusOne, c.etldplusone[dm.PublicSuffix.QnameEffectiveTLDPlusOne]) + c.topETLDPlusOne.Record(dm.PublicSuffix.QnameEffectiveTLDPlusOne, count+1) } } // suspicious domains if dm.Suspicious != nil { if dm.Suspicious.Score > 0.0 { - if _, exists := c.suspicious[dm.DNS.Qname]; !exists { - c.suspicious[dm.DNS.Qname] = 1 + count, exists := c.suspicious.Get(dm.DNS.Qname) + if exists { + c.suspicious.Add(dm.DNS.Qname, count+1) } else { - c.suspicious[dm.DNS.Qname] += 1 + c.suspicious.Add(dm.DNS.Qname, 1) } - c.topSuspicious.Record(dm.DNS.Qname, c.domains[dm.DNS.Qname]) + c.topSuspicious.Record(dm.DNS.Qname, count+1) } } // compute histograms, no more enabled by default to avoid to hurt performance. @@ -487,38 +496,38 @@ func (c *PrometheusCountersSet) Collect(ch chan<- prometheus.Metric) { defer c.Unlock() // Update number of domains ch <- prometheus.MustNewConstMetric(c.prom.counterDomains, prometheus.CounterValue, - float64(len(c.domains)), + float64(c.domains.Len()), ) // Count NX domains ch <- prometheus.MustNewConstMetric(c.prom.counterDomainsNx, prometheus.CounterValue, - float64(len(c.nxdomains)), + float64(c.nxdomains.Len()), ) // Count SERVFAIL domains ch <- prometheus.MustNewConstMetric(c.prom.counterDomainsSf, prometheus.CounterValue, - float64(len(c.sfdomains)), + float64(c.sfdomains.Len()), ) // Requesters counter ch <- prometheus.MustNewConstMetric(c.prom.counterRequesters, prometheus.CounterValue, - float64(len(c.requesters)), + float64(c.requesters.Len()), ) // Count number of unique TLDs ch <- prometheus.MustNewConstMetric(c.prom.counterTlds, prometheus.CounterValue, - float64(len(c.tlds)), + float64(c.tlds.Len()), ) ch <- prometheus.MustNewConstMetric(c.prom.counterETldPlusOne, prometheus.CounterValue, - float64(len(c.etldplusone)), + float64(c.etldplusone.Len()), ) // Count number of unique suspicious names ch <- prometheus.MustNewConstMetric(c.prom.counterSuspicious, prometheus.CounterValue, - float64(len(c.suspicious)), + float64(c.suspicious.Len()), ) // Count number of unique unanswered (timedout) names ch <- prometheus.MustNewConstMetric(c.prom.counterEvicted, prometheus.CounterValue, - float64(len(c.evicted)), + float64(c.evicted.Len()), ) for _, r := range c.topDomains.Get() { ch <- prometheus.MustNewConstMetric(c.prom.gaugeTopDomains, prometheus.GaugeValue, diff --git a/pkgconfig/loggers.go b/pkgconfig/loggers.go index 333abb46..ba7c23cc 100644 --- a/pkgconfig/loggers.go +++ b/pkgconfig/loggers.go @@ -29,6 +29,10 @@ type ConfigLoggers struct { BasicAuthEnabled bool `yaml:"basic-auth-enable"` ChannelBufferSize int `yaml:"chan-buffer-size"` HistogramMetricsEnabled bool `yaml:"histogram-metrics-enabled"` + RequestersCacheTTL int `yaml:"requeters-cache-ttl"` + RequestersCacheSize int `yaml:"requeters-cache-size"` + DomainsCacheTTL int `yaml:"domains-cache-ttl"` + DomainsCacheSize int `yaml:"domains-cache-size"` } `yaml:"prometheus"` RestAPI struct { Enable bool `yaml:"enable"` @@ -327,6 +331,10 @@ func (c *ConfigLoggers) SetDefault() { c.Prometheus.BasicAuthEnabled = true c.Prometheus.ChannelBufferSize = 65535 c.Prometheus.HistogramMetricsEnabled = false + c.Prometheus.RequestersCacheTTL = 3600 + c.Prometheus.RequestersCacheSize = 250000 + c.Prometheus.DomainsCacheTTL = 3600 + c.Prometheus.DomainsCacheSize = 500000 c.RestAPI.Enable = false c.RestAPI.ListenIP = LocalhostIP From abfdbe0f3d63762f73a89ebea9200674cbbb9b63 Mon Sep 17 00:00:00 2001 From: dmachard <5562930+dmachard@users.noreply.github.com> Date: Thu, 28 Dec 2023 20:54:46 +0100 Subject: [PATCH 02/12] counter to gauge --- go.mod | 4 +- loggers/prometheus.go | 149 ++++++++++++++++-------------------------- pkgconfig/loggers.go | 4 +- 3 files changed, 62 insertions(+), 95 deletions(-) diff --git a/go.mod b/go.mod index e6c57677..f66d5d1b 100644 --- a/go.mod +++ b/go.mod @@ -19,6 +19,7 @@ require ( github.com/google/uuid v1.4.0 github.com/grafana/dskit v0.0.0-20230201083518-528d8a7d52f2 github.com/grafana/loki v1.6.2-0.20230503110102-9f809eda70ba + github.com/hashicorp/golang-lru/v2 v2.0.7 github.com/hpcloud/tail v1.0.0 github.com/influxdata/influxdb-client-go v1.4.0 github.com/klauspost/compress v1.17.4 @@ -74,7 +75,6 @@ require ( github.com/hashicorp/go-sockaddr v1.0.2 // indirect github.com/hashicorp/go-uuid v1.0.3 // indirect github.com/hashicorp/golang-lru v0.6.0 // indirect - github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect github.com/hashicorp/memberlist v0.5.0 // indirect github.com/hashicorp/serf v0.10.1 // indirect github.com/imdario/mergo v0.3.13 // indirect @@ -133,7 +133,7 @@ require ( github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839 // indirect - github.com/pkg/errors v0.9.1 // indirect + github.com/pkg/errors v0.9.1 github.com/prometheus/client_model v0.5.0 github.com/prometheus/common v0.45.0 github.com/prometheus/procfs v0.11.1 // indirect diff --git a/loggers/prometheus.go b/loggers/prometheus.go index 55ccfe6d..8e22f5ed 100644 --- a/loggers/prometheus.go +++ b/loggers/prometheus.go @@ -193,14 +193,14 @@ type Prometheus struct { gaugeTopSuspicious *prometheus.Desc gaugeTopEvicted *prometheus.Desc - counterDomains *prometheus.Desc - counterDomainsNx *prometheus.Desc - counterDomainsSf *prometheus.Desc - counterRequesters *prometheus.Desc - counterTlds *prometheus.Desc - counterETldPlusOne *prometheus.Desc - counterSuspicious *prometheus.Desc - counterEvicted *prometheus.Desc + gaugeDomains *prometheus.Desc + gaugeDomainsNx *prometheus.Desc + gaugeDomainsSf *prometheus.Desc + gaugeRequesters *prometheus.Desc + gaugeTlds *prometheus.Desc + gaugeETldPlusOne *prometheus.Desc + gaugeSuspicious *prometheus.Desc + gaugeEvicted *prometheus.Desc gaugeEps *prometheus.Desc gaugeEpsMax *prometheus.Desc @@ -264,7 +264,6 @@ func newPrometheusCounterSet(p *Prometheus, labels prometheus.Labels) *Prometheu topETLDPlusOne: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), topSuspicious: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), } - prometheus.WrapRegistererWith(labels, p.promRegistry).MustRegister(pcs) return pcs } @@ -289,14 +288,14 @@ func (c *PrometheusCountersSet) Describe(ch chan<- *prometheus.Desc) { ch <- c.prom.gaugeTopEvicted // Counter metrics - ch <- c.prom.counterDomains - ch <- c.prom.counterDomainsNx - ch <- c.prom.counterDomainsSf - ch <- c.prom.counterRequesters - ch <- c.prom.counterTlds - ch <- c.prom.counterETldPlusOne - ch <- c.prom.counterSuspicious - ch <- c.prom.counterEvicted + ch <- c.prom.gaugeDomains + ch <- c.prom.gaugeDomainsNx + ch <- c.prom.gaugeDomainsSf + ch <- c.prom.gaugeRequesters + ch <- c.prom.gaugeTlds + ch <- c.prom.gaugeETldPlusOne + ch <- c.prom.gaugeSuspicious + ch <- c.prom.gaugeEvicted ch <- c.prom.gaugeEps ch <- c.prom.gaugeEpsMax @@ -327,62 +326,38 @@ func (c *PrometheusCountersSet) Record(dm dnsutils.DNSMessage) { c.Lock() defer c.Unlock() - count, exists := c.requesters.Get(dm.NetworkInfo.QueryIP) - if exists { - c.requesters.Add(dm.NetworkInfo.QueryIP, count+1) - } else { - c.requesters.Add(dm.NetworkInfo.QueryIP, 1) - } + count, _ := c.requesters.Get(dm.NetworkInfo.QueryIP) + c.requesters.Add(dm.NetworkInfo.QueryIP, count+1) c.topRequesters.Record(dm.NetworkInfo.QueryIP, count+1) - // top domains + // // top domains switch dm.DNS.Rcode { case dnsutils.DNSRcodeTimeout: - count, exists := c.evicted.Get(dm.DNS.Qname) - if exists { - c.evicted.Add(dm.DNS.Qname, count+1) - } else { - c.evicted.Add(dm.DNS.Qname, 1) - } + count, _ := c.evicted.Get(dm.DNS.Qname) + c.evicted.Add(dm.DNS.Qname, count+1) c.topEvicted.Record(dm.DNS.Qname, count+1) case dnsutils.DNSRcodeServFail: - count, exists := c.sfdomains.Get(dm.DNS.Qname) - if exists { - c.sfdomains.Add(dm.DNS.Qname, count+1) - } else { - c.sfdomains.Add(dm.DNS.Qname, 1) - } + count, _ := c.sfdomains.Get(dm.DNS.Qname) + c.sfdomains.Add(dm.DNS.Qname, count+1) c.topSfDomains.Record(dm.DNS.Qname, count+1) case dnsutils.DNSRcodeNXDomain: - count, exists := c.nxdomains.Get(dm.DNS.Qname) - if exists { - c.nxdomains.Add(dm.DNS.Qname, count+1) - } else { - c.nxdomains.Add(dm.DNS.Qname, 1) - } + count, _ := c.nxdomains.Get(dm.DNS.Qname) + c.nxdomains.Add(dm.DNS.Qname, count+1) c.topNxDomains.Record(dm.DNS.Qname, count+1) default: - count, exists := c.domains.Get(dm.DNS.Qname) - if exists { - c.domains.Add(dm.DNS.Qname, count+1) - } else { - c.domains.Add(dm.DNS.Qname, 1) - } + count, _ := c.domains.Get(dm.DNS.Qname) + c.domains.Add(dm.DNS.Qname, count+1) c.topDomains.Record(dm.DNS.Qname, count+1) } // count and top tld if dm.PublicSuffix != nil { if dm.PublicSuffix.QnamePublicSuffix != "-" { - count, exists := c.tlds.Get(dm.PublicSuffix.QnamePublicSuffix) - if exists { - c.tlds.Add(dm.PublicSuffix.QnamePublicSuffix, count+1) - } else { - c.tlds.Add(dm.PublicSuffix.QnamePublicSuffix, 1) - } + count, _ := c.tlds.Get(dm.PublicSuffix.QnamePublicSuffix) + c.tlds.Add(dm.PublicSuffix.QnamePublicSuffix, count+1) c.topTlds.Record(dm.PublicSuffix.QnamePublicSuffix, count+1) } } @@ -390,12 +365,8 @@ func (c *PrometheusCountersSet) Record(dm dnsutils.DNSMessage) { // count TLD+1 if it is set if dm.PublicSuffix != nil { if dm.PublicSuffix.QnameEffectiveTLDPlusOne != "-" { - count, exists := c.etldplusone.Get(dm.PublicSuffix.QnameEffectiveTLDPlusOne) - if exists { - c.etldplusone.Add(dm.PublicSuffix.QnameEffectiveTLDPlusOne, count+1) - } else { - c.etldplusone.Add(dm.PublicSuffix.QnameEffectiveTLDPlusOne, 1) - } + count, _ := c.etldplusone.Get(dm.PublicSuffix.QnameEffectiveTLDPlusOne) + c.etldplusone.Add(dm.PublicSuffix.QnameEffectiveTLDPlusOne, count+1) c.topETLDPlusOne.Record(dm.PublicSuffix.QnameEffectiveTLDPlusOne, count+1) } } @@ -403,13 +374,8 @@ func (c *PrometheusCountersSet) Record(dm dnsutils.DNSMessage) { // suspicious domains if dm.Suspicious != nil { if dm.Suspicious.Score > 0.0 { - count, exists := c.suspicious.Get(dm.DNS.Qname) - if exists { - c.suspicious.Add(dm.DNS.Qname, count+1) - } else { - c.suspicious.Add(dm.DNS.Qname, 1) - } - + count, _ := c.suspicious.Get(dm.DNS.Qname) + c.suspicious.Add(dm.DNS.Qname, count+1) c.topSuspicious.Record(dm.DNS.Qname, count+1) } } @@ -495,38 +461,39 @@ func (c *PrometheusCountersSet) Collect(ch chan<- prometheus.Metric) { c.Lock() defer c.Unlock() // Update number of domains - ch <- prometheus.MustNewConstMetric(c.prom.counterDomains, prometheus.CounterValue, + ch <- prometheus.MustNewConstMetric(c.prom.gaugeDomains, prometheus.GaugeValue, + //float64(c.domains.Len()), float64(c.domains.Len()), ) // Count NX domains - ch <- prometheus.MustNewConstMetric(c.prom.counterDomainsNx, prometheus.CounterValue, + ch <- prometheus.MustNewConstMetric(c.prom.gaugeDomainsNx, prometheus.GaugeValue, float64(c.nxdomains.Len()), ) // Count SERVFAIL domains - ch <- prometheus.MustNewConstMetric(c.prom.counterDomainsSf, prometheus.CounterValue, + ch <- prometheus.MustNewConstMetric(c.prom.gaugeDomainsSf, prometheus.GaugeValue, float64(c.sfdomains.Len()), ) // Requesters counter - ch <- prometheus.MustNewConstMetric(c.prom.counterRequesters, prometheus.CounterValue, + ch <- prometheus.MustNewConstMetric(c.prom.gaugeRequesters, prometheus.GaugeValue, float64(c.requesters.Len()), ) // Count number of unique TLDs - ch <- prometheus.MustNewConstMetric(c.prom.counterTlds, prometheus.CounterValue, + ch <- prometheus.MustNewConstMetric(c.prom.gaugeTlds, prometheus.GaugeValue, float64(c.tlds.Len()), ) - ch <- prometheus.MustNewConstMetric(c.prom.counterETldPlusOne, prometheus.CounterValue, + ch <- prometheus.MustNewConstMetric(c.prom.gaugeETldPlusOne, prometheus.GaugeValue, float64(c.etldplusone.Len()), ) // Count number of unique suspicious names - ch <- prometheus.MustNewConstMetric(c.prom.counterSuspicious, prometheus.CounterValue, + ch <- prometheus.MustNewConstMetric(c.prom.gaugeSuspicious, prometheus.GaugeValue, float64(c.suspicious.Len()), ) // Count number of unique unanswered (timedout) names - ch <- prometheus.MustNewConstMetric(c.prom.counterEvicted, prometheus.CounterValue, + ch <- prometheus.MustNewConstMetric(c.prom.gaugeEvicted, prometheus.GaugeValue, float64(c.evicted.Len()), ) for _, r := range c.topDomains.Get() { @@ -895,51 +862,51 @@ func (c *Prometheus) InitProm() { ) // Counter metrics - c.counterDomains = prometheus.NewDesc( + c.gaugeDomains = prometheus.NewDesc( fmt.Sprintf("%s_domains_total", promPrefix), - "The total number of domains per stream identity", + "Number of domains per stream identity", nil, nil, ) - c.counterDomainsNx = prometheus.NewDesc( + c.gaugeDomainsNx = prometheus.NewDesc( fmt.Sprintf("%s_nxdomains_total", promPrefix), - "The total number of unknown domains per stream identity", + "Number of unknown domains per stream identity", nil, nil, ) - c.counterDomainsSf = prometheus.NewDesc( + c.gaugeDomainsSf = prometheus.NewDesc( fmt.Sprintf("%s_sfdomains_total", promPrefix), - "The total number of serverfail domains per stream identity", + "Number of serverfail domains per stream identity", nil, nil, ) - c.counterRequesters = prometheus.NewDesc( + c.gaugeRequesters = prometheus.NewDesc( fmt.Sprintf("%s_requesters_total", promPrefix), - "The total number of DNS clients per stream identity", + "Number of DNS clients per stream identity", nil, nil, ) - c.counterTlds = prometheus.NewDesc( + c.gaugeTlds = prometheus.NewDesc( fmt.Sprintf("%s_tlds_total", promPrefix), - "The total number of tld per stream identity", + "Number of tld per stream identity", nil, nil, ) - c.counterETldPlusOne = prometheus.NewDesc( + c.gaugeETldPlusOne = prometheus.NewDesc( fmt.Sprintf("%s_etldplusone_total", promPrefix), - "The total number of tld per stream identity", + "Number of tld per stream identity", nil, nil, ) - c.counterSuspicious = prometheus.NewDesc( + c.gaugeSuspicious = prometheus.NewDesc( fmt.Sprintf("%s_suspicious_total", promPrefix), - "The total number of suspicious domain per stream identity", + "Number of suspicious domain per stream identity", nil, nil, ) - c.counterEvicted = prometheus.NewDesc( + c.gaugeEvicted = prometheus.NewDesc( fmt.Sprintf("%s_unanswered_total", promPrefix), - "The total number of unanswered domains per stream identity", + "Number of unanswered domains per stream identity", nil, nil, ) diff --git a/pkgconfig/loggers.go b/pkgconfig/loggers.go index ba7c23cc..d20f9d9a 100644 --- a/pkgconfig/loggers.go +++ b/pkgconfig/loggers.go @@ -29,8 +29,8 @@ type ConfigLoggers struct { BasicAuthEnabled bool `yaml:"basic-auth-enable"` ChannelBufferSize int `yaml:"chan-buffer-size"` HistogramMetricsEnabled bool `yaml:"histogram-metrics-enabled"` - RequestersCacheTTL int `yaml:"requeters-cache-ttl"` - RequestersCacheSize int `yaml:"requeters-cache-size"` + RequestersCacheTTL int `yaml:"requesters-cache-ttl"` + RequestersCacheSize int `yaml:"requesters-cache-size"` DomainsCacheTTL int `yaml:"domains-cache-ttl"` DomainsCacheSize int `yaml:"domains-cache-size"` } `yaml:"prometheus"` From b91c002bc70b20a225805877225dbc5e4a7dd894 Mon Sep 17 00:00:00 2001 From: dmachard <5562930+dmachard@users.noreply.github.com> Date: Thu, 28 Dec 2023 20:59:16 +0100 Subject: [PATCH 03/12] make linter happy --- config.yml | 20 +++++++++++++++++++- loggers/prometheus.go | 1 - 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/config.yml b/config.yml index cdcbffa7..b48649ca 100644 --- a/config.yml +++ b/config.yml @@ -77,10 +77,28 @@ multiplexer: - name: console stdout: mode: text + - name: prom + prometheus: + listen-ip: 0.0.0.0 + listen-port: 8081 + basic-auth-login: adminx + basic-auth-pwd: changeme + basic-auth-enable: false + tls-support: false + tls-mutual: false + tls-min-version: 1.2 + cert-file: "" + key-file: "" + prometheus-prefix: "dnscollector" + top-n: 10 + chan-buffer-size: 65535 + histogram-metrics-enabled: false + domains-cache-size: 50000 + domains-cache-ttl: 3600 routes: - from: [ tap ] - to: [ console ] + to: [ prom ] ################################################ # list of supported collectors diff --git a/loggers/prometheus.go b/loggers/prometheus.go index 8e22f5ed..b43b8ece 100644 --- a/loggers/prometheus.go +++ b/loggers/prometheus.go @@ -462,7 +462,6 @@ func (c *PrometheusCountersSet) Collect(ch chan<- prometheus.Metric) { defer c.Unlock() // Update number of domains ch <- prometheus.MustNewConstMetric(c.prom.gaugeDomains, prometheus.GaugeValue, - //float64(c.domains.Len()), float64(c.domains.Len()), ) // Count NX domains From 4c84d6a1fa30ca16957cf475e2e2a59f0166e1ab Mon Sep 17 00:00:00 2001 From: dmachard <5562930+dmachard@users.noreply.github.com> Date: Thu, 28 Dec 2023 21:03:15 +0100 Subject: [PATCH 04/12] rename metrics --- loggers/prometheus.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/loggers/prometheus.go b/loggers/prometheus.go index b43b8ece..b4096d79 100644 --- a/loggers/prometheus.go +++ b/loggers/prometheus.go @@ -862,49 +862,49 @@ func (c *Prometheus) InitProm() { // Counter metrics c.gaugeDomains = prometheus.NewDesc( - fmt.Sprintf("%s_domains_total", promPrefix), + fmt.Sprintf("%s_domains", promPrefix), "Number of domains per stream identity", nil, nil, ) c.gaugeDomainsNx = prometheus.NewDesc( - fmt.Sprintf("%s_nxdomains_total", promPrefix), + fmt.Sprintf("%s_nxdomains", promPrefix), "Number of unknown domains per stream identity", nil, nil, ) c.gaugeDomainsSf = prometheus.NewDesc( - fmt.Sprintf("%s_sfdomains_total", promPrefix), + fmt.Sprintf("%s_sfdomains", promPrefix), "Number of serverfail domains per stream identity", nil, nil, ) c.gaugeRequesters = prometheus.NewDesc( - fmt.Sprintf("%s_requesters_total", promPrefix), + fmt.Sprintf("%s_requesters", promPrefix), "Number of DNS clients per stream identity", nil, nil, ) c.gaugeTlds = prometheus.NewDesc( - fmt.Sprintf("%s_tlds_total", promPrefix), + fmt.Sprintf("%s_tlds", promPrefix), "Number of tld per stream identity", nil, nil, ) c.gaugeETldPlusOne = prometheus.NewDesc( - fmt.Sprintf("%s_etldplusone_total", promPrefix), + fmt.Sprintf("%s_etldplusone", promPrefix), "Number of tld per stream identity", nil, nil, ) c.gaugeSuspicious = prometheus.NewDesc( - fmt.Sprintf("%s_suspicious_total", promPrefix), + fmt.Sprintf("%s_suspicious", promPrefix), "Number of suspicious domain per stream identity", nil, nil, ) c.gaugeEvicted = prometheus.NewDesc( - fmt.Sprintf("%s_unanswered_total", promPrefix), + fmt.Sprintf("%s_unanswered", promPrefix), "Number of unanswered domains per stream identity", nil, nil, ) From 6630ce74331501e015c670b4e5acf84283d2ce4b Mon Sep 17 00:00:00 2001 From: dmachard <5562930+dmachard@users.noreply.github.com> Date: Fri, 29 Dec 2023 10:09:21 +0100 Subject: [PATCH 05/12] rename metrics --- config.yml | 92 ++++++++++++++++++++++++++----------------- loggers/prometheus.go | 32 +++++++-------- 2 files changed, 72 insertions(+), 52 deletions(-) diff --git a/config.yml b/config.yml index b48649ca..ca249ff8 100644 --- a/config.yml +++ b/config.yml @@ -63,42 +63,62 @@ global: # create your dns collector, please refer bellow to see the list # of supported collectors, loggers and transformers -multiplexer: - collectors: - - name: tap - dnstap: - listen-ip: 0.0.0.0 - listen-port: 6000 - transforms: - normalize: - qname-lowercase: false - - loggers: - - name: console - stdout: - mode: text - - name: prom - prometheus: - listen-ip: 0.0.0.0 - listen-port: 8081 - basic-auth-login: adminx - basic-auth-pwd: changeme - basic-auth-enable: false - tls-support: false - tls-mutual: false - tls-min-version: 1.2 - cert-file: "" - key-file: "" - prometheus-prefix: "dnscollector" - top-n: 10 - chan-buffer-size: 65535 - histogram-metrics-enabled: false - domains-cache-size: 50000 - domains-cache-ttl: 3600 - - routes: - - from: [ tap ] - to: [ prom ] +# multiplexer: +# collectors: +# - name: tap +# dnstap: +# listen-ip: 0.0.0.0 +# listen-port: 6000 +# transforms: +# normalize: +# qname-lowercase: false + +# loggers: +# - name: console +# stdout: +# mode: text + +# routes: +# - from: [ tap ] +# to: [ console ] + +# EXPERIMENTAL: pipeline mode +pipelines: + - name: dnsdist-main + dnstap: + listen-ip: 0.0.0.0 + listen-port: 6000 + routes: [ goog ] + + - name: goog + dnsmessage: + matching: + include: + dns.qname: "^.*\\.google\\.com$" + drop-policy: "unmatched" # unmatched, matched or disabled + transforms: + atags: + tags: [ "google", "web" ] + routes: [ match-tag ] + + - name: match-tag + dnsmessage: + matching: + include: + atags.tags.*: google + drop-policy: "unmatched" + routes: [ outputfile, console ] + + - name: outputfile + logfile: + file-path: "/tmp/dnstap.log" + max-size: 1000 + max-files: 10 + mode: flat-json + + - name: console + stdout: + mode: text ################################################ # list of supported collectors diff --git a/loggers/prometheus.go b/loggers/prometheus.go index b4096d79..6d9f2e55 100644 --- a/loggers/prometheus.go +++ b/loggers/prometheus.go @@ -862,50 +862,50 @@ func (c *Prometheus) InitProm() { // Counter metrics c.gaugeDomains = prometheus.NewDesc( - fmt.Sprintf("%s_domains", promPrefix), - "Number of domains per stream identity", + fmt.Sprintf("%s_total_domains_lru", promPrefix), + "Total number of domains most recently observed per stream identity ", nil, nil, ) c.gaugeDomainsNx = prometheus.NewDesc( - fmt.Sprintf("%s_nxdomains", promPrefix), - "Number of unknown domains per stream identity", + fmt.Sprintf("%s_total_nxdomains_lru", promPrefix), + "Total number of unknown domains most recently observed per stream identity", nil, nil, ) c.gaugeDomainsSf = prometheus.NewDesc( - fmt.Sprintf("%s_sfdomains", promPrefix), - "Number of serverfail domains per stream identity", + fmt.Sprintf("%s_sfdomains_lru", promPrefix), + "Total number of serverfail domains most recently observed per stream identity", nil, nil, ) c.gaugeRequesters = prometheus.NewDesc( - fmt.Sprintf("%s_requesters", promPrefix), - "Number of DNS clients per stream identity", + fmt.Sprintf("%s_total_requesters_lru", promPrefix), + "Total number of DNS clients most recently observed per stream identity.", nil, nil, ) c.gaugeTlds = prometheus.NewDesc( - fmt.Sprintf("%s_tlds", promPrefix), - "Number of tld per stream identity", + fmt.Sprintf("%s_total_tlds_lru", promPrefix), + "Total number of tld most recently observed per stream identity", nil, nil, ) c.gaugeETldPlusOne = prometheus.NewDesc( - fmt.Sprintf("%s_etldplusone", promPrefix), - "Number of tld per stream identity", + fmt.Sprintf("%s_total_etldsplusone_lru", promPrefix), + "Total number of etlds+one most recently observed per stream identity", nil, nil, ) c.gaugeSuspicious = prometheus.NewDesc( - fmt.Sprintf("%s_suspicious", promPrefix), - "Number of suspicious domain per stream identity", + fmt.Sprintf("%s_total_suspicious_lru", promPrefix), + "Total number of suspicious domains most recently observed per stream identity", nil, nil, ) c.gaugeEvicted = prometheus.NewDesc( - fmt.Sprintf("%s_unanswered", promPrefix), - "Number of unanswered domains per stream identity", + fmt.Sprintf("%s_total_unanswered_lru", promPrefix), + "Total number of unanswered domains most recently observed per stream identity", nil, nil, ) From d06d60876125961151a6d3767b1fea80520a0822 Mon Sep 17 00:00:00 2001 From: dmachard <5562930+dmachard@users.noreply.github.com> Date: Fri, 29 Dec 2023 18:25:27 +0100 Subject: [PATCH 06/12] Update config --- config.yml | 108 ++++++++++++++++++++++++++--------------------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/config.yml b/config.yml index ca249ff8..efbee507 100644 --- a/config.yml +++ b/config.yml @@ -63,62 +63,62 @@ global: # create your dns collector, please refer bellow to see the list # of supported collectors, loggers and transformers -# multiplexer: -# collectors: -# - name: tap -# dnstap: -# listen-ip: 0.0.0.0 -# listen-port: 6000 -# transforms: -# normalize: -# qname-lowercase: false - -# loggers: -# - name: console -# stdout: -# mode: text - -# routes: -# - from: [ tap ] -# to: [ console ] +multiplexer: + collectors: + - name: tap + dnstap: + listen-ip: 0.0.0.0 + listen-port: 6000 + transforms: + normalize: + qname-lowercase: false + + loggers: + - name: console + stdout: + mode: text + + routes: + - from: [ tap ] + to: [ console ] # EXPERIMENTAL: pipeline mode -pipelines: - - name: dnsdist-main - dnstap: - listen-ip: 0.0.0.0 - listen-port: 6000 - routes: [ goog ] - - - name: goog - dnsmessage: - matching: - include: - dns.qname: "^.*\\.google\\.com$" - drop-policy: "unmatched" # unmatched, matched or disabled - transforms: - atags: - tags: [ "google", "web" ] - routes: [ match-tag ] - - - name: match-tag - dnsmessage: - matching: - include: - atags.tags.*: google - drop-policy: "unmatched" - routes: [ outputfile, console ] - - - name: outputfile - logfile: - file-path: "/tmp/dnstap.log" - max-size: 1000 - max-files: 10 - mode: flat-json - - - name: console - stdout: - mode: text +# pipelines: +# - name: dnsdist-main +# dnstap: +# listen-ip: 0.0.0.0 +# listen-port: 6000 +# routes: [ goog ] + +# - name: goog +# dnsmessage: +# matching: +# include: +# dns.qname: "^.*\\.google\\.com$" +# drop-policy: "unmatched" # unmatched, matched or disabled +# transforms: +# atags: +# tags: [ "google", "web" ] +# routes: [ match-tag ] + +# - name: match-tag +# dnsmessage: +# matching: +# include: +# atags.tags.*: google +# drop-policy: "unmatched" +# routes: [ outputfile, console ] + +# - name: outputfile +# logfile: +# file-path: "/tmp/dnstap.log" +# max-size: 1000 +# max-files: 10 +# mode: flat-json + +# - name: console +# stdout: +# mode: text ################################################ # list of supported collectors From 3f6e8b26dc10a7fb64e7e6054d43e125f81db03d Mon Sep 17 00:00:00 2001 From: dmachard <5562930+dmachard@users.noreply.github.com> Date: Sun, 31 Dec 2023 10:15:03 +0100 Subject: [PATCH 07/12] support stream_global as selector --- config.yml | 52 +++++++++++-------------------------------- loggers/prometheus.go | 15 ++++++++----- 2 files changed, 23 insertions(+), 44 deletions(-) diff --git a/config.yml b/config.yml index efbee507..becf2fe2 100644 --- a/config.yml +++ b/config.yml @@ -77,48 +77,20 @@ multiplexer: - name: console stdout: mode: text + - name: prom + prometheus: + listen-ip: 0.0.0.0 + listen-port: 8081 + basic-auth-enable: false + prometheus-labels: ["stream_global"] + requesters-cache-size: 50000 + requesters-cache-ttl: 3600 + domains-cache-size: 50000 + domains-cache-ttl: 3600 routes: - from: [ tap ] - to: [ console ] - -# EXPERIMENTAL: pipeline mode -# pipelines: -# - name: dnsdist-main -# dnstap: -# listen-ip: 0.0.0.0 -# listen-port: 6000 -# routes: [ goog ] - -# - name: goog -# dnsmessage: -# matching: -# include: -# dns.qname: "^.*\\.google\\.com$" -# drop-policy: "unmatched" # unmatched, matched or disabled -# transforms: -# atags: -# tags: [ "google", "web" ] -# routes: [ match-tag ] - -# - name: match-tag -# dnsmessage: -# matching: -# include: -# atags.tags.*: google -# drop-policy: "unmatched" -# routes: [ outputfile, console ] - -# - name: outputfile -# logfile: -# file-path: "/tmp/dnstap.log" -# max-size: 1000 -# max-files: 10 -# mode: flat-json - -# - name: console -# stdout: -# mode: text + to: [ console, prom ] ################################################ # list of supported collectors @@ -312,6 +284,8 @@ multiplexer: # chan-buffer-size: 65535 # # compute histogram for qnames length, latencies, queries and replies size repartition # histogram-metrics-enabled: false +# prometheus-labels: (list of strings) labels to add to metrics. Currently supported labels: stream_id, resolver, stream_global +# prometheus-labels: ["stream_id"] # # write captured dns traffic to text or binary files with rotation and compression support # logfile: diff --git a/loggers/prometheus.go b/loggers/prometheus.go index 6d9f2e55..95bc2766 100644 --- a/loggers/prometheus.go +++ b/loggers/prometheus.go @@ -35,8 +35,9 @@ Configuration may specifiy a list of lables to use for metrics. Any label in this catalogueSelectors can be specidied in config (prometheus-labels stanza) */ var catalogueSelectors map[string]func(*dnsutils.DNSMessage) string = map[string]func(*dnsutils.DNSMessage) string{ - "stream_id": GetStreamID, - "resolver": GetResolverIP, + "stream_id": GetStreamID, + "resolver": GetResolverIP, + "stream_global": GetStreamGlobal, } /* @@ -156,6 +157,10 @@ type PromCounterCatalogueContainer struct { /* Selectors */ +func GetStreamGlobal(dm *dnsutils.DNSMessage) string { + return "enabled" +} + func GetStreamID(dm *dnsutils.DNSMessage) string { return dm.DNSTap.Identity } @@ -717,9 +722,9 @@ func CreateSystemCatalogue(o *Prometheus) ([]string, *PromCounterCatalogueContai lbls := o.config.Loggers.Prometheus.LabelsList // Default configuration is label with stream_id, to keep us backward compatible - if len(lbls) == 0 { - lbls = []string{"stream_id"} - } + // if len(lbls) == 0 { + // lbls = []string{"stream_id"} + // } return lbls, NewPromCounterCatalogueContainer( o, lbls, From 8ad2f89c14017544f16711026e07b47932c89593 Mon Sep 17 00:00:00 2001 From: dmachard <5562930+dmachard@users.noreply.github.com> Date: Sun, 31 Dec 2023 10:20:11 +0100 Subject: [PATCH 08/12] set default value if no label provided --- loggers/prometheus.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loggers/prometheus.go b/loggers/prometheus.go index 95bc2766..cf92a1fd 100644 --- a/loggers/prometheus.go +++ b/loggers/prometheus.go @@ -722,9 +722,9 @@ func CreateSystemCatalogue(o *Prometheus) ([]string, *PromCounterCatalogueContai lbls := o.config.Loggers.Prometheus.LabelsList // Default configuration is label with stream_id, to keep us backward compatible - // if len(lbls) == 0 { - // lbls = []string{"stream_id"} - // } + if len(lbls) == 0 { + lbls = []string{"stream_id"} + } return lbls, NewPromCounterCatalogueContainer( o, lbls, From 578ccb896070f3434d7c40209d98eb670635a8a6 Mon Sep 17 00:00:00 2001 From: dmachard <5562930+dmachard@users.noreply.github.com> Date: Sun, 31 Dec 2023 10:52:32 +0100 Subject: [PATCH 09/12] Update docs --- config.yml | 12 +- docs/loggers/logger_prometheus.md | 33 ++- docs/metrics.txt | 333 +++++++++--------------------- 3 files changed, 130 insertions(+), 248 deletions(-) diff --git a/config.yml b/config.yml index becf2fe2..eaad0080 100644 --- a/config.yml +++ b/config.yml @@ -82,7 +82,7 @@ multiplexer: listen-ip: 0.0.0.0 listen-port: 8081 basic-auth-enable: false - prometheus-labels: ["stream_global"] + prometheus-labels: ["stream_id"] requesters-cache-size: 50000 requesters-cache-ttl: 3600 domains-cache-size: 50000 @@ -284,8 +284,16 @@ multiplexer: # chan-buffer-size: 65535 # # compute histogram for qnames length, latencies, queries and replies size repartition # histogram-metrics-enabled: false -# prometheus-labels: (list of strings) labels to add to metrics. Currently supported labels: stream_id, resolver, stream_global +# # prometheus-labels: (list of strings) labels to add to metrics. Currently supported labels: stream_id, resolver, stream_global # prometheus-labels: ["stream_id"] +# # LRU (least-recently-used) cache size for observed clients DNS +# requesters-cache-size: 250000 +# # maximum time (in seconds) before eviction from the LRU cache +# requesters-cache-ttl: 3600 +# # LRU (least-recently-used) cache size for observed domains +# domains-cache-size: 500000 +# # maximum time (in seconds) before eviction from the LRU cache +# domains-cache-ttl: 3600 # # write captured dns traffic to text or binary files with rotation and compression support # logfile: diff --git a/docs/loggers/logger_prometheus.md b/docs/loggers/logger_prometheus.md index 91a748d1..a791f59d 100644 --- a/docs/loggers/logger_prometheus.md +++ b/docs/loggers/logger_prometheus.md @@ -19,7 +19,11 @@ Options: - `top-n`: (string) default number of items on top - `chan-buffer-size`: (integer) channel buffer size used on incoming dns message, number of messages before to drop it. - `histogram-metrics-enabled`: (boolean) compute histogram for qnames length, latencies, queries and replies size repartition -- `prometheus-labels`: (list of strings) labels to add to metrics. Currently supported labels: `stream_id`, `resolver` +- `prometheus-labels`: (list of strings) labels to add to metrics. Currently supported labels: `stream_id` (default), `stream_global`, `resolver` +- `requesters-cache-size`: (integer) LRU (least-recently-used) cache size for observed clients DNS +- `requesters-cache-ttl`: (integer) maximum time (in seconds) before eviction from the LRU cache +- `domains-cache-size`: (integer) LRU (least-recently-used) cache size for observed domains +- `domains-cache-ttl`: (integer) maximum time (in seconds) before eviction from the LRU cache Default values: @@ -40,6 +44,10 @@ prometheus: chan-buffer-size: 65535 histogram-metrics-enabled: false prometheus-labels: ["stream_id"] + requesters-cache-size: 250000 + requesters-cache-ttl: 3600 + domains-cache-size: 500000 + domains-cache-ttl: 3600 ``` Scrape metric with curl: @@ -55,9 +63,10 @@ The full metrics can be found [here](./../metrics.txt). | Metric | Notes |-------------------------------------------------|------------------------------------ | dnscollector_build_info | Build info -| dnscollector_requesters_total | The total number of requesters per stream identity -| dnscollector_nxdomains_total | The total number of NX domains per stream identity -| dnscollector_domains_total | The total number of domains per stream identity +| dnscollector_total_requesters_lru | Total number of DNS clients most recently observed per stream identity. +| dnscollector_total_sfdomains_lru | Total number of serverfail domains most recently observed per stream identity +| dnscollector_total_nxdomains_lru | Total number of NX domains most recently observed per stream identity +| dnscollector_total_domains_lru | Total number of domains most recently observed per stream identity | dnscollector_dnsmessage_total | Counter of total of DNS messages | dnscollector_queries_total | Counter of total of queries | dnscollector_replies_total | Counter of total of replies @@ -77,15 +86,15 @@ The full metrics can be found [here](./../metrics.txt). | dnscollector_reassembled_total | Total of reassembled DNS messages (TCP level) | dnscollector_throughput_ops | Number of ops per second received, partitioned by stream | dnscollector_throughput_ops_max | Max number of ops per second observed, partitioned by stream -| dnscollector_tlds_total | The total number of tld per stream identity +| dnscollector_total_tlds_lru | Total number of tld most recently observed per stream identity | dnscollector_top_domains | Number of hit per domain topN, partitioned by stream and qname | dnscollector_top_nxdomains | Number of hit per nx domain topN, partitioned by stream and qname | dnscollector_top_sfdomains | Number of hit per servfail domain topN, partitioned by stream and qname | dnscollector_top_requesters | Number of hit per requester topN, partitioned by client IP | dnscollector_top_tlds | Number of hit per tld - topN | dnscollector_top_unanswered | Number of hit per unanswered domain - topN -| dnscollector_unanswered_total | The total number of unanswered domains per stream identity -| dnscollector_suspicious_total | The total number of unanswered domains per stream identity +| dnscollector_total_unanswered_lru | Total number of unanswered domains most recently observed per stream identity +| dnscollector_total_suspicious_lru | Total number of suspicious domains most recently observed per stream identity | dnscollector_qnames_size_bytes_bucket | Histogram of the size of the qname in bytes | dnscollector_queries_size_bytes_bucket | Histogram of the size of the queries in bytes. | dnscollector_replies_size_bytes_bucket | Histogram of the size of the replies in bytes. @@ -97,3 +106,13 @@ The following [build-in](https://grafana.com/grafana/dashboards/16630) dashboard

dnscollector

+ +# Merge streams for metrics computation + +If you want to compute metrics based on the merge of all streams instead of each one, use the following settings: + +```yaml +prometheus: + .... + prometheus-labels: ["stream_global"] +``` diff --git a/docs/metrics.txt b/docs/metrics.txt index e458b0ef..ebfc903c 100644 --- a/docs/metrics.txt +++ b/docs/metrics.txt @@ -1,277 +1,132 @@ # HELP dnscollector_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which dnscollector was built, and the goos and goarch for the build. # TYPE dnscollector_build_info gauge -dnscollector_build_info{branch="main",goarch="amd64",goos="linux",goversion="go1.21.3",revision="9fde998",tags="unknown",version="0.37.0-beta1"} 1 +dnscollector_build_info{branch="",goarch="amd64",goos="linux",goversion="go1.21.4",revision="unknown",tags="unknown",version=""} 1 # HELP dnscollector_bytes_total The total bytes received and sent # TYPE dnscollector_bytes_total counter -dnscollector_bytes_total{stream_id="dnsdist_pdns2"} 5.6543221e+07 -dnscollector_bytes_total{stream_id="dnsdist_pdns3"} 5.7528599e+07 -dnscollector_bytes_total{stream_id="dnsdist_pdns4"} 5.7536258e+07 -dnscollector_bytes_total{stream_id="dnsdist_pdns1"} 5.7530809e+07 +dnscollector_bytes_total{stream_id="dnsdist1"} 201 # HELP dnscollector_dnsmessages_total Counter of DNS messages per stream # TYPE dnscollector_dnsmessages_total counter -dnscollector_dnsmessages_total{stream_id="dnsdist_pdns2"} 550161 -dnscollector_dnsmessages_total{stream_id="dnsdist_pdns3"} 559725 -dnscollector_dnsmessages_total{stream_id="dnsdist_pdns4"} 559803 -dnscollector_dnsmessages_total{stream_id="dnsdist_pdns1"} 559749 -# HELP dnscollector_domains_total The total number of domains per stream identity -# TYPE dnscollector_domains_total counter -dnscollector_domains_total{stream_id="dnsdist_pdns2"} 99882 -dnscollector_domains_total{stream_id="dnsdist_pdns3"} 99882 -dnscollector_domains_total{stream_id="dnsdist_pdns4"} 99882 -dnscollector_domains_total{stream_id="dnsdist_pdns1"} 99882 +dnscollector_dnsmessages_total{stream_id="dnsdist1"} 2 # HELP dnscollector_flag_aa_total Number of packet with flag AA # TYPE dnscollector_flag_aa_total counter -dnscollector_flag_aa_total{stream_id="dnsdist_pdns2"} 0 -dnscollector_flag_aa_total{stream_id="dnsdist_pdns3"} 0 -dnscollector_flag_aa_total{stream_id="dnsdist_pdns4"} 0 -dnscollector_flag_aa_total{stream_id="dnsdist_pdns1"} 0 +dnscollector_flag_aa_total{stream_id="dnsdist1"} 0 # HELP dnscollector_flag_ad_total Number of packet with flag AD # TYPE dnscollector_flag_ad_total counter -dnscollector_flag_ad_total{stream_id="dnsdist_pdns2"} 0 -dnscollector_flag_ad_total{stream_id="dnsdist_pdns3"} 0 -dnscollector_flag_ad_total{stream_id="dnsdist_pdns4"} 0 -dnscollector_flag_ad_total{stream_id="dnsdist_pdns1"} 0 -# HELP dnscollector_fragmented_total Number of IP fragmented packets -# TYPE dnscollector_fragmented_total counter -dnscollector_fragmented_total{stream_id="dnsdist_pdns2"} 0 -dnscollector_fragmented_total{stream_id="dnsdist_pdns3"} 0 -dnscollector_fragmented_total{stream_id="dnsdist_pdns4"} 0 -dnscollector_fragmented_total{stream_id="dnsdist_pdns1"} 0 -# HELP dnscollector_malformed_total Number of malformed packets -# TYPE dnscollector_malformed_total counter -dnscollector_malformed_total{stream_id="dnsdist_pdns2"} 0 -dnscollector_malformed_total{stream_id="dnsdist_pdns3"} 0 -dnscollector_malformed_total{stream_id="dnsdist_pdns4"} 0 -dnscollector_malformed_total{stream_id="dnsdist_pdns1"} 0 +dnscollector_flag_ad_total{stream_id="dnsdist1"} 1 # HELP dnscollector_flag_ra_total Number of packet with flag RA # TYPE dnscollector_flag_ra_total counter -dnscollector_flag_ra_total{stream_id="dnsdist_pdns2"} 0 -dnscollector_flag_ra_total{stream_id="dnsdist_pdns3"} 0 -dnscollector_flag_ra_total{stream_id="dnsdist_pdns4"} 0 -dnscollector_flag_ra_total{stream_id="dnsdist_pdns1"} 0 -# HELP dnscollector_reassembled_total Number of TCP reassembled packets -# TYPE dnscollector_reassembled_total counter -dnscollector_reassembled_total{stream_id="dnsdist_pdns2"} 0 -dnscollector_reassembled_total{stream_id="dnsdist_pdns3"} 0 -dnscollector_reassembled_total{stream_id="dnsdist_pdns4"} 0 -dnscollector_reassembled_total{stream_id="dnsdist_pdns1"} 0 +dnscollector_flag_ra_total{stream_id="dnsdist1"} 1 # HELP dnscollector_flag_tc_total Number of packet with flag TC # TYPE dnscollector_flag_tc_total counter -dnscollector_flag_tc_total{stream_id="dnsdist_pdns2"} 0 -dnscollector_flag_tc_total{stream_id="dnsdist_pdns3"} 0 -dnscollector_flag_tc_total{stream_id="dnsdist_pdns4"} 0 -dnscollector_flag_tc_total{stream_id="dnsdist_pdns1"} 0 +dnscollector_flag_tc_total{stream_id="dnsdist1"} 0 +# HELP dnscollector_fragmented_total Number of IP fragmented packets +# TYPE dnscollector_fragmented_total counter +dnscollector_fragmented_total{stream_id="dnsdist1"} 0 # HELP dnscollector_ipprotocol_total Counter of packets per IP protocol # TYPE dnscollector_ipprotocol_total counter -dnscollector_ipprotocol_total{net_transport="UDP",stream_id="dnsdist_pdns2"} 550161 -dnscollector_ipprotocol_total{net_transport="UDP",stream_id="dnsdist_pdns3"} 559725 -dnscollector_ipprotocol_total{net_transport="UDP",stream_id="dnsdist_pdns4"} 559803 -dnscollector_ipprotocol_total{net_transport="UDP",stream_id="dnsdist_pdns1"} 559749 +dnscollector_ipprotocol_total{net_transport="DOT",stream_id="dnsdist1"} 2 # HELP dnscollector_ipversion_total Counter of packets per IP version # TYPE dnscollector_ipversion_total counter -dnscollector_ipversion_total{net_family="IPv4",stream_id="dnsdist_pdns2"} 550161 -dnscollector_ipversion_total{net_family="IPv4",stream_id="dnsdist_pdns3"} 559725 -dnscollector_ipversion_total{net_family="IPv4",stream_id="dnsdist_pdns4"} 559803 -dnscollector_ipversion_total{net_family="IPv4",stream_id="dnsdist_pdns1"} 559749 -# HELP dnscollector_nxdomains_total The total number of unknown domains per stream identity -# TYPE dnscollector_nxdomains_total counter -dnscollector_nxdomains_total{stream_id="dnsdist_pdns2"} 0 -dnscollector_nxdomains_total{stream_id="dnsdist_pdns3"} 0 -dnscollector_nxdomains_total{stream_id="dnsdist_pdns4"} 0 -dnscollector_nxdomains_total{stream_id="dnsdist_pdns1"} 0 -# HELP dnscollector_qnames_size_bytes Size of the qname in bytes. -# TYPE dnscollector_qnames_size_bytes histogram -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns2",le="10"} 0 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns2",le="20"} 34627 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns2",le="40"} 130990 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns2",le="60"} 226286 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns2",le="100"} 415397 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns2",le="+Inf"} 567359 -dnscollector_qnames_size_bytes_sum{stream_id="dnsdist_pdns2"} 4.1859514e+07 -dnscollector_qnames_size_bytes_count{stream_id="dnsdist_pdns2"} 567359 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns3",le="10"} 0 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns3",le="20"} 35212 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns3",le="40"} 133159 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns3",le="60"} 229987 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns3",le="100"} 422254 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns3",le="+Inf"} 576767 -dnscollector_qnames_size_bytes_sum{stream_id="dnsdist_pdns3"} 4.2556437e+07 -dnscollector_qnames_size_bytes_count{stream_id="dnsdist_pdns3"} 576767 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns4",le="10"} 0 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns4",le="20"} 35225 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns4",le="40"} 133229 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns4",le="60"} 230106 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns4",le="100"} 422452 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns4",le="+Inf"} 577028 -dnscollector_qnames_size_bytes_sum{stream_id="dnsdist_pdns4"} 4.2574976e+07 -dnscollector_qnames_size_bytes_count{stream_id="dnsdist_pdns4"} 577028 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns1",le="10"} 0 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns1",le="20"} 35244 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns1",le="40"} 133275 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns1",le="60"} 230183 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns1",le="100"} 422555 -dnscollector_qnames_size_bytes_bucket{stream_id="dnsdist_pdns1",le="+Inf"} 577183 -dnscollector_qnames_size_bytes_sum{stream_id="dnsdist_pdns1"} 4.2586089e+07 -dnscollector_qnames_size_bytes_count{stream_id="dnsdist_pdns1"} 577183 +dnscollector_ipversion_total{net_family="IPv4",stream_id="dnsdist1"} 2 +# HELP dnscollector_malformed_total Number of malformed packets +# TYPE dnscollector_malformed_total counter +dnscollector_malformed_total{stream_id="dnsdist1"} 0 # HELP dnscollector_qtypes_total Counter of queries per qtypes # TYPE dnscollector_qtypes_total counter -dnscollector_qtypes_total{query_type="A",stream_id="dnsdist_pdns2"} 68593 -dnscollector_qtypes_total{query_type="A",stream_id="dnsdist_pdns3"} 69838 -dnscollector_qtypes_total{query_type="A",stream_id="dnsdist_pdns4"} 69843 -dnscollector_qtypes_total{query_type="A",stream_id="dnsdist_pdns1"} 69842 -dnscollector_qtypes_total{query_type="AAAA",stream_id="dnsdist_pdns2"} 137999 -dnscollector_qtypes_total{query_type="AAAA",stream_id="dnsdist_pdns3"} 140380 -dnscollector_qtypes_total{query_type="AAAA",stream_id="dnsdist_pdns4"} 140396 -dnscollector_qtypes_total{query_type="AAAA",stream_id="dnsdist_pdns1"} 140401 -dnscollector_qtypes_total{query_type="CNAME",stream_id="dnsdist_pdns2"} 68772 -dnscollector_qtypes_total{query_type="CNAME",stream_id="dnsdist_pdns3"} 70005 -dnscollector_qtypes_total{query_type="CNAME",stream_id="dnsdist_pdns4"} 70014 -dnscollector_qtypes_total{query_type="CNAME",stream_id="dnsdist_pdns1"} 70006 -dnscollector_qtypes_total{query_type="MX",stream_id="dnsdist_pdns2"} 68702 -dnscollector_qtypes_total{query_type="MX",stream_id="dnsdist_pdns3"} 69852 -dnscollector_qtypes_total{query_type="MX",stream_id="dnsdist_pdns4"} 69862 -dnscollector_qtypes_total{query_type="MX",stream_id="dnsdist_pdns1"} 69832 -dnscollector_qtypes_total{query_type="NS",stream_id="dnsdist_pdns2"} 68197 -dnscollector_qtypes_total{query_type="NS",stream_id="dnsdist_pdns3"} 69381 -dnscollector_qtypes_total{query_type="NS",stream_id="dnsdist_pdns4"} 69395 -dnscollector_qtypes_total{query_type="NS",stream_id="dnsdist_pdns1"} 69392 -dnscollector_qtypes_total{query_type="SOA",stream_id="dnsdist_pdns2"} 69745 -dnscollector_qtypes_total{query_type="SOA",stream_id="dnsdist_pdns3"} 70927 -dnscollector_qtypes_total{query_type="SOA",stream_id="dnsdist_pdns4"} 70940 -dnscollector_qtypes_total{query_type="SOA",stream_id="dnsdist_pdns1"} 70901 -dnscollector_qtypes_total{query_type="TXT",stream_id="dnsdist_pdns2"} 68153 -dnscollector_qtypes_total{query_type="TXT",stream_id="dnsdist_pdns3"} 69342 -dnscollector_qtypes_total{query_type="TXT",stream_id="dnsdist_pdns4"} 69353 -dnscollector_qtypes_total{query_type="TXT",stream_id="dnsdist_pdns1"} 69375 -# HELP dnscollector_queries_size_bytes Size of the queries in bytes. -# TYPE dnscollector_queries_size_bytes histogram -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns2",le="50"} 38921 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns2",le="100"} 278538 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns2",le="250"} 567368 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns2",le="500"} 567368 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns2",le="+Inf"} 567368 -dnscollector_queries_size_bytes_sum{stream_id="dnsdist_pdns2"} 5.8313773e+07 -dnscollector_queries_size_bytes_count{stream_id="dnsdist_pdns2"} 567368 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns3",le="50"} 39573 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns3",le="100"} 283076 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns3",le="250"} 576775 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns3",le="500"} 576775 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns3",le="+Inf"} 576775 -dnscollector_queries_size_bytes_sum{stream_id="dnsdist_pdns3"} 5.9283485e+07 -dnscollector_queries_size_bytes_count{stream_id="dnsdist_pdns3"} 576775 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns4",le="50"} 39588 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns4",le="100"} 283216 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns4",le="250"} 577028 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns4",le="500"} 577028 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns4",le="+Inf"} 577028 -dnscollector_queries_size_bytes_sum{stream_id="dnsdist_pdns4"} 5.9308788e+07 -dnscollector_queries_size_bytes_count{stream_id="dnsdist_pdns4"} 577028 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns1",le="50"} 39608 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns1",le="100"} 283295 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns1",le="250"} 577192 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns1",le="500"} 577192 -dnscollector_queries_size_bytes_bucket{stream_id="dnsdist_pdns1",le="+Inf"} 577192 -dnscollector_queries_size_bytes_sum{stream_id="dnsdist_pdns1"} 5.9325498e+07 -dnscollector_queries_size_bytes_count{stream_id="dnsdist_pdns1"} 577192 +dnscollector_qtypes_total{query_type="A",stream_id="dnsdist1"} 2 # HELP dnscollector_queries_total Counter of DNS queries per stream # TYPE dnscollector_queries_total counter -dnscollector_queries_total{stream_id="dnsdist_pdns2"} 550161 -dnscollector_queries_total{stream_id="dnsdist_pdns3"} 559725 -dnscollector_queries_total{stream_id="dnsdist_pdns4"} 559803 -dnscollector_queries_total{stream_id="dnsdist_pdns1"} 559749 +dnscollector_queries_total{stream_id="dnsdist1"} 1 # HELP dnscollector_rcodes_total Counter of replies per return codes # TYPE dnscollector_rcodes_total counter -dnscollector_rcodes_total{return_code="-",stream_id="dnsdist_pdns2"} 550161 -dnscollector_rcodes_total{return_code="-",stream_id="dnsdist_pdns3"} 559725 -dnscollector_rcodes_total{return_code="-",stream_id="dnsdist_pdns4"} 559803 -dnscollector_rcodes_total{return_code="-",stream_id="dnsdist_pdns1"} 559749 +dnscollector_rcodes_total{return_code="NOERROR",stream_id="dnsdist1"} 2 +# HELP dnscollector_reassembled_total Number of TCP reassembled packets +# TYPE dnscollector_reassembled_total counter +dnscollector_reassembled_total{stream_id="dnsdist1"} 0 # HELP dnscollector_received_bytes_total The total bytes received # TYPE dnscollector_received_bytes_total counter -dnscollector_received_bytes_total{stream_id="dnsdist_pdns2"} 5.6543221e+07 -dnscollector_received_bytes_total{stream_id="dnsdist_pdns3"} 5.7528599e+07 -dnscollector_received_bytes_total{stream_id="dnsdist_pdns4"} 5.7536258e+07 -dnscollector_received_bytes_total{stream_id="dnsdist_pdns1"} 5.7530809e+07 +dnscollector_received_bytes_total{stream_id="dnsdist1"} 128 # HELP dnscollector_replies_total Counter of DNS replies per stream # TYPE dnscollector_replies_total counter -dnscollector_replies_total{stream_id="dnsdist_pdns2"} 0 -dnscollector_replies_total{stream_id="dnsdist_pdns3"} 0 -dnscollector_replies_total{stream_id="dnsdist_pdns4"} 0 -dnscollector_replies_total{stream_id="dnsdist_pdns1"} 0 -# HELP dnscollector_requesters_total The total number of DNS clients per stream identity -# TYPE dnscollector_requesters_total counter -dnscollector_requesters_total{stream_id="dnsdist_pdns2"} 1 -dnscollector_requesters_total{stream_id="dnsdist_pdns3"} 1 -dnscollector_requesters_total{stream_id="dnsdist_pdns4"} 1 -dnscollector_requesters_total{stream_id="dnsdist_pdns1"} 1 +dnscollector_replies_total{stream_id="dnsdist1"} 1 # HELP dnscollector_sent_bytes_total The total bytes sent # TYPE dnscollector_sent_bytes_total counter -dnscollector_sent_bytes_total{stream_id="dnsdist_pdns2"} 0 -dnscollector_sent_bytes_total{stream_id="dnsdist_pdns3"} 0 -dnscollector_sent_bytes_total{stream_id="dnsdist_pdns4"} 0 -dnscollector_sent_bytes_total{stream_id="dnsdist_pdns1"} 0 -# HELP dnscollector_sfdomains_total The total number of serverfail domains per stream identity -# TYPE dnscollector_sfdomains_total counter -dnscollector_sfdomains_total{stream_id="dnsdist_pdns2"} 0 -dnscollector_sfdomains_total{stream_id="dnsdist_pdns3"} 0 -dnscollector_sfdomains_total{stream_id="dnsdist_pdns4"} 0 -dnscollector_sfdomains_total{stream_id="dnsdist_pdns1"} 0 +dnscollector_sent_bytes_total{stream_id="dnsdist1"} 73 +# HELP dnscollector_sfdomains_lru Total number of serverfail domains most recently observed per stream identity +# TYPE dnscollector_sfdomains_lru gauge +dnscollector_sfdomains_lru{stream_id="dnsdist1"} 0 # HELP dnscollector_throughput_ops Number of ops per second received, partitioned by stream # TYPE dnscollector_throughput_ops gauge -dnscollector_throughput_ops{stream_id="dnsdist_pdns2"} 21731 -dnscollector_throughput_ops{stream_id="dnsdist_pdns3"} 21747 -dnscollector_throughput_ops{stream_id="dnsdist_pdns4"} 21704 -dnscollector_throughput_ops{stream_id="dnsdist_pdns1"} 21589 +dnscollector_throughput_ops{stream_id="dnsdist1"} 0 # HELP dnscollector_throughput_ops_max Max number of ops per second observed, partitioned by stream # TYPE dnscollector_throughput_ops_max gauge -dnscollector_throughput_ops_max{stream_id="dnsdist_pdns2"} 25554 -dnscollector_throughput_ops_max{stream_id="dnsdist_pdns3"} 25540 -dnscollector_throughput_ops_max{stream_id="dnsdist_pdns4"} 25351 -dnscollector_throughput_ops_max{stream_id="dnsdist_pdns1"} 25531 +dnscollector_throughput_ops_max{stream_id="dnsdist1"} 0 # HELP dnscollector_top_domains Number of hit per domain topN, partitioned by qname # TYPE dnscollector_top_domains gauge -dnscollector_top_domains{domain="35.test.com",stream_id="dnsdist_pdns2"} 17 -dnscollector_top_domains{domain="35.test.com",stream_id="dnsdist_pdns3"} 17 -dnscollector_top_domains{domain="am.test.com",stream_id="dnsdist_pdns2"} 17 -dnscollector_top_domains{domain="am.test.com",stream_id="dnsdist_pdns3"} 18 -dnscollector_top_domains{domain="am.test.com",stream_id="dnsdist_pdns4"} 18 -dnscollector_top_domains{domain="am.test.com",stream_id="dnsdist_pdns1"} 18 -dnscollector_top_domains{domain="fj.test.com",stream_id="dnsdist_pdns2"} 18 -dnscollector_top_domains{domain="fj.test.com",stream_id="dnsdist_pdns3"} 18 -dnscollector_top_domains{domain="fj.test.com",stream_id="dnsdist_pdns4"} 18 -dnscollector_top_domains{domain="fj.test.com",stream_id="dnsdist_pdns1"} 18 -dnscollector_top_domains{domain="fl.test.com",stream_id="dnsdist_pdns2"} 19 -dnscollector_top_domains{domain="fl.test.com",stream_id="dnsdist_pdns3"} 19 -dnscollector_top_domains{domain="fl.test.com",stream_id="dnsdist_pdns4"} 19 -dnscollector_top_domains{domain="fl.test.com",stream_id="dnsdist_pdns1"} 19 -dnscollector_top_domains{domain="ir.test.com",stream_id="dnsdist_pdns2"} 24 -dnscollector_top_domains{domain="ir.test.com",stream_id="dnsdist_pdns3"} 24 -dnscollector_top_domains{domain="ir.test.com",stream_id="dnsdist_pdns4"} 24 -dnscollector_top_domains{domain="ir.test.com",stream_id="dnsdist_pdns1"} 24 -dnscollector_top_domains{domain="ix.test.com",stream_id="dnsdist_pdns2"} 18 -dnscollector_top_domains{domain="ix.test.com",stream_id="dnsdist_pdns3"} 18 -dnscollector_top_domains{domain="ix.test.com",stream_id="dnsdist_pdns4"} 18 -dnscollector_top_domains{domain="ix.test.com",stream_id="dnsdist_pdns1"} 18 -dnscollector_top_domains{domain="m4.test.com",stream_id="dnsdist_pdns2"} 17 -dnscollector_top_domains{domain="m4.test.com",stream_id="dnsdist_pdns4"} 17 -dnscollector_top_domains{domain="m4.test.com",stream_id="dnsdist_pdns1"} 17 -dnscollector_top_domains{domain="pc.test.com",stream_id="dnsdist_pdns3"} 17 -dnscollector_top_domains{domain="pc.test.com",stream_id="dnsdist_pdns4"} 17 -dnscollector_top_domains{domain="pc.test.com",stream_id="dnsdist_pdns1"} 17 -dnscollector_top_domains{domain="qo.test.com",stream_id="dnsdist_pdns2"} 22 -dnscollector_top_domains{domain="qo.test.com",stream_id="dnsdist_pdns3"} 22 -dnscollector_top_domains{domain="qo.test.com",stream_id="dnsdist_pdns4"} 22 -dnscollector_top_domains{domain="qo.test.com",stream_id="dnsdist_pdns1"} 22 -dnscollector_top_domains{domain="rc.test.com",stream_id="dnsdist_pdns2"} 17 -dnscollector_top_domains{domain="rc.test.com",stream_id="dnsdist_pdns3"} 17 -dnscollector_top_domains{domain="rc.test.com",stream_id="dnsdist_pdns4"} 17 -dnscollector_top_domains{domain="rc.test.com",stream_id="dnsdist_pdns1"} 17 -dnscollector_top_domains{domain="ws.test.com",stream_id="dnsdist_pdns2"} 20 -dnscollector_top_domains{domain="ws.test.com",stream_id="dnsdist_pdns3"} 20 -dnscollector_top_domains{domain="ws.test.com",stream_id="dnsdist_pdns4"} 20 -dnscollector_top_domains{domain="ws.test.com",stream_id="dnsdist_pdns1"} 20 +dnscollector_top_domains{domain="www.github.com",stream_id="dnsdist1"} 2 # HELP dnscollector_top_requesters Number of hit per requester topN, partitioned by client IP # TYPE dnscollector_top_requesters gauge -dnscollector_top_requesters{ip="172.17.0.3",stream_id="dnsdist_pdns2"} 550161 -dnscollector_top_requesters{ip="172.17.0.3",stream_id="dnsdist_pdns3"} 559725 -dnscollector_top_requesters{ip="172.17.0.3",stream_id="dnsdist_pdns4"} 559803 -dnscollector_top_requesters{ip="172.17.0.3",stream_id="dnsdist_pdns1"} 559749 \ No newline at end of file +dnscollector_top_requesters{ip="192.168.1.210",stream_id="dnsdist1"} 2 +# HELP dnscollector_total_domains_lru Total number of domains most recently observed per stream identity +# TYPE dnscollector_total_domains_lru gauge +dnscollector_total_domains_lru{stream_id="dnsdist1"} 1 +# HELP dnscollector_total_etldsplusone_lru Total number of etlds+one most recently observed per stream identity +# TYPE dnscollector_total_etldsplusone_lru gauge +dnscollector_total_etldsplusone_lru{stream_id="dnsdist1"} 0 +# HELP dnscollector_total_nxdomains_lru Total number of unknown domains most recently observed per stream identity +# TYPE dnscollector_total_nxdomains_lru gauge +dnscollector_total_nxdomains_lru{stream_id="dnsdist1"} 0 +# HELP dnscollector_total_requesters_lru Total number of DNS clients most recently observed per stream identity. +# TYPE dnscollector_total_requesters_lru gauge +dnscollector_total_requesters_lru{stream_id="dnsdist1"} 1 +# HELP dnscollector_total_suspicious_lru Total number of suspicious domains most recently observed per stream identity +# TYPE dnscollector_total_suspicious_lru gauge +dnscollector_total_suspicious_lru{stream_id="dnsdist1"} 0 +# HELP dnscollector_total_tlds_lru Total number of tld most recently observed per stream identity +# TYPE dnscollector_total_tlds_lru gauge +dnscollector_total_tlds_lru{stream_id="dnsdist1"} 0 +# HELP dnscollector_total_unanswered_lru Total number of unanswered domains most recently observed per stream identity +# TYPE dnscollector_total_unanswered_lru gauge +dnscollector_total_unanswered_lru{stream_id="dnsdist1"} 0 +# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. +# TYPE go_gc_duration_seconds summary +go_gc_duration_seconds{quantile="0"} 2.2399e-05 +go_gc_duration_seconds{quantile="0.25"} 2.2399e-05 +go_gc_duration_seconds{quantile="0.5"} 4.3674e-05 +go_gc_duration_seconds{quantile="0.75"} 5.4769e-05 +go_gc_duration_seconds{quantile="1"} 5.4769e-05 +go_gc_duration_seconds_sum 0.000120842 +go_gc_duration_seconds_count 3 +# HELP go_goroutines Number of goroutines that currently exist. +# TYPE go_goroutines gauge +go_goroutines 29 +# HELP go_info Information about the Go environment. +# TYPE go_info gauge +go_info{version="go1.21.4"} 1 +# HELP go_memstats_last_gc_time_seconds Number of seconds since 1970 of last garbage collection. +# TYPE go_memstats_last_gc_time_seconds gauge +go_memstats_last_gc_time_seconds 1.704015697147078e+09 +# HELP go_threads Number of OS threads created. +# TYPE go_threads gauge +go_threads 10 +# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. +# TYPE process_cpu_seconds_total counter +process_cpu_seconds_total 0.14 +# HELP process_max_fds Maximum number of open file descriptors. +# TYPE process_max_fds gauge +process_max_fds 1.048576e+06 +# HELP process_open_fds Number of open file descriptors. +# TYPE process_open_fds gauge +process_open_fds 17 +# HELP process_resident_memory_bytes Resident memory size in bytes. +# TYPE process_resident_memory_bytes gauge +process_resident_memory_bytes 3.9059456e+07 +# HELP process_start_time_seconds Start time of the process since unix epoch in seconds. +# TYPE process_start_time_seconds gauge +process_start_time_seconds 1.7040156967e+09 +# HELP process_virtual_memory_bytes Virtual memory size in bytes. +# TYPE process_virtual_memory_bytes gauge +process_virtual_memory_bytes 2.036846592e+09 +# HELP process_virtual_memory_max_bytes Maximum amount of virtual memory available in bytes. +# TYPE process_virtual_memory_max_bytes gauge +process_virtual_memory_max_bytes 1.8446744073709552e+19 \ No newline at end of file From 15706347d59bcfad4f4d7d5649d299769862cea6 Mon Sep 17 00:00:00 2001 From: dmachard <5562930+dmachard@users.noreply.github.com> Date: Sun, 31 Dec 2023 11:00:55 +0100 Subject: [PATCH 10/12] Update docs --- docs/loggers/logger_prometheus.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/loggers/logger_prometheus.md b/docs/loggers/logger_prometheus.md index a791f59d..9451d585 100644 --- a/docs/loggers/logger_prometheus.md +++ b/docs/loggers/logger_prometheus.md @@ -109,7 +109,7 @@ The following [build-in](https://grafana.com/grafana/dashboards/16630) dashboard # Merge streams for metrics computation -If you want to compute metrics based on the merge of all streams instead of each one, use the following settings: +Use the following setting to consolidate all streams into one for metric computations. ```yaml prometheus: From 7069277f8e7330a08525620c7843f14790b13050 Mon Sep 17 00:00:00 2001 From: dmachard <5562930+dmachard@users.noreply.github.com> Date: Tue, 2 Jan 2024 13:34:19 +0100 Subject: [PATCH 11/12] update and fix tests --- config.yml | 29 ++++ dnsutils/constant.go | 1 + docs/loggers/logger_prometheus.md | 31 +++- loggers/prometheus.go | 231 +++++++++++++++++------------- loggers/prometheus_test.go | 73 +++++----- pkgconfig/loggers.go | 68 ++++++--- 6 files changed, 274 insertions(+), 159 deletions(-) diff --git a/config.yml b/config.yml index eaad0080..c3cff510 100644 --- a/config.yml +++ b/config.yml @@ -87,6 +87,7 @@ multiplexer: requesters-cache-ttl: 3600 domains-cache-size: 50000 domains-cache-ttl: 3600 + histogram-metrics-enabled: true routes: - from: [ tap ] @@ -284,6 +285,18 @@ multiplexer: # chan-buffer-size: 65535 # # compute histogram for qnames length, latencies, queries and replies size repartition # histogram-metrics-enabled: false +# # compute requesters metrics - total and top requesters +# requesters-metrics-enabled: true +# # compute domains metrics - total and top domains +# domains-metrics-enabled: true +# # compute NOERROR domains metrics - total and top domains +# noerror-metrics-enabled: true +# # compute NOERROR domains metrics - total and top domains +# servfail-metrics-enabled: true +# # compute NXDOMAIN domains metrics - total and top domains +# nonexistent-metrics-enabled: true +# # compute TIMEOUT domains metrics - total and top domains +# timeout-metrics-enabled: true # # prometheus-labels: (list of strings) labels to add to metrics. Currently supported labels: stream_id, resolver, stream_global # prometheus-labels: ["stream_id"] # # LRU (least-recently-used) cache size for observed clients DNS @@ -294,6 +307,22 @@ multiplexer: # domains-cache-size: 500000 # # maximum time (in seconds) before eviction from the LRU cache # domains-cache-ttl: 3600 +# # LRU (least-recently-used) cache size for observed NOERROR domains +# noerror-domains-cache-size: 500000 +# # maximum time (in seconds) before eviction from the LRU cache +# noerror-domains-cache-ttl: 3600 +# # LRU (least-recently-used) cache size for observed SERVFAIL domains +# servfail-domains-cache-size: 500000 +# # maximum time (in seconds) before eviction from the LRU cache +# servfail-domains-cache-ttl: 3600 +# # LRU (least-recently-used) cache size for observed NX domains +# nonexistent-domains-cache-size: 500000 +# # maximum time (in seconds) before eviction from the LRU cache +# nonexistent-domains-cache-ttl: 3600 +# # LRU (least-recently-used) cache size for observed other domains (suspicious, tlds, ...) +# default-domains-cache-size: 500000 +# # maximum time (in seconds) before eviction from the LRU cache +# default-domains-cache-ttl: 3600 # # write captured dns traffic to text or binary files with rotation and compression support # logfile: diff --git a/dnsutils/constant.go b/dnsutils/constant.go index 387f65fa..5e91c3a4 100644 --- a/dnsutils/constant.go +++ b/dnsutils/constant.go @@ -4,6 +4,7 @@ const ( ProtoDoT = "DOT" ProtoDoH = "DOH" + DNSRcodeNoError = "NOERROR" DNSRcodeNXDomain = "NXDOMAIN" DNSRcodeServFail = "SERVFAIL" DNSRcodeTimeout = "TIMEOUT" diff --git a/docs/loggers/logger_prometheus.md b/docs/loggers/logger_prometheus.md index 9451d585..b485fc5d 100644 --- a/docs/loggers/logger_prometheus.md +++ b/docs/loggers/logger_prometheus.md @@ -20,10 +20,16 @@ Options: - `chan-buffer-size`: (integer) channel buffer size used on incoming dns message, number of messages before to drop it. - `histogram-metrics-enabled`: (boolean) compute histogram for qnames length, latencies, queries and replies size repartition - `prometheus-labels`: (list of strings) labels to add to metrics. Currently supported labels: `stream_id` (default), `stream_global`, `resolver` -- `requesters-cache-size`: (integer) LRU (least-recently-used) cache size for observed clients DNS +- `requesters-cache-size`: (integer) LRU (least-recently-used) cache size for observed clients DNS per stream - `requesters-cache-ttl`: (integer) maximum time (in seconds) before eviction from the LRU cache -- `domains-cache-size`: (integer) LRU (least-recently-used) cache size for observed domains +- `domains-cache-size`: (integer) LRU (least-recently-used) cache size for observed domains per stream - `domains-cache-ttl`: (integer) maximum time (in seconds) before eviction from the LRU cache +- `noerror-domains-cache-size`: (integer) LRU (least-recently-used) cache size for observed NOERROR domains per stream +- `noerror-domains-cache-ttl`: (integer) maximum time (in seconds) before eviction from the LRU cache +- `servfail-domains-cache-size`: (integer) LRU (least-recently-used) cache size for observed SERVFAIL domains per stream +- `servfail-domains-cache-ttl`: (integer) maximum time (in seconds) before eviction from the LRU cache +- `nonexistent-domains-cache-size`: (integer) LRU (least-recently-used) cache size for observed NX domains per stream +- `nonexistent-domains-cache-ttl`: (integer) maximum time (in seconds) before eviction from the LRU cache Default values: @@ -43,11 +49,25 @@ prometheus: top-n: 10 chan-buffer-size: 65535 histogram-metrics-enabled: false + requesters-metrics-enabled: true + domains-metrics-enabled: true + noerror-domains-metrics-enabled: true + servfail-domains-metrics-enabled: true + nonexistent-domains-metrics-enabled: true + timeout-domains-metrics-enabled: true prometheus-labels: ["stream_id"] requesters-cache-size: 250000 requesters-cache-ttl: 3600 domains-cache-size: 500000 domains-cache-ttl: 3600 + noerror-domains-cache-size: 100000 + noerror-domains-cache-ttl: 3600 + servfail-domains-cache-size: 10000 + servfail-domains-cache-ttl: 3600 + nonexistent-domains-cache-size: 10000 + nonexistent-domains-cache-ttl: 3600 + default-domains-cache-size: 1000 + default-domains-cache-ttl: 3600 ``` Scrape metric with curl: @@ -64,9 +84,10 @@ The full metrics can be found [here](./../metrics.txt). |-------------------------------------------------|------------------------------------ | dnscollector_build_info | Build info | dnscollector_total_requesters_lru | Total number of DNS clients most recently observed per stream identity. -| dnscollector_total_sfdomains_lru | Total number of serverfail domains most recently observed per stream identity -| dnscollector_total_nxdomains_lru | Total number of NX domains most recently observed per stream identity -| dnscollector_total_domains_lru | Total number of domains most recently observed per stream identity +| dnscollector_total_domains_lru | Total number of serverfail domains most recently observed per stream identity +| dnscollector_total_noerror_domains_lru | Total number of serverfail domains most recently observed per stream identity +| dnscollector_total_servfail_domains_lru | Total number of serverfail domains most recently observed per stream identity +| dnscollector_total_nonexistentçdomains_lru | Total number of NX domains most recently observed per stream identity | dnscollector_dnsmessage_total | Counter of total of DNS messages | dnscollector_queries_total | Counter of total of queries | dnscollector_replies_total | Counter of total of replies diff --git a/loggers/prometheus.go b/loggers/prometheus.go index cf92a1fd..f4af353d 100644 --- a/loggers/prometheus.go +++ b/loggers/prometheus.go @@ -97,25 +97,27 @@ type PrometheusCountersCatalogue interface { type PrometheusCountersSet struct { prom *Prometheus - // Counters - requesters *expirable.LRU[string, int] // Requests number made by a specific requestor - domains *expirable.LRU[string, int] // Requests number made to find out about a specific domain - nxdomains *expirable.LRU[string, int] // Requests number ended up in NXDOMAIN - sfdomains *expirable.LRU[string, int] // Requests number ended up in SERVFAIL - tlds *expirable.LRU[string, int] // Requests number for a specific TLD - etldplusone *expirable.LRU[string, int] // Requests number for a specific eTLD+1 - suspicious *expirable.LRU[string, int] // Requests number for a specific name that looked suspicious - evicted *expirable.LRU[string, int] // Requests number for a specific name that timed out - - epsCounters EpsCounters - topRequesters *topmap.TopMap - topEvicted *topmap.TopMap - topSfDomains *topmap.TopMap - topDomains *topmap.TopMap - topNxDomains *topmap.TopMap - topTlds *topmap.TopMap - topETLDPlusOne *topmap.TopMap - topSuspicious *topmap.TopMap + // LRU cache counters per domains and IP + requesters *expirable.LRU[string, int] // Requests number made by a specific requestor + allDomains *expirable.LRU[string, int] // Requests number made to find out about a specific domain + validDomains *expirable.LRU[string, int] // Requests number ended up in NOERROR + nxDomains *expirable.LRU[string, int] // Requests number ended up in NXDOMAIN + sfDomains *expirable.LRU[string, int] // Requests number ended up in SERVFAIL + tlds *expirable.LRU[string, int] // Requests number for a specific TLD + etldplusone *expirable.LRU[string, int] // Requests number for a specific eTLD+1 + suspicious *expirable.LRU[string, int] // Requests number for a specific name that looked suspicious + evicted *expirable.LRU[string, int] // Requests number for a specific name that timed out + + epsCounters EpsCounters + topRequesters *topmap.TopMap + topAllDomains *topmap.TopMap + topEvicted *topmap.TopMap + topValidDomains *topmap.TopMap + topSfDomains *topmap.TopMap + topNxDomains *topmap.TopMap + topTlds *topmap.TopMap + topETLDPlusOne *topmap.TopMap + topSuspicious *topmap.TopMap labels prometheus.Labels // Do we really need to keep that map outside of registration? sync.Mutex // Each PrometheusCountersSet locks independently @@ -190,6 +192,7 @@ type Prometheus struct { // All metrics use these descriptions when regestering gaugeTopDomains *prometheus.Desc + gaugeTopNoerrDomains *prometheus.Desc gaugeTopNxDomains *prometheus.Desc gaugeTopSfDomains *prometheus.Desc gaugeTopRequesters *prometheus.Desc @@ -198,14 +201,15 @@ type Prometheus struct { gaugeTopSuspicious *prometheus.Desc gaugeTopEvicted *prometheus.Desc - gaugeDomains *prometheus.Desc - gaugeDomainsNx *prometheus.Desc - gaugeDomainsSf *prometheus.Desc - gaugeRequesters *prometheus.Desc - gaugeTlds *prometheus.Desc - gaugeETldPlusOne *prometheus.Desc - gaugeSuspicious *prometheus.Desc - gaugeEvicted *prometheus.Desc + gaugeDomainsAll *prometheus.Desc + gaugeDomainsValid *prometheus.Desc + gaugeDomainsNx *prometheus.Desc + gaugeDomainsSf *prometheus.Desc + gaugeRequesters *prometheus.Desc + gaugeTlds *prometheus.Desc + gaugeETldPlusOne *prometheus.Desc + gaugeSuspicious *prometheus.Desc + gaugeEvicted *prometheus.Desc gaugeEps *prometheus.Desc gaugeEpsMax *prometheus.Desc @@ -242,16 +246,17 @@ type Prometheus struct { func newPrometheusCounterSet(p *Prometheus, labels prometheus.Labels) *PrometheusCountersSet { pcs := &PrometheusCountersSet{ - prom: p, - labels: labels, - requesters: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.RequestersCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.RequestersCacheTTL)), - domains: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), - nxdomains: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), - sfdomains: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), - tlds: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), - etldplusone: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), - suspicious: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), - evicted: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), + prom: p, + labels: labels, + requesters: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.RequestersCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.RequestersCacheTTL)), + allDomains: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DomainsCacheTTL)), + validDomains: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.NoErrorDomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.NoErrorDomainsCacheTTL)), + nxDomains: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.NXDomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.NXDomainsCacheTTL)), + sfDomains: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.ServfailDomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.ServfailDomainsCacheTTL)), + tlds: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DefaultDomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DefaultDomainsCacheTTL)), + etldplusone: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DefaultDomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DefaultDomainsCacheTTL)), + suspicious: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DefaultDomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DefaultDomainsCacheTTL)), + evicted: expirable.NewLRU[string, int](p.config.Loggers.Prometheus.DefaultDomainsCacheSize, nil, time.Second*time.Duration(p.config.Loggers.Prometheus.DefaultDomainsCacheTTL)), epsCounters: EpsCounters{ TotalRcodes: make(map[string]float64), @@ -260,14 +265,15 @@ func newPrometheusCounterSet(p *Prometheus, labels prometheus.Labels) *Prometheu TotalIPProtocol: make(map[string]float64), }, - topRequesters: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), - topEvicted: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), - topSfDomains: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), - topDomains: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), - topNxDomains: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), - topTlds: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), - topETLDPlusOne: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), - topSuspicious: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), + topRequesters: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), + topEvicted: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), + topAllDomains: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), + topValidDomains: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), + topSfDomains: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), + topNxDomains: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), + topTlds: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), + topETLDPlusOne: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), + topSuspicious: topmap.NewTopMap(p.config.Loggers.Prometheus.TopN), } prometheus.WrapRegistererWith(labels, p.promRegistry).MustRegister(pcs) return pcs @@ -284,6 +290,7 @@ func (c *PrometheusCountersSet) Describe(ch chan<- *prometheus.Desc) { c.Lock() defer c.Unlock() ch <- c.prom.gaugeTopDomains + ch <- c.prom.gaugeTopNoerrDomains ch <- c.prom.gaugeTopNxDomains ch <- c.prom.gaugeTopSfDomains ch <- c.prom.gaugeTopRequesters @@ -293,7 +300,8 @@ func (c *PrometheusCountersSet) Describe(ch chan<- *prometheus.Desc) { ch <- c.prom.gaugeTopEvicted // Counter metrics - ch <- c.prom.gaugeDomains + ch <- c.prom.gaugeDomainsAll + ch <- c.prom.gaugeDomainsValid ch <- c.prom.gaugeDomainsNx ch <- c.prom.gaugeDomainsSf ch <- c.prom.gaugeRequesters @@ -331,59 +339,64 @@ func (c *PrometheusCountersSet) Record(dm dnsutils.DNSMessage) { c.Lock() defer c.Unlock() - count, _ := c.requesters.Get(dm.NetworkInfo.QueryIP) - c.requesters.Add(dm.NetworkInfo.QueryIP, count+1) - c.topRequesters.Record(dm.NetworkInfo.QueryIP, count+1) + // count all uniq requesters if enabled + if c.prom.config.Loggers.Prometheus.RequestersMetricsEnabled { + count, _ := c.requesters.Get(dm.NetworkInfo.QueryIP) + c.requesters.Add(dm.NetworkInfo.QueryIP, count+1) + c.topRequesters.Record(dm.NetworkInfo.QueryIP, count+1) + } + + // count all uniq domains if enabled + if c.prom.config.Loggers.Prometheus.DomainsMetricsEnabled { + count, _ := c.allDomains.Get(dm.DNS.Qname) + c.allDomains.Add(dm.DNS.Qname, count+1) + c.topAllDomains.Record(dm.DNS.Qname, count+1) + } - // // top domains - switch dm.DNS.Rcode { - case dnsutils.DNSRcodeTimeout: + // top domains + switch { + case dm.DNS.Rcode == dnsutils.DNSRcodeTimeout && c.prom.config.Loggers.Prometheus.TimeoutMetricsEnabled: count, _ := c.evicted.Get(dm.DNS.Qname) c.evicted.Add(dm.DNS.Qname, count+1) c.topEvicted.Record(dm.DNS.Qname, count+1) - case dnsutils.DNSRcodeServFail: - count, _ := c.sfdomains.Get(dm.DNS.Qname) - c.sfdomains.Add(dm.DNS.Qname, count+1) + case dm.DNS.Rcode == dnsutils.DNSRcodeServFail && c.prom.config.Loggers.Prometheus.ServfailMetricsEnabled: + count, _ := c.sfDomains.Get(dm.DNS.Qname) + c.sfDomains.Add(dm.DNS.Qname, count+1) c.topSfDomains.Record(dm.DNS.Qname, count+1) - case dnsutils.DNSRcodeNXDomain: - count, _ := c.nxdomains.Get(dm.DNS.Qname) - c.nxdomains.Add(dm.DNS.Qname, count+1) + case dm.DNS.Rcode == dnsutils.DNSRcodeNXDomain && c.prom.config.Loggers.Prometheus.NonExistentMetricsEnabled: + count, _ := c.nxDomains.Get(dm.DNS.Qname) + c.nxDomains.Add(dm.DNS.Qname, count+1) c.topNxDomains.Record(dm.DNS.Qname, count+1) - default: - count, _ := c.domains.Get(dm.DNS.Qname) - c.domains.Add(dm.DNS.Qname, count+1) - c.topDomains.Record(dm.DNS.Qname, count+1) + case dm.DNS.Rcode == dnsutils.DNSRcodeNoError && c.prom.config.Loggers.Prometheus.NoErrorMetricsEnabled: + count, _ := c.validDomains.Get(dm.DNS.Qname) + c.validDomains.Add(dm.DNS.Qname, count+1) + c.topValidDomains.Record(dm.DNS.Qname, count+1) } // count and top tld - if dm.PublicSuffix != nil { - if dm.PublicSuffix.QnamePublicSuffix != "-" { - count, _ := c.tlds.Get(dm.PublicSuffix.QnamePublicSuffix) - c.tlds.Add(dm.PublicSuffix.QnamePublicSuffix, count+1) - c.topTlds.Record(dm.PublicSuffix.QnamePublicSuffix, count+1) - } + if dm.PublicSuffix != nil && dm.PublicSuffix.QnamePublicSuffix != "-" { + count, _ := c.tlds.Get(dm.PublicSuffix.QnamePublicSuffix) + c.tlds.Add(dm.PublicSuffix.QnamePublicSuffix, count+1) + c.topTlds.Record(dm.PublicSuffix.QnamePublicSuffix, count+1) } // count TLD+1 if it is set - if dm.PublicSuffix != nil { - if dm.PublicSuffix.QnameEffectiveTLDPlusOne != "-" { - count, _ := c.etldplusone.Get(dm.PublicSuffix.QnameEffectiveTLDPlusOne) - c.etldplusone.Add(dm.PublicSuffix.QnameEffectiveTLDPlusOne, count+1) - c.topETLDPlusOne.Record(dm.PublicSuffix.QnameEffectiveTLDPlusOne, count+1) - } + if dm.PublicSuffix != nil && dm.PublicSuffix.QnameEffectiveTLDPlusOne != "-" { + count, _ := c.etldplusone.Get(dm.PublicSuffix.QnameEffectiveTLDPlusOne) + c.etldplusone.Add(dm.PublicSuffix.QnameEffectiveTLDPlusOne, count+1) + c.topETLDPlusOne.Record(dm.PublicSuffix.QnameEffectiveTLDPlusOne, count+1) } // suspicious domains - if dm.Suspicious != nil { - if dm.Suspicious.Score > 0.0 { - count, _ := c.suspicious.Get(dm.DNS.Qname) - c.suspicious.Add(dm.DNS.Qname, count+1) - c.topSuspicious.Record(dm.DNS.Qname, count+1) - } + if dm.Suspicious != nil && dm.Suspicious.Score > 0.0 { + count, _ := c.suspicious.Get(dm.DNS.Qname) + c.suspicious.Add(dm.DNS.Qname, count+1) + c.topSuspicious.Record(dm.DNS.Qname, count+1) } + // compute histograms, no more enabled by default to avoid to hurt performance. if c.prom.config.Loggers.Prometheus.HistogramMetricsEnabled { c.prom.histogramQnamesLength.With(c.labels).Observe(float64(len(dm.DNS.Qname))) @@ -465,17 +478,21 @@ func (c *PrometheusCountersSet) Record(dm dnsutils.DNSMessage) { func (c *PrometheusCountersSet) Collect(ch chan<- prometheus.Metric) { c.Lock() defer c.Unlock() - // Update number of domains - ch <- prometheus.MustNewConstMetric(c.prom.gaugeDomains, prometheus.GaugeValue, - float64(c.domains.Len()), + // Update number of all domains + ch <- prometheus.MustNewConstMetric(c.prom.gaugeDomainsAll, prometheus.GaugeValue, + float64(c.allDomains.Len()), + ) + // Update number of valid domains (noerror) + ch <- prometheus.MustNewConstMetric(c.prom.gaugeDomainsValid, prometheus.GaugeValue, + float64(c.validDomains.Len()), ) // Count NX domains ch <- prometheus.MustNewConstMetric(c.prom.gaugeDomainsNx, prometheus.GaugeValue, - float64(c.nxdomains.Len()), + float64(c.nxDomains.Len()), ) // Count SERVFAIL domains ch <- prometheus.MustNewConstMetric(c.prom.gaugeDomainsSf, prometheus.GaugeValue, - float64(c.sfdomains.Len()), + float64(c.sfDomains.Len()), ) // Requesters counter ch <- prometheus.MustNewConstMetric(c.prom.gaugeRequesters, prometheus.GaugeValue, @@ -500,11 +517,18 @@ func (c *PrometheusCountersSet) Collect(ch chan<- prometheus.Metric) { ch <- prometheus.MustNewConstMetric(c.prom.gaugeEvicted, prometheus.GaugeValue, float64(c.evicted.Len()), ) - for _, r := range c.topDomains.Get() { + + // Count for all top domains + for _, r := range c.topAllDomains.Get() { ch <- prometheus.MustNewConstMetric(c.prom.gaugeTopDomains, prometheus.GaugeValue, float64(r.Hit), strings.ToValidUTF8(r.Name, "�")) } + for _, r := range c.topValidDomains.Get() { + ch <- prometheus.MustNewConstMetric(c.prom.gaugeTopNoerrDomains, prometheus.GaugeValue, + float64(r.Hit), strings.ToValidUTF8(r.Name, "�")) + } + for _, r := range c.topNxDomains.Get() { ch <- prometheus.MustNewConstMetric(c.prom.gaugeTopNxDomains, prometheus.GaugeValue, float64(r.Hit), strings.ToValidUTF8(r.Name, "�")) @@ -811,14 +835,21 @@ func (c *Prometheus) InitProm() { "Number of hit per domain topN, partitioned by qname", []string{"domain"}, nil, ) + + c.gaugeTopNoerrDomains = prometheus.NewDesc( + fmt.Sprintf("%s_top_noerror_domains", promPrefix), + "Number of hit per domain topN, partitioned by qname", + []string{"domain"}, nil, + ) + c.gaugeTopNxDomains = prometheus.NewDesc( - fmt.Sprintf("%s_top_nxdomains", promPrefix), + fmt.Sprintf("%s_top_nonexistent_domains", promPrefix), "Number of hit per nx domain topN, partitioned by qname", []string{"domain"}, nil, ) c.gaugeTopSfDomains = prometheus.NewDesc( - fmt.Sprintf("%s_top_sfdomains", promPrefix), + fmt.Sprintf("%s_top_servfail_domains", promPrefix), "Number of hit per servfail domain topN, partitioned by stream and qname", []string{"domain"}, nil, ) @@ -836,7 +867,7 @@ func (c *Prometheus) InitProm() { ) // etldplusone_top_total c.gaugeTopETldsPlusOne = prometheus.NewDesc( - fmt.Sprintf("%s_top_etldplusone", promPrefix), + fmt.Sprintf("%s_top_etlds_plusone", promPrefix), "Number of hit per eTLD+1 - topN", []string{"suffix"}, nil, ) @@ -866,21 +897,27 @@ func (c *Prometheus) InitProm() { ) // Counter metrics - c.gaugeDomains = prometheus.NewDesc( + c.gaugeDomainsAll = prometheus.NewDesc( fmt.Sprintf("%s_total_domains_lru", promPrefix), - "Total number of domains most recently observed per stream identity ", + "Total number of uniq domains most recently observed per stream identity ", + nil, nil, + ) + + c.gaugeDomainsValid = prometheus.NewDesc( + fmt.Sprintf("%s_total_noerror_domains_lru", promPrefix), + "Total number of NOERROR domains most recently observed per stream identity ", nil, nil, ) c.gaugeDomainsNx = prometheus.NewDesc( - fmt.Sprintf("%s_total_nxdomains_lru", promPrefix), - "Total number of unknown domains most recently observed per stream identity", + fmt.Sprintf("%s_total_nonexistent_domains_lru", promPrefix), + "Total number of NX domains most recently observed per stream identity", nil, nil, ) c.gaugeDomainsSf = prometheus.NewDesc( - fmt.Sprintf("%s_sfdomains_lru", promPrefix), - "Total number of serverfail domains most recently observed per stream identity", + fmt.Sprintf("%s_total_servfail_domains_lru", promPrefix), + "Total number of SERVFAIL domains most recently observed per stream identity", nil, nil, ) @@ -897,8 +934,8 @@ func (c *Prometheus) InitProm() { ) c.gaugeETldPlusOne = prometheus.NewDesc( - fmt.Sprintf("%s_total_etldsplusone_lru", promPrefix), - "Total number of etlds+one most recently observed per stream identity", + fmt.Sprintf("%s_total_etlds_plusone_lru", promPrefix), + "Total number of etld+one most recently observed per stream identity", nil, nil, ) diff --git a/loggers/prometheus_test.go b/loggers/prometheus_test.go index 6eb15b5e..e44d69de 100644 --- a/loggers/prometheus_test.go +++ b/loggers/prometheus_test.go @@ -64,7 +64,7 @@ func TestPrometheus_GetMetrics(t *testing.T) { config.Loggers.Prometheus.HistogramMetricsEnabled = true // By default, prometheus uses 'stream_id' as the label - // t.Run("SingleLabelStreamID", getMetricsTestCase(config, map[string]string{"stream_id": "collector"})) + t.Run("SingleLabelStreamID", getMetricsTestCase(config, map[string]string{"stream_id": "collector"})) config.Loggers.Prometheus.LabelsList = []string{"resolver", "stream_id"} t.Run("TwoLabelsStreamIDResolver", getMetricsTestCase(config, map[string]string{"resolver": "4.3.2.1", "stream_id": "collector"})) @@ -100,10 +100,8 @@ func getMetricsTestCase(config *pkgconfig.Config, labels map[string]string) func nxRecord.NetworkInfo.Protocol = UDP nxRecord.NetworkInfo.Family = IPv4 nxRecord.DNS.Length = 123 + nxRecord.DNSTap.Latency = 0.05 - // nxRecord.PublicSuffix = &dnsutils.TransformPublicSuffix{ - // QnamePublicSuffix: "faketld1", - // } g.Record(nxRecord) sfRecord := dnsutils.GetFakeDNSMessage() @@ -112,6 +110,7 @@ func getMetricsTestCase(config *pkgconfig.Config, labels map[string]string) func sfRecord.NetworkInfo.Protocol = UDP sfRecord.NetworkInfo.Family = IPv4 sfRecord.DNS.Length = 123 + sfRecord.DNSTap.Latency = 0.05 g.Record(sfRecord) @@ -122,27 +121,30 @@ func getMetricsTestCase(config *pkgconfig.Config, labels map[string]string) func // call ComputeMetrics for the second time, to calculate per-second metrcis g.ComputeEventsPerSecond() mf := getMetrics(g, t) + ensureMetricValue(t, mf, "dnscollector_bytes_total", labels, 369) ensureMetricValue(t, mf, "dnscollector_received_bytes_total", labels, 123) ensureMetricValue(t, mf, "dnscollector_sent_bytes_total", labels, 246) ensureMetricValue(t, mf, "dnscollector_throughput_ops", labels, 2) - ensureMetricValue(t, mf, "dnscollector_tlds_total", labels, 1) - ensureMetricValue(t, mf, "dnscollector_requesters_total", labels, 1) - ensureMetricValue(t, mf, "dnscollector_domains_total", labels, 1) - ensureMetricValue(t, mf, "dnscollector_domains_domains_total", labels, 1) - ensureMetricValue(t, mf, "dnscollector_nxdomains_total", labels, 1) - ensureMetricValue(t, mf, "dnscollector_sfdomains_total", labels, 1) + ensureMetricValue(t, mf, "dnscollector_total_tlds_lru", labels, 1) + ensureMetricValue(t, mf, "dnscollector_total_requesters_lru", labels, 1) + ensureMetricValue(t, mf, "dnscollector_total_domains_lru", labels, 1) + ensureMetricValue(t, mf, "dnscollector_total_noerror_domains_lru", labels, 1) + ensureMetricValue(t, mf, "dnscollector_total_nonexistent_domains_lru", labels, 1) + ensureMetricValue(t, mf, "dnscollector_total_servfail_domains_lru", labels, 1) + ensureMetricValue(t, mf, "dnscollector_dnsmessages_total", labels, 3) ensureMetricValue(t, mf, "dnscollector_queries_total", labels, 1) ensureMetricValue(t, mf, "dnscollector_replies_total", labels, 2) ensureMetricValue(t, mf, "dnscollector_flag_aa_total", labels, 1) labels["domain"] = "dns.collector" - ensureMetricValue(t, mf, "dnscollector_top_domains", labels, 1) - ensureMetricValue(t, mf, "dnscollector_top_nxdomains", labels, 1) - ensureMetricValue(t, mf, "dnscollector_top_sfdomains", labels, 1) + ensureMetricValue(t, mf, "dnscollector_top_domains", labels, 3) + ensureMetricValue(t, mf, "dnscollector_top_noerror_domains", labels, 1) + ensureMetricValue(t, mf, "dnscollector_top_nonexistent_domains", labels, 1) + ensureMetricValue(t, mf, "dnscollector_top_servfail_domains", labels, 1) delete(labels, "domain") labels["query_type"] = "A" @@ -154,13 +156,9 @@ func getMetricsTestCase(config *pkgconfig.Config, labels map[string]string) func labels["net_family"] = "IPv4" ensureMetricValue(t, mf, "dnscollector_ipversion_total", labels, 3) delete(labels, "net_family") - ensureMetricValue(t, mf, "dnscollector_latencies_count", labels, 1) - labels["le"] = "0.001" - ensureMetricValue(t, mf, "dnscollector_latencies_bucket", labels, 0) - labels["le"] = "0.1" - ensureMetricValue(t, mf, "dnscollector_latencies_bucket", labels, 1) - labels["le"] = "+Inf" - ensureMetricValue(t, mf, "dnscollector_latencies_bucket", labels, 1) + + // check histogram + ensureMetricValue(t, mf, "dnscollector_latencies", labels, 3) } } @@ -187,10 +185,6 @@ func TestPrometheus_EPS_Counters(t *testing.T) { ensureMetricValue(t, mf, "dnscollector_throughput_ops", map[string]string{"stream_id": "collector"}, 2) ensureMetricValue(t, mf, "dnscollector_throughput_ops_max", map[string]string{"stream_id": "collector"}, 2) - // for _, tc := range tt_1 { - // validateEPSCaseHelper(t, config, tc) - // } - // During next 'second' we see only 1 event. EPS counter changes, EPS Max counter keeps it's value g.Record(noErrorRecord) g.ComputeEventsPerSecond() @@ -199,10 +193,6 @@ func TestPrometheus_EPS_Counters(t *testing.T) { ensureMetricValue(t, mf, "dnscollector_throughput_ops", map[string]string{"stream_id": "collector"}, 1) ensureMetricValue(t, mf, "dnscollector_throughput_ops_max", map[string]string{"stream_id": "collector"}, 2) - // for _, tc := range tt_2 { - // validateEPSCaseHelper(t, config, tc) - // } - } func TestPrometheus_BuildInfo(t *testing.T) { @@ -235,7 +225,7 @@ func TestPrometheus_ConfirmDifferentResolvers(t *testing.T) { ensureMetricValue(t, mf, "dnscollector_bytes_total", map[string]string{"resolver": "10.10.10.10"}, 999) } -func TestPrometheus_etldplusone(t *testing.T) { +func TestPrometheus_Etldplusone(t *testing.T) { config := pkgconfig.GetFakeConfig() config.Loggers.Prometheus.LabelsList = []string{"stream_id"} g := NewPrometheus(config, logger.New(false), "test") @@ -258,13 +248,14 @@ func TestPrometheus_etldplusone(t *testing.T) { g.Record(noErrorRecord) mf := getMetrics(g, t) - ensureMetricValue(t, mf, "dnscollector_etldplusone_total", map[string]string{"stream_id": "collector"}, 2) - ensureMetricValue(t, mf, "dnscollector_etldplusone_top", map[string]string{"stream_id": "collector", "suffix": "anotherdomain.co.uk"}, 1) + ensureMetricValue(t, mf, "dnscollector_total_etlds_plusone_lru", map[string]string{"stream_id": "collector"}, 2) + ensureMetricValue(t, mf, "dnscollector_top_etlds_plusone", map[string]string{"stream_id": "collector", "suffix": "anotherdomain.co.uk"}, 1) } func ensureMetricValue(t *testing.T, mf map[string]*dto.MetricFamily, name string, labels map[string]string, value float64) bool { m, found := mf[name] if !found { + t.Errorf("Not found metric %v", name) return false } // Match labels @@ -294,11 +285,16 @@ func ensureMetricValue(t *testing.T, mf map[string]*dto.MetricFamily, name strin if pv == value { return true } + case dto.MetricType_HISTOGRAM: + pv = float64(*metric.GetHistogram().SampleCount) + if pv == value { + return true + } } t.Errorf("Metric %v, expected=%v, got=%v", name, value, pv) } } - t.Errorf("Not found metric %v{%v}", name, labels) + t.Errorf("Not found metric with label %v{%v}", name, labels) return false } @@ -351,13 +347,16 @@ func TestPrometheus_QnameInvalidChars(t *testing.T) { g.Record(dmSf) mf := getMetrics(g, t) - if !ensureMetricValue(t, mf, "dnscollector_top_domains", map[string]string{"domain": qnameValidUTF8}, 1) { + if !ensureMetricValue(t, mf, "dnscollector_top_domains", map[string]string{"domain": qnameValidUTF8}, 3) { t.Errorf("Cannot validate dnscollector_top_domains!") } - if !ensureMetricValue(t, mf, "dnscollector_top_nxdomains", map[string]string{"domain": qnameValidUTF8}, 1) { - t.Errorf("Cannot validate dnscollector_top_nxdomains!") + if !ensureMetricValue(t, mf, "dnscollector_top_noerror_domains", map[string]string{"domain": qnameValidUTF8}, 1) { + t.Errorf("Cannot validate dnscollector_top_noerror_domains!") + } + if !ensureMetricValue(t, mf, "dnscollector_top_nonexistent_domains", map[string]string{"domain": qnameValidUTF8}, 1) { + t.Errorf("Cannot validate dnscollector_top_nonexistent_domains!") } - if !ensureMetricValue(t, mf, "dnscollector_top_sfdomains", map[string]string{"domain": qnameValidUTF8}, 1) { - t.Errorf("Cannot validate dnscollector_top_sfdomains!") + if !ensureMetricValue(t, mf, "dnscollector_top_servfail_domains", map[string]string{"domain": qnameValidUTF8}, 1) { + t.Errorf("Cannot validate dnscollector_top_servfail_domains!") } } diff --git a/pkgconfig/loggers.go b/pkgconfig/loggers.go index d20f9d9a..e96fd3ac 100644 --- a/pkgconfig/loggers.go +++ b/pkgconfig/loggers.go @@ -13,26 +13,40 @@ type ConfigLoggers struct { ChannelBufferSize int `yaml:"chan-buffer-size"` } `yaml:"stdout"` Prometheus struct { - Enable bool `yaml:"enable"` - ListenIP string `yaml:"listen-ip"` - ListenPort int `yaml:"listen-port"` - TLSSupport bool `yaml:"tls-support"` - TLSMutual bool `yaml:"tls-mutual"` - TLSMinVersion string `yaml:"tls-min-version"` - CertFile string `yaml:"cert-file"` - KeyFile string `yaml:"key-file"` - PromPrefix string `yaml:"prometheus-prefix"` - LabelsList []string `yaml:"prometheus-labels"` - TopN int `yaml:"top-n"` - BasicAuthLogin string `yaml:"basic-auth-login"` - BasicAuthPwd string `yaml:"basic-auth-pwd"` - BasicAuthEnabled bool `yaml:"basic-auth-enable"` - ChannelBufferSize int `yaml:"chan-buffer-size"` - HistogramMetricsEnabled bool `yaml:"histogram-metrics-enabled"` - RequestersCacheTTL int `yaml:"requesters-cache-ttl"` - RequestersCacheSize int `yaml:"requesters-cache-size"` - DomainsCacheTTL int `yaml:"domains-cache-ttl"` - DomainsCacheSize int `yaml:"domains-cache-size"` + Enable bool `yaml:"enable"` + ListenIP string `yaml:"listen-ip"` + ListenPort int `yaml:"listen-port"` + TLSSupport bool `yaml:"tls-support"` + TLSMutual bool `yaml:"tls-mutual"` + TLSMinVersion string `yaml:"tls-min-version"` + CertFile string `yaml:"cert-file"` + KeyFile string `yaml:"key-file"` + PromPrefix string `yaml:"prometheus-prefix"` + LabelsList []string `yaml:"prometheus-labels"` + TopN int `yaml:"top-n"` + BasicAuthLogin string `yaml:"basic-auth-login"` + BasicAuthPwd string `yaml:"basic-auth-pwd"` + BasicAuthEnabled bool `yaml:"basic-auth-enable"` + ChannelBufferSize int `yaml:"chan-buffer-size"` + RequestersMetricsEnabled bool `yaml:"requesters-metrics-enabled"` + DomainsMetricsEnabled bool `yaml:"domains-metrics-enabled"` + NoErrorMetricsEnabled bool `yaml:"noerror-metrics-enabled"` + ServfailMetricsEnabled bool `yaml:"servfail-metrics-enabled"` + NonExistentMetricsEnabled bool `yaml:"nonexistent-metrics-enabled"` + TimeoutMetricsEnabled bool `yaml:"timeout-metrics-enabled"` + HistogramMetricsEnabled bool `yaml:"histogram-metrics-enabled"` + RequestersCacheTTL int `yaml:"requesters-cache-ttl"` + RequestersCacheSize int `yaml:"requesters-cache-size"` + DomainsCacheTTL int `yaml:"domains-cache-ttl"` + DomainsCacheSize int `yaml:"domains-cache-size"` + NoErrorDomainsCacheTTL int `yaml:"noerror-domains-cache-ttl"` + NoErrorDomainsCacheSize int `yaml:"noerror-domains-cache-size"` + ServfailDomainsCacheTTL int `yaml:"servfail-domains-cache-ttl"` + ServfailDomainsCacheSize int `yaml:"servfail-domains-cache-size"` + NXDomainsCacheTTL int `yaml:"nonexistent-domains-cache-ttl"` + NXDomainsCacheSize int `yaml:"nonexistent-domains-cache-size"` + DefaultDomainsCacheTTL int `yaml:"default-domains-cache-ttl"` + DefaultDomainsCacheSize int `yaml:"default-domains-cache-size"` } `yaml:"prometheus"` RestAPI struct { Enable bool `yaml:"enable"` @@ -331,10 +345,24 @@ func (c *ConfigLoggers) SetDefault() { c.Prometheus.BasicAuthEnabled = true c.Prometheus.ChannelBufferSize = 65535 c.Prometheus.HistogramMetricsEnabled = false + c.Prometheus.RequestersMetricsEnabled = true + c.Prometheus.DomainsMetricsEnabled = true + c.Prometheus.NoErrorMetricsEnabled = true + c.Prometheus.ServfailMetricsEnabled = true + c.Prometheus.NonExistentMetricsEnabled = true c.Prometheus.RequestersCacheTTL = 3600 c.Prometheus.RequestersCacheSize = 250000 c.Prometheus.DomainsCacheTTL = 3600 c.Prometheus.DomainsCacheSize = 500000 + c.Prometheus.DomainsCacheTTL = 3600 + c.Prometheus.NoErrorDomainsCacheSize = 100000 + c.Prometheus.NoErrorDomainsCacheTTL = 3600 + c.Prometheus.ServfailDomainsCacheSize = 10000 + c.Prometheus.ServfailDomainsCacheTTL = 3600 + c.Prometheus.NXDomainsCacheSize = 10000 + c.Prometheus.NXDomainsCacheTTL = 3600 + c.Prometheus.DefaultDomainsCacheSize = 1000 + c.Prometheus.DefaultDomainsCacheTTL = 3600 c.RestAPI.Enable = false c.RestAPI.ListenIP = LocalhostIP From 902bbcc08435d052833d13d4e86227dd0b0b4482 Mon Sep 17 00:00:00 2001 From: dmachard <5562930+dmachard@users.noreply.github.com> Date: Wed, 3 Jan 2024 18:19:34 +0100 Subject: [PATCH 12/12] revert config to default --- config.yml | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/config.yml b/config.yml index c3cff510..3a7eb913 100644 --- a/config.yml +++ b/config.yml @@ -77,21 +77,10 @@ multiplexer: - name: console stdout: mode: text - - name: prom - prometheus: - listen-ip: 0.0.0.0 - listen-port: 8081 - basic-auth-enable: false - prometheus-labels: ["stream_id"] - requesters-cache-size: 50000 - requesters-cache-ttl: 3600 - domains-cache-size: 50000 - domains-cache-ttl: 3600 - histogram-metrics-enabled: true routes: - from: [ tap ] - to: [ console, prom ] + to: [ console ] ################################################ # list of supported collectors