From b0dbb790a1768ead663e33f961f2bbf4fba671b7 Mon Sep 17 00:00:00 2001 From: Denis Machard <5562930+dmachard@users.noreply.github.com> Date: Tue, 11 Jun 2024 22:12:16 +0200 Subject: [PATCH] add global telemetry (#724) * add telemetry * add test * add dashboard, * update config * update workers * update docs * fix usecase24 * Update README.md --- .github/workflows/testing-go.yml | 2 +- Dockerfile | 2 +- Makefile | 1 + README.md | 15 +- config.yml | 16 +- dnscollector.go | 30 +- docker-compose.yml | 1 + docs/_examples/use-case-24.yml | 4 +- docs/configuration.md | 26 +- docs/dashboards/grafana_exporter.json | 954 ++++++++++++++++++++++++++ docs/loggers/logger_devnull.md | 11 + docs/workers.md | 1 + pkgconfig/global.go | 16 + pkginit/pipelines.go | 37 +- pkginit/pipelines_test.go | 7 +- telemetry/prometheus.go | 252 +++++++ telemetry/prometheus_test.go | 50 ++ workers/clickhouse.go | 7 +- workers/devnull.go | 34 +- workers/dnsmessage.go | 11 +- workers/dnsprocessor.go | 9 +- workers/dnstapclient.go | 7 +- workers/dnstapserver.go | 13 +- workers/elasticsearch.go | 7 +- workers/falco.go | 7 +- workers/file_tail.go | 7 +- workers/fluentd.go | 7 +- workers/influxdb.go | 7 +- workers/kafkaproducer.go | 7 +- workers/logfile.go | 7 +- workers/lokiclient.go | 7 +- workers/powerdns.go | 9 +- workers/prometheus.go | 22 +- workers/redispub.go | 7 +- workers/restapi.go | 7 +- workers/scalyr.go | 7 +- workers/statsd.go | 7 +- workers/stdout.go | 8 +- workers/syslog.go | 7 +- workers/tcpclient.go | 7 +- workers/worker.go | 111 ++- 41 files changed, 1658 insertions(+), 96 deletions(-) create mode 100644 docs/dashboards/grafana_exporter.json create mode 100644 docs/loggers/logger_devnull.md create mode 100644 telemetry/prometheus.go create mode 100644 telemetry/prometheus_test.go diff --git a/.github/workflows/testing-go.yml b/.github/workflows/testing-go.yml index b3134c25..c9331893 100644 --- a/.github/workflows/testing-go.yml +++ b/.github/workflows/testing-go.yml @@ -141,7 +141,7 @@ jobs: - id: count_tests run: | - data=$(sudo go test -timeout 360s -v ./workers ./dnsutils ./transformers ./pkgconfig ./pkginit ././ 2>&1 | grep -c RUN) + data=$(sudo go test -timeout 360s -v ./workers ./dnsutils ./transformers ./pkgconfig ./pkginit ./telemetry ././ 2>&1 | grep -c RUN) echo "Count of Tests: $data" echo "data=$data" >> $GITHUB_OUTPUT diff --git a/Dockerfile b/Dockerfile index 81936f34..71562971 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,7 +20,7 @@ USER dnscollector COPY --from=builder /build/go-dnscollector /bin/go-dnscollector COPY --from=builder /build/docker-config.yml ./etc/dnscollector/config.yml -EXPOSE 6000/tcp 8080/tcp +EXPOSE 6000/tcp 8080/tcp 9165/tcp ENTRYPOINT ["/bin/go-dnscollector"] diff --git a/Makefile b/Makefile index 771d25e2..1ffafb69 100644 --- a/Makefile +++ b/Makefile @@ -74,6 +74,7 @@ tests: check-go @go test ./pkgconfig/ -race -cover -v @go test ./pkginit/ -race -cover -v @go test ./netutils/ -race -cover -v + @go test ./telemetry/ -race -cover -v @go test -timeout 90s ./transformers/ -race -cover -v @go test -timeout 180s ./workers/ -race -cover -v diff --git a/README.md b/README.md index 5fa8ca02..e1d74bb0 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@
- + - +
@@ -115,6 +115,17 @@ The [`_integration`](./docs/_integration) folder contains DNS-collector `configu - [Elasticsearch](./docs/_integration/elasticsearch/README.md) - [Kafka](./docs/_integration/kafka/README.md) +## Telemetry + +Performance metrics are available to evaluate the efficiency of your pipelines. These metrics allow you to track: +- The number of incoming and outgoing packets processed by each worker +- The number of packets matching the policies applied (forwarded, dropped) +- The number of "discarded" packets +- Memory consumption +- CPU consumption + +A [build-in](./docs/dashboards/grafana_exporter.json) dashboard is available for monitoring these metrics. + ## Performance Tuning may be necessary to deal with a large traffic loads. diff --git a/config.yml b/config.yml index f721ef5c..3f0b9403 100644 --- a/config.yml +++ b/config.yml @@ -10,6 +10,20 @@ global: text-format-delimiter: " " text-format-boundary: "\"" pid-file: "" + worker: + interval-monitor: 10 + telemetry: + enabled: true + web-path: "/metrics" + web-listen: ":9165" + prometheus-prefix: "dnscollector_exporter" + tls-support: false + tls-cert-file: "" + tls-key-file: "" + client-ca-file: "" + basic-auth-enable: false + basic-auth-login: admin + basic-auth-pwd: changeme ################################################ # Pipelining configuration @@ -27,7 +41,7 @@ pipelines: qname-lowercase: true routing-policy: forward: [ console ] - dropped: [] + dropped: [ ] - name: console stdout: diff --git a/dnscollector.go b/dnscollector.go index dbb0e4ea..40691467 100644 --- a/dnscollector.go +++ b/dnscollector.go @@ -1,6 +1,7 @@ package main import ( + "context" "fmt" "os" "os/signal" @@ -12,6 +13,7 @@ import ( "github.com/dmachard/go-dnscollector/pkgconfig" "github.com/dmachard/go-dnscollector/pkginit" + "github.com/dmachard/go-dnscollector/telemetry" "github.com/dmachard/go-dnscollector/workers" "github.com/dmachard/go-logger" "github.com/natefinch/lumberjack" @@ -150,6 +152,12 @@ func main() { InitLogger(logger, config) logger.Info("main - version=%s revision=%s", version.Version, version.Revision) + // // telemetry + if config.Global.Telemetry.Enabled { + logger.Info("main - telemetry enabled on local address: %s", config.Global.Telemetry.WebListen) + } + promServer, metrics, errTelemetry := telemetry.InitTelemetryServer(config, logger) + // init active collectors and loggers mapLoggers := make(map[string]workers.Worker) mapCollectors := make(map[string]workers.Worker) @@ -164,8 +172,8 @@ func main() { // or pipeline ? if pkginit.IsPipelinesEnabled(config) { - logger.Info("main - running in pipelines mode") - err := pkginit.InitPipelines(mapLoggers, mapCollectors, config, logger) + logger.Info("main - running in pipeline mode") + err := pkginit.InitPipelines(mapLoggers, mapCollectors, config, logger, metrics) if err != nil { logger.Error("main - %s", err.Error()) removePIDFile(config) @@ -183,6 +191,11 @@ func main() { go func() { for { select { + case err := <-errTelemetry: + logger.Error("main - unable to start telemetry: %v", err) + removePIDFile(config) + os.Exit(1) + case <-sigHUP: logger.Warning("main - SIGHUP received") @@ -205,6 +218,19 @@ func main() { case <-sigTerm: logger.Warning("main - exiting...") + + // gracefully shutdown the HTTP server + if config.Global.Telemetry.Enabled { + logger.Info("main - telemetry is stopping") + metrics.Stop() + + if err := promServer.Shutdown(context.Background()); err != nil { + logger.Error("main - telemetry error shutting down http server - %s", err.Error()) + } + + } + + // and stop all workers for _, c := range mapCollectors { c.Stop() } diff --git a/docker-compose.yml b/docker-compose.yml index cd2784b7..e388b967 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,4 +12,5 @@ services: ports: - "6000:6000/tcp" - "8080:8080/tcp" + - "9165:9165/tcp" restart: always \ No newline at end of file diff --git a/docs/_examples/use-case-24.yml b/docs/_examples/use-case-24.yml index 6bfdd1df..dca37544 100644 --- a/docs/_examples/use-case-24.yml +++ b/docs/_examples/use-case-24.yml @@ -31,7 +31,7 @@ pipelines: dns.qtype: "TXT" transforms: atags: - tags: [ "TAG:TXT-QUERIES" ] + add-tags: [ "TAG:TXT-QUERIES" ] routing-policy: forward: [ apple-txt, all-txt ] @@ -51,7 +51,7 @@ pipelines: dns.qname: "^*.apple.com$" transforms: atags: - tags: [ "TXT:apple" ] + add-tags: [ "TXT:apple" ] routing-policy: forward: [ outputfile-apple ] diff --git a/docs/configuration.md b/docs/configuration.md index 108cd408..748eb027 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -13,6 +13,7 @@ You can find the global settings below - [Custom text format](#custom-text-format) - [Server identity](#server-identity) - [Pid file](#pid-file) + - [Telemetry](#telemetry) ### Trace @@ -127,9 +128,30 @@ Output example: ### Pid file -Set path to create pid file. +Set path to create DNS-collector PID. +By default, this settings is empty. ```yaml global: pid-file: "/path/to/your/pidfile.pid" -``` \ No newline at end of file +``` + +### Telemetry + +Enable and configure telemetry + +```yaml +global: + telemetry: + enabled: true + web-path: "/metrics" + web-listen: ":9165" + prometheus-prefix: "dnscollector_exporter" + tls-support: false + tls-cert-file: "" + tls-key-file: "" + client-ca-file: "" + basic-auth-enable: false + basic-auth-login: admin + basic-auth-pwd: changeme +``` diff --git a/docs/dashboards/grafana_exporter.json b/docs/dashboards/grafana_exporter.json new file mode 100644 index 00000000..e9000c2e --- /dev/null +++ b/docs/dashboards/grafana_exporter.json @@ -0,0 +1,954 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 9, + "panels": [], + "title": "Go", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "go_goroutines{job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "dnscollector", + "range": true, + "refId": "A" + } + ], + "title": "Goroutines", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 1 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "go_memstats_sys_bytes{job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "dnscollector", + "range": true, + "refId": "A" + } + ], + "title": "Total Used Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 1 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(process_cpu_seconds_total{job=\"$job\"}[2m])", + "legendFormat": "dnscollector", + "range": true, + "refId": "A" + } + ], + "title": "Process cpu", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 7, + "panels": [], + "title": "Workers", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "tap" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 11 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "increase(dnscollector_exporter_worker_ingress_traffic_total{job=~\"$job\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{worker}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Worker - Ingress traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 11 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "increase(dnscollector_exporter_worker_egress_traffic_total{job=~\"$job\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{worker}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Worker - Egress traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 11 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "increase(dnscollector_exporter_worker_discarded_traffic_total{job=~\"$job\"}[$__rate_interval])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{worker}}", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Worker - Discarded", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 8, + "panels": [], + "title": "Policies", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 20 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "increase(dnscollector_exporter_policy_forwarded_total{job=~\"$job\"}[$__rate_interval])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{worker}}", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Policy - Forwarded", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 20 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "increase(dnscollector_exporter_policy_dropped_total{job=~\"$job\"}[$__rate_interval])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "{{worker}}", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Policy - Dropped", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "dnscollector-exporter", + "value": "dnscollector-exporter" + }, + "description": "", + "hide": 0, + "label": "job", + "name": "job", + "options": [ + { + "selected": true, + "text": "dnscollector-exporter", + "value": "dnscollector-exporter" + } + ], + "query": "dnscollector-exporter", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timeRangeUpdatedDuringEditOrView": false, + "timepicker": {}, + "timezone": "browser", + "title": "DNScollector - Exporter", + "uid": "bdo8oaa6fq7lse", + "version": 11, + "weekStart": "" +} \ No newline at end of file diff --git a/docs/loggers/logger_devnull.md b/docs/loggers/logger_devnull.md new file mode 100644 index 00000000..d2fb060d --- /dev/null +++ b/docs/loggers/logger_devnull.md @@ -0,0 +1,11 @@ +# Logger: DevNull + +Devnull plugin Logger + +Options: + +Default values: + +```yaml +devnull: +``` diff --git a/docs/workers.md b/docs/workers.md index 4e1af7fe..534342d5 100644 --- a/docs/workers.md +++ b/docs/workers.md @@ -28,3 +28,4 @@ A worker can act as a collector or a logger. | [Kafka Producer](loggers/logger_kafka.md) | Logger | Kafka DNS producer | | [Falco](loggers/logger_falco.md) | Logger | Falco plugin logger | | [ClickHouse](loggers/logger_clickhouse.md) | Logger | ClickHouse logger | +| [DevNull](loggers/logger_devnull.md) | Logger | For testing purpose | diff --git a/pkgconfig/global.go b/pkgconfig/global.go index 647596e2..f316bdb9 100644 --- a/pkgconfig/global.go +++ b/pkgconfig/global.go @@ -19,6 +19,22 @@ type ConfigGlobal struct { } `yaml:"trace"` ServerIdentity string `yaml:"server-identity" default:""` PidFile string `yaml:"pid-file" default:""` + Worker struct { + InternalMonitor int `yaml:"interval-monitor" default:"10"` + } `yaml:"worker"` + Telemetry struct { + Enabled bool `yaml:"enabled" default:"true"` + WebPath string `yaml:"web-path" default:"/metrics"` + WebListen string `yaml:"web-listen" default:":9165"` + PromPrefix string `yaml:"prometheus-prefix" default:"dnscollector_exporter"` + TLSSupport bool `yaml:"tls-support" default:"false"` + TLSCertFile string `yaml:"tls-cert-file" default:""` + TLSKeyFile string `yaml:"tls-key-file" default:""` + ClientCAFile string `yaml:"client-ca-file" default:""` + BasicAuthEnable bool `yaml:"basic-auth-enable" default:"false"` + BasicAuthLogin string `yaml:"basic-auth-login" default:"admin"` + BasicAuthPwd string `yaml:"basic-auth-pwd" default:"changeme"` + } `yaml:"telemetry"` } func (c *ConfigGlobal) SetDefault() { diff --git a/pkginit/pipelines.go b/pkginit/pipelines.go index cc7605aa..50e97e17 100644 --- a/pkginit/pipelines.go +++ b/pkginit/pipelines.go @@ -4,6 +4,7 @@ import ( "fmt" "github.com/dmachard/go-dnscollector/pkgconfig" + "github.com/dmachard/go-dnscollector/telemetry" "github.com/dmachard/go-dnscollector/workers" "github.com/dmachard/go-logger" "github.com/pkg/errors" @@ -124,91 +125,121 @@ func CreateRouting(stanza pkgconfig.ConfigPipelines, mapCollectors map[string]wo return nil } -func CreateStanza(stanzaName string, config *pkgconfig.Config, mapCollectors map[string]workers.Worker, mapLoggers map[string]workers.Worker, logger *logger.Logger) { +func CreateStanza(stanzaName string, config *pkgconfig.Config, mapCollectors map[string]workers.Worker, mapLoggers map[string]workers.Worker, logger *logger.Logger, metrics *telemetry.PrometheusCollector) { // register the logger if enabled if config.Loggers.RestAPI.Enable { mapLoggers[stanzaName] = workers.NewRestAPI(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.Prometheus.Enable { mapLoggers[stanzaName] = workers.NewPrometheus(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.Stdout.Enable { mapLoggers[stanzaName] = workers.NewStdOut(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.LogFile.Enable { mapLoggers[stanzaName] = workers.NewLogFile(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.DNSTap.Enable { mapLoggers[stanzaName] = workers.NewDnstapSender(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.TCPClient.Enable { mapLoggers[stanzaName] = workers.NewTCPClient(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.Syslog.Enable { mapLoggers[stanzaName] = workers.NewSyslog(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.Fluentd.Enable { mapLoggers[stanzaName] = workers.NewFluentdClient(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.InfluxDB.Enable { mapLoggers[stanzaName] = workers.NewInfluxDBClient(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.LokiClient.Enable { mapLoggers[stanzaName] = workers.NewLokiClient(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.Statsd.Enable { mapLoggers[stanzaName] = workers.NewStatsdClient(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.ElasticSearchClient.Enable { mapLoggers[stanzaName] = workers.NewElasticSearchClient(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.ScalyrClient.Enable { mapLoggers[stanzaName] = workers.NewScalyrClient(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.RedisPub.Enable { mapLoggers[stanzaName] = workers.NewRedisPub(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.KafkaProducer.Enable { mapLoggers[stanzaName] = workers.NewKafkaProducer(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.FalcoClient.Enable { mapLoggers[stanzaName] = workers.NewFalcoClient(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } if config.Loggers.ClickhouseClient.Enable { mapLoggers[stanzaName] = workers.NewClickhouseClient(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) + } + if config.Loggers.DevNull.Enable { + mapLoggers[stanzaName] = workers.NewDevNull(config, logger, stanzaName) + mapLoggers[stanzaName].SetMetrics(metrics) } // register the collector if enabled if config.Collectors.DNSMessage.Enable { mapCollectors[stanzaName] = workers.NewDNSMessage(nil, config, logger, stanzaName) + mapCollectors[stanzaName].SetMetrics(metrics) } if config.Collectors.Dnstap.Enable { mapCollectors[stanzaName] = workers.NewDnstapServer(nil, config, logger, stanzaName) + mapCollectors[stanzaName].SetMetrics(metrics) } if config.Collectors.DnstapProxifier.Enable { mapCollectors[stanzaName] = workers.NewDnstapProxifier(nil, config, logger, stanzaName) + mapCollectors[stanzaName].SetMetrics(metrics) } if config.Collectors.AfpacketLiveCapture.Enable { mapCollectors[stanzaName] = workers.NewAfpacketSniffer(nil, config, logger, stanzaName) + mapCollectors[stanzaName].SetMetrics(metrics) } if config.Collectors.XdpLiveCapture.Enable { mapCollectors[stanzaName] = workers.NewXDPSniffer(nil, config, logger, stanzaName) + mapCollectors[stanzaName].SetMetrics(metrics) } if config.Collectors.Tail.Enable { mapCollectors[stanzaName] = workers.NewTail(nil, config, logger, stanzaName) + mapCollectors[stanzaName].SetMetrics(metrics) } if config.Collectors.PowerDNS.Enable { mapCollectors[stanzaName] = workers.NewPdnsServer(nil, config, logger, stanzaName) + mapCollectors[stanzaName].SetMetrics(metrics) } if config.Collectors.FileIngestor.Enable { mapCollectors[stanzaName] = workers.NewFileIngestor(nil, config, logger, stanzaName) + mapCollectors[stanzaName].SetMetrics(metrics) } if config.Collectors.Tzsp.Enable { mapCollectors[stanzaName] = workers.NewTZSP(nil, config, logger, stanzaName) + mapCollectors[stanzaName].SetMetrics(metrics) } } -func InitPipelines(mapLoggers map[string]workers.Worker, mapCollectors map[string]workers.Worker, config *pkgconfig.Config, logger *logger.Logger) error { +func InitPipelines(mapLoggers map[string]workers.Worker, mapCollectors map[string]workers.Worker, config *pkgconfig.Config, logger *logger.Logger, telemetry *telemetry.PrometheusCollector) error { // check if the name of each stanza is uniq routesDefined := false for _, stanza := range config.Pipelines { @@ -241,7 +272,7 @@ func InitPipelines(mapLoggers map[string]workers.Worker, mapCollectors map[strin // read each stanza and init for _, stanza := range config.Pipelines { stanzaConfig := GetStanzaConfig(config, stanza) - CreateStanza(stanza.Name, stanzaConfig, mapCollectors, mapLoggers, logger) + CreateStanza(stanza.Name, stanzaConfig, mapCollectors, mapLoggers, logger, telemetry) } diff --git a/pkginit/pipelines_test.go b/pkginit/pipelines_test.go index 4615ac6b..36c8d975 100644 --- a/pkginit/pipelines_test.go +++ b/pkginit/pipelines_test.go @@ -5,6 +5,7 @@ import ( "testing" "github.com/dmachard/go-dnscollector/pkgconfig" + "github.com/dmachard/go-dnscollector/telemetry" "github.com/dmachard/go-dnscollector/workers" "github.com/dmachard/go-logger" ) @@ -76,7 +77,8 @@ func TestPipelines_NoRoutesDefined(t *testing.T) { mapLoggers := make(map[string]workers.Worker) mapCollectors := make(map[string]workers.Worker) - err := InitPipelines(mapLoggers, mapCollectors, config, logger.New(false)) + metrics := telemetry.NewPrometheusCollector(config) + err := InitPipelines(mapLoggers, mapCollectors, config, logger.New(false), metrics) if err == nil { t.Errorf("Want err, got nil") } else if err.Error() != "no routes are defined" { @@ -100,7 +102,8 @@ func TestPipelines_RoutingLoop(t *testing.T) { mapLoggers := make(map[string]workers.Worker) mapCollectors := make(map[string]workers.Worker) - err := InitPipelines(mapLoggers, mapCollectors, config, logger.New(false)) + metrics := telemetry.NewPrometheusCollector(config) + err := InitPipelines(mapLoggers, mapCollectors, config, logger.New(false), metrics) if err == nil { t.Errorf("Want err, got nil") } else if !strings.Contains(err.Error(), "routing error loop") { diff --git a/telemetry/prometheus.go b/telemetry/prometheus.go new file mode 100644 index 00000000..79945c82 --- /dev/null +++ b/telemetry/prometheus.go @@ -0,0 +1,252 @@ +package telemetry + +import ( + "crypto/tls" + "crypto/x509" + "fmt" + "net/http" + "os" + "regexp" + "sync" + "time" + + "github.com/dmachard/go-dnscollector/pkgconfig" + "github.com/dmachard/go-logger" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/prometheus/common/version" +) + +/* +OpenMetrics and the Prometheus exposition format require the metric name +to consist only of alphanumericals and "_", ":" and they must not start +with digits. +*/ +var metricNameRegex = regexp.MustCompile(`_*[^0-9A-Za-z_]+_*`) + +func SanitizeMetricName(metricName string) string { + return metricNameRegex.ReplaceAllString(metricName, "_") +} + +type WorkerStats struct { + Name string + TotalIngress int + TotalEgress int + TotalForwardedPolicy int + TotalDroppedPolicy int + TotalDiscarded int +} + +type PrometheusCollector struct { + sync.Mutex + config *pkgconfig.Config + metrics map[string]*prometheus.Desc + Record chan WorkerStats + data map[string]WorkerStats // To store the worker stats + stop chan struct{} // Channel to signal stopping + promPrefix string +} + +func NewPrometheusCollector(config *pkgconfig.Config) *PrometheusCollector { + t := &PrometheusCollector{ + config: config, + Record: make(chan WorkerStats), + data: make(map[string]WorkerStats), + stop: make(chan struct{}), + } + + t.promPrefix = SanitizeMetricName(config.Global.Telemetry.PromPrefix) + + t.metrics = map[string]*prometheus.Desc{ + "worker_ingress_total": prometheus.NewDesc( + fmt.Sprintf("%s_worker_ingress_traffic_total", t.promPrefix), + "Ingress traffic associated to each worker", []string{"worker"}, nil), + "worker_egress_total": prometheus.NewDesc( + fmt.Sprintf("%s_worker_egress_traffic_total", t.promPrefix), + "Egress traffic associated to each worker", []string{"worker"}, nil), + "worker_discarded_total": prometheus.NewDesc( + fmt.Sprintf("%s_worker_discarded_traffic_total", t.promPrefix), + "Discarded traffic associated to each worker", []string{"worker"}, nil), + "policy_forwarded_total": prometheus.NewDesc( + fmt.Sprintf("%s_policy_forwarded_total", t.promPrefix), + "Total number of forwarded policy", []string{"worker"}, nil), + "policy_dropped_total": prometheus.NewDesc( + fmt.Sprintf("%s_policy_dropped_total", t.promPrefix), + "Total number of dropped policy", []string{"worker"}, nil), + } + return t +} + +func (t *PrometheusCollector) UpdateStats() { + for { + select { + case ws := <-t.Record: + t.Lock() + if _, ok := t.data[ws.Name]; !ok { + t.data[ws.Name] = ws + } else { + updatedWs := t.data[ws.Name] + updatedWs.TotalForwardedPolicy += ws.TotalForwardedPolicy + updatedWs.TotalDroppedPolicy += ws.TotalDroppedPolicy + updatedWs.TotalIngress += ws.TotalIngress + updatedWs.TotalEgress += ws.TotalEgress + updatedWs.TotalDiscarded += ws.TotalDiscarded + t.data[ws.Name] = updatedWs + } + t.Unlock() + case <-t.stop: + // Received stop signal, exit the goroutine + return + } + } +} +func (t *PrometheusCollector) Collect(ch chan<- prometheus.Metric) { + t.Lock() + defer t.Unlock() + + // Collect the forwarded and dropped metrics for each worker + for _, ws := range t.data { + ch <- prometheus.MustNewConstMetric( + t.metrics["worker_discarded_total"], + prometheus.CounterValue, + float64(ws.TotalDiscarded), + ws.Name, + ) + ch <- prometheus.MustNewConstMetric( + t.metrics["worker_ingress_total"], + prometheus.CounterValue, + float64(ws.TotalIngress), + ws.Name, + ) + ch <- prometheus.MustNewConstMetric( + t.metrics["worker_egress_total"], + prometheus.CounterValue, + float64(ws.TotalEgress), + ws.Name, + ) + ch <- prometheus.MustNewConstMetric( + t.metrics["policy_forwarded_total"], + prometheus.CounterValue, + float64(ws.TotalForwardedPolicy), + ws.Name, + ) + ch <- prometheus.MustNewConstMetric( + t.metrics["policy_dropped_total"], + prometheus.CounterValue, + float64(ws.TotalDroppedPolicy), + ws.Name, + ) + } +} + +func (t *PrometheusCollector) Describe(ch chan<- *prometheus.Desc) { + for _, m := range t.metrics { + ch <- m + } +} + +func (t *PrometheusCollector) Stop() { + close(t.stop) // Signal the stop channel to stop the goroutine +} + +func InitTelemetryServer(config *pkgconfig.Config, logger *logger.Logger) (*http.Server, *PrometheusCollector, chan error) { + // channel for error + errChan := make(chan error) + + // Prometheus collectors + metrics := NewPrometheusCollector(config) + + // HTTP server + promServer := &http.Server{ + Addr: config.Global.Telemetry.WebListen, + ReadHeaderTimeout: 5 * time.Second, + } + + if config.Global.Telemetry.Enabled { + go func() { + // start metrics + go metrics.UpdateStats() + + // register metrics + prometheus.MustRegister(metrics) + prometheus.MustRegister(version.NewCollector(config.Global.Telemetry.PromPrefix)) + + // handle /metrics + http.Handle(config.Global.Telemetry.WebPath, promhttp.Handler()) + + // handle http error + http.HandleFunc("/", func(w http.ResponseWriter, _ *http.Request) { + _, err := w.Write([]byte(` +