From 757b776de39bf0fc0c6d1dd74e4a245d7a99023a Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Wed, 5 Jun 2024 02:19:46 +0200 Subject: [PATCH] fix(mixins): Align loki-writes mixins with loki-reads (#13022) Signed-off-by: QuentinBisson --- .../dashboards/loki-writes.json | 303 ++---------------- .../dashboards/loki-writes.json | 297 +++++++++++++++-- .../dashboards/dashboard-utils.libsonnet | 19 ++ .../dashboards/loki-reads.libsonnet | 37 +-- .../dashboards/loki-writes.libsonnet | 90 +++++- 5 files changed, 418 insertions(+), 328 deletions(-) diff --git a/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json b/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json index 7f1a296557e8..b78ffce9f2c7 100644 --- a/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json +++ b/production/loki-mixin-compiled-ssd/dashboards/loki-writes.json @@ -211,7 +211,7 @@ "sort": "none" } }, - "span": 6, + "span": 4, "stack": true, "targets": [ { @@ -260,7 +260,7 @@ "sort": "none" } }, - "span": 6, + "span": 4, "targets": [ { "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/(loki.*|enterprise-logs)-write\", route=~\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", @@ -307,19 +307,7 @@ "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Write Path", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "datasource": "$datasource", "fieldConfig": { @@ -340,12 +328,13 @@ "mode": "absolute", "steps": [ ] }, - "unit": "short" + "unit": "ms" }, "overrides": [ ] }, "id": 3, "links": [ ], + "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -355,83 +344,20 @@ "sort": "none" } }, - "span": 6, + "span": 4, "targets": [ { - "expr": "sum (rate(loki_distributor_structured_metadata_bytes_received_total{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\",}[$__rate_interval])) / sum(rate(loki_distributor_bytes_received_total{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\",}[$__rate_interval]))", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(loki.*|enterprise-logs)-write\", route=~\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", - "legendFormat": "bytes", - "legendLink": null + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "__auto", + "refId": "A", + "step": 10 } ], - "title": "Per Total Received Bytes", + "title": "Per Pod Latency (p99)", "type": "timeseries" - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "pointSize": 5, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - } - }, - "thresholds": { - "mode": "absolute", - "steps": [ ] - }, - "unit": "short" - }, - "overrides": [ ] - }, - "id": 4, - "links": [ ], - "options": { - "legend": { - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "span": 6, - "stack": true, - "targets": [ - { - "expr": "sum by (tenant) (rate(loki_distributor_structured_metadata_bytes_received_total{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\",}[$__rate_interval])) / ignoring(tenant) group_left sum(rate(loki_distributor_structured_metadata_bytes_received_total{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\",}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "{{tenant}}", - "legendLink": null - } - ], - "title": "Per Tenant", - "type": "timeseries", - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": 1, - "min": null, - "show": false - } - ] } ], "repeat": null, @@ -446,180 +372,30 @@ "height": "250px", "panels": [ { - "aliasColors": { - "1xx": "#EAB839", - "2xx": "#7EB26D", - "3xx": "#6ED0E0", - "4xx": "#EF843C", - "5xx": "#E24D42", - "OK": "#7EB26D", - "cancel": "#A9A9A9", - "error": "#E24D42", - "success": "#7EB26D" - }, "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": { "drawStyle": "line", - "fillOpacity": 100, - "lineWidth": 0, + "fillOpacity": 10, + "lineWidth": 1, "pointSize": 5, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", - "mode": "normal" + "mode": "none" } }, - "min": 0, "thresholds": { "mode": "absolute", "steps": [ ] }, "unit": "short" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "1xx" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "2xx" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "3xx" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "4xx" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "5xx" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "OK" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "cancel" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#A9A9A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "error" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "success" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - } - ] + "overrides": [ ] }, - "fill": 10, - "id": 5, - "linewidth": 0, + "id": 4, "links": [ ], "options": { "legend": { @@ -631,16 +407,15 @@ } }, "span": 6, - "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\", operation=\"WRITE\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum (rate(loki_distributor_structured_metadata_bytes_received_total{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\",}[$__rate_interval])) / sum(rate(loki_distributor_bytes_received_total{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\",}[$__rate_interval]))", "format": "time_series", - "legendFormat": "{{status}}", - "refId": "A" + "legendFormat": "bytes", + "legendLink": null } ], - "title": "QPS", + "title": "Per Total Received Bytes", "type": "timeseries" }, { @@ -663,13 +438,12 @@ "mode": "absolute", "steps": [ ] }, - "unit": "ms" + "unit": "short" }, "overrides": [ ] }, - "id": 6, + "id": 5, "links": [ ], - "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -680,34 +454,23 @@ } }, "span": 6, + "stack": true, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\", operation=\"WRITE\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\", operation=\"WRITE\"}[$__rate_interval])) by (le)) * 1e3", - "format": "time_series", - "legendFormat": "50th Percentile", - "refId": "B" - }, - { - "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_sum{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\", operation=\"WRITE\"}[$__rate_interval])) * 1e3 / sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\", operation=\"WRITE\"}[$__rate_interval]))", + "expr": "sum by (tenant) (rate(loki_distributor_structured_metadata_bytes_received_total{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\",}[$__rate_interval])) / ignoring(tenant) group_left sum(rate(loki_distributor_structured_metadata_bytes_received_total{cluster=~\"$cluster\",job=~\"($namespace)/(loki.*|enterprise-logs)-write\",}[$__rate_interval]))", "format": "time_series", - "legendFormat": "Average", - "refId": "C" + "legendFormat": "{{tenant}}", + "legendLink": null } ], - "title": "Latency", + "title": "Per Tenant", "type": "timeseries", "yaxes": [ { - "format": "ms", + "format": "short", "label": null, "logBase": 1, - "max": null, + "max": 1, "min": 0, "show": true }, @@ -715,7 +478,7 @@ "format": "short", "label": null, "logBase": 1, - "max": null, + "max": 1, "min": null, "show": false } @@ -726,7 +489,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "BoltDB Shipper", + "title": "Write Path - Structured Metadata", "titleSize": "h6" } ], diff --git a/production/loki-mixin-compiled/dashboards/loki-writes.json b/production/loki-mixin-compiled/dashboards/loki-writes.json index be1b3fde7cb7..a4ff98dfbf0b 100644 --- a/production/loki-mixin-compiled/dashboards/loki-writes.json +++ b/production/loki-mixin-compiled/dashboards/loki-writes.json @@ -211,7 +211,7 @@ "sort": "none" } }, - "span": 6, + "span": 4, "stack": true, "targets": [ { @@ -260,7 +260,7 @@ "sort": "none" } }, - "span": 6, + "span": 4, "targets": [ { "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/distributor\", route=~\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"})) * 1e3", @@ -307,6 +307,57 @@ "show": false } ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 3, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/distributor\", route=~\"api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle\"}[$__rate_interval])) by (le,pod)) * 1e3", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "__auto", + "refId": "A", + "step": 10 + } + ], + "title": "Per Pod Latency (p99)", + "type": "timeseries" } ], "repeat": null, @@ -344,7 +395,7 @@ }, "overrides": [ ] }, - "id": 3, + "id": 4, "links": [ ], "options": { "legend": { @@ -391,7 +442,7 @@ }, "overrides": [ ] }, - "id": 4, + "id": 5, "links": [ ], "options": { "legend": { @@ -618,7 +669,7 @@ ] }, "fill": 10, - "id": 5, + "id": 6, "linewidth": 0, "links": [ ], "options": { @@ -630,7 +681,7 @@ "sort": "none" } }, - "span": 6, + "span": 4, "stack": true, "targets": [ { @@ -667,7 +718,7 @@ }, "overrides": [ ] }, - "id": 6, + "id": 7, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -679,7 +730,7 @@ "sort": "none" } }, - "span": 6, + "span": 4, "targets": [ { "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=\"/logproto.Pusher/Push\"})) * 1e3", @@ -726,6 +777,57 @@ "show": false } ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 8, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (le,pod)) * 1e3", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "__auto", + "refId": "A", + "step": 10 + } + ], + "title": "Per Pod Latency (p99)", + "type": "timeseries" } ], "repeat": null, @@ -912,7 +1014,7 @@ ] }, "fill": 10, - "id": 7, + "id": 9, "linewidth": 0, "links": [ ], "options": { @@ -924,7 +1026,7 @@ "sort": "none" } }, - "span": 6, + "span": 4, "stack": true, "targets": [ { @@ -961,7 +1063,7 @@ }, "overrides": [ ] }, - "id": 8, + "id": 10, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -973,7 +1075,7 @@ "sort": "none" } }, - "span": 6, + "span": 4, "targets": [ { "expr": "histogram_quantile(0.99, sum by (le) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=\"/logproto.Pusher/Push\"})) * 1e3", @@ -1020,6 +1122,57 @@ "show": false } ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 11, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (le,pod)) * 1e3", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "__auto", + "refId": "A", + "step": 10 + } + ], + "title": "Per Pod Latency (p99)", + "type": "timeseries" } ], "repeat": null, @@ -1206,7 +1359,7 @@ ] }, "fill": 10, - "id": 9, + "id": 12, "linewidth": 0, "links": [ ], "options": { @@ -1218,7 +1371,7 @@ "sort": "none" } }, - "span": 6, + "span": 4, "stack": true, "targets": [ { @@ -1255,7 +1408,7 @@ }, "overrides": [ ] }, - "id": 10, + "id": 13, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -1267,7 +1420,7 @@ "sort": "none" } }, - "span": 6, + "span": 4, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(loki_index_request_duration_seconds_bucket{cluster=~\"$cluster\",job=~\"($namespace)/ingester.*\", operation=\"index_chunk\"}[$__rate_interval])) by (le)) * 1e3", @@ -1308,6 +1461,57 @@ "show": false } ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 14, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(loki_index_request_duration_seconds_bucket{cluster=~\"$cluster\",job=~\"($namespace)/ingester.*\", operation=\"index_chunk\"}[$__rate_interval])) by (le,pod)) * 1e3", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "__auto", + "refId": "A", + "step": 10 + } + ], + "title": "Per Pod Latency (p99)", + "type": "timeseries" } ], "repeat": null, @@ -1494,7 +1698,7 @@ ] }, "fill": 10, - "id": 11, + "id": 15, "linewidth": 0, "links": [ ], "options": { @@ -1506,7 +1710,7 @@ "sort": "none" } }, - "span": 6, + "span": 4, "stack": true, "targets": [ { @@ -1543,7 +1747,7 @@ }, "overrides": [ ] }, - "id": 12, + "id": 16, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -1555,7 +1759,7 @@ "sort": "none" } }, - "span": 6, + "span": 4, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\",job=~\"($namespace)/ingester\", operation=\"WRITE\"}[$__rate_interval])) by (le)) * 1e3", @@ -1596,13 +1800,64 @@ "show": false } ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 17, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\",job=~\"($namespace)/ingester\", operation=\"WRITE\"}[$__rate_interval])) by (le,pod)) * 1e3", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "__auto", + "refId": "A", + "step": 10 + } + ], + "title": "Per Pod Latency (p99)", + "type": "timeseries" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "BoltDB Shipper", + "title": "BoltDB Index", "titleSize": "h6" } ], diff --git a/production/loki-mixin/dashboards/dashboard-utils.libsonnet b/production/loki-mixin/dashboards/dashboard-utils.libsonnet index 099d6810d2e1..577f3235eaf5 100644 --- a/production/loki-mixin/dashboards/dashboard-utils.libsonnet +++ b/production/loki-mixin/dashboards/dashboard-utils.libsonnet @@ -328,4 +328,23 @@ local utils = import 'mixin-utils/utils.libsonnet'; containerDiskSpaceUtilizationPanel(title, containerName):: $.newQueryPanel(title, 'percentunit') + $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,%s})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher(), $.containerLabelMatcher(containerName)], '{{persistentvolumeclaim}}'), + + local latencyPanelWithExtraGrouping(metricName, selector, multiplier='1e3', extra_grouping='') = { + nullPointMode: 'null as zero', + targets: [ + { + expr: 'histogram_quantile(0.99, sum(rate(%s_bucket%s[$__rate_interval])) by (le,%s)) * %s' % [metricName, selector, extra_grouping, multiplier], + format: 'time_series', + intervalFactor: 2, + refId: 'A', + step: 10, + interval: '1m', + legendFormat: '__auto', + }, + ], + }, + + p99LatencyByPod(metric, selectorStr):: + $.newQueryPanel('Per Pod Latency (p99)', 'ms') + + latencyPanelWithExtraGrouping(metric, selectorStr, '1e3', 'pod'), } diff --git a/production/loki-mixin/dashboards/loki-reads.libsonnet b/production/loki-mixin/dashboards/loki-reads.libsonnet index 6d75993d1139..ea791408bb7a 100644 --- a/production/loki-mixin/dashboards/loki-reads.libsonnet +++ b/production/loki-mixin/dashboards/loki-reads.libsonnet @@ -56,25 +56,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; ] ), - local latencyPanelWithExtraGrouping(metricName, selector, multiplier='1e3', extra_grouping='') = { - nullPointMode: 'null as zero', - targets: [ - { - expr: 'histogram_quantile(0.99, sum(rate(%s_bucket%s[$__rate_interval])) by (le,%s)) * %s' % [metricName, selector, extra_grouping, multiplier], - format: 'time_series', - intervalFactor: 2, - refId: 'A', - step: 10, - interval: '1m', - legendFormat: '__auto', - }, - ], - }, - - local p99LatencyByPod(metric, selectorStr) = - $.newQueryPanel('Per Pod Latency (p99)', 'ms') + - latencyPanelWithExtraGrouping(metric, selectorStr, '1e3', 'pod'), - 'loki-reads.json': { local cfg = self, @@ -145,7 +126,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - p99LatencyByPod( + $.p99LatencyByPod( 'loki_request_duration_seconds', $.toPrometheusSelector( dashboards['loki-reads.json'].clusterMatchers + dashboards['loki-reads.json'].matchers.cortexgateway + [utils.selector.re('route', http_routes)] @@ -168,7 +149,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - p99LatencyByPod( + $.p99LatencyByPod( 'loki_request_duration_seconds', $.toPrometheusSelector( dashboards['loki-reads.json'].clusterMatchers + @@ -194,7 +175,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - p99LatencyByPod( + $.p99LatencyByPod( 'loki_request_duration_seconds', $.toPrometheusSelector( dashboards['loki-reads.json'].clusterMatchers + @@ -220,7 +201,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - p99LatencyByPod( + $.p99LatencyByPod( 'loki_request_duration_seconds', $.toPrometheusSelector( dashboards['loki-reads.json'].clusterMatchers + @@ -247,7 +228,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - p99LatencyByPod( + $.p99LatencyByPod( 'loki_request_duration_seconds', $.toPrometheusSelector( dashboards['loki-reads.json'].clusterMatchers + @@ -273,7 +254,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - p99LatencyByPod( + $.p99LatencyByPod( 'loki_request_duration_seconds', $.toPrometheusSelector( dashboards['loki-reads.json'].clusterMatchers + @@ -299,7 +280,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - p99LatencyByPod( + $.p99LatencyByPod( 'loki_request_duration_seconds', $.toPrometheusSelector( dashboards['loki-reads.json'].clusterMatchers + @@ -336,7 +317,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('loki_index_request_duration_seconds', '{%s operation!="index_chunk"}' % dashboards['loki-reads.json'].querierSelector) ) .addPanel( - p99LatencyByPod( + $.p99LatencyByPod( 'loki_index_request_duration_seconds', '{%s operation!="index_chunk"}' % dashboards['loki-reads.json'].querierSelector ) @@ -354,7 +335,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('loki_boltdb_shipper_request_duration_seconds', '{%s operation="Shipper.Query"}' % dashboards['loki-reads.json'].querierOrIndexGatewaySelector) ) .addPanel( - p99LatencyByPod( + $.p99LatencyByPod( 'loki_boltdb_shipper_request_duration_seconds', '{%s operation="Shipper.Query"}' % dashboards['loki-reads.json'].querierOrIndexGatewaySelector ) diff --git a/production/loki-mixin/dashboards/loki-writes.libsonnet b/production/loki-mixin/dashboards/loki-writes.libsonnet index 8cde24657090..dd704941b0ac 100644 --- a/production/loki-mixin/dashboards/loki-writes.libsonnet +++ b/production/loki-mixin/dashboards/loki-writes.libsonnet @@ -58,7 +58,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.newQueryPanel('Latency', 'ms') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.cortexgateway + [utils.selector.re('route', 'api_prom_push|loki_api_v1_push')], + dashboards['loki-writes.json'].clusterMatchers + + dashboards['loki-writes.json'].matchers.cortexgateway + + [utils.selector.re('route', 'api_prom_push|loki_api_v1_push')], + ) + ) + .addPanel( + $.p99LatencyByPod( + 'loki_request_duration_seconds', + $.toPrometheusSelector( + dashboards['loki-writes.json'].clusterMatchers + + dashboards['loki-writes.json'].matchers.cortexgateway + + [utils.selector.re('route', 'api_prom_push|loki_api_v1_push')], + ), ) ) ) @@ -72,13 +84,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.newQueryPanel('Latency', 'ms') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.distributor + [utils.selector.re('route', 'api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle')], + dashboards['loki-writes.json'].clusterMatchers + + dashboards['loki-writes.json'].matchers.distributor + + [utils.selector.re('route', 'api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle')], + ) + ) + .addPanel( + $.p99LatencyByPod( + 'loki_request_duration_seconds', + $.toPrometheusSelector( + dashboards['loki-writes.json'].clusterMatchers + + dashboards['loki-writes.json'].matchers.distributor + + [utils.selector.re('route', 'api_prom_push|loki_api_v1_push|/httpgrpc.HTTP/Handle')], + ), ) ) ) .addRowIf( $._config.tsdb, - $.row(if $._config.ssd.enabled then 'Write Path' else 'Distributor - Structured Metadata') + $.row((if $._config.ssd.enabled then 'Write Path' else 'Distributor') + ' - Structured Metadata') .addPanel( $.newQueryPanel('Per Total Received Bytes') + $.queryPanel('sum (rate(loki_distributor_structured_metadata_bytes_received_total{%s}[$__rate_interval])) / sum(rate(loki_distributor_bytes_received_total{%s}[$__rate_interval]))' % [dashboards['loki-writes.json'].distributorSelector, dashboards['loki-writes.json'].distributorSelector], 'bytes') @@ -105,7 +129,19 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.newQueryPanel('Latency', 'ms') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.ingester_zone + [utils.selector.eq('route', '/logproto.Pusher/Push')], + dashboards['loki-writes.json'].clusterMatchers + + dashboards['loki-writes.json'].matchers.ingester_zone + + [utils.selector.eq('route', '/logproto.Pusher/Push')], + ) + ) + .addPanel( + $.p99LatencyByPod( + 'loki_request_duration_seconds', + $.toPrometheusSelector( + dashboards['loki-writes.json'].clusterMatchers + + dashboards['loki-writes.json'].matchers.ingester_zone + + [utils.selector.eq('route', '/logproto.Pusher/Push')], + ), ) ) ) @@ -114,14 +150,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Ingester') .addPanel( $.newQueryPanel('QPS') + - $.newQpsPanel('loki_request_duration_seconds_count{%s route="/logproto.Pusher/Push"}' % dashboards['loki-writes.json'].ingesterSelector) + $.newQpsPanel('loki_request_duration_seconds_count{%s route="/logproto.Pusher/Push"}' % dashboards['loki-writes.json'].ingesterSelector) ) .addPanel( $.newQueryPanel('Latency', 'ms') + utils.latencyRecordingRulePanel( 'loki_request_duration_seconds', - dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.ingester + [utils.selector.eq('route', '/logproto.Pusher/Push')], + dashboards['loki-writes.json'].clusterMatchers + + dashboards['loki-writes.json'].matchers.ingester + + [utils.selector.eq('route', '/logproto.Pusher/Push')], + ) + ) + .addPanel( + $.p99LatencyByPod( + 'loki_request_duration_seconds', + $.toPrometheusSelector( + dashboards['loki-writes.json'].clusterMatchers + + dashboards['loki-writes.json'].matchers.ingester + + [utils.selector.eq('route', '/logproto.Pusher/Push')] + ), ) ) ) @@ -136,6 +183,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.newQueryPanel('Latency', 'ms') + $.latencyPanel('loki_index_request_duration_seconds', '{%s operation="index_chunk"}' % dashboards['loki-writes.json'].anyIngester) ) + .addPanel( + $.p99LatencyByPod( + 'loki_index_request_duration_seconds', + '{%s operation="index_chunk"}' % dashboards['loki-writes.json'].anyIngester, + ) + ) ) .addRowIf( showBigTable, @@ -148,12 +201,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.newQueryPanel('Latency', 'ms') + utils.latencyRecordingRulePanel( 'loki_bigtable_request_duration_seconds', - dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].clusterMatchers + dashboards['loki-writes.json'].matchers.ingester + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')] + dashboards['loki-writes.json'].clusterMatchers + + dashboards['loki-writes.json'].matchers.ingester + + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')], + ) + ) + .addPanel( + $.p99LatencyByPod( + 'loki_bigtable_request_duration_seconds', + $.toPrometheusSelector( + dashboards['loki-writes.json'].clusterMatchers + + dashboards['loki-writes.json'].matchers.ingester + + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')], + ), ) ) ) - .addRow( - $.row('BoltDB Shipper') + .addRowIf( + !$._config.ssd.enabled, + $.row('BoltDB Index') .addPanel( $.newQueryPanel('QPS') + $.newQpsPanel('loki_boltdb_shipper_request_duration_seconds_count{%s operation="WRITE"}' % dashboards['loki-writes.json'].ingesterSelector) @@ -162,6 +228,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.newQueryPanel('Latency', 'ms') + $.latencyPanel('loki_boltdb_shipper_request_duration_seconds', '{%s operation="WRITE"}' % dashboards['loki-writes.json'].ingesterSelector) ) + .addPanel( + $.p99LatencyByPod( + 'loki_boltdb_shipper_request_duration_seconds', + '{%s operation="WRITE"}' % dashboards['loki-writes.json'].ingesterSelector, + ) + ) ), }, }