diff --git a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-chunks.json b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-chunks.json index b1304ffa..59dd5286 100644 --- a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-chunks.json +++ b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-chunks.json @@ -416,7 +416,7 @@ "span": 6, "targets": [ { - "expr": "sum(rate(loki_chunk_store_index_entries_per_chunk_sum{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m])) / sum(rate(loki_chunk_store_index_entries_per_chunk_count{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m]))", + "expr": "sum(rate(loki_chunk_store_index_entries_per_chunk_sum{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) / sum(rate(loki_chunk_store_index_entries_per_chunk_count{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "Index Entries", "legendLink": null @@ -981,19 +981,19 @@ "span": 12, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[1m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "histogram_quantile(0.90, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[1m])) by (le))", + "expr": "histogram_quantile(0.90, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p90", "legendLink": null }, { - "expr": "histogram_quantile(0.50, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[1m])) by (le))", + "expr": "histogram_quantile(0.50, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -1052,19 +1052,19 @@ "span": 12, "targets": [ { - "expr": "histogram_quantile(0.5, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p50", "legendLink": null }, { - "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "sum(rate(loki_ingester_chunk_bounds_hours_sum{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m])) / sum(rate(loki_ingester_chunk_bounds_hours_count{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m]))", + "expr": "sum(rate(loki_ingester_chunk_bounds_hours_sum{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) / sum(rate(loki_ingester_chunk_bounds_hours_count{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "avg", "legendLink": null diff --git a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-deletion.json b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-deletion.json index 939f37e4..2db2b7cb 100644 --- a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-deletion.json +++ b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-deletion.json @@ -579,7 +579,7 @@ "span": 6, "targets": [ { - "expr": "sum(rate(loki_compactor_deleted_lines{cluster=~\"$cluster\",job=~\"$namespace/compactor\"}[$__rate_interval])) by (user)", + "expr": "sum(rate(loki_compactor_deleted_lines{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"}[$__rate_interval])) by (user)", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null diff --git a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-logs.json b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-logs.json index 90691632..32b8e52a 100644 --- a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-logs.json +++ b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-logs.json @@ -114,6 +114,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -236,7 +241,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[$__rate_interval]))", "refId": "A" } ], @@ -287,6 +292,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -373,6 +383,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -408,7 +423,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[$__rate_interval]))", "refId": "A" } ], @@ -459,6 +474,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -494,7 +514,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[$__rate_interval]))", "refId": "A" } ], @@ -632,6 +652,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -667,7 +692,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)", + "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[$__rate_interval])) by (level)", "legendFormat": "{{level}}", "refId": "A" } @@ -719,6 +744,11 @@ "dashLength": 10, "dashes": false, "datasource": "$loki_datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -771,7 +801,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" | __error__=\"\" [$__auto])) by (level)", "intervalFactor": 3, "legendFormat": "{{level}}", "refId": "A" diff --git a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-mixin-recording-rules.json b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-mixin-recording-rules.json index 1234065b..f1f6c215 100644 --- a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-mixin-recording-rules.json +++ b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-mixin-recording-rules.json @@ -300,7 +300,8 @@ "value": 80 } ] - } + }, + "unit": "s" }, "overrides": [ ] }, diff --git a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-operational.json b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-operational.json index 7d25fc64..a54d3daa 100644 --- a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-operational.json +++ b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-operational.json @@ -87,7 +87,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", "legendFormat": "{{status}}", "refId": "A" } @@ -183,7 +183,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/distributor\", route=~\"api_prom_push|loki_api_v1_push\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/distributor\", route=~\"api_prom_push|loki_api_v1_push\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", "legendFormat": "{{status}}", "refId": "A" } @@ -278,7 +278,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant))", + "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (tenant))", "legendFormat": "{{tenant}}", "refId": "A" } @@ -332,7 +332,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "MBs" }, "overrides": [ ] }, @@ -374,7 +375,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant)) / 1024 / 1024", + "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (tenant)) / 1024 / 1024", "legendFormat": "{{tenant}}", "refId": "A" } @@ -524,7 +525,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -630,7 +632,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -834,7 +837,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -1040,7 +1044,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -1148,7 +1153,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -1353,7 +1359,8 @@ "description": "", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -1602,7 +1609,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])))", + "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])))", "interval": "", "legendFormat": "{{ tenant }} - {{ reason }}", "refId": "A" @@ -1727,7 +1734,7 @@ ], "targets": [ { - "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])[$__range:1m])))", + "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])[$__range:$__rate_interval])))", "format": "table", "instant": true, "interval": "", @@ -1852,6 +1859,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -1985,7 +1997,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"} | logfmt | level=\"error\"[$__auto]))", "refId": "A" } ], @@ -2153,6 +2165,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2189,7 +2206,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2242,6 +2259,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2278,7 +2300,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2331,6 +2353,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2367,7 +2394,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2525,6 +2552,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2658,7 +2690,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\"} | logfmt | level=\"error\"[$__auto]))", "refId": "A" } ], @@ -2971,7 +3003,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m]) > 0))", + "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[$__rate_interval]) > 0))", "interval": "", "legendFormat": "{{ tenant }}", "refId": "A" @@ -3081,13 +3113,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m]))", + "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[$__rate_interval]))", "interval": "", "legendFormat": "Chunks", "refId": "A" }, { - "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m])) < 1", + "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[$__rate_interval]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[$__rate_interval])) < 1", "interval": "", "legendFormat": "De-Dupe Ratio", "refId": "B" @@ -3165,7 +3197,7 @@ "reverseYBuckets": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m])) by (le)", + "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[$__rate_interval])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -3320,7 +3352,7 @@ "reverseYBuckets": false, "targets": [ { - "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m]))", + "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[$__rate_interval]))", "format": "heatmap", "instant": false, "interval": "", @@ -3465,6 +3497,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -3598,7 +3635,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"} | logfmt | level=\"error\"[$__auto]))", "refId": "A" } ], @@ -3782,6 +3819,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -3821,19 +3863,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (method, name, le, container))", "intervalFactor": 1, "legendFormat": "{{container}}: .99-{{method}}-{{name}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .9-{{method}}-{{name}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .5-{{method}}-{{name}}", "refId": "C" @@ -3925,7 +3967,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, method, name, container)", + "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, method, name, container)", "intervalFactor": 1, "legendFormat": "{{container}}: {{status_code}}-{{method}}-{{name}}", "refId": "A" @@ -3994,6 +4036,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4033,19 +4080,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -4098,6 +4145,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4137,7 +4189,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, status_code, method)", + "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, status_code, method)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -4206,6 +4258,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4243,17 +4300,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (operation, le))", "refId": "C" } ], @@ -4304,6 +4361,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4341,20 +4403,20 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -4407,6 +4469,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4444,20 +4511,20 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -4510,6 +4577,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4547,17 +4619,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (operation, le))", "refId": "C" } ], @@ -4608,6 +4680,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4645,7 +4722,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4698,6 +4775,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4735,7 +4817,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4788,6 +4870,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4825,7 +4912,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4878,6 +4965,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4915,7 +5007,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4984,6 +5076,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -5023,19 +5120,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5127,7 +5224,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5231,7 +5328,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -5317,7 +5414,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -5403,7 +5500,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_throttled_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_throttled_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -5489,7 +5586,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -5575,17 +5672,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": ".99", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": ".9", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": ".5", "refId": "C" } @@ -5637,6 +5734,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -5675,19 +5777,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5778,7 +5880,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5847,6 +5949,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -5885,19 +5992,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5988,7 +6095,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6057,6 +6164,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -6095,19 +6207,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6198,7 +6310,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6267,6 +6379,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -6305,19 +6422,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6408,7 +6525,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" diff --git a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-reads-resources.json b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-reads-resources.json index 7141e403..ed3965e5 100644 --- a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-reads-resources.json +++ b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-reads-resources.json @@ -1364,7 +1364,7 @@ }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "datasource": "$datasource", @@ -1429,6 +1429,7 @@ } ] }, + "gridPos": { }, "id": 19, "links": [ ], "options": { @@ -1440,6 +1441,411 @@ "sort": "none" } }, + "targets": [ + { + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + }, + { + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\", resource=\"cpu\"} > 0)", + "format": "time_series", + "legendFormat": "request", + "legendLink": null + }, + { + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"})", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "CPU", + "tooltip": { + "sort": 2 + }, + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "request" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FFC000", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E02F44", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { }, + "id": 20, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"})", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + }, + { + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\", resource=\"memory\"} > 0)", + "format": "time_series", + "legendFormat": "request", + "legendLink": null + }, + { + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"} > 0)", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "Memory (workingset)", + "tooltip": { + "sort": 2 + }, + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "bytes" + }, + "overrides": [ ] + }, + "gridPos": { }, + "id": 21, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\"})", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Memory (go heap inuse)", + "tooltip": { + "sort": 2 + }, + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "gridPos": { }, + "id": 22, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "format": "time_series", + "legendFormat": "{{pod}} - {{device}}", + "legendLink": null + } + ], + "title": "Disk Writes", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "gridPos": { }, + "id": 23, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "format": "time_series", + "legendFormat": "{{pod}} - {{device}}", + "legendLink": null + } + ], + "title": "Disk Reads", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "percentunit" + }, + "overrides": [ ] + }, + "gridPos": { }, + "id": 24, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} / kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{cluster=~\"$cluster\", namespace=~\"$namespace\",label_name=~\"bloom-gateway.*\"})", + "format": "time_series", + "legendFormat": "{{persistentvolumeclaim}}", + "legendLink": null + } + ], + "title": "Disk Space Utilization", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bloom Gateway", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "request" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FFC000", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E02F44", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "id": 25, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, "span": 4, "targets": [ { @@ -1530,7 +1936,7 @@ } ] }, - "id": 20, + "id": 26, "links": [ ], "options": { "legend": { @@ -1592,7 +1998,7 @@ }, "overrides": [ ] }, - "id": 21, + "id": 27, "links": [ ], "options": { "legend": { @@ -1655,7 +2061,7 @@ "overrides": [ ] }, "gridPos": { }, - "id": 22, + "id": 28, "links": [ ], "options": { "legend": { @@ -1741,7 +2147,7 @@ ] }, "gridPos": { }, - "id": 23, + "id": 29, "links": [ ], "options": { "legend": { @@ -1842,7 +2248,7 @@ ] }, "gridPos": { }, - "id": 24, + "id": 30, "links": [ ], "options": { "legend": { @@ -1904,7 +2310,7 @@ "overrides": [ ] }, "gridPos": { }, - "id": 25, + "id": 31, "links": [ ], "options": { "legend": { diff --git a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-reads.json b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-reads.json index c8cda840..99692299 100644 --- a/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-reads.json +++ b/monitoring-mixins/loki-mixin/deploy/dashboards_out/loki-reads.json @@ -215,7 +215,7 @@ "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -263,19 +263,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}) by (route) ", "format": "time_series", "legendFormat": "{{ route }} Average", "refId": "C" @@ -341,7 +341,7 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval])) by (le,pod)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -554,7 +554,7 @@ "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -602,19 +602,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}) by (route) ", "format": "time_series", "legendFormat": "{{ route }} Average", "refId": "C" @@ -680,7 +680,7 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval])) by (le,pod)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -893,7 +893,7 @@ "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -941,19 +941,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) ", "format": "time_series", "legendFormat": "{{ route }} Average", "refId": "C" @@ -1019,7 +1019,7 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (le,pod)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1232,7 +1232,7 @@ "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -1280,19 +1280,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) ", "format": "time_series", "legendFormat": "{{ route }} Average", "refId": "C" @@ -1358,7 +1358,7 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (le,pod)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1569,6 +1569,684 @@ }, "span": 4, "stack": true, + "targets": [ + { + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/index-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "QPS", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 14, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", + "format": "time_series", + "legendFormat": "{{ route }} 99th percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", + "format": "time_series", + "legendFormat": "{{ route }} 50th percentile", + "refId": "B" + }, + { + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) ", + "format": "time_series", + "legendFormat": "{{ route }} Average", + "refId": "C" + } + ], + "title": "Latency", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 15, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/index-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval])) by (le,pod)) * 1e3", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "__auto", + "refId": "A", + "step": 10 + } + ], + "title": "Per Pod Latency (p99)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Index Gateway", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", + "error": "#E24D42", + "success": "#7EB26D" + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "1xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "3xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "OK" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cancel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#A9A9A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + } + ] + }, + "fill": 10, + "id": 16, + "linewidth": 0, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "stack": true, + "targets": [ + { + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "QPS", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 17, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", + "format": "time_series", + "legendFormat": "{{ route }} 99th percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", + "format": "time_series", + "legendFormat": "{{ route }} 50th percentile", + "refId": "B" + }, + { + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) ", + "format": "time_series", + "legendFormat": "{{ route }} Average", + "refId": "C" + } + ], + "title": "Latency", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 18, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval])) by (le,pod)) * 1e3", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "__auto", + "refId": "A", + "step": 10 + } + ], + "title": "Per Pod Latency (p99)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bloom Gateway", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", + "error": "#E24D42", + "success": "#7EB26D" + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "1xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "3xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "OK" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cancel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#A9A9A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + } + ] + }, + "fill": 10, + "id": 19, + "linewidth": 0, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "stack": true, "targets": [ { "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_index_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/querier\", operation!=\"index_chunk\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", @@ -1604,7 +2282,7 @@ }, "overrides": [ ] }, - "id": 14, + "id": 20, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -1682,7 +2360,7 @@ }, "overrides": [ ] }, - "id": 15, + "id": 21, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -1714,7 +2392,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Index", + "title": "TSBD Index", "titleSize": "h6" }, { @@ -1894,7 +2572,7 @@ ] }, "fill": 10, - "id": 16, + "id": 22, "linewidth": 0, "links": [ ], "options": { @@ -1943,7 +2621,7 @@ }, "overrides": [ ] }, - "id": 17, + "id": 23, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -2021,7 +2699,7 @@ }, "overrides": [ ] }, - "id": 18, + "id": 24, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -2053,7 +2731,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "BoltDB Shipper", + "title": "BoltDB Index", "titleSize": "h6" } ], diff --git a/monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml b/monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml index af06880c..7c0825d8 100644 --- a/monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml +++ b/monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml @@ -3,8 +3,9 @@ groups: rules: - alert: LokiRequestErrors annotations: - message: | + description: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + summary: Loki request error rate is high. expr: | 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) / @@ -15,16 +16,18 @@ groups: severity: critical - alert: LokiRequestPanics annotations: - message: | + description: | {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + summary: Loki requests are causing code panics. expr: | sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency annotations: - message: | + description: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + summary: Loki request error latency is high. expr: | cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 for: 15m @@ -32,8 +35,9 @@ groups: severity: critical - alert: LokiTooManyCompactorsRunning annotations: - message: | + description: | {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + summary: Loki deployment is running more than one compactor. expr: | sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 for: 5m diff --git a/monitoring-mixins/loki-mixin/deploy/manifests/k8s-all-in-one.yaml b/monitoring-mixins/loki-mixin/deploy/manifests/k8s-all-in-one.yaml index d77ef2c9..5f60f007 100644 --- a/monitoring-mixins/loki-mixin/deploy/manifests/k8s-all-in-one.yaml +++ b/monitoring-mixins/loki-mixin/deploy/manifests/k8s-all-in-one.yaml @@ -419,7 +419,7 @@ data: "span": 6, "targets": [ { - "expr": "sum(rate(loki_chunk_store_index_entries_per_chunk_sum{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m])) / sum(rate(loki_chunk_store_index_entries_per_chunk_count{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m]))", + "expr": "sum(rate(loki_chunk_store_index_entries_per_chunk_sum{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) / sum(rate(loki_chunk_store_index_entries_per_chunk_count{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "Index Entries", "legendLink": null @@ -984,19 +984,19 @@ data: "span": 12, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[1m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "histogram_quantile(0.90, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[1m])) by (le))", + "expr": "histogram_quantile(0.90, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p90", "legendLink": null }, { - "expr": "histogram_quantile(0.50, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[1m])) by (le))", + "expr": "histogram_quantile(0.50, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -1055,19 +1055,19 @@ data: "span": 12, "targets": [ { - "expr": "histogram_quantile(0.5, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p50", "legendLink": null }, { - "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "sum(rate(loki_ingester_chunk_bounds_hours_sum{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m])) / sum(rate(loki_ingester_chunk_bounds_hours_count{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m]))", + "expr": "sum(rate(loki_ingester_chunk_bounds_hours_sum{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) / sum(rate(loki_ingester_chunk_bounds_hours_count{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "avg", "legendLink": null @@ -1781,7 +1781,7 @@ data: "span": 6, "targets": [ { - "expr": "sum(rate(loki_compactor_deleted_lines{cluster=~\"$cluster\",job=~\"$namespace/compactor\"}[$__rate_interval])) by (user)", + "expr": "sum(rate(loki_compactor_deleted_lines{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"}[$__rate_interval])) by (user)", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null @@ -2068,6 +2068,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2190,7 +2195,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[$__rate_interval]))", "refId": "A" } ], @@ -2241,6 +2246,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2327,6 +2337,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2362,7 +2377,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[$__rate_interval]))", "refId": "A" } ], @@ -2413,6 +2428,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2448,7 +2468,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[$__rate_interval]))", "refId": "A" } ], @@ -2586,6 +2606,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2621,7 +2646,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)", + "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[$__rate_interval])) by (level)", "legendFormat": "{{level}}", "refId": "A" } @@ -2673,6 +2698,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$loki_datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2725,7 +2755,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" | __error__=\"\" [$__auto])) by (level)", "intervalFactor": 3, "legendFormat": "{{level}}", "refId": "A" @@ -3338,7 +3368,8 @@ data: "value": 80 } ] - } + }, + "unit": "s" }, "overrides": [ ] }, @@ -3860,7 +3891,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", "legendFormat": "{{status}}", "refId": "A" } @@ -3956,7 +3987,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/distributor\", route=~\"api_prom_push|loki_api_v1_push\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/distributor\", route=~\"api_prom_push|loki_api_v1_push\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", "legendFormat": "{{status}}", "refId": "A" } @@ -4051,7 +4082,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant))", + "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (tenant))", "legendFormat": "{{tenant}}", "refId": "A" } @@ -4105,7 +4136,8 @@ data: "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "MBs" }, "overrides": [ ] }, @@ -4147,7 +4179,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant)) / 1024 / 1024", + "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (tenant)) / 1024 / 1024", "legendFormat": "{{tenant}}", "refId": "A" } @@ -4297,7 +4329,8 @@ data: "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -4403,7 +4436,8 @@ data: "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -4607,7 +4641,8 @@ data: "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -4813,7 +4848,8 @@ data: "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -4921,7 +4957,8 @@ data: "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -5126,7 +5163,8 @@ data: "description": "", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -5375,7 +5413,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])))", + "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])))", "interval": "", "legendFormat": "{{ tenant }} - {{ reason }}", "refId": "A" @@ -5500,7 +5538,7 @@ data: ], "targets": [ { - "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])[$__range:1m])))", + "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])[$__range:$__rate_interval])))", "format": "table", "instant": true, "interval": "", @@ -5625,6 +5663,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -5758,7 +5801,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"} | logfmt | level=\"error\"[$__auto]))", "refId": "A" } ], @@ -5926,6 +5969,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -5962,7 +6010,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -6015,6 +6063,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -6051,7 +6104,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -6104,6 +6157,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -6140,7 +6198,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -6298,6 +6356,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -6431,7 +6494,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\"} | logfmt | level=\"error\"[$__auto]))", "refId": "A" } ], @@ -6744,7 +6807,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m]) > 0))", + "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[$__rate_interval]) > 0))", "interval": "", "legendFormat": "{{ tenant }}", "refId": "A" @@ -6854,13 +6917,13 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m]))", + "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[$__rate_interval]))", "interval": "", "legendFormat": "Chunks", "refId": "A" }, { - "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m])) < 1", + "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[$__rate_interval]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[$__rate_interval])) < 1", "interval": "", "legendFormat": "De-Dupe Ratio", "refId": "B" @@ -6938,7 +7001,7 @@ data: "reverseYBuckets": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m])) by (le)", + "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[$__rate_interval])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -7093,7 +7156,7 @@ data: "reverseYBuckets": false, "targets": [ { - "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m]))", + "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[$__rate_interval]))", "format": "heatmap", "instant": false, "interval": "", @@ -7238,6 +7301,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -7371,7 +7439,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"} | logfmt | level=\"error\"[$__auto]))", "refId": "A" } ], @@ -7555,6 +7623,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -7594,19 +7667,19 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (method, name, le, container))", "intervalFactor": 1, "legendFormat": "{{container}}: .99-{{method}}-{{name}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .9-{{method}}-{{name}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .5-{{method}}-{{name}}", "refId": "C" @@ -7698,7 +7771,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, method, name, container)", + "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, method, name, container)", "intervalFactor": 1, "legendFormat": "{{container}}: {{status_code}}-{{method}}-{{name}}", "refId": "A" @@ -7767,6 +7840,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -7806,19 +7884,19 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -7871,6 +7949,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -7910,7 +7993,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, status_code, method)", + "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, status_code, method)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -7979,6 +8062,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -8016,17 +8104,17 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (operation, le))", "refId": "C" } ], @@ -8077,6 +8165,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -8114,20 +8207,20 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -8180,6 +8273,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -8217,20 +8315,20 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -8283,6 +8381,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -8320,17 +8423,17 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (operation, le))", "refId": "C" } ], @@ -8381,6 +8484,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -8418,7 +8526,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -8471,6 +8579,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -8508,7 +8621,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -8561,6 +8674,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -8598,7 +8716,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -8651,6 +8769,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -8688,7 +8811,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -8757,6 +8880,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -8796,19 +8924,19 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -8900,7 +9028,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -9004,7 +9132,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -9090,7 +9218,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -9176,7 +9304,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_throttled_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_throttled_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -9262,7 +9390,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -9348,17 +9476,17 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": ".99", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": ".9", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": ".5", "refId": "C" } @@ -9410,6 +9538,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -9448,19 +9581,19 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -9551,7 +9684,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -9620,6 +9753,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -9658,19 +9796,19 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -9761,7 +9899,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -9830,6 +9968,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -9868,19 +10011,19 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -9971,7 +10114,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -10040,6 +10183,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -10078,19 +10226,19 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -10181,7 +10329,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -11727,7 +11875,7 @@ data: }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "datasource": "$datasource", @@ -11792,6 +11940,7 @@ data: } ] }, + "gridPos": { }, "id": 19, "links": [ ], "options": { @@ -11803,22 +11952,21 @@ data: "sort": "none" } }, - "span": 4, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\"}[$__rate_interval]))", + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\", resource=\"cpu\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\", resource=\"cpu\"} > 0)", "format": "time_series", "legendFormat": "request", "legendLink": null }, { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\"})", + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"})", "format": "time_series", "legendFormat": "limit", "legendLink": null @@ -11893,6 +12041,7 @@ data: } ] }, + "gridPos": { }, "id": 20, "links": [ ], "options": { @@ -11904,22 +12053,21 @@ data: "sort": "none" } }, - "span": 4, "targets": [ { - "expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\"})", + "expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"})", "format": "time_series", "legendFormat": "{{pod}}", "legendLink": null }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\", resource=\"memory\"} > 0)", + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\", resource=\"memory\"} > 0)", "format": "time_series", "legendFormat": "request", "legendLink": null }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\"} > 0)", + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"} > 0)", "format": "time_series", "legendFormat": "limit", "legendLink": null @@ -11955,6 +12103,7 @@ data: }, "overrides": [ ] }, + "gridPos": { }, "id": 21, "links": [ ], "options": { @@ -11966,10 +12115,9 @@ data: "sort": "none" } }, - "span": 4, "targets": [ { - "expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/ingester.+\"})", + "expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\"})", "format": "time_series", "legendFormat": "{{pod}}", "legendLink": null @@ -11980,40 +12128,28 @@ data: "sort": 2 }, "type": "timeseries" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Ingester", - "titleSize": "h6" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + }, { "datasource": "$datasource", "fieldConfig": { "defaults": { "custom": { "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, + "fillOpacity": 100, + "lineWidth": 0, "pointSize": 5, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" } }, "thresholds": { "mode": "absolute", "steps": [ ] }, - "unit": "short" + "unit": "Bps" }, "overrides": [ ] }, @@ -12031,13 +12167,13 @@ data: }, "targets": [ { - "expr": "sum by(pod) (loki_prometheus_rule_group_rules{cluster=~\"$cluster\", job=~\"($namespace)/ruler\"}) or sum by(pod) (cortex_prometheus_rule_group_rules{cluster=~\"$cluster\", job=~\"($namespace)/ruler\"})", + "expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", - "legendFormat": "{{pod}}", + "legendFormat": "{{pod}} - {{device}}", "legendLink": null } ], - "title": "Rules", + "title": "Disk Writes", "type": "timeseries" }, { @@ -12046,62 +12182,23 @@ data: "defaults": { "custom": { "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, + "fillOpacity": 100, + "lineWidth": 0, "pointSize": 5, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" } }, "thresholds": { "mode": "absolute", "steps": [ ] }, - "unit": "short" + "unit": "Bps" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "request" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FFC000", - "mode": "fixed" - } - }, - { - "id": "custom.fillOpacity", - "value": 0 - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "limit" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E02F44", - "mode": "fixed" - } - }, - { - "id": "custom.fillOpacity", - "value": 0 - } - ] - } - ] + "overrides": [ ] }, "gridPos": { }, "id": 23, @@ -12117,28 +12214,13 @@ data: }, "targets": [ { - "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"}[$__rate_interval]))", - "format": "time_series", - "legendFormat": "{{pod}}", - "legendLink": null - }, - { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\", resource=\"cpu\"} > 0)", - "format": "time_series", - "legendFormat": "request", - "legendLink": null - }, - { - "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"})", + "expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", "format": "time_series", - "legendFormat": "limit", + "legendFormat": "{{pod}} - {{device}}", "legendLink": null } ], - "title": "CPU", - "tooltip": { - "sort": 2 - }, + "title": "Disk Reads", "type": "timeseries" }, { @@ -12161,18 +12243,78 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "bytes" + "unit": "percentunit" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "request" - }, - "properties": [ - { - "id": "color", - "value": { + "overrides": [ ] + }, + "gridPos": { }, + "id": 24, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} / kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{cluster=~\"$cluster\", namespace=~\"$namespace\",label_name=~\"bloom-gateway.*\"})", + "format": "time_series", + "legendFormat": "{{persistentvolumeclaim}}", + "legendLink": null + } + ], + "title": "Disk Space Utilization", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bloom Gateway", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "request" + }, + "properties": [ + { + "id": "color", + "value": { "fixedColor": "#FFC000", "mode": "fixed" } @@ -12204,9 +12346,1223 @@ data: } ] }, - "gridPos": { }, - "id": 24, + "id": 25, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + }, + { + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\", resource=\"cpu\"} > 0)", + "format": "time_series", + "legendFormat": "request", + "legendLink": null + }, + { + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\"})", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "CPU", + "tooltip": { + "sort": 2 + }, + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "request" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FFC000", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E02F44", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "id": 26, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\"})", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + }, + { + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\", resource=\"memory\"} > 0)", + "format": "time_series", + "legendFormat": "request", + "legendLink": null + }, + { + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\"} > 0)", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "Memory (workingset)", + "tooltip": { + "sort": 2 + }, + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "bytes" + }, + "overrides": [ ] + }, + "id": 27, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/ingester.+\"})", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Memory (go heap inuse)", + "tooltip": { + "sort": 2 + }, + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Ingester", + "titleSize": "h6" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { }, + "id": 28, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(pod) (loki_prometheus_rule_group_rules{cluster=~\"$cluster\", job=~\"($namespace)/ruler\"}) or sum by(pod) (cortex_prometheus_rule_group_rules{cluster=~\"$cluster\", job=~\"($namespace)/ruler\"})", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Rules", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "request" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FFC000", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E02F44", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { }, + "id": 29, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + }, + { + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\", resource=\"cpu\"} > 0)", + "format": "time_series", + "legendFormat": "request", + "legendLink": null + }, + { + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"})", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "CPU", + "tooltip": { + "sort": 2 + }, + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "request" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FFC000", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E02F44", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { }, + "id": 30, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"})", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + }, + { + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\", resource=\"memory\"} > 0)", + "format": "time_series", + "legendFormat": "request", + "legendLink": null + }, + { + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"} > 0)", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "Memory (workingset)", + "tooltip": { + "sort": 2 + }, + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "bytes" + }, + "overrides": [ ] + }, + "gridPos": { }, + "id": 31, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/ruler\"})", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Memory (go heap inuse)", + "tooltip": { + "sort": 2 + }, + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Ruler", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "loki" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ ], + "query": "label_values(loki_build_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ ], + "query": "label_values(loki_build_info{cluster=~\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Loki / Reads Resources", + "uid": "reads-resources", + "version": 0 + } +kind: ConfigMap +metadata: + annotations: + grafana_dashboard_folder: /dashboards/Loki Mixin + labels: + grafana_dashboard: "1" + name: loki-reads-resources.json + namespace: monitoring-system +--- +apiVersion: v1 +data: + loki-reads.json: |- + { + "annotations": { + "list": [ ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "loki" + ], + "targetBlank": false, + "title": "Loki Dashboards", + "type": "dashboards" + } + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", + "error": "#E24D42", + "success": "#7EB26D" + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "1xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "3xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "OK" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cancel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#A9A9A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + } + ] + }, + "fill": 10, + "id": 1, + "linewidth": 0, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "stack": true, + "targets": [ + { + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "QPS", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 2, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"})) * 1e3", + "format": "time_series", + "legendFormat": "{{ route }} 99th percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"})) * 1e3", + "format": "time_series", + "legendFormat": "{{ route }} 50th percentile", + "refId": "B" + }, + { + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}) by (route) ", + "format": "time_series", + "legendFormat": "{{ route }} Average", + "refId": "C" + } + ], + "title": "Latency", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 3, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}[$__rate_interval])) by (le,pod)) * 1e3", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "__auto", + "refId": "A", + "step": 10 + } + ], + "title": "Per Pod Latency (p99)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Frontend (query-frontend)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", + "error": "#E24D42", + "success": "#7EB26D" + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "1xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "3xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "OK" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cancel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#A9A9A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + } + ] + }, + "fill": 10, + "id": 4, + "linewidth": 0, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "stack": true, + "targets": [ + { + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "QPS", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 5, "links": [ ], + "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -12216,31 +13572,47 @@ data: "sort": "none" } }, + "span": 4, "targets": [ { - "expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"})", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"})) * 1e3", "format": "time_series", - "legendFormat": "{{pod}}", - "legendLink": null + "legendFormat": "{{ route }} 99th percentile", + "refId": "A" }, { - "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\", resource=\"memory\"} > 0)", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"})) * 1e3", "format": "time_series", - "legendFormat": "request", - "legendLink": null + "legendFormat": "{{ route }} 50th percentile", + "refId": "B" }, { - "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"ruler\"} > 0)", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}) by (route) ", "format": "time_series", - "legendFormat": "limit", - "legendLink": null + "legendFormat": "{{ route }} Average", + "refId": "C" } ], - "title": "Memory (workingset)", - "tooltip": { - "sort": 2 - }, - "type": "timeseries" + "title": "Latency", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] }, { "datasource": "$datasource", @@ -12262,13 +13634,13 @@ data: "mode": "absolute", "steps": [ ] }, - "unit": "bytes" + "unit": "ms" }, "overrides": [ ] }, - "gridPos": { }, - "id": 25, + "id": 6, "links": [ ], + "nullPointMode": "null as zero", "options": { "legend": { "showLegend": true @@ -12278,18 +13650,19 @@ data: "sort": "none" } }, + "span": 4, "targets": [ { - "expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/ruler\"})", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", - "legendFormat": "{{pod}}", - "legendLink": null + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "__auto", + "refId": "A", + "step": 10 } ], - "title": "Memory (go heap inuse)", - "tooltip": { - "sort": 2 - }, + "title": "Per Pod Latency (p99)", "type": "timeseries" } ], @@ -12297,150 +13670,9 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ruler", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "loki" - ], - "templating": { - "list": [ - { - "current": { - "text": "default", - "value": "default" - }, - "hide": 0, - "label": "Data source", - "name": "datasource", - "options": [ ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ ], - "query": "label_values(loki_build_info, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ ], - "query": "label_values(loki_build_info{cluster=~\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "utc", - "title": "Loki / Reads Resources", - "uid": "reads-resources", - "version": 0 - } -kind: ConfigMap -metadata: - annotations: - grafana_dashboard_folder: /dashboards/Loki Mixin - labels: - grafana_dashboard: "1" - name: loki-reads-resources.json - namespace: monitoring-system ---- -apiVersion: v1 -data: - loki-reads.json: |- - { - "annotations": { - "list": [ ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - { - "asDropdown": true, - "icon": "external link", - "includeVars": true, - "keepTime": true, - "tags": [ - "loki" - ], - "targetBlank": false, - "title": "Loki Dashboards", - "type": "dashboards" - } - ], - "refresh": "10s", - "rows": [ + "title": "Querier", + "titleSize": "h6" + }, { "collapse": false, "height": "250px", @@ -12618,7 +13850,7 @@ data: ] }, "fill": 10, - "id": 1, + "id": 7, "linewidth": 0, "links": [ ], "options": { @@ -12634,7 +13866,7 @@ data: "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -12667,7 +13899,7 @@ data: }, "overrides": [ ] }, - "id": 2, + "id": 8, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -12682,19 +13914,19 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) ", "format": "time_series", "legendFormat": "{{ route }} Average", "refId": "C" @@ -12745,7 +13977,7 @@ data: }, "overrides": [ ] }, - "id": 3, + "id": 9, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -12760,7 +13992,7 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval])) by (le,pod)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -12777,7 +14009,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Frontend (query-frontend)", + "title": "Ingester", "titleSize": "h6" }, { @@ -12957,7 +14189,7 @@ data: ] }, "fill": 10, - "id": 4, + "id": 10, "linewidth": 0, "links": [ ], "options": { @@ -12973,7 +14205,7 @@ data: "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -13006,7 +14238,7 @@ data: }, "overrides": [ ] }, - "id": 5, + "id": 11, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -13021,19 +14253,19 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) ", "format": "time_series", "legendFormat": "{{ route }} Average", "refId": "C" @@ -13084,7 +14316,7 @@ data: }, "overrides": [ ] }, - "id": 6, + "id": 12, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -13099,7 +14331,7 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval])) by (le,pod)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13116,7 +14348,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Querier", + "title": "Ingester - Zone Aware", "titleSize": "h6" }, { @@ -13296,7 +14528,7 @@ data: ] }, "fill": 10, - "id": 7, + "id": 13, "linewidth": 0, "links": [ ], "options": { @@ -13312,7 +14544,7 @@ data: "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/index-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -13345,7 +14577,7 @@ data: }, "overrides": [ ] }, - "id": 8, + "id": 14, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -13360,19 +14592,19 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) ", "format": "time_series", "legendFormat": "{{ route }} Average", "refId": "C" @@ -13423,7 +14655,7 @@ data: }, "overrides": [ ] }, - "id": 9, + "id": 15, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -13438,7 +14670,7 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (le,pod)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/index-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13455,7 +14687,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ingester", + "title": "Index Gateway", "titleSize": "h6" }, { @@ -13635,7 +14867,7 @@ data: ] }, "fill": 10, - "id": 10, + "id": 16, "linewidth": 0, "links": [ ], "options": { @@ -13651,7 +14883,7 @@ data: "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -13684,7 +14916,7 @@ data: }, "overrides": [ ] }, - "id": 11, + "id": 17, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -13699,19 +14931,19 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) ", "format": "time_series", "legendFormat": "{{ route }} Average", "refId": "C" @@ -13762,7 +14994,7 @@ data: }, "overrides": [ ] }, - "id": 12, + "id": 18, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -13777,7 +15009,7 @@ data: "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (le,pod)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -13794,7 +15026,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Ingester - Zone Aware", + "title": "Bloom Gateway", "titleSize": "h6" }, { @@ -13974,7 +15206,7 @@ data: ] }, "fill": 10, - "id": 13, + "id": 19, "linewidth": 0, "links": [ ], "options": { @@ -14023,7 +15255,7 @@ data: }, "overrides": [ ] }, - "id": 14, + "id": 20, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -14101,7 +15333,7 @@ data: }, "overrides": [ ] }, - "id": 15, + "id": 21, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -14133,7 +15365,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Index", + "title": "TSBD Index", "titleSize": "h6" }, { @@ -14313,7 +15545,7 @@ data: ] }, "fill": 10, - "id": 16, + "id": 22, "linewidth": 0, "links": [ ], "options": { @@ -14362,7 +15594,7 @@ data: }, "overrides": [ ] }, - "id": 17, + "id": 23, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -14440,7 +15672,7 @@ data: }, "overrides": [ ] }, - "id": 18, + "id": 24, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -14472,7 +15704,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "BoltDB Shipper", + "title": "BoltDB Index", "titleSize": "h6" } ], @@ -18674,8 +19906,9 @@ spec: rules: - alert: LokiRequestErrors annotations: - message: | + description: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + summary: Loki request error rate is high. expr: | 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) / @@ -18686,16 +19919,18 @@ spec: severity: critical - alert: LokiRequestPanics annotations: - message: | + description: | {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + summary: Loki requests are causing code panics. expr: | sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency annotations: - message: | + description: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + summary: Loki request error latency is high. expr: | cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 for: 15m @@ -18703,8 +19938,9 @@ spec: severity: critical - alert: LokiTooManyCompactorsRunning annotations: - message: | + description: | {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + summary: Loki deployment is running more than one compactor. expr: | sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 for: 5m diff --git a/monitoring-mixins/loki-mixin/deploy/prometheus-alerts.yaml b/monitoring-mixins/loki-mixin/deploy/prometheus-alerts.yaml index 5f6576e7..7e99cfaf 100644 --- a/monitoring-mixins/loki-mixin/deploy/prometheus-alerts.yaml +++ b/monitoring-mixins/loki-mixin/deploy/prometheus-alerts.yaml @@ -10,8 +10,9 @@ spec: rules: - alert: LokiRequestErrors annotations: - message: | + description: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + summary: Loki request error rate is high. expr: | 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) / @@ -22,16 +23,18 @@ spec: severity: critical - alert: LokiRequestPanics annotations: - message: | + description: | {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + summary: Loki requests are causing code panics. expr: | sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency annotations: - message: | + description: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + summary: Loki request error latency is high. expr: | cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 for: 15m @@ -39,8 +42,9 @@ spec: severity: critical - alert: LokiTooManyCompactorsRunning annotations: - message: | + description: | {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + summary: Loki deployment is running more than one compactor. expr: | sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 for: 5m diff --git a/monitoring-mixins/loki-mixin/jsonnetfile.lock.json b/monitoring-mixins/loki-mixin/jsonnetfile.lock.json index f2675b4a..86296aed 100644 --- a/monitoring-mixins/loki-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/loki-mixin/jsonnetfile.lock.json @@ -18,8 +18,8 @@ "subdir": "grafana-builder" } }, - "version": "abf0830008f0a61f3a7f5782738b3569eb6b0203", - "sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU=" + "version": "33d0c06eadc2558b339903771425c26b4e157919", + "sum": "5ku1Hd8UPnjmn8nWyaTFzMpT3Pa+VylBnmposMSVEuU=" }, { "source": { @@ -28,8 +28,8 @@ "subdir": "mixin-utils" } }, - "version": "abf0830008f0a61f3a7f5782738b3569eb6b0203", - "sum": "0jg7qc3N8FtMnnQbunYCGSNcjHr9Y1krZW9OSTmWcEQ=" + "version": "33d0c06eadc2558b339903771425c26b4e157919", + "sum": "A0f0G3aJEkdu5sqHXtizHDyU1jOSx6VuEXLRlI9Psp8=" }, { "source": { @@ -38,8 +38,8 @@ "subdir": "production/loki-mixin" } }, - "version": "82f6548183233da36a18019739c6223846f7baba", - "sum": "poponbHA5aiE9loWWsqmgd8zqGASayqJr3IxHwcjkdU=" + "version": "2d62fca05d6ec82196b46c956733c89439660754", + "sum": "hMxvpcgKvJqMAK9kooRu2RvD3ctX3mNk7/yij6RWiVs=" }, { "source": { @@ -48,8 +48,8 @@ "subdir": "operations/mimir-mixin" } }, - "version": "cd13a1b0509f877f3d84e3de5c884680563e6ab3", - "sum": "eNgijCGmvYx1X4yXMLBsdo7jo3HbDknbzSYHGt/I/MY=" + "version": "748a726fa073c7fa3ca1ea79d6ac1265656a9e01", + "sum": "Qx7y7DJqEtiHer20VwAULlh+67Ae2OGanw5pdABUvWg=" }, { "source": { @@ -58,7 +58,7 @@ "subdir": "jsonnet/kube-prometheus/lib" } }, - "version": "76f2e1ef95be0df752037baa040781c5219e1fb3", + "version": "407142589d5b38663e70f80c6c3b493e5b5e6578", "sum": "QKRgrgEZ3k9nLmLCrDBaeIGVqQZf+AvZTcnhdLk3TrA=" } ], diff --git a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-chunks.json b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-chunks.json index b1304ffa..59dd5286 100644 --- a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-chunks.json +++ b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-chunks.json @@ -416,7 +416,7 @@ "span": 6, "targets": [ { - "expr": "sum(rate(loki_chunk_store_index_entries_per_chunk_sum{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m])) / sum(rate(loki_chunk_store_index_entries_per_chunk_count{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m]))", + "expr": "sum(rate(loki_chunk_store_index_entries_per_chunk_sum{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) / sum(rate(loki_chunk_store_index_entries_per_chunk_count{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "Index Entries", "legendLink": null @@ -981,19 +981,19 @@ "span": 12, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[1m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "histogram_quantile(0.90, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[1m])) by (le))", + "expr": "histogram_quantile(0.90, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p90", "legendLink": null }, { - "expr": "histogram_quantile(0.50, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[1m])) by (le))", + "expr": "histogram_quantile(0.50, sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p50", "legendLink": null @@ -1052,19 +1052,19 @@ "span": 12, "targets": [ { - "expr": "histogram_quantile(0.5, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m])) by (le))", + "expr": "histogram_quantile(0.5, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p50", "legendLink": null }, { - "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(loki_ingester_chunk_bounds_hours_bucket{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) by (le))", "format": "time_series", "legendFormat": "p99", "legendLink": null }, { - "expr": "sum(rate(loki_ingester_chunk_bounds_hours_sum{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m])) / sum(rate(loki_ingester_chunk_bounds_hours_count{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[5m]))", + "expr": "sum(rate(loki_ingester_chunk_bounds_hours_sum{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval])) / sum(rate(loki_ingester_chunk_bounds_hours_count{cluster=\"$cluster\", job=~\"$namespace/ingester.*\"}[$__rate_interval]))", "format": "time_series", "legendFormat": "avg", "legendLink": null diff --git a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-deletion.json b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-deletion.json index 939f37e4..2db2b7cb 100644 --- a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-deletion.json +++ b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-deletion.json @@ -579,7 +579,7 @@ "span": 6, "targets": [ { - "expr": "sum(rate(loki_compactor_deleted_lines{cluster=~\"$cluster\",job=~\"$namespace/compactor\"}[$__rate_interval])) by (user)", + "expr": "sum(rate(loki_compactor_deleted_lines{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"compactor\"}[$__rate_interval])) by (user)", "format": "time_series", "legendFormat": "{{user}}", "legendLink": null diff --git a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-logs.json b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-logs.json index 90691632..32b8e52a 100644 --- a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-logs.json +++ b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-logs.json @@ -114,6 +114,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -236,7 +241,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[$__rate_interval]))", "refId": "A" } ], @@ -287,6 +292,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -373,6 +383,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -408,7 +423,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[$__rate_interval]))", "refId": "A" } ], @@ -459,6 +474,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -494,7 +514,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[$__rate_interval]))", "refId": "A" } ], @@ -632,6 +652,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -667,7 +692,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)", + "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[$__rate_interval])) by (level)", "legendFormat": "{{level}}", "refId": "A" } @@ -719,6 +744,11 @@ "dashLength": 10, "dashes": false, "datasource": "$loki_datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -771,7 +801,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" | __error__=\"\" [$__auto])) by (level)", "intervalFactor": 3, "legendFormat": "{{level}}", "refId": "A" diff --git a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-mixin-recording-rules.json b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-mixin-recording-rules.json index 1234065b..f1f6c215 100644 --- a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-mixin-recording-rules.json +++ b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-mixin-recording-rules.json @@ -300,7 +300,8 @@ "value": 80 } ] - } + }, + "unit": "s" }, "overrides": [ ] }, diff --git a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-operational.json b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-operational.json index 7d25fc64..a54d3daa 100644 --- a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-operational.json +++ b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-operational.json @@ -87,7 +87,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", "legendFormat": "{{status}}", "refId": "A" } @@ -183,7 +183,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/distributor\", route=~\"api_prom_push|loki_api_v1_push\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/distributor\", route=~\"api_prom_push|loki_api_v1_push\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", "legendFormat": "{{status}}", "refId": "A" } @@ -278,7 +278,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant))", + "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (tenant))", "legendFormat": "{{tenant}}", "refId": "A" } @@ -332,7 +332,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "MBs" }, "overrides": [ ] }, @@ -374,7 +375,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant)) / 1024 / 1024", + "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (tenant)) / 1024 / 1024", "legendFormat": "{{tenant}}", "refId": "A" } @@ -524,7 +525,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -630,7 +632,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -834,7 +837,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -1040,7 +1044,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -1148,7 +1153,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -1353,7 +1359,8 @@ "description": "", "fieldConfig": { "defaults": { - "custom": { } + "custom": { }, + "unit": "s" }, "overrides": [ ] }, @@ -1602,7 +1609,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])))", + "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])))", "interval": "", "legendFormat": "{{ tenant }} - {{ reason }}", "refId": "A" @@ -1727,7 +1734,7 @@ ], "targets": [ { - "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])[$__range:1m])))", + "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])[$__range:$__rate_interval])))", "format": "table", "instant": true, "interval": "", @@ -1852,6 +1859,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -1985,7 +1997,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"} | logfmt | level=\"error\"[$__auto]))", "refId": "A" } ], @@ -2153,6 +2165,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2189,7 +2206,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2242,6 +2259,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2278,7 +2300,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2331,6 +2353,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2367,7 +2394,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2525,6 +2552,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2658,7 +2690,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\"} | logfmt | level=\"error\"[$__auto]))", "refId": "A" } ], @@ -2971,7 +3003,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m]) > 0))", + "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[$__rate_interval]) > 0))", "interval": "", "legendFormat": "{{ tenant }}", "refId": "A" @@ -3081,13 +3113,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m]))", + "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[$__rate_interval]))", "interval": "", "legendFormat": "Chunks", "refId": "A" }, { - "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m])) < 1", + "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[$__rate_interval]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[$__rate_interval])) < 1", "interval": "", "legendFormat": "De-Dupe Ratio", "refId": "B" @@ -3165,7 +3197,7 @@ "reverseYBuckets": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m])) by (le)", + "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[$__rate_interval])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -3320,7 +3352,7 @@ "reverseYBuckets": false, "targets": [ { - "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m]))", + "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[$__rate_interval]))", "format": "heatmap", "instant": false, "interval": "", @@ -3465,6 +3497,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -3598,7 +3635,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"} | logfmt | level=\"error\"[$__auto]))", "refId": "A" } ], @@ -3782,6 +3819,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -3821,19 +3863,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (method, name, le, container))", "intervalFactor": 1, "legendFormat": "{{container}}: .99-{{method}}-{{name}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .9-{{method}}-{{name}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .5-{{method}}-{{name}}", "refId": "C" @@ -3925,7 +3967,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, method, name, container)", + "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, method, name, container)", "intervalFactor": 1, "legendFormat": "{{container}}: {{status_code}}-{{method}}-{{name}}", "refId": "A" @@ -3994,6 +4036,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4033,19 +4080,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -4098,6 +4145,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4137,7 +4189,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, status_code, method)", + "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, status_code, method)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -4206,6 +4258,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4243,17 +4300,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (operation, le))", "refId": "C" } ], @@ -4304,6 +4361,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4341,20 +4403,20 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -4407,6 +4469,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4444,20 +4511,20 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -4510,6 +4577,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4547,17 +4619,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (operation, le))", "refId": "C" } ], @@ -4608,6 +4680,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4645,7 +4722,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4698,6 +4775,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4735,7 +4817,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4788,6 +4870,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4825,7 +4912,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4878,6 +4965,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4915,7 +5007,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4984,6 +5076,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -5023,19 +5120,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5127,7 +5224,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5231,7 +5328,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -5317,7 +5414,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -5403,7 +5500,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_throttled_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_throttled_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -5489,7 +5586,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -5575,17 +5672,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": ".99", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": ".9", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": ".5", "refId": "C" } @@ -5637,6 +5734,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -5675,19 +5777,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5778,7 +5880,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5847,6 +5949,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -5885,19 +5992,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5988,7 +6095,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6057,6 +6164,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -6095,19 +6207,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6198,7 +6310,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6267,6 +6379,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -6305,19 +6422,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6408,7 +6525,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" diff --git a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-reads-resources.json b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-reads-resources.json index 7141e403..ed3965e5 100644 --- a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-reads-resources.json +++ b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-reads-resources.json @@ -1364,7 +1364,7 @@ }, { "collapse": false, - "height": "250px", + "collapsed": false, "panels": [ { "datasource": "$datasource", @@ -1429,6 +1429,7 @@ } ] }, + "gridPos": { }, "id": 19, "links": [ ], "options": { @@ -1440,6 +1441,411 @@ "sort": "none" } }, + "targets": [ + { + "expr": "sum by(pod) (rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"}[$__rate_interval]))", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + }, + { + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\", resource=\"cpu\"} > 0)", + "format": "time_series", + "legendFormat": "request", + "legendLink": null + }, + { + "expr": "min(container_spec_cpu_quota{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"} / container_spec_cpu_period{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"})", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "CPU", + "tooltip": { + "sort": 2 + }, + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "request" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FFC000", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E02F44", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { }, + "id": 20, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "max by(pod) (container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"})", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + }, + { + "expr": "min(kube_pod_container_resource_requests{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\", resource=\"memory\"} > 0)", + "format": "time_series", + "legendFormat": "request", + "legendLink": null + }, + { + "expr": "min(container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", container=~\"bloom-gateway\"} > 0)", + "format": "time_series", + "legendFormat": "limit", + "legendLink": null + } + ], + "title": "Memory (workingset)", + "tooltip": { + "sort": 2 + }, + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "bytes" + }, + "overrides": [ ] + }, + "gridPos": { }, + "id": 21, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(pod) (go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\"})", + "format": "time_series", + "legendFormat": "{{pod}}", + "legendLink": null + } + ], + "title": "Memory (go heap inuse)", + "tooltip": { + "sort": 2 + }, + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "gridPos": { }, + "id": 22, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "format": "time_series", + "legendFormat": "{{pod}} - {{device}}", + "legendLink": null + } + ], + "title": "Disk Writes", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "Bps" + }, + "overrides": [ ] + }, + "gridPos": { }, + "id": 23, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n", + "format": "time_series", + "legendFormat": "{{pod}} - {{device}}", + "legendLink": null + } + ], + "title": "Disk Reads", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "percentunit" + }, + "overrides": [ ] + }, + "gridPos": { }, + "id": 24, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "expr": "max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"} / kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{cluster=~\"$cluster\", namespace=~\"$namespace\",label_name=~\"bloom-gateway.*\"})", + "format": "time_series", + "legendFormat": "{{persistentvolumeclaim}}", + "legendLink": null + } + ], + "title": "Disk Space Utilization", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bloom Gateway", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "request" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FFC000", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E02F44", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "id": 25, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, "span": 4, "targets": [ { @@ -1530,7 +1936,7 @@ } ] }, - "id": 20, + "id": 26, "links": [ ], "options": { "legend": { @@ -1592,7 +1998,7 @@ }, "overrides": [ ] }, - "id": 21, + "id": 27, "links": [ ], "options": { "legend": { @@ -1655,7 +2061,7 @@ "overrides": [ ] }, "gridPos": { }, - "id": 22, + "id": 28, "links": [ ], "options": { "legend": { @@ -1741,7 +2147,7 @@ ] }, "gridPos": { }, - "id": 23, + "id": 29, "links": [ ], "options": { "legend": { @@ -1842,7 +2248,7 @@ ] }, "gridPos": { }, - "id": 24, + "id": 30, "links": [ ], "options": { "legend": { @@ -1904,7 +2310,7 @@ "overrides": [ ] }, "gridPos": { }, - "id": 25, + "id": 31, "links": [ ], "options": { "legend": { diff --git a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-reads.json b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-reads.json index c8cda840..99692299 100644 --- a/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-reads.json +++ b/monitoring-mixins/loki-mixin/microservices-mode/dashboards_out/loki-reads.json @@ -215,7 +215,7 @@ "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -263,19 +263,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}) by (route) ", "format": "time_series", "legendFormat": "{{ route }} Average", "refId": "C" @@ -341,7 +341,7 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval])) by (le,pod)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -554,7 +554,7 @@ "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -602,19 +602,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}) by (route) ", "format": "time_series", "legendFormat": "{{ route }} Average", "refId": "C" @@ -680,7 +680,7 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values\"}[$__rate_interval])) by (le,pod)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=~\"(api_prom_rules|api_prom_rules_namespace_groupname|api_v1_rules|loki_api_v1_delete|loki_api_v1_detected_labels|loki_api_v1_index_stats|loki_api_v1_index_volume|loki_api_v1_index_volume_range|loki_api_v1_label_name_values|loki_api_v1_label_values|loki_api_v1_labels|loki_api_v1_patterns|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_series|otlp_v1_logs|prometheus_api_v1_rules)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -893,7 +893,7 @@ "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -941,19 +941,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) ", "format": "time_series", "legendFormat": "{{ route }} Average", "refId": "C" @@ -1019,7 +1019,7 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (le,pod)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1232,7 +1232,7 @@ "stack": true, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -1280,19 +1280,19 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 99th percentile", "refId": "A" }, { - "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"})) * 1e3", + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", "format": "time_series", "legendFormat": "{{ route }} 50th percentile", "refId": "B" }, { - "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}) by (route) ", + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) ", "format": "time_series", "legendFormat": "{{ route }} Average", "refId": "C" @@ -1358,7 +1358,7 @@ "span": 4, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (le,pod)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester-zone.*\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval])) by (le,pod)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -1569,6 +1569,684 @@ }, "span": 4, "stack": true, + "targets": [ + { + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/index-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "QPS", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 14, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", + "format": "time_series", + "legendFormat": "{{ route }} 99th percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", + "format": "time_series", + "legendFormat": "{{ route }} 50th percentile", + "refId": "B" + }, + { + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) ", + "format": "time_series", + "legendFormat": "{{ route }} Average", + "refId": "C" + } + ], + "title": "Latency", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 15, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/index-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval])) by (le,pod)) * 1e3", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "__auto", + "refId": "A", + "step": 10 + } + ], + "title": "Per Pod Latency (p99)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Index Gateway", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", + "error": "#E24D42", + "success": "#7EB26D" + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "1xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "3xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "OK" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cancel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#A9A9A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + } + ] + }, + "fill": 10, + "id": 16, + "linewidth": 0, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "stack": true, + "targets": [ + { + "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "format": "time_series", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "QPS", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 17, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", + "format": "time_series", + "legendFormat": "{{ route }} 99th percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"})) * 1e3", + "format": "time_series", + "legendFormat": "{{ route }} 50th percentile", + "refId": "B" + }, + { + "expr": "1e3 * sum(cluster_job_route:loki_request_duration_seconds_sum:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) / sum(cluster_job_route:loki_request_duration_seconds_count:sum_rate{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}) by (route) ", + "format": "time_series", + "legendFormat": "{{ route }} Average", + "refId": "C" + } + ], + "title": "Latency", + "type": "timeseries", + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "id": 18, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/bloom-gateway\", route=~\"(/base.Ruler/Rules|/indexgatewaypb.IndexGateway/GetChunkRef|/indexgatewaypb.IndexGateway/GetSeries|/indexgatewaypb.IndexGateway/GetShards|/indexgatewaypb.IndexGateway/GetStats|/indexgatewaypb.IndexGateway/GetVolume|/indexgatewaypb.IndexGateway/LabelNamesForMetricName|/indexgatewaypb.IndexGateway/LabelValuesForMetricName|/indexgatewaypb.IndexGateway/QueryIndex|/logproto.BloomGateway/FilterChunkRefs|/logproto.Pattern/Query|/logproto.Querier/GetChunkIDs|/logproto.Querier/GetDetectedLabels|/logproto.Querier/GetStats|/logproto.Querier/GetVolume|/logproto.Querier/Label|/logproto.Querier/Query|/logproto.Querier/QuerySample|/logproto.Querier/Series|/logproto.StreamData/GetStreamRates)\"}[$__rate_interval])) by (le,pod)) * 1e3", + "format": "time_series", + "interval": "1m", + "intervalFactor": 2, + "legendFormat": "__auto", + "refId": "A", + "step": 10 + } + ], + "title": "Per Pod Latency (p99)", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Bloom Gateway", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + "1xx": "#EAB839", + "2xx": "#7EB26D", + "3xx": "#6ED0E0", + "4xx": "#EF843C", + "5xx": "#E24D42", + "OK": "#7EB26D", + "cancel": "#A9A9A9", + "error": "#E24D42", + "success": "#7EB26D" + }, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "1xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "2xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "3xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "4xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "5xx" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "OK" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cancel" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#A9A9A9", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "error" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + } + ] + }, + "fill": 10, + "id": 19, + "linewidth": 0, + "links": [ ], + "options": { + "legend": { + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "span": 4, + "stack": true, "targets": [ { "expr": "sum by (status) (\n label_replace(label_replace(rate(loki_index_request_duration_seconds_count{cluster=~\"$cluster\",job=~\"($namespace)/querier\", operation!=\"index_chunk\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", @@ -1604,7 +2282,7 @@ }, "overrides": [ ] }, - "id": 14, + "id": 20, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -1682,7 +2360,7 @@ }, "overrides": [ ] }, - "id": 15, + "id": 21, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -1714,7 +2392,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Index", + "title": "TSBD Index", "titleSize": "h6" }, { @@ -1894,7 +2572,7 @@ ] }, "fill": 10, - "id": 16, + "id": 22, "linewidth": 0, "links": [ ], "options": { @@ -1943,7 +2621,7 @@ }, "overrides": [ ] }, - "id": 17, + "id": 23, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -2021,7 +2699,7 @@ }, "overrides": [ ] }, - "id": 18, + "id": 24, "links": [ ], "nullPointMode": "null as zero", "options": { @@ -2053,7 +2731,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "BoltDB Shipper", + "title": "BoltDB Index", "titleSize": "h6" } ], diff --git a/monitoring-mixins/loki-mixin/microservices-mode/loki-mixin-alerts.yaml b/monitoring-mixins/loki-mixin/microservices-mode/loki-mixin-alerts.yaml index af06880c..7c0825d8 100644 --- a/monitoring-mixins/loki-mixin/microservices-mode/loki-mixin-alerts.yaml +++ b/monitoring-mixins/loki-mixin/microservices-mode/loki-mixin-alerts.yaml @@ -3,8 +3,9 @@ groups: rules: - alert: LokiRequestErrors annotations: - message: | + description: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + summary: Loki request error rate is high. expr: | 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) / @@ -15,16 +16,18 @@ groups: severity: critical - alert: LokiRequestPanics annotations: - message: | + description: | {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + summary: Loki requests are causing code panics. expr: | sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency annotations: - message: | + description: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + summary: Loki request error latency is high. expr: | cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 for: 15m @@ -32,8 +35,9 @@ groups: severity: critical - alert: LokiTooManyCompactorsRunning annotations: - message: | + description: | {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + summary: Loki deployment is running more than one compactor. expr: | sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 for: 5m diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet index cc43f483..f4a9745a 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet @@ -456,18 +456,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, ], + httpStatusColors:: { + '1xx': '#EAB839', + '2xx': '#7EB26D', + '3xx': '#6ED0E0', + '4xx': '#EF843C', + '5xx': '#E24D42', + OK: '#7EB26D', + success: '#7EB26D', + 'error': '#E24D42', + cancel: '#A9A9A9', + }, + qpsPanel(selector, statusLabelName='status_code'):: { - aliasColors: { - '1xx': '#EAB839', - '2xx': '#7EB26D', - '3xx': '#6ED0E0', - '4xx': '#EF843C', - '5xx': '#E24D42', - OK: '#7EB26D', - success: '#7EB26D', - 'error': '#E24D42', - cancel: '#A9A9A9', - }, + aliasColors: $.httpStatusColors, targets: [ { expr: @@ -484,6 +486,65 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], } + $.stack, + // Assumes that the metricName is for a histogram (as opposed to qpsPanel above) + // Assumes that there is a dashboard variable named latency_metrics, values are -1 (native) or 1 (classic) + qpsPanelNativeHistogram(metricName, selector, statusLabelName='status_code'):: { + local sumByStatus(nativeClassicQuery) = { + local template = + ||| + sum by (status) ( + label_replace(label_replace(%(metricQuery)s, + "status", "${1}xx", "%(label)s", "([0-9]).."), + "status", "${1}", "%(label)s", "([a-zA-Z]+)")) + |||, + native: template % { metricQuery: nativeClassicQuery.native, label: statusLabelName }, + classic: template % { metricQuery: nativeClassicQuery.classic, label: statusLabelName }, + }, + fieldConfig+: { + defaults+: { + custom+: { + lineWidth: 0, + fillOpacity: 100, // Get solid fill. + stacking: { + mode: 'normal', + group: 'A', + }, + }, + unit: 'reqps', + min: 0, + }, + overrides+: [{ + matcher: { + id: 'byName', + options: status, + }, + properties: [ + { + id: 'color', + value: { + mode: 'fixed', + fixedColor: $.httpStatusColors[status], + }, + }, + ], + } for status in std.objectFieldsAll($.httpStatusColors)], + }, + targets: [ + { + expr: utils.showClassicHistogramQuery(sumByStatus(utils.nativeClassicHistogramCountRate(metricName, selector))), + format: 'time_series', + legendFormat: '{{status}}', + refId: 'A_classic', + }, + { + expr: utils.showNativeHistogramQuery(sumByStatus(utils.nativeClassicHistogramCountRate(metricName, selector))), + format: 'time_series', + legendFormat: '{{status}}', + refId: 'A', + }, + ], + } + $.stack, + latencyPanel(metricName, selector, multiplier='1e3'):: { nullPointMode: 'null as zero', targets: [ @@ -509,6 +570,58 @@ local utils = import 'mixin-utils/utils.libsonnet'; yaxes: $.yaxes('ms'), }, + // Assumes that there is a dashboard variable named latency_metrics, values are -1 (native) or 1 (classic) + latencyPanelNativeHistogram(metricName, selector, multiplier='1e3'):: { + nullPointMode: 'null as zero', + fieldConfig+: { + defaults+: { + custom+: { + fillOpacity: 10, + }, + unit: 'ms', + }, + }, + targets: [ + { + expr: utils.showNativeHistogramQuery(utils.nativeClassicHistogramQuantile('0.99', metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: '99th percentile', + refId: 'A', + }, + { + expr: utils.showClassicHistogramQuery(utils.nativeClassicHistogramQuantile('0.99', metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: '99th percentile', + refId: 'A_classic', + }, + { + expr: utils.showNativeHistogramQuery(utils.nativeClassicHistogramQuantile('0.50', metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: '50th percentile', + refId: 'B', + }, + { + expr: utils.showClassicHistogramQuery(utils.nativeClassicHistogramQuantile('0.50', metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: '50th percentile', + refId: 'B_classic', + }, + { + expr: utils.showNativeHistogramQuery(utils.nativeClassicHistogramAverageRate(metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: 'Average', + refId: 'C', + }, + { + expr: utils.showClassicHistogramQuery(utils.nativeClassicHistogramAverageRate(metricName, selector, multiplier=multiplier)), + format: 'time_series', + legendFormat: 'Average', + refId: 'C_classic', + }, + ], + yaxes: $.yaxes('ms'), + }, + selector:: { eq(label, value):: { label: label, op: '=', value: value }, neq(label, value):: { label: label, op: '!=', value: value }, diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet index d669aa55..7d042989 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/jsonnet-libs/mixin-utils/utils.libsonnet @@ -3,84 +3,129 @@ local g = import 'grafana-builder/grafana.libsonnet'; { // The classicNativeHistogramQuantile function is used to calculate histogram quantiles from native histograms or classic histograms. // Metric name should be provided without _bucket suffix. - nativeClassicHistogramQuantile(percentile, metric, selector, sum_by=[], rate_interval='$__rate_interval', multiplier=''):: + // If from_recording is true, the function will assume :sum_rate metric suffix and no rate needed. + nativeClassicHistogramQuantile(percentile, metric, selector, sum_by=[], rate_interval='$__rate_interval', multiplier='', from_recording=false):: local classicSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', ['le'] + sum_by) } else ' by (le) '; local nativeSumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(',', sum_by) } else ' '; local multiplierStr = if multiplier == '' then '' else ' * %s' % multiplier; + local rateOpen = if from_recording then '' else 'rate('; + local rateClose = if from_recording then '' else '[%s])' % rate_interval; { - classic: 'histogram_quantile(%(percentile)s, sum%(classicSumBy)s(rate(%(metric)s_bucket{%(selector)s}[%(rateInterval)s])))%(multiplierStr)s' % { + classic: 'histogram_quantile(%(percentile)s, sum%(classicSumBy)s(%(rateOpen)s%(metric)s_bucket%(suffix)s{%(selector)s}%(rateClose)s))%(multiplierStr)s' % { classicSumBy: classicSumBy, metric: metric, multiplierStr: multiplierStr, percentile: percentile, rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, selector: selector, + suffix: if from_recording then ':sum_rate' else '', }, - native: 'histogram_quantile(%(percentile)s, sum%(nativeSumBy)s(rate(%(metric)s{%(selector)s}[%(rateInterval)s])))%(multiplierStr)s' % { + native: 'histogram_quantile(%(percentile)s, sum%(nativeSumBy)s(%(rateOpen)s%(metric)s%(suffix)s{%(selector)s}%(rateClose)s))%(multiplierStr)s' % { metric: metric, multiplierStr: multiplierStr, nativeSumBy: nativeSumBy, percentile: percentile, rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, selector: selector, + suffix: if from_recording then ':sum_rate' else '', }, }, // The classicNativeHistogramSumRate function is used to calculate the histogram sum of rate from native histograms or classic histograms. // Metric name should be provided without _sum suffix. - nativeClassicHistogramSumRate(metric, selector, rate_interval='$__rate_interval'):: + // If from_recording is true, the function will assume :sum_rate metric suffix and no rate needed. + nativeClassicHistogramSumRate(metric, selector, rate_interval='$__rate_interval', from_recording=false):: + local rateOpen = if from_recording then '' else 'rate('; + local rateClose = if from_recording then '' else '[%s])' % rate_interval; { - classic: 'rate(%(metric)s_sum{%(selector)s}[%(rateInterval)s])' % { + classic: '%(rateOpen)s%(metric)s_sum%(suffix)s{%(selector)s}%(rateClose)s' % { metric: metric, rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, selector: selector, + suffix: if from_recording then ':sum_rate' else '', }, - native: 'histogram_sum(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + native: 'histogram_sum(%(rateOpen)s%(metric)s%(suffix)s{%(selector)s}%(rateClose)s)' % { metric: metric, rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, selector: selector, + suffix: if from_recording then ':sum_rate' else '', }, }, // The classicNativeHistogramCountRate function is used to calculate the histogram count of rate from native histograms or classic histograms. // Metric name should be provided without _count suffix. - nativeClassicHistogramCountRate(metric, selector, rate_interval='$__rate_interval'):: + // If from_recording is true, the function will assume :sum_rate metric suffix and no rate needed. + nativeClassicHistogramCountRate(metric, selector, rate_interval='$__rate_interval', from_recording=false):: + local rateOpen = if from_recording then '' else 'rate('; + local rateClose = if from_recording then '' else '[%s])' % rate_interval; { - classic: 'rate(%(metric)s_count{%(selector)s}[%(rateInterval)s])' % { + classic: '%(rateOpen)s%(metric)s_count%(suffix)s{%(selector)s}%(rateClose)s' % { metric: metric, rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, selector: selector, + suffix: if from_recording then ':sum_rate' else '', }, - native: 'histogram_count(rate(%(metric)s{%(selector)s}[%(rateInterval)s]))' % { + native: 'histogram_count(%(rateOpen)s%(metric)s%(suffix)s{%(selector)s}%(rateClose)s)' % { metric: metric, rateInterval: rate_interval, + rateOpen: rateOpen, + rateClose: rateClose, selector: selector, + suffix: if from_recording then ':sum_rate' else '', }, }, // TODO(krajorama) Switch to histogram_avg function for native histograms later. - nativeClassicHistogramAverageRate(metric, selector, rate_interval='$__rate_interval', multiplier=''):: + // nativeClassicHistogramAverageRate function is used to calculate the histogram average rate from native histograms or classic histograms. + // If from_recording is true, the function will assume :sum_rate metric suffix and no rate needed. + nativeClassicHistogramAverageRate(metric, selector, rate_interval='$__rate_interval', multiplier='', from_recording=false):: local multiplierStr = if multiplier == '' then '' else '%s * ' % multiplier; { classic: ||| %(multiplier)ssum(%(sumMetricQuery)s) / sum(%(countMetricQuery)s) ||| % { - sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval).classic, - countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval).classic, + sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval, from_recording).classic, + countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval, from_recording).classic, multiplier: multiplierStr, }, native: ||| %(multiplier)ssum(%(sumMetricQuery)s) / sum(%(countMetricQuery)s) ||| % { - sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval).native, - countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval).native, + sumMetricQuery: $.nativeClassicHistogramSumRate(metric, selector, rate_interval, from_recording).native, + countMetricQuery: $.nativeClassicHistogramCountRate(metric, selector, rate_interval, from_recording).native, multiplier: multiplierStr, }, }, + nativeClassicSumBy(query, sum_by=[], multiplier=''):: + local sumBy = if std.length(sum_by) > 0 then ' by (%(lbls)s) ' % { lbls: std.join(', ', sum_by) } else ' '; + local multiplierStr = if multiplier == '' then '' else ' * %s' % multiplier; + { + classic: 'sum%(sumBy)s(%(query)s)%(multiplierStr)s' % { + multiplierStr: multiplierStr, + query: query.classic, + sumBy: sumBy, + }, + native: 'sum%(sumBy)s(%(query)s)%(multiplierStr)s' % { + multiplierStr: multiplierStr, + query: query.native, + sumBy: sumBy, + }, + }, + // showClassicHistogramQuery wraps a query defined as map {classic: q, native: q}, and compares the classic query // to dashboard variable which should take -1 or +1 as values in order to hide or show the classic query. showClassicHistogramQuery(query, dashboard_variable='latency_metrics'):: '%s < ($%s * +Inf)' % [query.classic, dashboard_variable], @@ -106,7 +151,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; }, { record: '%(labels_underscore)s:%(metric)s:avg' % vars, - expr: 'sum(rate(%(metric)s_sum[1m])) by (%(labels_comma)s) / sum(rate(%(metric)s_count[%(interval)s])) by (%(labels_comma)s)' % vars, + expr: 'sum(rate(%(metric)s_sum[%(interval)s])) by (%(labels_comma)s) / sum(rate(%(metric)s_count[%(interval)s])) by (%(labels_comma)s)' % vars, }, { record: '%(labels_underscore)s:%(metric)s_bucket:sum_rate' % vars, @@ -205,12 +250,72 @@ local g = import 'grafana-builder/grafana.libsonnet'; noop(label):: { label: label, op: 'nop' }, }, - toPrometheusSelector(selector):: + // latencyRecordingRulePanelNativeHistogram - build a latency panel for a recording rule. + // - metric: the base metric name (middle part of recording rule name) + // - selectors: list of selectors which will be added to first part of + // recording rule name, and to the query selector itself. + // - extra_selectors (optional): list of selectors which will be added to the + // query selector, but not to the beginnig of the recording rule name. + // Useful for external labels. + // - multiplier (optional): assumes results are in seconds, will multiply + // by 1e3 to get ms. Can be turned off. + // - sum_by (optional): additional labels to use in the sum by clause, will also be used in the legend + latencyRecordingRulePanelNativeHistogram(metric, selectors, extra_selectors=[], multiplier='1e3', sum_by=[]):: + local labels = std.join('_', [matcher.label for matcher in selectors]); + local legend = std.join('', ['{{ %(lb)s }} ' % lb for lb in sum_by]); + local metricStr = '%(labels)s:%(metric)s' % { labels: labels, metric: metric }; + local selectorStr = $.toPrometheusSelectorNaked(selectors + extra_selectors); + { + nullPointMode: 'null as zero', + yaxes: g.yaxes('ms'), + targets: [ + { + expr: $.showClassicHistogramQuery($.nativeClassicHistogramQuantile('0.99', metricStr, selectorStr, sum_by=sum_by, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)s99th percentile' % legend, + refId: 'A_classic', + }, + { + expr: $.showNativeHistogramQuery($.nativeClassicHistogramQuantile('0.99', metricStr, selectorStr, sum_by=sum_by, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)s99th percentile' % legend, + refId: 'A_native', + }, + { + expr: $.showClassicHistogramQuery($.nativeClassicHistogramQuantile('0.50', metricStr, selectorStr, sum_by=sum_by, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)s50th percentile' % legend, + refId: 'B_classic', + }, + { + expr: $.showNativeHistogramQuery($.nativeClassicHistogramQuantile('0.50', metricStr, selectorStr, sum_by=sum_by, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)s50th percentile' % legend, + refId: 'B_native', + }, + { + expr: $.showClassicHistogramQuery($.nativeClassicHistogramAverageRate(metricStr, selectorStr, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)sAverage' % legend, + refId: 'C_classic', + }, + { + expr: $.showNativeHistogramQuery($.nativeClassicHistogramAverageRate(metricStr, selectorStr, multiplier=multiplier, from_recording=true)), + format: 'time_series', + legendFormat: '%(legend)sAverage' % legend, + refId: 'C_native', + }, + ], + }, + + toPrometheusSelectorNaked(selector):: local pairs = [ '%(label)s%(op)s"%(value)s"' % matcher for matcher in std.filter(function(matcher) matcher.op != 'nop', selector) ]; - '{%s}' % std.join(', ', pairs), + '%s' % std.join(', ', pairs), + + toPrometheusSelector(selector):: '{%s}' % $.toPrometheusSelectorNaked(selector), // withRunbookURL - Add/Override the runbook_url annotations for all alerts inside a list of rule groups. // - url_format: an URL format for the runbook, the alert name will be substituted in the URL. diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/alerts.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/alerts.libsonnet index 0045cc19..5bff18e7 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/alerts.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/alerts.libsonnet @@ -17,7 +17,8 @@ severity: 'critical', }, annotations: { - message: ||| + summary: 'Loki request error rate is high.', + description: ||| {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. |||, }, @@ -31,7 +32,8 @@ severity: 'critical', }, annotations: { - message: ||| + summary: 'Loki requests are causing code panics.', + description: ||| {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. |||, }, @@ -46,7 +48,8 @@ severity: 'critical', }, annotations: { - message: ||| + summary: 'Loki request error latency is high.', + description: ||| {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. |||, }, @@ -54,16 +57,17 @@ { alert: 'LokiTooManyCompactorsRunning', expr: ||| - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - |||, + sum(loki_boltdb_shipper_compactor_running) by (namespace, %s) > 1 + ||| % $._config.per_cluster_label, 'for': '5m', labels: { severity: 'warning', }, annotations: { - message: ||| + summary: 'Loki deployment is running more than one compactor.', + description: std.strReplace(||| {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - |||, + |||, 'cluster', $._config.per_cluster_label), }, }, ], diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/config.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/config.libsonnet index 1fa22f56..48e75865 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/config.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/config.libsonnet @@ -31,5 +31,10 @@ // The prefix used to match the write and read pods on SSD mode. pod_prefix_matcher: '(loki|enterprise-logs)', }, + + // Meta-monitoring related configuration + meta_monitoring: { + enabled: false, + }, }, } diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/dashboard-loki-logs.json b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/dashboard-loki-logs.json index bcb5737a..72b8565e 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/dashboard-loki-logs.json +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/dashboard-loki-logs.json @@ -116,6 +116,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -238,7 +243,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[$__rate_interval]))", "refId": "A" } ], @@ -289,6 +294,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -375,6 +385,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -410,7 +425,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[$__rate_interval]))", "refId": "A" } ], @@ -461,6 +476,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -496,7 +516,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[$__rate_interval]))", "refId": "A" } ], @@ -634,6 +654,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -669,7 +694,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)", + "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[$__rate_interval])) by (level)", "legendFormat": "{{level}}", "refId": "A" } @@ -721,6 +746,11 @@ "dashLength": 10, "dashes": false, "datasource": "$loki_datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -773,7 +803,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" | __error__=\"\" [$__auto])) by (level)", "intervalFactor": 3, "legendFormat": "{{level}}", "refId": "A" @@ -862,8 +892,8 @@ "10s", "30s", "1m", - "5m", - "15m", + "$__rate_interval", + "1$__rate_interval", "30m", "1h", "2h", diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/dashboard-loki-operational.json b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/dashboard-loki-operational.json index 2dd944c2..3f215c2e 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/dashboard-loki-operational.json +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/dashboard-loki-operational.json @@ -90,7 +90,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"$namespace/cortex-gw(-internal)?\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"$namespace/cortex-gw(-internal)?\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", "legendFormat": "{{status}}", "refId": "A" } @@ -185,7 +185,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"$namespace/cortex-gw(-internal)?\", route=~\"api_prom_push|loki_api_v1_push\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"$namespace/cortex-gw(-internal)?\", route=~\"api_prom_push|loki_api_v1_push\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", "legendFormat": "{{status}}", "refId": "A" } @@ -239,7 +239,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": {} + "custom": {}, + "unit": "ops" }, "overrides": [] }, @@ -374,7 +375,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant))", + "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (tenant))", "legendFormat": "{{tenant}}", "refId": "A" } @@ -428,7 +429,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": {} + "custom": {}, + "unit": "MBs" }, "overrides": [] }, @@ -469,7 +471,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant)) / 1024 / 1024", + "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (tenant)) / 1024 / 1024", "legendFormat": "{{tenant}}", "refId": "A" } @@ -618,7 +620,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": {} + "custom": {}, + "unit": "s" }, "overrides": [] }, @@ -723,7 +726,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": {} + "custom": {}, + "unit": "s" }, "overrides": [] }, @@ -925,7 +929,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": {} + "custom": {}, + "unit": "s" }, "overrides": [] }, @@ -1129,7 +1134,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": {} + "custom": {}, + "unit": "s" }, "overrides": [] }, @@ -1236,7 +1242,8 @@ "datasource": "$datasource", "fieldConfig": { "defaults": { - "custom": {} + "custom": {}, + "unit": "s" }, "overrides": [] }, @@ -1439,7 +1446,8 @@ "description": "", "fieldConfig": { "defaults": { - "custom": {} + "custom": {}, + "unit": "s" }, "overrides": [] }, @@ -1685,7 +1693,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])))", + "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])))", "interval": "", "legendFormat": "{{ tenant }} - {{ reason }}", "refId": "A" @@ -1809,7 +1817,7 @@ ], "targets": [ { - "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])[$__range:1m])))", + "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[$__rate_interval])[$__range:$__rate_interval])))", "format": "table", "instant": true, "interval": "", @@ -1844,6 +1852,7 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fill": 1, "fillGradient": 0, "gridPos": { @@ -1932,6 +1941,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2063,7 +2077,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\"} | logfmt | level=\"error\"[$__auto]))", "refId": "A" } ], @@ -2229,6 +2243,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2264,7 +2283,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2317,6 +2336,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2352,7 +2376,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2405,6 +2429,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2440,7 +2469,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2596,6 +2625,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -2727,7 +2761,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\"} | logfmt | level=\"error\"[$__auto]))", "refId": "A" } ], @@ -3035,7 +3069,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=\"$cluster\",job=\"$namespace/ingester\"}[1m]) > 0))", + "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=\"$cluster\",job=\"$namespace/ingester\"}[$__rate_interval]) > 0))", "interval": "", "legendFormat": "{{ tenant }}", "refId": "A" @@ -3143,13 +3177,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=\"$cluster\",job=\"$namespace/ingester\"}[1m]))", + "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=\"$cluster\",job=\"$namespace/ingester\"}[$__rate_interval]))", "interval": "", "legendFormat": "Chunks", "refId": "A" }, { - "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=\"$cluster\", job=\"$namespace/ingester\"}[1m]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=\"$cluster\", job=\"$namespace/ingester\"}[1m])) < 1", + "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=\"$cluster\", job=\"$namespace/ingester\"}[$__rate_interval]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=\"$cluster\", job=\"$namespace/ingester\"}[$__rate_interval])) < 1", "interval": "", "legendFormat": "De-Dupe Ratio", "refId": "B" @@ -3226,7 +3260,7 @@ "reverseYBuckets": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\",job=\"$namespace/ingester\"}[1m])) by (le)", + "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\",job=\"$namespace/ingester\"}[$__rate_interval])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -3379,7 +3413,7 @@ "reverseYBuckets": false, "targets": [ { - "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=\"$namespace/ingester\"}[1m]))", + "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=\"$namespace/ingester\"}[$__rate_interval]))", "format": "heatmap", "instant": false, "interval": "", @@ -3522,6 +3556,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "binBps" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -3653,7 +3692,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"} | logfmt | level=\"error\"[$__auto]))", "refId": "A" } ], @@ -3834,6 +3873,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -3872,19 +3916,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (method, name, le, container))", "intervalFactor": 1, "legendFormat": "{{container}}: .99-{{method}}-{{name}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .9-{{method}}-{{name}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .5-{{method}}-{{name}}", "refId": "C" @@ -3975,7 +4019,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, method, name, container)", + "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, method, name, container)", "intervalFactor": 1, "legendFormat": "{{container}}: {{status_code}}-{{method}}-{{name}}", "refId": "A" @@ -4043,6 +4087,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4081,19 +4130,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -4146,6 +4195,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4184,7 +4238,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, status_code, method)", + "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, status_code, method)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -4252,6 +4306,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4288,17 +4347,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (operation, le))", "refId": "C" } ], @@ -4349,6 +4408,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4385,20 +4449,20 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -4451,6 +4515,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4487,20 +4556,20 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -4553,6 +4622,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4589,17 +4663,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (operation, le))", "refId": "C" } ], @@ -4650,6 +4724,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4686,7 +4765,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4739,6 +4818,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4775,7 +4859,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4828,6 +4912,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4864,7 +4953,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4917,6 +5006,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -4953,7 +5047,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[$__rate_interval])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -5021,6 +5115,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -5059,19 +5158,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5162,7 +5261,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5264,7 +5363,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -5349,7 +5448,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -5434,7 +5533,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_throttled_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_throttled_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -5519,7 +5618,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))", "refId": "A" } ], @@ -5604,17 +5703,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": ".99", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": ".9", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))", "legendFormat": ".5", "refId": "C" } @@ -5666,6 +5765,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -5703,19 +5807,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5805,7 +5909,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5873,6 +5977,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -5910,19 +6019,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6012,7 +6121,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6080,6 +6189,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -6117,19 +6231,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6219,7 +6333,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6287,6 +6401,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -6324,19 +6443,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_cassandra_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_cassandra_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_cassandra_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_cassandra_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_cassandra_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_cassandra_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6426,7 +6545,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_cassandra_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_cassandra_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6494,6 +6613,11 @@ "dashLength": 10, "dashes": false, "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -6531,19 +6655,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6633,7 +6757,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6697,9 +6821,9 @@ "refresh_intervals": [ "10s", "30s", - "1m", - "5m", - "15m", + "$__rate_interval", + "$__rate_interval", + "1$__rate_interval", "30m", "1h", "2h", diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/dashboard-recording-rules.json b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/dashboard-recording-rules.json index 2861e871..d94f131f 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/dashboard-recording-rules.json +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/dashboard-recording-rules.json @@ -273,6 +273,7 @@ "color": { "mode": "palette-classic" }, + "unit": "s", "custom": { "axisLabel": "", "axisPlacement": "auto", diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet index 6539a34d..94e07deb 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet @@ -24,8 +24,8 @@ local grafana = import 'grafonnet/grafana.libsonnet'; // This logic is inherited from mimir-mixin. dashboard.dashboard('Canary') // We can't make use of simplified template selectors from the loki dashboard utils until we port the cortex dashboard utils panel/grid functionality. - .addTemplate('cluster', 'loki_build_info', 'cluster') - .addTemplate('namespace', 'loki_build_info{cluster=~"$cluster"}', 'namespace') + .addTemplate('cluster', 'loki_build_info', $._config.per_cluster_label) + .addTemplate('namespace', 'loki_build_info{' + $._config.per_cluster_label + '=~"$cluster"}', 'namespace') + { // This dashboard uses the new grid system in order to place panels (using gridPos). // Because of this we can't use the mixin's addRow() and addPanel(). @@ -33,7 +33,7 @@ local grafana = import 'grafonnet/grafana.libsonnet'; rows: null, // ugly hack, copy pasta the tag/link // code from the loki-mixin - tags: ['loki'], + tags: $._config.tags, links: [ { asDropdown: true, @@ -49,60 +49,60 @@ local grafana = import 'grafonnet/grafana.libsonnet'; panels: [ // grid row 1 dashboard.panel('Canary Entries Total') + - dashboard.newStatPanel('sum(count(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}))', unit='short') + + dashboard.newStatPanel('sum(count(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster", namespace=~"$namespace"}))', unit='short') + { gridPos: { h: 4, w: 3, x: 0, y: 0 } }, dashboard.panel('Canary Logs Total') + - dashboard.newStatPanel('sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 3, y: 0 } }, dashboard.panel('Missing') + - dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 6, y: 0 } }, dashboard.panel('Spotcheck Missing') + - dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 9, y: 0 } }, // grid row 2 dashboard.panel('Spotcheck Total') + - dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 0, y: 4 } }, dashboard.panel('Metric Test Error %') + - dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))) * 100') + + dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}))) * 100') + { gridPos: { h: 4, w: 3, x: 3, y: 4 } }, dashboard.panel('Missing %') + - dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + + dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + { gridPos: { h: 4, w: 3, x: 6, y: 4 } }, dashboard.panel('Spotcheck Missing %') + - dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') + + dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') + { gridPos: { h: 4, w: 3, x: 9, y: 4 } }, // grid row 3 dashboard.panel('Metric Test Expected') + - dashboard.newStatPanel('sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') + + dashboard.newStatPanel('sum(loki_canary_metric_test_expected{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"})', unit='short') + { gridPos: { h: 4, w: 3, x: 0, y: 8 } }, dashboard.panel('Metric Test Actual') + - dashboard.newStatPanel('sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') + + dashboard.newStatPanel('sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"})', unit='short') + { gridPos: { h: 4, w: 3, x: 3, y: 8 } }, dashboard.panel('Websocket Missing') + - dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 6, y: 8 } }, dashboard.panel('Websocket Missing %') + - dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + + dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + { gridPos: { h: 4, w: 3, x: 9, y: 8 } }, // end of grid dashboard.panel('Log Write to read Latency Percentiles') + dashboard.queryPanel([ - 'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', - 'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', ], ['p95', 'p50']) + { gridPos: { h: 6, w: 12, x: 12, y: 0 } }, @@ -115,7 +115,7 @@ local grafana = import 'grafonnet/grafana.libsonnet'; ).addTargets( [ grafana.prometheus.target( - 'sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)', + 'sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)', legendFormat='{{le}}', format='heatmap', ), @@ -125,24 +125,24 @@ local grafana = import 'grafonnet/grafana.libsonnet'; dashboard.panel('Spot Check Query') + dashboard.queryPanel([ - 'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', - 'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', ], ['p99', 'p95']) + { gridPos: { h: 6, w: 12, x: 0, y: 14 } }, dashboard.panel('Metric Test Query') + dashboard.queryPanel([ - 'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', - 'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', + 'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', ], ['p99', 'p95'],) + { gridPos: { h: 6, w: 12, x: 12, y: 14 } }, dashboard.panel('Spot Check Missing %') + - dashboard.queryPanel('topk(20, (sum by (cluster, pod) (increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod) (increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') + + dashboard.queryPanel('topk(20, (sum by (' + $._config.per_cluster_label + ', pod) (increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (' + $._config.per_cluster_label + ', pod) (increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') + { gridPos: { h: 6, w: 12, x: 0, y: 20 } }, g.panel('Missing logs') + - g.queryPanel('topk(20,(sum by (cluster, pod)(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod)(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ cluster }} {{ pod }}') + + g.queryPanel('topk(20,(sum by (' + $._config.per_cluster_label + ', pod)(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (' + $._config.per_cluster_label + ', pod)(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ ' + $._config.per_cluster_label + ' }} {{ pod }}') + { gridPos: { h: 6, w: 12, x: 12, y: 20 } }, ], diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-chunks.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-chunks.libsonnet index 87dfff7a..a048dadf 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-chunks.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-chunks.libsonnet @@ -6,7 +6,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; local dashboards = self, 'loki-chunks.json': { local cfg = self, - labelsSelector:: $._config.per_cluster_label + '="$cluster", job=~"$namespace/%s"' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester.*'), + labelsSelector:: $._config.per_cluster_label + '="$cluster", job=~"$namespace/%s"' % ( + if $._config.meta_monitoring.enabled + then '(ingester.*|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher + else if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester.*' + ), } + $.dashboard('Loki / Chunks', uid='chunks') .addCluster() @@ -49,7 +53,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.newQueryPanel('Index Entries Per Chunk') + $.queryPanel( - 'sum(rate(loki_chunk_store_index_entries_per_chunk_sum{%s}[5m])) / sum(rate(loki_chunk_store_index_entries_per_chunk_count{%s}[5m]))' % [ + 'sum(rate(loki_chunk_store_index_entries_per_chunk_sum{%s}[$__rate_interval])) / sum(rate(loki_chunk_store_index_entries_per_chunk_count{%s}[$__rate_interval]))' % [ dashboards['loki-chunks.json'].labelsSelector, dashboards['loki-chunks.json'].labelsSelector, ], @@ -139,9 +143,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.newQueryPanel('Chunk Size Quantiles', 'bytes') + $.queryPanel( [ - 'histogram_quantile(0.99, sum(rate(loki_ingester_chunk_size_bytes_bucket{%s}[1m])) by (le))' % dashboards['loki-chunks.json'].labelsSelector, - 'histogram_quantile(0.90, sum(rate(loki_ingester_chunk_size_bytes_bucket{%s}[1m])) by (le))' % dashboards['loki-chunks.json'].labelsSelector, - 'histogram_quantile(0.50, sum(rate(loki_ingester_chunk_size_bytes_bucket{%s}[1m])) by (le))' % dashboards['loki-chunks.json'].labelsSelector, + 'histogram_quantile(0.99, sum(rate(loki_ingester_chunk_size_bytes_bucket{%s}[$__rate_interval])) by (le))' % dashboards['loki-chunks.json'].labelsSelector, + 'histogram_quantile(0.90, sum(rate(loki_ingester_chunk_size_bytes_bucket{%s}[$__rate_interval])) by (le))' % dashboards['loki-chunks.json'].labelsSelector, + 'histogram_quantile(0.50, sum(rate(loki_ingester_chunk_size_bytes_bucket{%s}[$__rate_interval])) by (le))' % dashboards['loki-chunks.json'].labelsSelector, ], [ 'p99', @@ -157,9 +161,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.newQueryPanel('Chunk Duration hours (end-start)') + $.queryPanel( [ - 'histogram_quantile(0.5, sum(rate(loki_ingester_chunk_bounds_hours_bucket{%s}[5m])) by (le))' % dashboards['loki-chunks.json'].labelsSelector, - 'histogram_quantile(0.99, sum(rate(loki_ingester_chunk_bounds_hours_bucket{%s}[5m])) by (le))' % dashboards['loki-chunks.json'].labelsSelector, - 'sum(rate(loki_ingester_chunk_bounds_hours_sum{%s}[5m])) / sum(rate(loki_ingester_chunk_bounds_hours_count{%s}[5m]))' % [ + 'histogram_quantile(0.5, sum(rate(loki_ingester_chunk_bounds_hours_bucket{%s}[$__rate_interval])) by (le))' % dashboards['loki-chunks.json'].labelsSelector, + 'histogram_quantile(0.99, sum(rate(loki_ingester_chunk_bounds_hours_bucket{%s}[$__rate_interval])) by (le))' % dashboards['loki-chunks.json'].labelsSelector, + 'sum(rate(loki_ingester_chunk_bounds_hours_sum{%s}[$__rate_interval])) / sum(rate(loki_ingester_chunk_bounds_hours_count{%s}[$__rate_interval]))' % [ dashboards['loki-chunks.json'].labelsSelector, dashboards['loki-chunks.json'].labelsSelector, ], diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-deletion.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-deletion.libsonnet index 5fdbcc76..5b8ef5d5 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-deletion.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-deletion.libsonnet @@ -2,7 +2,9 @@ local g = import 'grafana-builder/grafana.libsonnet'; local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { - local compactor_matcher = if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else 'compactor', + local compactor_matcher = if $._config.meta_monitoring.enabled + then 'pod=~"(compactor|%s-backend.*|loki-single-binary)"' % $._config.ssd.pod_prefix_matcher + else if $._config.ssd.enabled then 'container="loki", pod=~"%s-backend.*"' % $._config.ssd.pod_prefix_matcher else 'container="compactor"', grafanaDashboards+:: { 'loki-deletion.json': @@ -61,15 +63,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.newQueryPanel('Lines Deleted / Sec') + - g.queryPanel('sum(rate(loki_compactor_deleted_lines{' + $._config.per_cluster_label + '=~"$cluster",job=~"$namespace/%s"}[$__rate_interval])) by (user)' % compactor_matcher, '{{user}}'), + g.queryPanel('sum(rate(loki_compactor_deleted_lines{' + $.namespaceMatcher() + ', ' + compactor_matcher + '}[$__rate_interval])) by (user)', '{{user}}'), ) ).addRow( g.row('List of deletion requests') .addPanel( - $.logPanel('In progress/finished', '{%s, container="compactor"} |~ "Started processing delete request|delete request for user marked as processed" | logfmt | line_format "{{.ts}} user={{.user}} delete_request_id={{.delete_request_id}} msg={{.msg}}" ' % $.namespaceMatcher()), + $.logPanel('In progress/finished', '{%s, %s} |~ "Started processing delete request|delete request for user marked as processed" | logfmt | line_format "{{.ts}} user={{.user}} delete_request_id={{.delete_request_id}} msg={{.msg}}" ' % [$.namespaceMatcher(), compactor_matcher]), ) .addPanel( - $.logPanel('Requests', '{%s, container="compactor"} |~ "delete request for user added" | logfmt | line_format "{{.ts}} user={{.user}} query=\'{{.query}}\'"' % $.namespaceMatcher()), + $.logPanel('Requests', '{%s, %s} |~ "delete request for user added" | logfmt | line_format "{{.ts}} user={{.user}} query=\'{{.query}}\'"' % [$.namespaceMatcher(), compactor_matcher]), ) ), }, diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-logs.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-logs.libsonnet index 9fd6eee5..b28d74e9 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-logs.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-logs.libsonnet @@ -48,7 +48,6 @@ local template = import 'grafonnet/template.libsonnet'; local cfg = self, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, } + lokiLogs + $.dashboard('Loki / Logs', uid='logs') @@ -61,8 +60,9 @@ local template = import 'grafonnet/template.libsonnet'; p { targets: [ e { - expr: if dashboards['loki-logs.json'].showMultiCluster then super.expr - else std.strReplace(super.expr, $._config.per_cluster_label + '="$cluster", ', ''), + expr: if dashboards['loki-logs.json'].showMultiCluster + then std.strReplace(super.expr, 'cluster="$cluster"', $._config.per_cluster_label + '="$cluster"') + else std.strReplace(super.expr, 'cluster="$cluster", ', ''), } for e in p.targets ], diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-operational.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-operational.libsonnet index e8f5d982..27152ff6 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-operational.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-operational.libsonnet @@ -11,7 +11,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; showAnnotations:: true, showLinks:: true, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, hiddenRows:: [ 'Cassandra', @@ -25,17 +24,31 @@ local utils = import 'mixin-utils/utils.libsonnet'; jobMatchers:: { cortexgateway: [utils.selector.re('job', '($namespace)/cortex-gw(-internal)?')], - distributor: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'distributor'))], - ingester: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester.*'))], - querier: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else 'querier'))], - queryFrontend: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else 'query-frontend'))], + distributor: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(distributor|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'distributor'))], + ingester: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(ingester|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester.*'))], + querier: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(querier|%s-read|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else 'querier'))], + queryFrontend: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(query-frontend|%s-read|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else 'query-frontend'))], }, podMatchers:: { cortexgateway: [utils.selector.re('pod', 'cortex-gw')], - distributor: [utils.selector.re('pod', '%s' % (if $._config.ssd.enabled then '%s-write.*' % $._config.ssd.pod_prefix_matcher else 'distributor.*'))], - ingester: [utils.selector.re('pod', '%s' % (if $._config.ssd.enabled then '%s-write.*' % $._config.ssd.pod_prefix_matcher else 'ingester.*'))], - querier: [utils.selector.re('pod', '%s' % (if $._config.ssd.enabled then '%s-read.*' % $._config.ssd.pod_prefix_matcher else 'querier.*'))], + distributor: if $._config.meta_monitoring.enabled + then [utils.selector.re('pod', '(distributor|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('pod', '%s' % (if $._config.ssd.enabled then '%s-write.*' % $._config.ssd.pod_prefix_matcher else 'distributor.*'))], + ingester: if $._config.meta_monitoring.enabled + then [utils.selector.re('pod', '(ingester|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('pod', '%s' % (if $._config.ssd.enabled then '%s-write.*' % $._config.ssd.pod_prefix_matcher else 'ingester.*'))], + querier: if $._config.meta_monitoring.enabled + then [utils.selector.re('pod', '(querier|%s-read|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('pod', '%s' % (if $._config.ssd.enabled then '%s-read.*' % $._config.ssd.pod_prefix_matcher else 'querier.*'))], }, } + lokiOperational + { @@ -62,7 +75,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; local replaceClusterMatchers(expr) = if dashboards['loki-operational.json'].showMultiCluster - then expr + // Replace the recording rules cluster label with the per-cluster label + then std.strReplace( + // Replace the cluster label for equality matchers with the per-cluster label + std.strReplace( + // Replace the cluster label for regex matchers with the per-cluster label + std.strReplace( + expr, + 'cluster=~"$cluster"', + $._config.per_cluster_label + '=~"$cluster"' + ), + 'cluster="$cluster"', + $._config.per_cluster_label + '="$cluster"' + ), + 'cluster_', + $._config.per_cluster_label + '_' + ) else std.strReplace( std.strReplace( @@ -143,7 +171,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; local replaceAllMatchers(expr) = - replaceMatchers(replaceClusterMatchers(expr)), + replaceMatchers(expr), local selectDatasource(ds) = if ds == null || ds == '' then ds @@ -179,7 +207,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; datasource: selectDatasource(super.datasource), targets: if std.objectHas(p, 'targets') then [ e { - expr: removeInternalComponents(p.title, e.expr), + expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)), } for e in p.targets ] else [], @@ -188,7 +216,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; datasource: selectDatasource(super.datasource), targets: if std.objectHas(sp, 'targets') then [ e { - expr: removeInternalComponents(p.title, e.expr), + expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)), } for e in sp.targets ] else [], @@ -197,7 +225,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; datasource: selectDatasource(super.datasource), targets: if std.objectHas(ssp, 'targets') then [ e { - expr: removeInternalComponents(p.title, e.expr), + expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)), } for e in ssp.targets ] else [], diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-reads-resources.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-reads-resources.libsonnet index 83c1f0ea..21db04ea 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-reads-resources.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-reads-resources.libsonnet @@ -2,11 +2,19 @@ local grafana = import 'grafonnet/grafana.libsonnet'; local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { - local index_gateway_pod_matcher = if $._config.ssd.enabled then 'container="loki", pod=~"%s-read.*"' % $._config.ssd.pod_prefix_matcher else 'container="index-gateway"', - local index_gateway_job_matcher = if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else 'index-gateway', + local index_gateway_pod_matcher = if $._config.meta_monitoring.enabled + then 'container=~"loki|index-gateway", pod=~"(index-gateway.*|%s-read.*|loki-single-binary)"' % $._config.ssd.pod_prefix_matcher + else if $._config.ssd.enabled then 'container="loki", pod=~"%s-read.*"' % $._config.ssd.pod_prefix_matcher else 'container="index-gateway"', + local index_gateway_job_matcher = if $._config.meta_monitoring.enabled + then '(index-gateway.*|%s-read.*|loki-single-binary)' % $._config.ssd.pod_prefix_matcher + else if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else 'index-gateway', - local ingester_pod_matcher = if $._config.ssd.enabled then 'container="loki", pod=~"%s-write.*"' % $._config.ssd.pod_prefix_matcher else 'container="ingester"', - local ingester_job_matcher = if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester.+', + local ingester_pod_matcher = if $._config.meta_monitoring.enabled + then 'container=~"loki|ingester", pod=~"(ingester.*|%s-write.*|loki-single-binary)"' % $._config.ssd.pod_prefix_matcher + else if $._config.ssd.enabled then 'container="loki", pod=~"%s-write.*"' % $._config.ssd.pod_prefix_matcher else 'container="ingester"', + local ingester_job_matcher = if $._config.meta_monitoring.enabled + then '(ingester.+|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher + else if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester.+', grafanaDashboards+:: { @@ -117,6 +125,38 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerDiskSpaceUtilizationPanel('Disk Space Utilization', index_gateway_job_matcher), ) ) + .addRowIf( + !$._config.ssd.enabled, + grafana.row.new('Bloom Gateway') + .addPanel( + $.containerCPUUsagePanel('CPU', 'bloom-gateway'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'bloom-gateway'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'bloom-gateway'), + ) + .addPanel( + $.newQueryPanel('Disk Writes', 'Bps') + + $.queryPanel( + 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('bloom-gateway')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + + $.withStacking, + ) + .addPanel( + $.newQueryPanel('Disk Reads', 'Bps') + + $.queryPanel( + 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('bloom-gateway')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + + $.withStacking, + ) + .addPanel( + $.containerDiskSpaceUtilizationPanel('Disk Space Utilization', 'bloom-gateway'), + ) + ) .addRow( $.row('Ingester') .addPanel( diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-reads.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-reads.libsonnet index 3da4e200..6d75993d 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-reads.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-reads.libsonnet @@ -5,8 +5,56 @@ local utils = import 'mixin-utils/utils.libsonnet'; local dashboards = self, local showBigTable = false, - local http_routes = 'loki_api_v1_series|api_prom_series|api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_labels|loki_api_v1_label_name_values', - local grpc_routes = '/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs', + // Available HTTP routes can be collected with the following instant query: + // count by (route) (loki_request_duration_seconds_count{route!~"/.*"}) + local http_routes = '(%s)' % std.join( + '|', [ + 'api_prom_rules', + 'api_prom_rules_namespace_groupname', + 'api_v1_rules', + 'loki_api_v1_delete', + 'loki_api_v1_detected_labels', + 'loki_api_v1_index_stats', + 'loki_api_v1_index_volume', + 'loki_api_v1_index_volume_range', + 'loki_api_v1_label_name_values', + 'loki_api_v1_label_values', + 'loki_api_v1_labels', + 'loki_api_v1_patterns', + 'loki_api_v1_query', + 'loki_api_v1_query_range', + 'loki_api_v1_series', + 'otlp_v1_logs', + 'prometheus_api_v1_rules', + ] + ), + + // Available GRPC routes can be collected with the following instant query: + // count by (route) (loki_request_duration_seconds_count{route=~"/.*"}) + local grpc_routes = '(%s)' % std.join( + '|', [ + '/base.Ruler/Rules', + '/indexgatewaypb.IndexGateway/GetChunkRef', + '/indexgatewaypb.IndexGateway/GetSeries', + '/indexgatewaypb.IndexGateway/GetShards', + '/indexgatewaypb.IndexGateway/GetStats', + '/indexgatewaypb.IndexGateway/GetVolume', + '/indexgatewaypb.IndexGateway/LabelNamesForMetricName', + '/indexgatewaypb.IndexGateway/LabelValuesForMetricName', + '/indexgatewaypb.IndexGateway/QueryIndex', + '/logproto.BloomGateway/FilterChunkRefs', + '/logproto.Pattern/Query', + '/logproto.Querier/GetChunkIDs', + '/logproto.Querier/GetDetectedLabels', + '/logproto.Querier/GetStats', + '/logproto.Querier/GetVolume', + '/logproto.Querier/Label', + '/logproto.Querier/Query', + '/logproto.Querier/QuerySample', + '/logproto.Querier/Series', + '/logproto.StreamData/GetStreamRates', + ] + ), local latencyPanelWithExtraGrouping(metricName, selector, multiplier='1e3', extra_grouping='') = { nullPointMode: 'null as zero', @@ -31,20 +79,35 @@ local utils = import 'mixin-utils/utils.libsonnet'; local cfg = self, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, clusterMatchers:: if cfg.showMultiCluster then - [utils.selector.re(cfg.clusterLabel, '$cluster')] + [utils.selector.re($._config.per_cluster_label, '$cluster')] else [], matchers:: { cortexgateway: [utils.selector.re('job', '($namespace)/cortex-gw(-internal)?')], - queryFrontend: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else 'query-frontend'))], - querier: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'querier'))], - ingester: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester'))], - ingesterZoneAware: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester-zone.*'))], - querierOrIndexGateway: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else '(querier|index-gateway)'))], + queryFrontend: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(query-frontend|%s-read|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else 'query-frontend'))], + querier: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(querier|%s-read|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else 'querier'))], + ingester: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(ingester|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester'))], + ingesterZoneAware: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(ingester-zone-.*|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester-zone.*'))], + querierOrIndexGateway: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(querier|index-gateway|%s-read|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else '(querier|index-gateway)'))], + indexGateway: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(index-gateway|%s-backend|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-backend' % $._config.ssd.pod_prefix_matcher else 'index-gateway'))], + bloomGateway: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(bloom-gateway|%s-backend|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-backend' % $._config.ssd.pod_prefix_matcher else 'bloom-gateway'))], }, local selector(matcherId) = @@ -59,6 +122,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ingesterSelector:: selector('ingester'), ingesterZoneSelector:: selector('ingesterZoneAware'), querierOrIndexGatewaySelector:: selector('querierOrIndexGateway'), + indexGatewaySelector:: selector('indexGateway'), + bloomGatewaySelector:: selector('bloomGateway'), } + $.dashboard('Loki / Reads', uid='reads') .addCluster() @@ -194,19 +259,53 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( !$._config.ssd.enabled, - $.row('Index') + $.row('Index Gateway') .addPanel( $.newQueryPanel('QPS') + - $.newQpsPanel('loki_index_request_duration_seconds_count{%s operation!="index_chunk"}' % dashboards['loki-reads.json'].querierSelector) + $.newQpsPanel('loki_request_duration_seconds_count{%s route=~"%s"}' % [dashboards['loki-reads.json'].indexGatewaySelector, grpc_routes]) ) .addPanel( $.newQueryPanel('Latency', 'ms') + - $.latencyPanel('loki_index_request_duration_seconds', '{%s operation!="index_chunk"}' % dashboards['loki-reads.json'].querierSelector) + utils.latencyRecordingRulePanel( + 'loki_request_duration_seconds', + dashboards['loki-reads.json'].clusterMatchers + dashboards['loki-reads.json'].matchers.bloomGateway + [utils.selector.re('route', grpc_routes)], + sum_by=['route'] + ) ) .addPanel( p99LatencyByPod( - 'loki_index_request_duration_seconds', - '{%s operation!="index_chunk"}' % dashboards['loki-reads.json'].querierSelector + 'loki_request_duration_seconds', + $.toPrometheusSelector( + dashboards['loki-reads.json'].clusterMatchers + + dashboards['loki-reads.json'].matchers.indexGateway + + [utils.selector.re('route', grpc_routes)] + ), + ) + ) + ) + .addRowIf( + !$._config.ssd.enabled, + $.row('Bloom Gateway') + .addPanel( + $.newQueryPanel('QPS') + + $.newQpsPanel('loki_request_duration_seconds_count{%s route=~"%s"}' % [dashboards['loki-reads.json'].bloomGatewaySelector, grpc_routes]) + ) + .addPanel( + $.newQueryPanel('Latency', 'ms') + + utils.latencyRecordingRulePanel( + 'loki_request_duration_seconds', + dashboards['loki-reads.json'].clusterMatchers + dashboards['loki-reads.json'].matchers.bloomGateway + [utils.selector.re('route', grpc_routes)], + sum_by=['route'] + ) + ) + .addPanel( + p99LatencyByPod( + 'loki_request_duration_seconds', + $.toPrometheusSelector( + dashboards['loki-reads.json'].clusterMatchers + + dashboards['loki-reads.json'].matchers.bloomGateway + + [utils.selector.re('route', grpc_routes)] + ), ) ) ) @@ -225,8 +324,27 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) ) - .addRow( - $.row('BoltDB Shipper') + .addRowIf( + !$._config.ssd.enabled, + $.row('TSBD Index') + .addPanel( + $.newQueryPanel('QPS') + + $.newQpsPanel('loki_index_request_duration_seconds_count{%s operation!="index_chunk"}' % dashboards['loki-reads.json'].querierSelector) + ) + .addPanel( + $.newQueryPanel('Latency', 'ms') + + $.latencyPanel('loki_index_request_duration_seconds', '{%s operation!="index_chunk"}' % dashboards['loki-reads.json'].querierSelector) + ) + .addPanel( + p99LatencyByPod( + 'loki_index_request_duration_seconds', + '{%s operation!="index_chunk"}' % dashboards['loki-reads.json'].querierSelector + ) + ) + ) + .addRowIf( + !$._config.ssd.enabled, + $.row('BoltDB Index') .addPanel( $.newQueryPanel('QPS') + $.newQpsPanel('loki_boltdb_shipper_request_duration_seconds_count{%s operation="Shipper.Query"}' % dashboards['loki-reads.json'].querierOrIndexGatewaySelector) diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-retention.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-retention.libsonnet index 9896a524..c2e461c3 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-retention.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-retention.libsonnet @@ -1,8 +1,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { - local compactor_pod_matcher = if $._config.ssd.enabled then 'container="loki", pod=~"%s-read.*"' % $._config.ssd.pod_prefix_matcher else 'container="compactor"', - local compactor_job_matcher = if $._config.ssd.enabled then '%s-read' % $._config.ssd.pod_prefix_matcher else 'compactor', + local compactor_pod_matcher = if $._config.meta_monitoring.enabled + then 'pod=~"(compactor.*|%s-backend.*|loki-single-binary)"' % $._config.ssd.pod_prefix_matcher + else if $._config.ssd.enabled then 'container="loki", pod=~"%s-read.*"' % $._config.ssd.pod_prefix_matcher else 'container="compactor"', + local compactor_job_matcher = if $._config.meta_monitoring.enabled + then '"(compactor|%s-backend.*|loki-single-binary)"' % $._config.ssd.pod_prefix_matcher + else if $._config.ssd.enabled then '%s-backend' % $._config.ssd.pod_prefix_matcher else 'compactor', grafanaDashboards+:: { 'loki-retention.json': diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-writes-resources.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-writes-resources.libsonnet index f25aeb4b..1d4c693a 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-writes-resources.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-writes-resources.libsonnet @@ -2,8 +2,12 @@ local grafana = import 'grafonnet/grafana.libsonnet'; local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { - local ingester_pod_matcher = if $._config.ssd.enabled then 'container="loki", pod=~"%s-write.*"' % $._config.ssd.pod_prefix_matcher else 'container="ingester"', - local ingester_job_matcher = if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester.*', + local ingester_pod_matcher = if $._config.meta_monitoring.enabled + then 'container=~"loki|ingester", pod=~"(ingester.*|%s-write.*|loki-single-binary)"' % $._config.ssd.pod_prefix_matcher + else if $._config.ssd.enabled then 'container="loki", pod=~"%s-write.*"' % $._config.ssd.pod_prefix_matcher else 'container="ingester"', + local ingester_job_matcher = if $._config.meta_monitoring.enabled + then '(ingester.*|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher + else if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester.*', grafanaDashboards+:: { diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-writes.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-writes.libsonnet index bedb9ca1..8cde2465 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-writes.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/loki-writes.libsonnet @@ -9,19 +9,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; local cfg = self, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, clusterMatchers:: if cfg.showMultiCluster then - [utils.selector.re(cfg.clusterLabel, '$cluster')] + [utils.selector.re($._config.per_cluster_label, '$cluster')] else [], matchers:: { cortexgateway: [utils.selector.re('job', '($namespace)/cortex-gw(-internal)?')], - distributor: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'distributor'))], - ingester: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester'))], - ingester_zone: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester-zone.*'))], - any_ingester: [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester.*'))], + distributor: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(distributor|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'distributor'))], + ingester: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(ingester|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester'))], + ingester_zone: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(ingester-zone.*|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester-zone.*'))], + any_ingester: if $._config.meta_monitoring.enabled + then [utils.selector.re('job', '($namespace)/(ingester.*|%s-write|loki-single-binary)' % $._config.ssd.pod_prefix_matcher)] + else [utils.selector.re('job', '($namespace)/%s' % (if $._config.ssd.enabled then '%s-write' % $._config.ssd.pod_prefix_matcher else 'ingester.*'))], }, local selector(matcherId) = diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/recording-rules.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/recording-rules.libsonnet index 2d943807..46618da9 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/recording-rules.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/loki/production/loki-mixin/dashboards/recording-rules.libsonnet @@ -7,7 +7,7 @@ local template = import 'grafonnet/template.libsonnet'; template.new( 'tenant', '$datasource', - 'query_result(sum by (id) (grafanacloud_logs_instance_info) and sum(label_replace(loki_tenant:active_streams{cluster="$cluster",namespace="$namespace"},"id","$1","tenant","(.*)")) by(id))', + 'query_result(sum by (id) (grafanacloud_logs_instance_info) and sum(label_replace(loki_tenant:active_streams{' + $._config.per_cluster_label + '="$cluster",namespace="$namespace"},"id","$1","tenant","(.*)")) by(id))', regex='/"([^"]+)"/', sort=1, includeAll=true, diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet index 03fe7d4c..832c7843 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts.libsonnet @@ -6,6 +6,6 @@ (import 'alerts/blocks.libsonnet') + (import 'alerts/compactor.libsonnet') + (import 'alerts/autoscaling.libsonnet') + - (import 'alerts/ingest-storage.libsonnet') + + (if $._config.ingest_storage_enabled then import 'alerts/ingest-storage.libsonnet' else {}) + (import 'alerts/continuous-test.libsonnet'), } diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet index ca67c94b..36119c5a 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/alerts.libsonnet @@ -34,10 +34,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Note if alert_aggregation_labels is "job", this will repeat the label. But // prometheus seems to tolerate that. expr: ||| - 100 * sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"%(excluded_routes)s"}[%(range_interval)s])) + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + ( + sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"%(excluded_routes)s"}[%(range_interval)s])) / - sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[%(range_interval)s])) - > 1 + sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[%(range_interval)s])) + ) * 100 > 1 ||| % { group_by: $._config.alert_aggregation_labels, job_label: $._config.per_job_label, @@ -79,26 +83,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| % $._config, }, }, - { - alert: $.alertName('QueriesIncorrect'), - expr: ||| - 100 * sum by (%(group_by)s) (rate(test_exporter_test_case_result_total{result="fail"}[%(range_interval)s])) - / - sum by (%(group_by)s) (rate(test_exporter_test_case_result_total[%(range_interval)s])) > 1 - ||| % { - group_by: $._config.alert_aggregation_labels, - range_interval: $.alertRangeInterval(5), - }, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - The %(product)s cluster %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% incorrect query results. - ||| % $._config, - }, - }, { alert: $.alertName('InconsistentRuntimeConfig'), expr: ||| diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet index 8f6ed51d..eeec268f 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/alerts/blocks.libsonnet @@ -14,17 +14,18 @@ (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 0) and # Only if the ingester has ingested samples over the last 4h. - (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate%(recording_rules_range_interval)s[4h])) > 0) and # Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica # had ingested samples in the past, then no traffic was received for a long period and then it starts # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving # samples, while the a block shipping is expected within the next 4h. - (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0) + (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate%(recording_rules_range_interval)s[1h] offset 4h)) > 0) ||| % { alert_aggregation_labels: $._config.alert_aggregation_labels, per_instance_label: $._config.per_instance_label, alert_aggregation_rule_prefix: $._config.alert_aggregation_rule_prefix, + recording_rules_range_interval: $._config.recording_rules_range_interval, }, labels: { severity: 'critical', @@ -41,11 +42,12 @@ expr: ||| (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) == 0) and - (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate%(recording_rules_range_interval)s[4h])) > 0) ||| % { alert_aggregation_labels: $._config.alert_aggregation_labels, per_instance_label: $._config.per_instance_label, alert_aggregation_rule_prefix: $._config.alert_aggregation_rule_prefix, + recording_rules_range_interval: $._config.recording_rules_range_interval, }, labels: { severity: 'critical', @@ -218,10 +220,13 @@ }, }, { - // Alert if the bucket index has not been updated for a given user. + // Alert if the bucket index has not been updated for a given user. The default update interval is 900 seconds + // so we alert if we've missed two updates plus a 300 second buffer to avoid false-positives. It's important + // that this alert fire before queriers start to return errors because the bucket index is too old (3600 seconds + // by default). alert: $.alertName('BucketIndexNotUpdated'), expr: ||| - min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 ||| % $._config, labels: { severity: 'critical', diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet index d40c3a23..32158d6d 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet @@ -57,11 +57,13 @@ }, // Some dashboards show panels grouping together multiple components of a given "path". - // This mapping configures which components belong to each group. + // This mapping configures which components belong to each group. A component can belong + // to multiple groups. local componentGroups = { write: ['distributor', 'ingester', 'mimir_write'], read: ['query_frontend', 'querier', 'ruler_query_frontend', 'ruler_querier', 'mimir_read'], backend: ['query_scheduler', 'ruler_query_scheduler', 'ruler', 'store_gateway', 'compactor', 'alertmanager', 'overrides_exporter', 'mimir_backend'], + remote_ruler_read: ['ruler_query_frontend', 'ruler_query_scheduler', 'ruler_querier'], }, // These are used by the dashboards and allow for the simultaneous display of @@ -133,6 +135,7 @@ write: componentsGroupMatcher(componentGroups.write), read: componentsGroupMatcher(componentGroups.read), backend: componentsGroupMatcher(componentGroups.backend), + remote_ruler_read: componentsGroupMatcher(componentGroups.remote_ruler_read), }, all_instances: std.join('|', std.map(function(name) componentNameRegexp[name], componentGroups.write + componentGroups.read + componentGroups.backend)), @@ -194,6 +197,9 @@ // Used to add extra annotations to all alerts, Careful: takes precedence over default annotations. alert_extra_annotations: {}, + // Whether alerts for experimental ingest storage are enabled. + ingest_storage_enabled: true, + cortex_p99_latency_threshold_seconds: 2.5, // Whether resources dashboards are enabled (based on cAdvisor metrics). @@ -278,7 +284,7 @@ sum by (%(alert_aggregation_labels)s, deployment) ( label_replace( label_replace( - sum by (%(alert_aggregation_labels)s, %(per_instance_label)s)(rate(container_cpu_usage_seconds_total[1m])), + sum by (%(alert_aggregation_labels)s, %(per_instance_label)s)(rate(container_cpu_usage_seconds_total[%(recording_rules_range_interval)s])), "deployment", "$1", "%(per_instance_label)s", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" ), # The question mark in "(.*?)" is used to make it non-greedy, otherwise it @@ -641,6 +647,10 @@ enabled: false, hpa_name: $._config.autoscaling_hpa_prefix + 'cortex-gw.*', }, + ingester: { + enabled: false, + hpa_name: $._config.autoscaling_hpa_prefix + 'ingester-zone-a', + }, }, diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards.libsonnet index c4e9d6f9..a9fb0265 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards.libsonnet @@ -23,6 +23,7 @@ (import 'dashboards/overview-networking.libsonnet') + (import 'dashboards/reads-resources.libsonnet') + (import 'dashboards/remote-ruler-reads-resources.libsonnet') + + (import 'dashboards/remote-ruler-reads-networking.libsonnet') + (import 'dashboards/reads-networking.libsonnet') + (import 'dashboards/writes-resources.libsonnet') + (import 'dashboards/writes-networking.libsonnet') + diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index 5ebf1bf8..70bbeb09 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -1,4 +1,30 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + { + // Helper function to produce failure rate in percentage queries for native and classic histograms. + // Takes a metric name and a selector as strings and returns a dictionary with classic and native queries. + nativeClassicFailureRate(metric, selector):: { + local template = ||| + ( + # gRPC errors are not tracked as 5xx but "error". + sum(%(countFailQuery)s) + or + # Handle the case no failure has been tracked yet. + vector(0) + ) + / + sum(%(countQuery)s) + |||, + classic: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(metric, selector + ',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(metric, selector).classic, + }, + native: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(metric, selector + ',status_code=~"5.*|error"').native, + countQuery: utils.nativeClassicHistogramCountRate(metric, selector).native, + }, + }, + // This object contains common queries used in the Mimir dashboards. // These queries are NOT intended to be configurable or overriddeable via jsonnet, // but they're defined in a common place just to share them between different dashboards. @@ -25,55 +51,43 @@ query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?', gateway: { + // deprecated, will be removed writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}' % variables, readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + local p = self, + requestsPerSecondMetric: 'cortex_request_duration_seconds', + writeRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"' % variables, + readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, + // Write failures rate as percentage of total requests. - writeFailuresRate: ||| - ( - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + writeFailuresRate: $.nativeClassicFailureRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector), // Read failures rate as percentage of total requests. - readFailuresRate: ||| - ( - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + readFailuresRate: $.nativeClassicFailureRate(p.requestsPerSecondMetric, p.readRequestsPerSecondSelector), }, distributor: { + // deprecated, will be removed writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables, + + local p = self, + requestsPerSecondMetric: 'cortex_request_duration_seconds', + writeRequestsPerSecondSelector: '%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"' % variables, samplesPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_samples:rate5m{%(distributorMatcher)s})' % variables, exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables, // Write failures rate as percentage of total requests. - writeFailuresRate: ||| - ( - # gRPC errors are not tracked as 5xx but "error". - sum(rate(cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s",status_code=~"5.*|error"}[$__rate_interval])) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(rate(cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + writeFailuresRate: $.nativeClassicFailureRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector), }, query_frontend: { + // deprecated, will be removed readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + + local p = self, + readRequestsPerSecondMetric: 'cortex_request_duration_seconds', + readRequestsPerSecondSelector: '%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, // These query routes are used in the overview and other dashboard, everythign else is considered "other" queries. // Has to be a list to keep the same colors as before, see overridesNonErrorColorsPalette. local overviewRoutes = [ @@ -124,16 +138,7 @@ labelValuesCardinalityQueriesPerSecond: queryPerSecond('labelValuesCardinality'), // Read failures rate as percentage of total requests. - readFailuresRate: ||| - ( - sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) - or - # Handle the case no failure has been tracked yet. - vector(0) - ) - / - sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + readFailuresRate: $.nativeClassicFailureRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector), }, ruler: { diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index 47a347cb..a5df3935 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -596,179 +596,160 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('cortex_kv_request_duration_seconds', '{%s, kv_name=~"%s"}' % [$.jobMatcher($._config.job_names[jobName]), kvName]) ), - cpuAndMemoryBasedAutoScalingRow(componentTitle):: - local component = std.asciiLower(componentTitle); - local field = std.strReplace(component, '-', '_'); - super.row('%s - autoscaling' % [componentTitle]) - .addPanel( - local title = 'Replicas'; - $.timeseriesPanel(title) + - $.queryPanel( - [ - ||| - max by (scaletargetref_name) ( - kube_horizontalpodautoscaler_spec_max_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - # Add the scaletargetref_name label for readability - + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) - 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - ) - ||| % { - namespace_matcher: $.namespaceMatcher(), - hpa_name: $._config.autoscaling[field].hpa_name, - cluster_labels: std.join(', ', $._config.cluster_labels), - }, - ||| - max by (scaletargetref_name) ( - kube_horizontalpodautoscaler_status_current_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - # HPA doesn't go to 0 replicas, so we multiply by 0 if the HPA is not active - * on (%(cluster_labels)s, horizontalpodautoscaler) - kube_horizontalpodautoscaler_status_condition{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s", condition="ScalingActive", status="true"} - # Add the scaletargetref_name label for readability - + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) - 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - ) - ||| % { - namespace_matcher: $.namespaceMatcher(), - hpa_name: $._config.autoscaling[field].hpa_name, - cluster_labels: std.join(', ', $._config.cluster_labels), - }, - ||| - max by (scaletargetref_name) ( - kube_horizontalpodautoscaler_spec_min_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - # Add the scaletargetref_name label for readability - + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) - 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} - ) - ||| % { - namespace_matcher: $.namespaceMatcher(), - hpa_name: $._config.autoscaling[field].hpa_name, - cluster_labels: std.join(', ', $._config.cluster_labels), - }, - ], - [ - 'Max {{ scaletargetref_name }}', - 'Current {{ scaletargetref_name }}', - 'Min {{ scaletargetref_name }}', - ], - ) + - $.panelDescription( - title, + // The provided componentName should be the name of a component among the ones defined in $._config.autoscaling. + autoScalingActualReplicas(componentName):: + local title = 'Replicas'; + local componentTitle = std.strReplace(componentName, '_', '-'); + + $.timeseriesPanel(title) + + $.queryPanel( + [ ||| - The maximum and current number of %s replicas. - Note: The current number of replicas can still show 1 replica even when scaled to 0. - Because HPA never reports 0 replicas, the query will report 0 only if the HPA is not active. - ||| % [component] - ) + - { - fieldConfig+: { - overrides: [ - $.overrideField('byRegexp', '/Max .+/', [ - $.overrideProperty('custom.fillOpacity', 0), - $.overrideProperty('custom.lineStyle', { fill: 'dash' }), - ]), - $.overrideField('byRegexp', '/Current .+/', [ - $.overrideProperty('custom.fillOpacity', 0), - ]), - $.overrideField('byRegexp', '/Min .+/', [ - $.overrideProperty('custom.fillOpacity', 0), - $.overrideProperty('custom.lineStyle', { fill: 'dash' }), - ]), - ], + max by (scaletargetref_name) ( + kube_horizontalpodautoscaler_spec_max_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + # Add the scaletargetref_name label for readability + + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) + 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + ) + ||| % { + namespace_matcher: $.namespaceMatcher(), + hpa_name: $._config.autoscaling[componentName].hpa_name, + cluster_labels: std.join(', ', $._config.cluster_labels), }, + ||| + max by (scaletargetref_name) ( + kube_horizontalpodautoscaler_status_current_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + # HPA doesn't go to 0 replicas, so we multiply by 0 if the HPA is not active + * on (%(cluster_labels)s, horizontalpodautoscaler) + kube_horizontalpodautoscaler_status_condition{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s", condition="ScalingActive", status="true"} + # Add the scaletargetref_name label for readability + + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) + 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + ) + ||| % { + namespace_matcher: $.namespaceMatcher(), + hpa_name: $._config.autoscaling[componentName].hpa_name, + cluster_labels: std.join(', ', $._config.cluster_labels), + }, + ||| + max by (scaletargetref_name) ( + kube_horizontalpodautoscaler_spec_min_replicas{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + # Add the scaletargetref_name label for readability + + on (%(cluster_labels)s, horizontalpodautoscaler) group_left (scaletargetref_name) + 0*kube_horizontalpodautoscaler_info{%(namespace_matcher)s, horizontalpodautoscaler=~"%(hpa_name)s"} + ) + ||| % { + namespace_matcher: $.namespaceMatcher(), + hpa_name: $._config.autoscaling[componentName].hpa_name, + cluster_labels: std.join(', ', $._config.cluster_labels), + }, + ], + [ + 'Max {{ scaletargetref_name }}', + 'Current {{ scaletargetref_name }}', + 'Min {{ scaletargetref_name }}', + ], + ) + + $.panelDescription( + title, + ||| + The maximum and current number of %s replicas.

+ Note: The current number of replicas can still show 1 replica even when scaled to 0. + Because HPA never reports 0 replicas, the query will report 0 only if the HPA is not active. + ||| % [componentTitle] + ) + + { + fieldConfig+: { + overrides: [ + $.overrideField('byRegexp', '/Max .+/', [ + $.overrideProperty('custom.fillOpacity', 0), + $.overrideProperty('custom.lineStyle', { fill: 'dash' }), + ]), + $.overrideField('byRegexp', '/Current .+/', [ + $.overrideProperty('custom.fillOpacity', 0), + ]), + $.overrideField('byRegexp', '/Min .+/', [ + $.overrideProperty('custom.fillOpacity', 0), + $.overrideProperty('custom.lineStyle', { fill: 'dash' }), + ]), + ], }, - ) - .addPanel( - local title = 'Scaling metric (CPU): Desired replicas'; - $.timeseriesPanel(title) + - $.queryPanel( - [ - ||| - sum by (scaler) ( + }, + + // The provided componentName should be the name of a component among the ones defined in $._config.autoscaling. + autoScalingDesiredReplicasByScalingMetricPanel(componentName, scalingMetricName, scalingMetricID):: + local title = if scalingMetricName != '' then 'Scaling metric (%s): Desired replicas' % scalingMetricName else 'Desired replicas'; + local scalerSelector = if scalingMetricID != '' then ('.*%s.*' % scalingMetricID) else '.+'; + + $.timeseriesPanel(title) + + $.queryPanel( + [ + ||| + sum by (scaler) ( + label_replace( + keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~"%(scaler_selector)s"}, + "namespace", "$1", "exported_namespace", "(.*)" + ) + / + on(%(aggregation_labels)s, scaledObject, metric) group_left label_replace( label_replace( - keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~".*cpu.*"}, - "namespace", "$1", "exported_namespace", "(.*)" - ) - / - on(%(aggregation_labels)s, scaledObject, metric) group_left label_replace( - label_replace( - kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, - "metric", "$1", "metric_name", "(.+)" - ), - "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" - ) + kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, + "metric", "$1", "metric_name", "(.+)" + ), + "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" ) - ||| % { - aggregation_labels: $._config.alert_aggregation_labels, - cluster_label: $._config.per_cluster_label, - hpa_prefix: $._config.autoscaling_hpa_prefix, - hpa_name: $._config.autoscaling[field].hpa_name, - namespace: $.namespaceMatcher(), - }, - ], [ - '{{ scaler }}', - ] - ) + - $.panelDescription( - title, - ||| - This panel shows the scaling metric exposed by KEDA divided by the target/threshold used. - It should represent the desired number of replicas, ignoring the min/max constraints applied later. - ||| - ), + ) + ||| % { + aggregation_labels: $._config.alert_aggregation_labels, + cluster_label: $._config.per_cluster_label, + hpa_prefix: $._config.autoscaling_hpa_prefix, + hpa_name: $._config.autoscaling[componentName].hpa_name, + namespace: $.namespaceMatcher(), + scaler_selector: scalerSelector, + }, + ], [ + '{{ scaler }}', + ] + ) + + $.panelDescription( + title, + ||| + This panel shows the scaling metric exposed by KEDA divided by the target/threshold used. + It should represent the desired number of replicas, ignoring the min/max constraints applied later. + ||| + ), + + // The provided componentName should be the name of a component among the ones defined in $._config.autoscaling. + autoScalingFailuresPanel(componentName):: + local title = 'Autoscaler failures rate'; + + $.timeseriesPanel(title) + + $.queryPanel( + $.filterKedaScalerErrorsByHPA($._config.autoscaling[componentName].hpa_name), + '{{scaler}} failures' + ) + + $.panelDescription( + title, + ||| + The rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom + metrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly. + ||| + ), + + cpuAndMemoryBasedAutoScalingRow(componentTitle):: + local componentName = std.strReplace(std.asciiLower(componentTitle), '-', '_'); + super.row('%s – autoscaling' % [componentTitle]) + .addPanel( + $.autoScalingActualReplicas(componentName) ) .addPanel( - local title = 'Scaling metric (memory): Desired replicas'; - $.timeseriesPanel(title) + - $.queryPanel( - [ - ||| - sum by (scaler) ( - label_replace( - keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~".*memory.*"}, - "namespace", "$1", "exported_namespace", "(.*)" - ) - / - on(%(aggregation_labels)s, scaledObject, metric) group_left label_replace( - label_replace( - kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, - "metric", "$1", "metric_name", "(.+)" - ), - "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" - ) - ) - ||| % { - aggregation_labels: $._config.alert_aggregation_labels, - cluster_label: $._config.per_cluster_label, - hpa_prefix: $._config.autoscaling_hpa_prefix, - hpa_name: $._config.autoscaling[field].hpa_name, - namespace: $.namespaceMatcher(), - }, - ], [ - '{{ scaler }}', - ] - ) + - $.panelDescription( - title, - ||| - This panel shows the scaling metric exposed by KEDA divided by the target/threshold used. - It should represent the desired number of replicas, ignoring the min/max constraints applied later. - ||| - ), + $.autoScalingDesiredReplicasByScalingMetricPanel(componentName, 'CPU', 'cpu') ) .addPanel( - local title = 'Autoscaler failures rate'; - $.timeseriesPanel(title) + - $.queryPanel( - $.filterKedaScalerErrorsByHPA($._config.autoscaling[field].hpa_name), - '{{scaler}} failures' - ) + - $.panelDescription( - title, - ||| - The rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom - metrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly. - ||| - ), + $.autoScalingDesiredReplicasByScalingMetricPanel(componentName, 'memory', 'memory') + ) + .addPanel( + $.autoScalingFailuresPanel(componentName) ), newStatPanel(queries, legends='', unit='percentunit', decimals=1, thresholds=[], instant=false, novalue=''):: diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet index bbd038ca..233fd6e5 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/overview.libsonnet @@ -33,6 +33,7 @@ local filename = 'mimir-overview.json'; assert std.md5(filename) == 'ffcd83628d7d4b5a03d1cafd159e6c9c' : 'UID of the dashboard has changed, please update references to dashboard.'; ($.dashboard('Overview') + { uid: std.md5(filename) }) .addClusterSelectorTemplates() + .addShowNativeLatencyVariable() .addRow( $.row('%(product)s cluster health' % $._config) @@ -53,9 +54,21 @@ local filename = 'mimir-overview.json'; 'Status', [ // Write failures. - if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate, + utils.showNativeHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate + ), + // Write failures but from classic histograms. + utils.showClassicHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate + ), // Read failures. - if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + utils.showNativeHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + ), + // Read failures but from classic histograms. + utils.showClassicHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + ), // Rule evaluation failures. $.queries.ruler.evaluations.failuresRate, // Alerting notifications. @@ -84,7 +97,7 @@ local filename = 'mimir-overview.json'; // Object storage failures. $.queries.storage.failuresRate, ], - ['Writes', 'Reads', 'Rule evaluations', 'Alerting notifications', 'Object storage'] + ['Writes', 'Writes', 'Reads', 'Reads', 'Rule evaluations', 'Alerting notifications', 'Object storage'] ) ) .addPanel( diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads-resources.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads-resources.libsonnet index ee82ed22..df04952f 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads-resources.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads-resources.libsonnet @@ -7,7 +7,7 @@ local filename = 'mimir-remote-ruler-reads-resources.json'; ($.dashboard('Remote ruler reads resources') + { uid: std.md5(filename) }) .addClusterSelectorTemplates(false) .addRow( - $.row('Query-frontend (dedicated to ruler)') + $.row('Ruler-query-frontend') .addPanel( $.containerCPUUsagePanelByComponent('ruler_query_frontend'), ) @@ -19,7 +19,7 @@ local filename = 'mimir-remote-ruler-reads-resources.json'; ) ) .addRow( - $.row('Query-scheduler (dedicated to ruler)') + $.row('Ruler-query-scheduler') .addPanel( $.containerCPUUsagePanelByComponent('ruler_query_scheduler'), ) @@ -31,7 +31,7 @@ local filename = 'mimir-remote-ruler-reads-resources.json'; ) ) .addRow( - $.row('Querier (dedicated to ruler)') + $.row('Ruler-querier') .addPanel( $.containerCPUUsagePanelByComponent('ruler_querier'), ) diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet index df1f48f4..87164d27 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet @@ -54,7 +54,7 @@ local filename = 'mimir-remote-ruler-reads.json'; ) ) .addRow( - $.row('Query-frontend (dedicated to ruler)') + $.row('Ruler-query-frontend') .addPanel( $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.ruler_query_frontend), rulerRoutesRegex]) @@ -80,7 +80,7 @@ local filename = 'mimir-remote-ruler-reads.json'; these panels will show "No data."

|||; - $.row('Query-scheduler (dedicated to ruler)') + $.row('Ruler-query-scheduler') .addPanel( local title = 'Requests / sec'; $.timeseriesPanel(title) + @@ -127,7 +127,7 @@ local filename = 'mimir-remote-ruler-reads.json'; regex: '^$', }, ]; - $.row('Query-scheduler Latency (Time in Queue) Breakout by Additional Queue Dimensions') + $.row('Ruler-query-scheduler Latency (Time in Queue) Breakout by Additional Queue Dimensions') .addPanel( local title = '99th Percentile Latency by Queue Dimension'; $.timeseriesPanel(title) + @@ -169,7 +169,7 @@ local filename = 'mimir-remote-ruler-reads.json'; ) ) .addRow( - $.row('Querier (dedicated to ruler)') + $.row('Ruler-querier') .addPanel( $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.ruler_querier), $.queries.read_http_routes_regex]) @@ -187,6 +187,25 @@ local filename = 'mimir-remote-ruler-reads.json'; ) .addRowIf( $._config.autoscaling.ruler_querier.enabled, - $.cpuAndMemoryBasedAutoScalingRow('Ruler-Querier'), + $.row('Ruler-querier - autoscaling') + .addPanel( + $.autoScalingActualReplicas('ruler_querier') + ) + .addPanel( + $.autoScalingFailuresPanel('ruler_querier') + ) + ) + .addRowIf( + $._config.autoscaling.ruler_querier.enabled, + $.row('') + .addPanel( + $.autoScalingDesiredReplicasByScalingMetricPanel('ruler_querier', 'CPU', 'cpu') + ) + .addPanel( + $.autoScalingDesiredReplicasByScalingMetricPanel('ruler_querier', 'memory', 'memory') + ) + .addPanel( + $.autoScalingDesiredReplicasByScalingMetricPanel('ruler_querier', 'in-flight queries', 'queries') + ) ), } diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet index 61205581..44de3e1b 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/dashboards/writes.libsonnet @@ -440,6 +440,41 @@ local filename = 'mimir-writes.json'; $._config.autoscaling.distributor.enabled, $.cpuAndMemoryBasedAutoScalingRow('Distributor'), ) + .addRowIf( + $._config.show_ingest_storage_panels && $._config.autoscaling.ingester.enabled, + $.row('Ingester (ingest storage) – autoscaling') + .addPanel( + $.autoScalingActualReplicas('ingester') + { title: 'Replicas (ReplicaTemplate)' } + + $.panelDescription( + 'Replicas (ReplicaTemplate)', + ||| + The maximum and current number of replicas for ReplicaTemplate object. + Rollout-operator will keep ingester replicas updated based on this object. +

+ Note: The current number of replicas can still show 1 replica even when scaled to 0. + Because HPA never reports 0 replicas, the query will report 0 only if the HPA is not active. + ||| + ) + ) + .addPanel( + $.timeseriesPanel('Replicas') + + $.panelDescription('Replicas', 'Number of ingester replicas.') + + $.queryPanel( + [ + 'sum by (%s) (up{%s})' % [$._config.per_job_label, $.jobMatcher($._config.job_names.ingester)], + ], + [ + '{{ %(per_job_label)s }}' % $._config.per_job_label, + ], + ), + ) + .addPanel( + $.autoScalingDesiredReplicasByScalingMetricPanel('ingester', '', '') + { title: 'Desired replicas (ReplicaTemplate)' } + ) + .addPanel( + $.autoScalingFailuresPanel('ingester') + { title: 'Autoscaler failures rate (ReplicaTemplate)' } + ), + ) .addRow( $.kvStoreRow('Distributor - key-value store for high-availability (HA) deduplication', 'distributor', 'distributor-hatracker') ) diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json index 3cad418e..f359690c 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "grafana-builder" } }, - "version": "7561fd330312538d22b00e0c7caecb4ba66321ea", + "version": "653836fa707a591b36fe490be350b1540e4f14ce", "sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU=" }, { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "7561fd330312538d22b00e0c7caecb4ba66321ea", - "sum": "0jg7qc3N8FtMnnQbunYCGSNcjHr9Y1krZW9OSTmWcEQ=" + "version": "653836fa707a591b36fe490be350b1540e4f14ce", + "sum": "wi1o6t5nUZVcsatqMdGLOWIv1HxNnlaE84oRE0Cl0ec=" } ], "legacyImports": false diff --git a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/recording_rules.libsonnet b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/recording_rules.libsonnet index 567bae04..5f0bb2a6 100644 --- a/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/recording_rules.libsonnet +++ b/monitoring-mixins/loki-mixin/vendor/github.com/grafana/mimir/operations/mimir-mixin/recording_rules.libsonnet @@ -333,9 +333,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; rules: [ { // cortex_ingester_ingested_samples_total is per user, in this rule we want to see the sum per cluster/namespace/instance - record: '%s_%s:cortex_ingester_ingested_samples_total:rate1m' % [$._config.alert_aggregation_rule_prefix, $._config.per_instance_label], + record: '%s_%s:cortex_ingester_ingested_samples_total:rate%s' % [$._config.alert_aggregation_rule_prefix, $._config.per_instance_label, $._config.recording_rules_range_interval], expr: ||| - sum by(%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingester_ingested_samples_total[1m])) + sum by(%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingester_ingested_samples_total[%(recording_rules_range_interval)s])) ||| % $._config, }, ],