Skip to content

Commit

Permalink
fix: mixin generation when cluster label is changed (#12613)
Browse files Browse the repository at this point in the history
Signed-off-by: QuentinBisson <[email protected]>
  • Loading branch information
QuentinBisson authored Apr 19, 2024
1 parent ffe684c commit 1ba7a30
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 42 deletions.
8 changes: 4 additions & 4 deletions production/loki-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,17 @@
{
alert: 'LokiTooManyCompactorsRunning',
expr: |||
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
|||,
sum(loki_boltdb_shipper_compactor_running) by (namespace, %s) > 1
||| % $._config.per_cluster_label,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Loki deployment is running more than one compactor.',
description: |||
description: std.strReplace(|||
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
|||,
|||, 'cluster', $._config.per_cluster_label),
},
},
],
Expand Down
48 changes: 24 additions & 24 deletions production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ local grafana = import 'grafonnet/grafana.libsonnet';
// This logic is inherited from mimir-mixin.
dashboard.dashboard('Canary')
// We can't make use of simplified template selectors from the loki dashboard utils until we port the cortex dashboard utils panel/grid functionality.
.addTemplate('cluster', 'loki_build_info', 'cluster')
.addTemplate('namespace', 'loki_build_info{cluster=~"$cluster"}', 'namespace')
.addTemplate('cluster', 'loki_build_info', $._config.per_cluster_label)
.addTemplate('namespace', 'loki_build_info{' + $._config.per_cluster_label + '=~"$cluster"}', 'namespace')
+ {
// This dashboard uses the new grid system in order to place panels (using gridPos).
// Because of this we can't use the mixin's addRow() and addPanel().
schemaVersion: 27,
rows: null,
// ugly hack, copy pasta the tag/link
// code from the loki-mixin
tags: ['loki'],
tags: $._config.tags,
links: [
{
asDropdown: true,
Expand All @@ -49,60 +49,60 @@ local grafana = import 'grafonnet/grafana.libsonnet';
panels: [
// grid row 1
dashboard.panel('Canary Entries Total') +
dashboard.newStatPanel('sum(count(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}))', unit='short') +
dashboard.newStatPanel('sum(count(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster", namespace=~"$namespace"}))', unit='short') +
{ gridPos: { h: 4, w: 3, x: 0, y: 0 } },

dashboard.panel('Canary Logs Total') +
dashboard.newStatPanel('sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
dashboard.newStatPanel('sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
{ gridPos: { h: 4, w: 3, x: 3, y: 0 } },

dashboard.panel('Missing') +
dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
{ gridPos: { h: 4, w: 3, x: 6, y: 0 } },

dashboard.panel('Spotcheck Missing') +
dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
{ gridPos: { h: 4, w: 3, x: 9, y: 0 } },

// grid row 2
dashboard.panel('Spotcheck Total') +
dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
{ gridPos: { h: 4, w: 3, x: 0, y: 4 } },

dashboard.panel('Metric Test Error %') +
dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))) * 100') +
dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}))) * 100') +
{ gridPos: { h: 4, w: 3, x: 3, y: 4 } },

dashboard.panel('Missing %') +
dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') +
dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range])))*100') +
{ gridPos: { h: 4, w: 3, x: 6, y: 4 } },

dashboard.panel('Spotcheck Missing %') +
dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') +
dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') +
{ gridPos: { h: 4, w: 3, x: 9, y: 4 } },

// grid row 3
dashboard.panel('Metric Test Expected') +
dashboard.newStatPanel('sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') +
dashboard.newStatPanel('sum(loki_canary_metric_test_expected{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"})', unit='short') +
{ gridPos: { h: 4, w: 3, x: 0, y: 8 } },

dashboard.panel('Metric Test Actual') +
dashboard.newStatPanel('sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') +
dashboard.newStatPanel('sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"})', unit='short') +
{ gridPos: { h: 4, w: 3, x: 3, y: 8 } },

dashboard.panel('Websocket Missing') +
dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
{ gridPos: { h: 4, w: 3, x: 6, y: 8 } },

dashboard.panel('Websocket Missing %') +
dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') +
dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range])))*100') +
{ gridPos: { h: 4, w: 3, x: 9, y: 8 } },
// end of grid

dashboard.panel('Log Write to read Latency Percentiles') +
dashboard.queryPanel([
'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
], ['p95', 'p50']) +
{ gridPos: { h: 6, w: 12, x: 12, y: 0 } },

Expand All @@ -115,7 +115,7 @@ local grafana = import 'grafonnet/grafana.libsonnet';
).addTargets(
[
grafana.prometheus.target(
'sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)',
'sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)',
legendFormat='{{le}}',
format='heatmap',
),
Expand All @@ -125,24 +125,24 @@ local grafana = import 'grafonnet/grafana.libsonnet';

dashboard.panel('Spot Check Query') +
dashboard.queryPanel([
'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
], ['p99', 'p95']) +
{ gridPos: { h: 6, w: 12, x: 0, y: 14 } },

dashboard.panel('Metric Test Query') +
dashboard.queryPanel([
'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))',
'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))',
'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[15m])) by (le))',
'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[15m])) by (le))',
], ['p99', 'p95'],) +
{ gridPos: { h: 6, w: 12, x: 12, y: 14 } },

dashboard.panel('Spot Check Missing %') +
dashboard.queryPanel('topk(20, (sum by (cluster, pod) (increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod) (increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') +
dashboard.queryPanel('topk(20, (sum by (' + $._config.per_cluster_label + ', pod) (increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (' + $._config.per_cluster_label + ', pod) (increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') +
{ gridPos: { h: 6, w: 12, x: 0, y: 20 } },

g.panel('Missing logs') +
g.queryPanel('topk(20,(sum by (cluster, pod)(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod)(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ cluster }} {{ pod }}') +
g.queryPanel('topk(20,(sum by (' + $._config.per_cluster_label + ', pod)(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (' + $._config.per_cluster_label + ', pod)(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ ' + $._config.per_cluster_label + ' }} {{ pod }}') +
{ gridPos: { h: 6, w: 12, x: 12, y: 20 } },

],
Expand Down
6 changes: 3 additions & 3 deletions production/loki-mixin/dashboards/loki-logs.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ local template = import 'grafonnet/template.libsonnet';
local cfg = self,

showMultiCluster:: true,
clusterLabel:: $._config.per_cluster_label,

} + lokiLogs +
$.dashboard('Loki / Logs', uid='logs')
Expand All @@ -61,8 +60,9 @@ local template = import 'grafonnet/template.libsonnet';
p {
targets: [
e {
expr: if dashboards['loki-logs.json'].showMultiCluster then super.expr
else std.strReplace(super.expr, $._config.per_cluster_label + '="$cluster", ', ''),
expr: if dashboards['loki-logs.json'].showMultiCluster
then std.strReplace(super.expr, 'cluster="$cluster"', $._config.per_cluster_label + '="$cluster"')
else std.strReplace(super.expr, 'cluster="$cluster", ', ''),
}
for e in p.targets
],
Expand Down
26 changes: 20 additions & 6 deletions production/loki-mixin/dashboards/loki-operational.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ local utils = import 'mixin-utils/utils.libsonnet';
showAnnotations:: true,
showLinks:: true,
showMultiCluster:: true,
clusterLabel:: $._config.per_cluster_label,

hiddenRows:: [
'Cassandra',
Expand Down Expand Up @@ -62,7 +61,22 @@ local utils = import 'mixin-utils/utils.libsonnet';

local replaceClusterMatchers(expr) =
if dashboards['loki-operational.json'].showMultiCluster
then expr
// Replace the recording rules cluster label with the per-cluster label
then std.strReplace(
// Replace the cluster label for equality matchers with the per-cluster label
std.strReplace(
// Replace the cluster label for regex matchers with the per-cluster label
std.strReplace(
expr,
'cluster=~"$cluster"',
$._config.per_cluster_label + '=~"$cluster"'
),
'cluster="$cluster"',
$._config.per_cluster_label + '="$cluster"'
),
'cluster_',
$._config.per_cluster_label + '_'
)
else
std.strReplace(
std.strReplace(
Expand Down Expand Up @@ -143,7 +157,7 @@ local utils = import 'mixin-utils/utils.libsonnet';


local replaceAllMatchers(expr) =
replaceMatchers(replaceClusterMatchers(expr)),
replaceMatchers(expr),

local selectDatasource(ds) =
if ds == null || ds == '' then ds
Expand Down Expand Up @@ -179,7 +193,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
datasource: selectDatasource(super.datasource),
targets: if std.objectHas(p, 'targets') then [
e {
expr: removeInternalComponents(p.title, e.expr),
expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)),
}
for e in p.targets
] else [],
Expand All @@ -188,7 +202,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
datasource: selectDatasource(super.datasource),
targets: if std.objectHas(sp, 'targets') then [
e {
expr: removeInternalComponents(p.title, e.expr),
expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)),
}
for e in sp.targets
] else [],
Expand All @@ -197,7 +211,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
datasource: selectDatasource(super.datasource),
targets: if std.objectHas(ssp, 'targets') then [
e {
expr: removeInternalComponents(p.title, e.expr),
expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)),
}
for e in ssp.targets
] else [],
Expand Down
3 changes: 1 addition & 2 deletions production/loki-mixin/dashboards/loki-reads.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
local cfg = self,

showMultiCluster:: true,
clusterLabel:: $._config.per_cluster_label,
clusterMatchers::
if cfg.showMultiCluster then
[utils.selector.re(cfg.clusterLabel, '$cluster')]
[utils.selector.re($._config.per_cluster_label, '$cluster')]
else
[],

Expand Down
3 changes: 1 addition & 2 deletions production/loki-mixin/dashboards/loki-writes.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
local cfg = self,

showMultiCluster:: true,
clusterLabel:: $._config.per_cluster_label,
clusterMatchers::
if cfg.showMultiCluster then
[utils.selector.re(cfg.clusterLabel, '$cluster')]
[utils.selector.re($._config.per_cluster_label, '$cluster')]
else
[],

Expand Down
Loading

0 comments on commit 1ba7a30

Please sign in to comment.