From b579fd28aae83c416313fbee04c818fecb82c009 Mon Sep 17 00:00:00 2001
From: Jan Horstmann <horstmann@osism.tech>
Date: Thu, 17 Oct 2024 09:00:03 +0200
Subject: [PATCH 1/5] Use monitoring-mixins for dashboards and alerts

Monitoring-mixins ([1]) allow upstream collaboration on alerts and
grafana dashboards and convergence on monitoring best practices through
bundling mixins with the corresponding software.
Downstream customization of alerts and dashboards may be done using
jsonnet ([2]).
Usage of jsonnet-bundler ([3]) gives a clear reference of where
dashboards and alerts originated and which version is used through
`.src/jsonnetfile.json` and `.src/jsonnetfile.lock.json`.

[1]
https://monitoring.mixins.dev/

[2]
https://github.com/google/jsonnet

[3]
https://github.com/jsonnet-bundler/jsonnet-bundler

Signed-off-by: Jan Horstmann <horstmann@osism.tech>
---
 .gitignore                                    |    1 +
 .src/Containerfile                            |   27 +
 .src/Dockerfile                               |    1 +
 .src/files/mixins.py                          |   69 +
 .src/jsonnetfile.json                         |   42 +
 .src/jsonnetfile.lock.json                    |  116 +
 .src/mixins/ceph/ceph.libsonnet               |    7 +
 .../infrastructure/alertmanager.libsonnet     |    7 +
 .src/mixins/infrastructure/node.libsonnet     |   11 +
 .../infrastructure/prometheus.libsonnet       |    7 +
 README.md                                     |   48 +-
 .../ceph/multi-cluster-overview.json          |   15 +-
 .../infrastructure/alertmanager-overview.json |  329 ++
 .../infrastructure/node-cluster-rsrc-use.json |  923 +++
 .../infrastructure/node-rsrc-use.json         |  943 ++++
 grafana/dashboards/infrastructure/nodes.json  |  962 ++++
 .../prometheus-remote-write.json              | 1443 +++++
 .../dashboards/infrastructure/prometheus.json | 5003 ++++-------------
 grafana/provisioning.yaml                     |   16 +-
 prometheus/alertmanager.rec.rules             |    1 +
 prometheus/alertmanager.rules                 |  117 +
 prometheus/ceph.rec.rules                     |    1 +
 prometheus/ceph.rules                         | 1244 ++--
 prometheus/node.rec.rules                     |   68 +
 prometheus/node.rules                         |  290 +
 prometheus/prometheus-extra.rules             |  257 +
 prometheus/prometheus.rec.rules               |    1 +
 prometheus/prometheus.rules                   |  520 +-
 28 files changed, 7894 insertions(+), 4575 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 .src/Containerfile
 create mode 120000 .src/Dockerfile
 create mode 100755 .src/files/mixins.py
 create mode 100644 .src/jsonnetfile.json
 create mode 100644 .src/jsonnetfile.lock.json
 create mode 100644 .src/mixins/ceph/ceph.libsonnet
 create mode 100644 .src/mixins/infrastructure/alertmanager.libsonnet
 create mode 100644 .src/mixins/infrastructure/node.libsonnet
 create mode 100644 .src/mixins/infrastructure/prometheus.libsonnet
 create mode 100644 grafana/dashboards/infrastructure/alertmanager-overview.json
 create mode 100644 grafana/dashboards/infrastructure/node-cluster-rsrc-use.json
 create mode 100644 grafana/dashboards/infrastructure/node-rsrc-use.json
 create mode 100644 grafana/dashboards/infrastructure/nodes.json
 create mode 100644 grafana/dashboards/infrastructure/prometheus-remote-write.json
 create mode 100644 prometheus/alertmanager.rec.rules
 create mode 100644 prometheus/alertmanager.rules
 create mode 100644 prometheus/ceph.rec.rules
 create mode 100644 prometheus/node.rec.rules
 create mode 100644 prometheus/node.rules
 create mode 100644 prometheus/prometheus-extra.rules
 create mode 100644 prometheus/prometheus.rec.rules

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3b699f9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.src/vendor
diff --git a/.src/Containerfile b/.src/Containerfile
new file mode 100644
index 0000000..a7d85c7
--- /dev/null
+++ b/.src/Containerfile
@@ -0,0 +1,27 @@
+From python:3.12-slim-bookworm
+
+ARG JB_VERSION=v0.6.0
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update \
+    && apt-get install --no-install-recommends -y \
+        ca-certificates \
+        curl \
+        git \
+        jsonnet \
+    && curl --location --output /usr/local/bin/jb https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/${JB_VERSION}/jb-linux-amd64 \
+    && chmod +x /usr/local/bin/jb \
+    && apt-get remove -y \
+        curl \
+    && apt-get clean \
+    && apt-get autoremove -y \
+    && rm -rf \
+      /var/lib/apt/lists/* \
+      /var/tmp/*
+
+COPY files/mixins.py /mixins.py
+
+WORKDIR /srv/.src
+CMD ["/mixins.py"]
+
diff --git a/.src/Dockerfile b/.src/Dockerfile
new file mode 120000
index 0000000..5240dc0
--- /dev/null
+++ b/.src/Dockerfile
@@ -0,0 +1 @@
+Containerfile
\ No newline at end of file
diff --git a/.src/files/mixins.py b/.src/files/mixins.py
new file mode 100755
index 0000000..4c2c833
--- /dev/null
+++ b/.src/files/mixins.py
@@ -0,0 +1,69 @@
+#!/usr/local/bin/python3
+
+import os
+import subprocess
+
+dashboards_out_prefix = "../grafana/dashboards"
+rules_out_prefix = "../prometheus"
+
+print("Installing dependencies...")
+try:
+    out = subprocess.run(["/usr/local/bin/jb", "install"], capture_output=True)
+except subprocess.SubprocessError as e:
+    print(e.stderr)
+    raise e
+print(out.stdout.decode())
+print(out.stderr.decode())
+
+for path, _, filenames in os.walk("mixins"):
+    for file in filenames:
+        dashboards_out = os.path.join(
+            dashboards_out_prefix, *os.path.normpath(path).split(os.sep)[1:]
+        )
+        if file.endswith(".libsonnet"):
+            name = file.removesuffix(".libsonnet")
+            print("Processing mixin " + name)
+            print("Creating dashboard(s)...")
+            os.makedirs(dashboards_out, exist_ok=True)
+            try:
+                subprocess.run(
+                    [
+                        "jsonnet",
+                        "-J",
+                        "vendor",
+                        "-m",
+                        dashboards_out,
+                        "-e",
+                        '(import "' + os.path.join(path, file) + '").grafanaDashboards',
+                    ]
+                )
+            except subprocess.SubprocessError as e:
+                print(e.stderr)
+                raise e
+            for kind, suffix in [
+                ("prometheusAlerts", ".rules"),
+                ("prometheusRules", ".rec.rules"),
+            ]:
+                print("Creating " + kind + "...")
+                prometheus_out = os.path.join(rules_out_prefix, name + suffix)
+                with open(prometheus_out, "w") as f:
+                    try:
+                        subprocess.run(
+                            [
+                                "jsonnet",
+                                "-J",
+                                "vendor",
+                                "-S",
+                                "-e",
+                                'std.manifestYamlDoc((import "'
+                                + os.path.join(path, file)
+                                + '").'
+                                + kind
+                                + ")",
+                            ],
+                            stdout=f,
+                        )
+                    except subprocess.SubprocessError as e:
+                        print(e.stderr)
+                        raise e
+                print(prometheus_out)
diff --git a/.src/jsonnetfile.json b/.src/jsonnetfile.json
new file mode 100644
index 0000000..148f54c
--- /dev/null
+++ b/.src/jsonnetfile.json
@@ -0,0 +1,42 @@
+{
+  "version": 1,
+  "dependencies": [
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/prometheus/alertmanager.git",
+          "subdir": "doc/alertmanager-mixin"
+        }
+      },
+      "version": "main"
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/ceph/ceph.git",
+          "subdir": "monitoring/ceph-mixin"
+        }
+      },
+      "version": "main"
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/prometheus/node_exporter.git",
+          "subdir": "docs/node-mixin"
+        }
+      },
+      "version": "master"
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/prometheus/prometheus.git",
+          "subdir": "documentation/prometheus-mixin"
+        }
+      },
+      "version": "main"
+    }
+  ],
+  "legacyImports": true
+}
diff --git a/.src/jsonnetfile.lock.json b/.src/jsonnetfile.lock.json
new file mode 100644
index 0000000..458fe6d
--- /dev/null
+++ b/.src/jsonnetfile.lock.json
@@ -0,0 +1,116 @@
+{
+  "version": 1,
+  "dependencies": [
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/ceph/ceph.git",
+          "subdir": "monitoring/ceph-mixin"
+        }
+      },
+      "version": "54a75a0e407aaa5c4fff987e7ad91001bfa79fbf",
+      "sum": "ZnyCIu25NBI6Q3Ru7QK1DHf7DBMEURSMQdEJXzCyIgA="
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/grafana/grafonnet-lib.git",
+          "subdir": "grafonnet"
+        }
+      },
+      "version": "a1d61cce1da59c71409b99b5c7568511fec661ea",
+      "sum": "342u++/7rViR/zj2jeJOjshzglkZ1SY+hFNuyCBFMdc="
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/grafana/grafonnet-lib.git",
+          "subdir": "grafonnet-7.0"
+        }
+      },
+      "version": "a1d61cce1da59c71409b99b5c7568511fec661ea",
+      "sum": "gCtR9s/4D5fxU9aKXg0Bru+/njZhA0YjLjPiASc61FM="
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/grafana/grafonnet.git",
+          "subdir": "gen/grafonnet-latest"
+        }
+      },
+      "version": "82a19822e54a0a12a51e24dbd48fcde717dc0864",
+      "sum": "64fMUPI3frXGj4X1FqFd1t7r04w3CUSmXaDcJ23EYbQ="
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/grafana/grafonnet.git",
+          "subdir": "gen/grafonnet-v11.1.0"
+        }
+      },
+      "version": "82a19822e54a0a12a51e24dbd48fcde717dc0864",
+      "sum": "41w7p/rwrNsITqNHMXtGSJAfAyKmnflg6rFhKBduUxM="
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/grafana/jsonnet-libs.git",
+          "subdir": "grafana-builder"
+        }
+      },
+      "version": "a8fc2139d881ae632a8c956eb9dd4b84b24f362e",
+      "sum": "yxqWcq/N3E/a/XreeU6EuE6X7kYPnG0AspAQFKOjASo="
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/jsonnet-libs/docsonnet.git",
+          "subdir": "doc-util"
+        }
+      },
+      "version": "6ac6c69685b8c29c54515448eaca583da2d88150",
+      "sum": "BrAL/k23jq+xy9oA7TWIhUx07dsA/QLm3g7ktCwe//U="
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/jsonnet-libs/xtd.git",
+          "subdir": ""
+        }
+      },
+      "version": "63d430b69a95741061c2f7fc9d84b1a778511d9c",
+      "sum": "qiZi3axUSXCVzKUF83zSAxklwrnitMmrDK4XAfjPMdE="
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/prometheus/alertmanager.git",
+          "subdir": "doc/alertmanager-mixin"
+        }
+      },
+      "version": "f6b942cf9b3a503d59192eada300d2ad97cba82f",
+      "sum": "Mf4h1BYLle2nrgjf/HXrBbl0Zk8N+xaoEM017o0BC+k="
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/prometheus/node_exporter.git",
+          "subdir": "docs/node-mixin"
+        }
+      },
+      "version": "0fddfd1ba530c954dc042c5d138de82ecd4e4ff1",
+      "sum": "cQCW+1N0Xae5yXecCWDK2oAlN0luBS/5GrwBYSlaFms="
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/prometheus/prometheus.git",
+          "subdir": "documentation/prometheus-mixin"
+        }
+      },
+      "version": "d3074b39c38493ebb81514c0ec962b7853ed0162",
+      "sum": "dYLcLzGH4yF3qB7OGC/7z4nqeTNjv42L7Q3BENU8XJI="
+    }
+  ],
+  "legacyImports": false
+}
diff --git a/.src/mixins/ceph/ceph.libsonnet b/.src/mixins/ceph/ceph.libsonnet
new file mode 100644
index 0000000..351d26f
--- /dev/null
+++ b/.src/mixins/ceph/ceph.libsonnet
@@ -0,0 +1,7 @@
+local ceph = import "ceph-mixin/mixin.libsonnet";
+
+ceph {
+  prometheusRules+: {},
+  prometheusAlerts+: {},
+  grafanaDashboards+: {}
+}
diff --git a/.src/mixins/infrastructure/alertmanager.libsonnet b/.src/mixins/infrastructure/alertmanager.libsonnet
new file mode 100644
index 0000000..71b3aba
--- /dev/null
+++ b/.src/mixins/infrastructure/alertmanager.libsonnet
@@ -0,0 +1,7 @@
+local alertmanager = import "alertmanager-mixin/mixin.libsonnet";
+
+alertmanager {
+  prometheusRules+: {},
+  prometheusAlerts+: {},
+  grafanaDashboards+: {}
+}
diff --git a/.src/mixins/infrastructure/node.libsonnet b/.src/mixins/infrastructure/node.libsonnet
new file mode 100644
index 0000000..8230a9a
--- /dev/null
+++ b/.src/mixins/infrastructure/node.libsonnet
@@ -0,0 +1,11 @@
+local node = import "node-mixin/mixin.libsonnet";
+
+node {
+  prometheusRules+: {},
+  prometheusAlerts+: {},
+  grafanaDashboards+: {
+    # Hide unused dashboards
+    'nodes-darwin.json':: super['nodes-darwin.json'],
+    'nodes-aix.json':: super['nodes-aix.json']
+  }
+}
diff --git a/.src/mixins/infrastructure/prometheus.libsonnet b/.src/mixins/infrastructure/prometheus.libsonnet
new file mode 100644
index 0000000..d38aeb0
--- /dev/null
+++ b/.src/mixins/infrastructure/prometheus.libsonnet
@@ -0,0 +1,7 @@
+local prometheus = import "prometheus-mixin/mixin.libsonnet";
+
+prometheus {
+  prometheusRules+: {},
+  prometheusAlerts+: {},
+  grafanaDashboards+: {}
+}
diff --git a/README.md b/README.md
index edad49b..036015a 100644
--- a/README.md
+++ b/README.md
@@ -3,5 +3,49 @@
 Repository for Grafana dashboards and Prometheus alerting rules.
 For use with the Prometheus exporters from Kolla.
 
-The alerts here build on those from:
-https://github.com/samber/awesome-prometheus-alerts
+## Usage
+
+While it is possible to directly place `.rules` files into the `prometheus/` folder  and grafana dashboards into `grafana/dashboards` or a subfolder thereof, use of [monitoring-mixins](https://monitoring.mixins.dev) is encouraged.
+
+### Build the jsonnet container
+
+Build the `mixin` container to manage mixins
+
+```
+podman build -t mixins .src
+```
+
+### Building mixins
+
+[jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler) is used to fetch mixins and their dependencies from upstream.
+
+* Install mixin dependencies
+
+  ```
+  podman run -it -v .:/srv localhost/mixins:latest jb install
+  ```
+
+* Build alerts, rules and dashboards
+  ```
+  podman run -it -v .:/srv localhost/mixins:latest
+  ```
+
+* Check all changes into git, including the generated alerts, rules and dashboards
+
+### Adding new mixins
+
+* Add any new upstream mixins to `.src/jsonnetfile.json
+* Add a local libsonnet for any new mixins in `.src/mixins/$DASHBOARDSUBFOLDER/$NAME.libsonnet
+* Apply any [customizations](https://monitoring.mixins.dev/#customising-the-mixin) to the local `$NAME.libsonnet`
+* [Build mixins](#building-mixins) 
+* Check all changes into git, including the `.src/jsonnetfile.lock.json`
+
+### Updating mixins
+
+* Update the jsonnet-bundler dependencies in `.src/jsonnetfile.lock.json
+
+  ```
+  podman run -it -v .:/srv localhost/mixins:latest jb update
+  ```
+
+* Check all changes into git, including the `.src/jsonnetfile.lock.json`
diff --git a/grafana/dashboards/ceph/multi-cluster-overview.json b/grafana/dashboards/ceph/multi-cluster-overview.json
index 25648cc..ba6d29c 100644
--- a/grafana/dashboards/ceph/multi-cluster-overview.json
+++ b/grafana/dashboards/ceph/multi-cluster-overview.json
@@ -22,7 +22,20 @@
    "graphTooltip": 0,
    "hideControls": false,
    "id": null,
-   "links": [ ],
+   "links": [
+      {
+         "asDropdown": true,
+         "icon": "external link",
+         "includeVars": true,
+         "keepTime": true,
+         "tags": [ ],
+         "targetBlank": false,
+         "title": "Browse Dashboards",
+         "tooltip": "",
+         "type": "dashboards",
+         "url": ""
+      }
+   ],
    "panels": [
       {
          "collapse": false,
diff --git a/grafana/dashboards/infrastructure/alertmanager-overview.json b/grafana/dashboards/infrastructure/alertmanager-overview.json
new file mode 100644
index 0000000..55f7a5c
--- /dev/null
+++ b/grafana/dashboards/infrastructure/alertmanager-overview.json
@@ -0,0 +1,329 @@
+{
+   "graphTooltip": 1,
+   "panels": [
+      {
+         "collapsed": false,
+         "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 0
+         },
+         "id": 1,
+         "panels": [ ],
+         "title": "Alerts",
+         "type": "row"
+      },
+      {
+         "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+         },
+         "description": "current set of alerts stored in the Alertmanager",
+         "fieldConfig": {
+            "defaults": {
+               "custom": {
+                  "fillOpacity": 10,
+                  "showPoints": "never",
+                  "stacking": {
+                     "mode": "normal"
+                  }
+               },
+               "unit": "none"
+            }
+         },
+         "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 0,
+            "y": 1
+         },
+         "id": 2,
+         "options": {
+            "legend": {
+               "showLegend": false
+            },
+            "tooltip": {
+               "mode": "multi"
+            }
+         },
+         "pluginVersion": "v11.1.0",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "$datasource"
+               },
+               "expr": "sum(alertmanager_alerts{job=~\"$job\"}) by (job,instance)",
+               "intervalFactor": 2,
+               "legendFormat": "{{instance}}"
+            }
+         ],
+         "title": "Alerts",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+         },
+         "description": "rate of successful and invalid alerts received by the Alertmanager",
+         "fieldConfig": {
+            "defaults": {
+               "custom": {
+                  "fillOpacity": 10,
+                  "showPoints": "never",
+                  "stacking": {
+                     "mode": "normal"
+                  }
+               },
+               "unit": "ops"
+            }
+         },
+         "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 12,
+            "y": 1
+         },
+         "id": 3,
+         "options": {
+            "legend": {
+               "showLegend": false
+            },
+            "tooltip": {
+               "mode": "multi"
+            }
+         },
+         "pluginVersion": "v11.1.0",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "$datasource"
+               },
+               "expr": "sum(rate(alertmanager_alerts_received_total{job=~\"$job\"}[$__rate_interval])) by (job,instance)",
+               "intervalFactor": 2,
+               "legendFormat": "{{instance}} Received"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "$datasource"
+               },
+               "expr": "sum(rate(alertmanager_alerts_invalid_total{job=~\"$job\"}[$__rate_interval])) by (job,instance)",
+               "intervalFactor": 2,
+               "legendFormat": "{{instance}} Invalid"
+            }
+         ],
+         "title": "Alerts receive rate",
+         "type": "timeseries"
+      },
+      {
+         "collapsed": false,
+         "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 8
+         },
+         "id": 4,
+         "panels": [ ],
+         "title": "Notifications",
+         "type": "row"
+      },
+      {
+         "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+         },
+         "description": "rate of successful and invalid notifications sent by the Alertmanager",
+         "fieldConfig": {
+            "defaults": {
+               "custom": {
+                  "fillOpacity": 10,
+                  "showPoints": "never",
+                  "stacking": {
+                     "mode": "normal"
+                  }
+               },
+               "unit": "ops"
+            }
+         },
+         "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 0,
+            "y": 9
+         },
+         "id": 5,
+         "options": {
+            "legend": {
+               "showLegend": false
+            },
+            "tooltip": {
+               "mode": "multi"
+            }
+         },
+         "pluginVersion": "v11.1.0",
+         "repeat": "integration",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "$datasource"
+               },
+               "expr": "sum(rate(alertmanager_notifications_total{job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (integration,job,instance)",
+               "intervalFactor": 2,
+               "legendFormat": "{{instance}} Total"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "$datasource"
+               },
+               "expr": "sum(rate(alertmanager_notifications_failed_total{job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (integration,job,instance)",
+               "intervalFactor": 2,
+               "legendFormat": "{{instance}} Failed"
+            }
+         ],
+         "title": "$integration: Notifications Send Rate",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "type": "prometheus",
+            "uid": "$datasource"
+         },
+         "description": "latency of notifications sent by the Alertmanager",
+         "fieldConfig": {
+            "defaults": {
+               "custom": {
+                  "fillOpacity": 10,
+                  "showPoints": "never",
+                  "stacking": {
+                     "mode": "normal"
+                  }
+               },
+               "unit": "s"
+            }
+         },
+         "gridPos": {
+            "h": 7,
+            "w": 12,
+            "x": 12,
+            "y": 9
+         },
+         "id": 6,
+         "options": {
+            "legend": {
+               "showLegend": false
+            },
+            "tooltip": {
+               "mode": "multi"
+            }
+         },
+         "pluginVersion": "v11.1.0",
+         "repeat": "integration",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "$datasource"
+               },
+               "expr": "histogram_quantile(0.99,\n  sum(rate(alertmanager_notification_latency_seconds_bucket{job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (le,job,instance)\n)\n",
+               "intervalFactor": 2,
+               "legendFormat": "{{instance}} 99th Percentile"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "$datasource"
+               },
+               "expr": "histogram_quantile(0.50,\n  sum(rate(alertmanager_notification_latency_seconds_bucket{job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (le,job,instance)\n)\n",
+               "intervalFactor": 2,
+               "legendFormat": "{{instance}} Median"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "$datasource"
+               },
+               "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (job,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (job,instance)\n",
+               "intervalFactor": 2,
+               "legendFormat": "{{instance}} Average"
+            }
+         ],
+         "title": "$integration: Notification Duration",
+         "type": "timeseries"
+      }
+   ],
+   "schemaVersion": 39,
+   "tags": [
+      "alertmanager-mixin"
+   ],
+   "templating": {
+      "list": [
+         {
+            "current": {
+               "selected": false,
+               "text": "Prometheus",
+               "value": "Prometheus"
+            },
+            "hide": 0,
+            "label": "Data Source",
+            "name": "datasource",
+            "query": "prometheus",
+            "type": "datasource"
+         },
+         {
+            "current": {
+               "selected": false,
+               "text": "",
+               "value": ""
+            },
+            "datasource": {
+               "type": "prometheus",
+               "uid": "${datasource}"
+            },
+            "includeAll": false,
+            "label": "job",
+            "name": "job",
+            "query": "label_values(alertmanager_alerts, job)",
+            "refresh": 2,
+            "sort": 1,
+            "type": "query"
+         },
+         {
+            "current": {
+               "selected": false,
+               "text": "$__all",
+               "value": "$__all"
+            },
+            "datasource": {
+               "type": "prometheus",
+               "uid": "${datasource}"
+            },
+            "hide": 2,
+            "includeAll": true,
+            "name": "integration",
+            "query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)",
+            "refresh": 2,
+            "sort": 1,
+            "type": "query"
+         }
+      ]
+   },
+   "time": {
+      "from": "now-1h",
+      "to": "now"
+   },
+   "timepicker": {
+      "refresh_intervals": [
+         "30s"
+      ]
+   },
+   "timezone": "utc",
+   "title": "Alertmanager / Overview",
+   "uid": "alertmanager-overview"
+}
diff --git a/grafana/dashboards/infrastructure/node-cluster-rsrc-use.json b/grafana/dashboards/infrastructure/node-cluster-rsrc-use.json
new file mode 100644
index 0000000..b40d090
--- /dev/null
+++ b/grafana/dashboards/infrastructure/node-cluster-rsrc-use.json
@@ -0,0 +1,923 @@
+{
+   "__inputs": [ ],
+   "__requires": [ ],
+   "annotations": {
+      "list": [ ]
+   },
+   "editable": false,
+   "gnetId": null,
+   "graphTooltip": 1,
+   "hideControls": false,
+   "id": null,
+   "links": [ ],
+   "refresh": "30s",
+   "rows": [
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 2,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "((\n  instance:node_cpu_utilisation:rate5m{job=\"node\", cluster=\"$cluster\"}\n  *\n  instance:node_num_cpu:sum{job=\"node\", cluster=\"$cluster\"}\n) != 0 )\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node\", cluster=\"$cluster\"}))\n",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{ instance }}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "CPU Utilisation",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 3,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "(\n  instance:node_load1_per_cpu:ratio{job=\"node\", cluster=\"$cluster\"}\n  / scalar(count(instance:node_load1_per_cpu:ratio{job=\"node\", cluster=\"$cluster\"}))\n)  != 0\n",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{instance}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "CPU Saturation (Load1 per CPU)",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "CPU",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 4,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "(\n  instance:node_memory_utilisation:ratio{job=\"node\", cluster=\"$cluster\"}\n  / scalar(count(instance:node_memory_utilisation:ratio{job=\"node\", cluster=\"$cluster\"}))\n) != 0\n",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{instance}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Memory Utilisation",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 5,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node\", cluster=\"$cluster\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{instance}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Memory Saturation (Major Page Faults)",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "rds",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "rds",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Memory",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 6,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [
+                  {
+                     "alias": "/Receive/",
+                     "stack": "A"
+                  },
+                  {
+                     "alias": "/Transmit/",
+                     "stack": "B",
+                     "transform": "negative-Y"
+                  }
+               ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{instance}} Receive",
+                     "refId": "A"
+                  },
+                  {
+                     "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{instance}} Transmit",
+                     "refId": "B"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Network Utilisation (Bytes Receive/Transmit)",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "Bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "Bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 7,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [
+                  {
+                     "alias": "/ Receive/",
+                     "stack": "A"
+                  },
+                  {
+                     "alias": "/ Transmit/",
+                     "stack": "B",
+                     "transform": "negative-Y"
+                  }
+               ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{instance}} Receive",
+                     "refId": "A"
+                  },
+                  {
+                     "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{instance}} Transmit",
+                     "refId": "B"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Network Saturation (Drops Receive/Transmit)",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "Bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "Bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Network",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 8,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "(\n  instance_device:node_disk_io_time_seconds:rate5m{job=\"node\", cluster=\"$cluster\"}\n  / scalar(count(instance_device:node_disk_io_time_seconds:rate5m{job=\"node\", cluster=\"$cluster\"}))\n) != 0\n",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{instance}} {{device}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Disk IO Utilisation",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 9,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "(\n  instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node\", cluster=\"$cluster\"}\n  / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node\", cluster=\"$cluster\"}))\n) != 0\n",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{instance}} {{device}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Disk IO Saturation",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Disk IO",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 10,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 12,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "sum without (device) (\n  max without (fstype, mountpoint) ((\n    node_filesystem_size_bytes{job=\"node\", fstype!=\"\", mountpoint!=\"\", cluster=\"$cluster\"}\n    -\n    node_filesystem_avail_bytes{job=\"node\", fstype!=\"\", mountpoint!=\"\", cluster=\"$cluster\"}\n  ) != 0)\n)\n/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{job=\"node\", fstype!=\"\", mountpoint!=\"\", cluster=\"$cluster\"})))\n",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{instance}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Disk Space Utilisation",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Disk Space",
+         "titleSize": "h6",
+         "type": "row"
+      }
+   ],
+   "schemaVersion": 14,
+   "style": "dark",
+   "tags": [
+      "node-exporter-mixin"
+   ],
+   "templating": {
+      "list": [
+         {
+            "current": {
+               "text": "default",
+               "value": "default"
+            },
+            "hide": 0,
+            "label": "Data Source",
+            "name": "datasource",
+            "options": [ ],
+            "query": "prometheus",
+            "refresh": 1,
+            "regex": "",
+            "type": "datasource"
+         },
+         {
+            "allValue": null,
+            "current": {
+               "text": "",
+               "value": ""
+            },
+            "datasource": "$datasource",
+            "hide": 2,
+            "includeAll": false,
+            "label": null,
+            "multi": false,
+            "name": "cluster",
+            "options": [ ],
+            "query": "label_values(node_time_seconds, cluster)",
+            "refresh": 2,
+            "regex": "",
+            "sort": 1,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         }
+      ]
+   },
+   "time": {
+      "from": "now-1h",
+      "to": "now"
+   },
+   "timepicker": {
+      "refresh_intervals": [
+         "5s",
+         "10s",
+         "30s",
+         "1m",
+         "5m",
+         "15m",
+         "30m",
+         "1h",
+         "2h",
+         "1d"
+      ],
+      "time_options": [
+         "5m",
+         "15m",
+         "1h",
+         "6h",
+         "12h",
+         "24h",
+         "2d",
+         "7d",
+         "30d"
+      ]
+   },
+   "timezone": "utc",
+   "title": "Node Exporter / USE Method / Cluster",
+   "uid": "3e97d1d02672cdd0861f4c97c64f89b2",
+   "version": 0
+}
diff --git a/grafana/dashboards/infrastructure/node-rsrc-use.json b/grafana/dashboards/infrastructure/node-rsrc-use.json
new file mode 100644
index 0000000..1f8c1c8
--- /dev/null
+++ b/grafana/dashboards/infrastructure/node-rsrc-use.json
@@ -0,0 +1,943 @@
+{
+   "__inputs": [ ],
+   "__requires": [ ],
+   "annotations": {
+      "list": [ ]
+   },
+   "editable": false,
+   "gnetId": null,
+   "graphTooltip": 1,
+   "hideControls": false,
+   "id": null,
+   "links": [ ],
+   "refresh": "30s",
+   "rows": [
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 2,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "instance:node_cpu_utilisation:rate5m{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "Utilisation",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "CPU Utilisation",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 3,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "instance:node_load1_per_cpu:ratio{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "Saturation",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "CPU Saturation (Load1 per CPU)",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "CPU",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 4,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "instance:node_memory_utilisation:ratio{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "Utilisation",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Memory Utilisation",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 5,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "Major page Faults",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Memory Saturation (Major Page Faults)",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "rds",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "rds",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Memory",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 6,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [
+                  {
+                     "alias": "/Receive/",
+                     "stack": "A"
+                  },
+                  {
+                     "alias": "/Transmit/",
+                     "stack": "B",
+                     "transform": "negative-Y"
+                  }
+               ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "Receive",
+                     "refId": "A"
+                  },
+                  {
+                     "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "Transmit",
+                     "refId": "B"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Network Utilisation (Bytes Receive/Transmit)",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "Bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "Bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 7,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [
+                  {
+                     "alias": "/ Receive/",
+                     "stack": "A"
+                  },
+                  {
+                     "alias": "/ Transmit/",
+                     "stack": "B",
+                     "transform": "negative-Y"
+                  }
+               ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "Receive",
+                     "refId": "A"
+                  },
+                  {
+                     "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "Transmit",
+                     "refId": "B"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Network Saturation (Drops Receive/Transmit)",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "Bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "Bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Network",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 8,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "instance_device:node_disk_io_time_seconds:rate5m{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{device}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Disk IO Utilisation",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 9,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"} != 0",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{device}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Disk IO Saturation",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Disk IO",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 10,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": false,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 12,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "sort_desc(1 -\n  (\n   max without (mountpoint, fstype) (node_filesystem_avail_bytes{job=\"node\", fstype!=\"\", instance=\"$instance\", cluster=\"$cluster\"})\n   /\n   max without (mountpoint, fstype) (node_filesystem_size_bytes{job=\"node\", fstype!=\"\", instance=\"$instance\", cluster=\"$cluster\"})\n  ) != 0\n)\n",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{device}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Disk Space Utilisation",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Disk Space",
+         "titleSize": "h6",
+         "type": "row"
+      }
+   ],
+   "schemaVersion": 14,
+   "style": "dark",
+   "tags": [
+      "node-exporter-mixin"
+   ],
+   "templating": {
+      "list": [
+         {
+            "current": {
+               "text": "default",
+               "value": "default"
+            },
+            "hide": 0,
+            "label": "Data Source",
+            "name": "datasource",
+            "options": [ ],
+            "query": "prometheus",
+            "refresh": 1,
+            "regex": "",
+            "type": "datasource"
+         },
+         {
+            "allValue": null,
+            "current": {
+               "text": "",
+               "value": ""
+            },
+            "datasource": "$datasource",
+            "hide": 2,
+            "includeAll": false,
+            "label": null,
+            "multi": false,
+            "name": "cluster",
+            "options": [ ],
+            "query": "label_values(node_time_seconds, cluster)",
+            "refresh": 2,
+            "regex": "",
+            "sort": 1,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         },
+         {
+            "allValue": null,
+            "current": { },
+            "datasource": "$datasource",
+            "hide": 0,
+            "includeAll": false,
+            "label": null,
+            "multi": false,
+            "name": "instance",
+            "options": [ ],
+            "query": "label_values(node_exporter_build_info{job=\"node\", cluster=\"$cluster\"}, instance)",
+            "refresh": 2,
+            "regex": "",
+            "sort": 1,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         }
+      ]
+   },
+   "time": {
+      "from": "now-1h",
+      "to": "now"
+   },
+   "timepicker": {
+      "refresh_intervals": [
+         "5s",
+         "10s",
+         "30s",
+         "1m",
+         "5m",
+         "15m",
+         "30m",
+         "1h",
+         "2h",
+         "1d"
+      ],
+      "time_options": [
+         "5m",
+         "15m",
+         "1h",
+         "6h",
+         "12h",
+         "24h",
+         "2d",
+         "7d",
+         "30d"
+      ]
+   },
+   "timezone": "utc",
+   "title": "Node Exporter / USE Method / Node",
+   "uid": "fac67cfbe174d3ef53eb473d73d9212f",
+   "version": 0
+}
diff --git a/grafana/dashboards/infrastructure/nodes.json b/grafana/dashboards/infrastructure/nodes.json
new file mode 100644
index 0000000..f19045f
--- /dev/null
+++ b/grafana/dashboards/infrastructure/nodes.json
@@ -0,0 +1,962 @@
+{
+   "__inputs": [ ],
+   "__requires": [ ],
+   "annotations": {
+      "list": [ ]
+   },
+   "editable": false,
+   "gnetId": null,
+   "graphTooltip": 1,
+   "hideControls": false,
+   "id": null,
+   "links": [ ],
+   "refresh": "30s",
+   "rows": [
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 2,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "(\n  (1 - sum without (mode) (rate(node_cpu_seconds_total{job=\"node\", mode=~\"idle|iowait|steal\", instance=\"$instance\", cluster=\"$cluster\"}[$__rate_interval])))\n/ ignoring(cpu) group_left\n  count without (cpu, mode) (node_cpu_seconds_total{job=\"node\", mode=\"idle\", instance=\"$instance\", cluster=\"$cluster\"})\n)\n",
+                     "format": "time_series",
+                     "intervalFactor": 5,
+                     "legendFormat": "{{cpu}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "CPU Usage",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": 1,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": 1,
+                     "min": 0,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 0,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 3,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "node_load1{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "1m load average",
+                     "refId": "A"
+                  },
+                  {
+                     "expr": "node_load5{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "5m load average",
+                     "refId": "B"
+                  },
+                  {
+                     "expr": "node_load15{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "15m load average",
+                     "refId": "C"
+                  },
+                  {
+                     "expr": "count(node_cpu_seconds_total{job=\"node\", instance=\"$instance\", cluster=\"$cluster\", mode=\"idle\"})",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "logical cores",
+                     "refId": "D"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Load Average",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "CPU",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 4,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 9,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "(\n  node_memory_MemTotal_bytes{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"}\n-\n  node_memory_MemFree_bytes{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"}\n-\n  node_memory_Buffers_bytes{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"}\n-\n  node_memory_Cached_bytes{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"}\n)\n",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "memory used",
+                     "refId": "A"
+                  },
+                  {
+                     "expr": "node_memory_Buffers_bytes{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "memory buffers",
+                     "refId": "B"
+                  },
+                  {
+                     "expr": "node_memory_Cached_bytes{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "memory cached",
+                     "refId": "C"
+                  },
+                  {
+                     "expr": "node_memory_MemFree_bytes{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "memory free",
+                     "refId": "D"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Memory Usage",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "bytes",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "bytes",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "datasource": "$datasource",
+               "fieldConfig": {
+                  "defaults": {
+                     "max": 100,
+                     "min": 0,
+                     "thresholds": {
+                        "mode": "absolute",
+                        "steps": [
+                           {
+                              "color": "rgba(50, 172, 45, 0.97)"
+                           },
+                           {
+                              "color": "rgba(237, 129, 40, 0.89)",
+                              "value": 80
+                           },
+                           {
+                              "color": "rgba(245, 54, 54, 0.9)",
+                              "value": 90
+                           }
+                        ]
+                     },
+                     "unit": "percent"
+                  }
+               },
+               "gridPos": { },
+               "id": 5,
+               "span": 3,
+               "targets": [
+                  {
+                     "expr": "100 -\n(\n  avg(node_memory_MemAvailable_bytes{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"}) /\n  avg(node_memory_MemTotal_bytes{job=\"node\", instance=\"$instance\", cluster=\"$cluster\"})\n* 100\n)\n",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": ""
+                  }
+               ],
+               "title": "Memory Usage",
+               "transparent": false,
+               "type": "gauge"
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Memory",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 0,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 6,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [
+                  {
+                     "alias": "/ read| written/",
+                     "yaxis": 1
+                  },
+                  {
+                     "alias": "/ io time/",
+                     "yaxis": 2
+                  }
+               ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "rate(node_disk_read_bytes_total{job=\"node\", instance=\"$instance\", cluster=\"$cluster\", device!=\"\"}[$__rate_interval])",
+                     "format": "time_series",
+                     "intervalFactor": 1,
+                     "legendFormat": "{{device}} read",
+                     "refId": "A"
+                  },
+                  {
+                     "expr": "rate(node_disk_written_bytes_total{job=\"node\", instance=\"$instance\", cluster=\"$cluster\", device!=\"\"}[$__rate_interval])",
+                     "format": "time_series",
+                     "intervalFactor": 1,
+                     "legendFormat": "{{device}} written",
+                     "refId": "B"
+                  },
+                  {
+                     "expr": "rate(node_disk_io_time_seconds_total{job=\"node\", instance=\"$instance\", cluster=\"$cluster\", device!=\"\"}[$__rate_interval])",
+                     "format": "time_series",
+                     "intervalFactor": 1,
+                     "legendFormat": "{{device}} io time",
+                     "refId": "C"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Disk I/O",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "Bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "percentunit",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "datasource": "$datasource",
+               "fieldConfig": {
+                  "defaults": {
+                     "custom": { },
+                     "thresholds": {
+                        "mode": "absolute",
+                        "steps": [
+                           {
+                              "color": "green"
+                           },
+                           {
+                              "color": "yellow",
+                              "value": 0.80000000000000004
+                           },
+                           {
+                              "color": "red",
+                              "value": 0.90000000000000002
+                           }
+                        ]
+                     },
+                     "unit": "decbytes"
+                  },
+                  "overrides": [
+                     {
+                        "matcher": {
+                           "id": "byName",
+                           "options": "Mounted on"
+                        },
+                        "properties": [
+                           {
+                              "id": "custom.width",
+                              "value": 260
+                           }
+                        ]
+                     },
+                     {
+                        "matcher": {
+                           "id": "byName",
+                           "options": "Size"
+                        },
+                        "properties": [
+                           {
+                              "id": "custom.width",
+                              "value": 93
+                           }
+                        ]
+                     },
+                     {
+                        "matcher": {
+                           "id": "byName",
+                           "options": "Used"
+                        },
+                        "properties": [
+                           {
+                              "id": "custom.width",
+                              "value": 72
+                           }
+                        ]
+                     },
+                     {
+                        "matcher": {
+                           "id": "byName",
+                           "options": "Available"
+                        },
+                        "properties": [
+                           {
+                              "id": "custom.width",
+                              "value": 88
+                           }
+                        ]
+                     },
+                     {
+                        "matcher": {
+                           "id": "byName",
+                           "options": "Used, %"
+                        },
+                        "properties": [
+                           {
+                              "id": "unit",
+                              "value": "percentunit"
+                           },
+                           {
+                              "id": "custom.displayMode",
+                              "value": "gradient-gauge"
+                           },
+                           {
+                              "id": "max",
+                              "value": 1
+                           },
+                           {
+                              "id": "min",
+                              "value": 0
+                           }
+                        ]
+                     }
+                  ]
+               },
+               "gridPos": { },
+               "id": 7,
+               "span": 6,
+               "targets": [
+                  {
+                     "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node\", instance=\"$instance\", cluster=\"$cluster\", fstype!=\"\", mountpoint!=\"\"})\n",
+                     "format": "table",
+                     "instant": true,
+                     "intervalFactor": 2,
+                     "legendFormat": ""
+                  },
+                  {
+                     "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node\", instance=\"$instance\", cluster=\"$cluster\", fstype!=\"\", mountpoint!=\"\"})\n",
+                     "format": "table",
+                     "instant": true,
+                     "intervalFactor": 2,
+                     "legendFormat": ""
+                  }
+               ],
+               "title": "Disk Space Usage",
+               "transformations": [
+                  {
+                     "id": "groupBy",
+                     "options": {
+                        "fields": {
+                           "Value #A": {
+                              "aggregations": [
+                                 "lastNotNull"
+                              ],
+                              "operation": "aggregate"
+                           },
+                           "Value #B": {
+                              "aggregations": [
+                                 "lastNotNull"
+                              ],
+                              "operation": "aggregate"
+                           },
+                           "mountpoint": {
+                              "aggregations": [ ],
+                              "operation": "groupby"
+                           }
+                        }
+                     }
+                  },
+                  {
+                     "id": "merge",
+                     "options": { }
+                  },
+                  {
+                     "id": "calculateField",
+                     "options": {
+                        "alias": "Used",
+                        "binary": {
+                           "left": "Value #A (lastNotNull)",
+                           "operator": "-",
+                           "reducer": "sum",
+                           "right": "Value #B (lastNotNull)"
+                        },
+                        "mode": "binary",
+                        "reduce": {
+                           "reducer": "sum"
+                        }
+                     }
+                  },
+                  {
+                     "id": "calculateField",
+                     "options": {
+                        "alias": "Used, %",
+                        "binary": {
+                           "left": "Used",
+                           "operator": "/",
+                           "reducer": "sum",
+                           "right": "Value #A (lastNotNull)"
+                        },
+                        "mode": "binary",
+                        "reduce": {
+                           "reducer": "sum"
+                        }
+                     }
+                  },
+                  {
+                     "id": "organize",
+                     "options": {
+                        "excludeByName": { },
+                        "indexByName": { },
+                        "renameByName": {
+                           "Value #A (lastNotNull)": "Size",
+                           "Value #B (lastNotNull)": "Available",
+                           "mountpoint": "Mounted on"
+                        }
+                     }
+                  },
+                  {
+                     "id": "sortBy",
+                     "options": {
+                        "fields": { },
+                        "sort": [
+                           {
+                              "field": "Mounted on"
+                           }
+                        ]
+                     }
+                  }
+               ],
+               "transparent": false,
+               "type": "table"
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Disk",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "description": "Network received (bits/s)",
+               "fill": 0,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 8,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "rate(node_network_receive_bytes_total{job=\"node\", instance=\"$instance\", cluster=\"$cluster\", device!=\"lo\"}[$__rate_interval]) * 8",
+                     "format": "time_series",
+                     "intervalFactor": 1,
+                     "legendFormat": "{{device}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Network Received",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "description": "Network transmitted (bits/s)",
+               "fill": 0,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 9,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "rate(node_network_transmit_bytes_total{job=\"node\", instance=\"$instance\", cluster=\"$cluster\", device!=\"lo\"}[$__rate_interval]) * 8",
+                     "format": "time_series",
+                     "intervalFactor": 1,
+                     "legendFormat": "{{device}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Network Transmitted",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "bps",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Network",
+         "titleSize": "h6",
+         "type": "row"
+      }
+   ],
+   "schemaVersion": 14,
+   "style": "dark",
+   "tags": [
+      "node-exporter-mixin"
+   ],
+   "templating": {
+      "list": [
+         {
+            "current": {
+               "text": "default",
+               "value": "default"
+            },
+            "hide": 0,
+            "label": "Data Source",
+            "name": "datasource",
+            "options": [ ],
+            "query": "prometheus",
+            "refresh": 1,
+            "regex": "",
+            "type": "datasource"
+         },
+         {
+            "allValue": null,
+            "current": { },
+            "datasource": "$datasource",
+            "hide": 2,
+            "includeAll": false,
+            "label": "Cluster",
+            "multi": false,
+            "name": "cluster",
+            "options": [ ],
+            "query": "label_values(node_uname_info{job=\"node\", sysname!=\"Darwin\"}, cluster)",
+            "refresh": 2,
+            "regex": "",
+            "sort": 0,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         },
+         {
+            "allValue": null,
+            "current": { },
+            "datasource": "$datasource",
+            "hide": 0,
+            "includeAll": false,
+            "label": "Instance",
+            "multi": false,
+            "name": "instance",
+            "options": [ ],
+            "query": "label_values(node_uname_info{job=\"node\", cluster=\"$cluster\", sysname!=\"Darwin\"}, instance)",
+            "refresh": 2,
+            "regex": "",
+            "sort": 0,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         }
+      ]
+   },
+   "time": {
+      "from": "now-1h",
+      "to": "now"
+   },
+   "timepicker": {
+      "refresh_intervals": [
+         "5s",
+         "10s",
+         "30s",
+         "1m",
+         "5m",
+         "15m",
+         "30m",
+         "1h",
+         "2h",
+         "1d"
+      ],
+      "time_options": [
+         "5m",
+         "15m",
+         "1h",
+         "6h",
+         "12h",
+         "24h",
+         "2d",
+         "7d",
+         "30d"
+      ]
+   },
+   "timezone": "utc",
+   "title": "Node Exporter / Nodes",
+   "uid": "7d57716318ee0dddbac5a7f451fb7753",
+   "version": 0
+}
diff --git a/grafana/dashboards/infrastructure/prometheus-remote-write.json b/grafana/dashboards/infrastructure/prometheus-remote-write.json
new file mode 100644
index 0000000..304f88e
--- /dev/null
+++ b/grafana/dashboards/infrastructure/prometheus-remote-write.json
@@ -0,0 +1,1443 @@
+{
+   "__inputs": [ ],
+   "__requires": [ ],
+   "annotations": {
+      "list": [ ]
+   },
+   "editable": true,
+   "gnetId": null,
+   "graphTooltip": 0,
+   "hideControls": false,
+   "id": null,
+   "links": [ ],
+   "refresh": "60s",
+   "rows": [
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 2,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "(\n  prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n-  \n  ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"} != 0)\n)\n",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Highest Timestamp In vs. Highest Timestamp Sent",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 3,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "clamp_min(\n  rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])  \n- \n  ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}[5m])\n, 0)\n",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Rate[5m]",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Timestamps",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 4,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 12,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "rate(\n  prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n  ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}[5m]))\n- \n  (rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}[5m]))\n",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Rate, in vs. succeeded or dropped [5m]",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Samples",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 5,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "minSpan": 6,
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 12,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Current Shards",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 6,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 4,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "prometheus_remote_storage_shards_max{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Max Shards",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 7,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 4,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "prometheus_remote_storage_shards_min{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Min Shards",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 8,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 4,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "prometheus_remote_storage_shards_desired{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Desired Shards",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Shards",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 9,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "prometheus_remote_storage_shard_capacity{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Shard Capacity",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 10,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"} or prometheus_remote_storage_samples_pending{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Pending Samples",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Shard Details",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 11,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "prometheus_tsdb_wal_segment_current{cluster=~\"$cluster\", instance=~\"$instance\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "TSDB Current Segment",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "none",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 12,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "prometheus_wal_watcher_current_segment{cluster=~\"$cluster\", instance=~\"$instance\"}",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{consumer}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Remote Write Current Segment",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "none",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Segments",
+         "titleSize": "h6",
+         "type": "row"
+      },
+      {
+         "collapse": false,
+         "collapsed": false,
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 13,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 3,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}[5m])",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Dropped Samples",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 14,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 3,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}[5m])",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Failed Samples",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 15,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 3,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}[5m])",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Retried Samples",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            },
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "fillGradient": 0,
+               "gridPos": { },
+               "id": 16,
+               "legend": {
+                  "alignAsTable": false,
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "rightSide": false,
+                  "show": true,
+                  "sideWidth": null,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "repeat": null,
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 3,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "rate(prometheus_remote_storage_enqueue_retries_total{cluster=~\"$cluster\", instance=~\"$instance\", url=~\"$url\"}[5m])",
+                     "format": "time_series",
+                     "intervalFactor": 2,
+                     "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}",
+                     "refId": "A"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Enqueue Retries",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 0,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": true
+                  }
+               ]
+            }
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Misc. Rates",
+         "titleSize": "h6",
+         "type": "row"
+      }
+   ],
+   "schemaVersion": 14,
+   "style": "dark",
+   "tags": [
+      "prometheus-mixin"
+   ],
+   "templating": {
+      "list": [
+         {
+            "hide": 0,
+            "label": null,
+            "name": "datasource",
+            "options": [ ],
+            "query": "prometheus",
+            "refresh": 1,
+            "regex": "",
+            "type": "datasource"
+         },
+         {
+            "allValue": null,
+            "current": {
+               "text": {
+                  "selected": true,
+                  "text": "All",
+                  "value": "$__all"
+               },
+               "value": {
+                  "selected": true,
+                  "text": "All",
+                  "value": "$__all"
+               }
+            },
+            "datasource": "$datasource",
+            "hide": 0,
+            "includeAll": true,
+            "label": null,
+            "multi": false,
+            "name": "cluster",
+            "options": [ ],
+            "query": "label_values(prometheus_build_info, cluster)",
+            "refresh": 2,
+            "regex": "",
+            "sort": 0,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         },
+         {
+            "allValue": null,
+            "current": {
+               "text": {
+                  "selected": true,
+                  "text": "All",
+                  "value": "$__all"
+               },
+               "value": {
+                  "selected": true,
+                  "text": "All",
+                  "value": "$__all"
+               }
+            },
+            "datasource": "$datasource",
+            "hide": 0,
+            "includeAll": true,
+            "label": null,
+            "multi": false,
+            "name": "instance",
+            "options": [ ],
+            "query": "label_values(prometheus_build_info{cluster=~\"$cluster\"}, instance)",
+            "refresh": 2,
+            "regex": "",
+            "sort": 0,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         },
+         {
+            "allValue": null,
+            "current": { },
+            "datasource": "$datasource",
+            "hide": 0,
+            "includeAll": true,
+            "label": null,
+            "multi": false,
+            "name": "url",
+            "options": [ ],
+            "query": "label_values(prometheus_remote_storage_shards{cluster=~\"$cluster\", instance=~\"$instance\"}, url)",
+            "refresh": 2,
+            "regex": "",
+            "sort": 0,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         }
+      ]
+   },
+   "time": {
+      "from": "now-6h",
+      "to": "now"
+   },
+   "timepicker": {
+      "refresh_intervals": [
+         "5s",
+         "10s",
+         "30s",
+         "1m",
+         "5m",
+         "15m",
+         "30m",
+         "1h",
+         "2h",
+         "1d"
+      ],
+      "time_options": [
+         "5m",
+         "15m",
+         "1h",
+         "6h",
+         "12h",
+         "24h",
+         "2d",
+         "7d",
+         "30d"
+      ]
+   },
+   "timezone": "browser",
+   "title": "Prometheus / Remote Write",
+   "version": 0
+}
diff --git a/grafana/dashboards/infrastructure/prometheus.json b/grafana/dashboards/infrastructure/prometheus.json
index 4f9948a..ab442f5 100644
--- a/grafana/dashboards/infrastructure/prometheus.json
+++ b/grafana/dashboards/infrastructure/prometheus.json
@@ -1,3966 +1,1083 @@
 {
-  "annotations": {
-    "list": [
+   "annotations": {
+      "list": [ ]
+   },
+   "editable": true,
+   "gnetId": null,
+   "graphTooltip": 0,
+   "hideControls": false,
+   "links": [ ],
+   "refresh": "60s",
+   "rows": [
       {
-        "builtIn": 1,
-        "datasource": {
-          "type": "grafana",
-          "uid": "-- Grafana --"
-        },
-        "enable": true,
-        "hide": true,
-        "iconColor": "rgba(0, 211, 255, 1)",
-        "name": "Annotations & Alerts",
-        "type": "dashboard"
-      },
-      {
-        "datasource": {
-          "uid": "${DS_THEMIS}"
-        },
-        "enable": true,
-        "expr": "sum(changes(prometheus_config_last_reload_success_timestamp_seconds{instance=~\"$instance\"}[10m])) by (instance)",
-        "hide": false,
-        "iconColor": "rgb(0, 96, 19)",
-        "limit": 100,
-        "name": "reloads",
-        "showIn": 0,
-        "step": "5m",
-        "type": "alert"
-      },
-      {
-        "datasource": {
-          "uid": "${DS_THEMIS}"
-        },
-        "enable": true,
-        "expr": "count(sum(up{instance=\"$instance\"}) by (instance) < 1)",
-        "hide": false,
-        "iconColor": "rgba(255, 96, 96, 1)",
-        "limit": 100,
-        "name": "down",
-        "showIn": 0,
-        "step": "5m",
-        "type": "alert"
-      }
-    ]
-  },
-  "description": "Get started faster with Grafana Cloud then easily build these dashboards. https://grafana.com/products/cloud/\nOverview of metrics from Prometheus 2.0.  \nUseful for using prometheus to monitor your prometheus.\nRevisions welcome!",
-  "editable": true,
-  "fiscalYearStartMonth": 0,
-  "gnetId": 3662,
-  "graphTooltip": 0,
-  "id": 69,
-  "links": [],
-  "panels": [
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 0
-      },
-      "id": 34,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "at a glance",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "Percentage of uptime during the most recent $interval period.  Change the period with the 'interval' dropdown above.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "decimals": 3,
-          "mappings": [
+         "collapse": false,
+         "height": "250px",
+         "panels": [
             {
-              "options": {
-                "match": "null",
-                "result": {
-                  "text": "N/A"
-                }
-              },
-              "type": "special"
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "id": 1,
+               "legend": {
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "show": true,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null as zero",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 12,
+               "stack": false,
+               "steppedLine": false,
+               "styles": [
+                  {
+                     "alias": "Time",
+                     "dateFormat": "YYYY-MM-DD HH:mm:ss",
+                     "pattern": "Time",
+                     "type": "hidden"
+                  },
+                  {
+                     "alias": "Count",
+                     "colorMode": null,
+                     "colors": [ ],
+                     "dateFormat": "YYYY-MM-DD HH:mm:ss",
+                     "decimals": 2,
+                     "link": false,
+                     "linkTargetBlank": false,
+                     "linkTooltip": "Drill down",
+                     "linkUrl": "",
+                     "pattern": "Value #A",
+                     "thresholds": [ ],
+                     "type": "hidden",
+                     "unit": "short"
+                  },
+                  {
+                     "alias": "Uptime",
+                     "colorMode": null,
+                     "colors": [ ],
+                     "dateFormat": "YYYY-MM-DD HH:mm:ss",
+                     "decimals": 2,
+                     "link": false,
+                     "linkTargetBlank": false,
+                     "linkTooltip": "Drill down",
+                     "linkUrl": "",
+                     "pattern": "Value #B",
+                     "thresholds": [ ],
+                     "type": "number",
+                     "unit": "s"
+                  },
+                  {
+                     "alias": "Cluster",
+                     "colorMode": null,
+                     "colors": [ ],
+                     "dateFormat": "YYYY-MM-DD HH:mm:ss",
+                     "decimals": 2,
+                     "link": false,
+                     "linkTargetBlank": false,
+                     "linkTooltip": "Drill down",
+                     "linkUrl": "",
+                     "pattern": "cluster",
+                     "thresholds": [ ],
+                     "type": "number",
+                     "unit": "short"
+                  },
+                  {
+                     "alias": "Instance",
+                     "colorMode": null,
+                     "colors": [ ],
+                     "dateFormat": "YYYY-MM-DD HH:mm:ss",
+                     "decimals": 2,
+                     "link": false,
+                     "linkTargetBlank": false,
+                     "linkTooltip": "Drill down",
+                     "linkUrl": "",
+                     "pattern": "instance",
+                     "thresholds": [ ],
+                     "type": "number",
+                     "unit": "short"
+                  },
+                  {
+                     "alias": "Job",
+                     "colorMode": null,
+                     "colors": [ ],
+                     "dateFormat": "YYYY-MM-DD HH:mm:ss",
+                     "decimals": 2,
+                     "link": false,
+                     "linkTargetBlank": false,
+                     "linkTooltip": "Drill down",
+                     "linkUrl": "",
+                     "pattern": "job",
+                     "thresholds": [ ],
+                     "type": "number",
+                     "unit": "short"
+                  },
+                  {
+                     "alias": "Version",
+                     "colorMode": null,
+                     "colors": [ ],
+                     "dateFormat": "YYYY-MM-DD HH:mm:ss",
+                     "decimals": 2,
+                     "link": false,
+                     "linkTargetBlank": false,
+                     "linkTooltip": "Drill down",
+                     "linkUrl": "",
+                     "pattern": "version",
+                     "thresholds": [ ],
+                     "type": "number",
+                     "unit": "short"
+                  },
+                  {
+                     "alias": "",
+                     "colorMode": null,
+                     "colors": [ ],
+                     "dateFormat": "YYYY-MM-DD HH:mm:ss",
+                     "decimals": 2,
+                     "pattern": "/.*/",
+                     "thresholds": [ ],
+                     "type": "string",
+                     "unit": "short"
+                  }
+               ],
+               "targets": [
+                  {
+                     "expr": "count by (cluster, job, instance, version) (prometheus_build_info{cluster=~\"$cluster\", job=~\"$job\", instance=~\"$instance\"})",
+                     "format": "table",
+                     "instant": true,
+                     "legendFormat": "",
+                     "refId": "A"
+                  },
+                  {
+                     "expr": "max by (cluster, job, instance) (time() - process_start_time_seconds{cluster=~\"$cluster\", job=~\"$job\", instance=~\"$instance\"})",
+                     "format": "table",
+                     "instant": true,
+                     "legendFormat": "",
+                     "refId": "B"
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Prometheus Stats",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "transform": "table",
+               "type": "table",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": false
+                  }
+               ]
             }
-          ],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(245, 54, 54, 0.9)",
-                "value": null
-              },
-              {
-                "color": "rgba(237, 129, 40, 0.89)",
-                "value": 90
-              },
-              {
-                "color": "rgba(50, 172, 45, 0.97)",
-                "value": 99
-              }
-            ]
-          },
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 6,
-        "x": 0,
-        "y": 1
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Prometheus Stats",
+         "titleSize": "h6"
       },
-      "id": 2,
-      "maxDataPoints": 100,
-      "options": {
-        "colorMode": "value",
-        "graphMode": "none",
-        "justifyMode": "auto",
-        "orientation": "horizontal",
-        "percentChangeColorMode": "standard",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showPercentChange": false,
-        "textMode": "auto",
-        "wideLayout": true
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "avg(avg_over_time(up{instance=~\"$instance\",job=~\"$job\"}[$interval]) * 100)",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "",
-          "refId": "A",
-          "step": 40
-        }
-      ],
-      "title": "Uptime [$interval]",
-      "type": "stat"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "Servers which are DOWN RIGHT NOW! \nFIX THEM!!",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "custom": {
-            "align": "auto",
-            "cellOptions": {
-              "type": "auto"
-            },
-            "inspect": false
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          }
-        },
-        "overrides": [
-          {
-            "matcher": {
-              "id": "byName",
-              "options": "Time"
-            },
-            "properties": [
-              {
-                "id": "displayName",
-                "value": "Time"
-              },
-              {
-                "id": "custom.hidden",
-                "value": true
-              },
-              {
-                "id": "custom.align"
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byRegexp",
-              "options": "/__name__|job|Value/"
-            },
-            "properties": [
-              {
-                "id": "unit",
-                "value": "short"
-              },
-              {
-                "id": "decimals",
-                "value": 2
-              },
-              {
-                "id": "custom.hidden",
-                "value": true
-              },
-              {
-                "id": "custom.align"
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byName",
-              "options": "instance"
+      {
+         "collapse": false,
+         "height": "250px",
+         "panels": [
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "id": 2,
+               "legend": {
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "show": true,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null as zero",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}[5m])) by (cluster, job, scrape_job, instance) * 1e3",
+                     "format": "time_series",
+                     "legendFormat": "{{cluster}}:{{job}}:{{instance}}:{{scrape_job}}",
+                     "legendLink": null
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Target Sync",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "ms",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": false
+                  }
+               ]
             },
-            "properties": [
-              {
-                "id": "displayName",
-                "value": "   "
-              },
-              {
-                "id": "unit",
-                "value": "short"
-              },
-              {
-                "id": "decimals",
-                "value": 2
-              },
-              {
-                "id": "custom.cellOptions",
-                "value": {
-                  "type": "color-background"
-                }
-              },
-              {
-                "id": "custom.align"
-              },
-              {
-                "id": "thresholds",
-                "value": {
-                  "mode": "absolute",
-                  "steps": [
-                    {
-                      "color": "rgba(255, 0, 0, 0.9)",
-                      "value": null
-                    },
-                    {
-                      "color": "rgba(237, 129, 40, 0.89)",
-                      "value": null
-                    },
-                    {
-                      "color": "rgba(255, 0, 0, 0.97)",
-                      "value": null
-                    },
-                    {
-                      "value": null
-                    }
-                  ]
-                }
-              }
-            ]
-          }
-        ]
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 6,
-        "x": 6,
-        "y": 1
-      },
-      "hideTimeOverride": true,
-      "id": 25,
-      "options": {
-        "cellHeight": "sm",
-        "footer": {
-          "countRows": false,
-          "fields": "",
-          "reducer": [
-            "sum"
-          ],
-          "show": false
-        },
-        "showHeader": true
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "up{instance=~\"$instance\",job=~\"$job\"} < 1",
-          "format": "table",
-          "intervalFactor": 2,
-          "refId": "A",
-          "step": 2
-        }
-      ],
-      "timeFrom": "1s",
-      "title": "Currently Down",
-      "transformations": [
-        {
-          "id": "merge",
-          "options": {
-            "reducers": []
-          }
-        }
-      ],
-      "type": "table"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "Total number of time series in prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [
             {
-              "options": {
-                "match": "null",
-                "result": {
-                  "text": "N/A"
-                }
-              },
-              "type": "special"
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "id": 3,
+               "legend": {
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "show": true,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 0,
+               "links": [ ],
+               "nullPointMode": "null as zero",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "sum by (cluster, job, instance) (prometheus_sd_discovered_targets{cluster=~\"$cluster\", job=~\"$job\",instance=~\"$instance\"})",
+                     "format": "time_series",
+                     "legendFormat": "{{cluster}}:{{job}}:{{instance}}",
+                     "legendLink": null
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Targets",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": false
+                  }
+               ]
             }
-          ],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(50, 172, 45, 0.97)",
-                "value": null
-              },
-              {
-                "color": "rgba(237, 129, 40, 0.89)",
-                "value": 1000000
-              },
-              {
-                "color": "rgba(245, 54, 54, 0.9)",
-                "value": 2000000
-              }
-            ]
-          },
-          "unit": "none"
-        },
-        "overrides": []
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Discovery",
+         "titleSize": "h6"
       },
-      "gridPos": {
-        "h": 7,
-        "w": 6,
-        "x": 12,
-        "y": 1
-      },
-      "id": 12,
-      "maxDataPoints": 100,
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "auto",
-        "orientation": "horizontal",
-        "percentChangeColorMode": "standard",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showPercentChange": false,
-        "textMode": "auto",
-        "wideLayout": true
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(prometheus_tsdb_head_series{job=~\"$job\",instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "refId": "B",
-          "step": 40
-        }
-      ],
-      "title": "Total Series",
-      "type": "stat"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "fixedColor": "rgb(31, 120, 193)",
-            "mode": "fixed"
-          },
-          "mappings": [
+      {
+         "collapse": false,
+         "height": "250px",
+         "panels": [
             {
-              "options": {
-                "match": "null",
-                "result": {
-                  "text": "N/A"
-                }
-              },
-              "type": "special"
-            }
-          ],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 6,
-        "x": 18,
-        "y": 1
-      },
-      "id": 14,
-      "maxDataPoints": 100,
-      "options": {
-        "colorMode": "none",
-        "graphMode": "area",
-        "justifyMode": "auto",
-        "orientation": "horizontal",
-        "percentChangeColorMode": "standard",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showPercentChange": false,
-        "textMode": "auto",
-        "wideLayout": true
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(prometheus_tsdb_head_chunks{job=~\"$job\",instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "refId": "B",
-          "step": 40
-        }
-      ],
-      "title": "Memory Chunks",
-      "type": "stat"
-    },
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 8
-      },
-      "id": 35,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "quick numbers",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "The total number of rule group evaluations missed due to slow rule group evaluation.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 1,
+               "id": 4,
+               "legend": {
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "show": true,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 1,
+               "links": [ ],
+               "nullPointMode": "null as zero",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 4,
+               "stack": false,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "rate(prometheus_target_interval_length_seconds_sum{cluster=~\"$cluster\", job=~\"$job\",instance=~\"$instance\"}[5m]) / rate(prometheus_target_interval_length_seconds_count{cluster=~\"$cluster\", job=~\"$job\",instance=~\"$instance\"}[5m]) * 1e3",
+                     "format": "time_series",
+                     "legendFormat": "{{cluster}}:{{job}}:{{instance}} {{interval}} configured",
+                     "legendLink": null
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Average Scrape Interval Duration",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "ms",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": false
+                  }
+               ]
+            },
             {
-              "options": {
-                "match": "null",
-                "result": {
-                  "text": "N/A"
-                }
-              },
-              "type": "special"
-            }
-          ],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(50, 172, 45, 0.97)",
-                "value": null
-              },
-              {
-                "color": "rgba(237, 129, 40, 0.89)",
-                "value": 1
-              },
-              {
-                "color": "rgba(245, 54, 54, 0.9)",
-                "value": 10
-              }
-            ]
-          },
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 4,
-        "x": 0,
-        "y": 9
-      },
-      "id": 16,
-      "maxDataPoints": 100,
-      "options": {
-        "colorMode": "value",
-        "graphMode": "none",
-        "justifyMode": "auto",
-        "orientation": "horizontal",
-        "percentChangeColorMode": "standard",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showPercentChange": false,
-        "textMode": "auto",
-        "wideLayout": true
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(sum_over_time(prometheus_evaluator_iterations_missed_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "refId": "A",
-          "step": 40
-        }
-      ],
-      "title": "Missed Iterations [$interval]",
-      "type": "stat"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "The total number of rule group evaluations skipped due to throttled metric storage.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "id": 5,
+               "legend": {
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "show": true,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 0,
+               "links": [ ],
+               "nullPointMode": "null as zero",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 4,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}[1m]))",
+                     "format": "time_series",
+                     "legendFormat": "exceeded body size limit: {{cluster}} {{job}} {{instance}}",
+                     "legendLink": null
+                  },
+                  {
+                     "expr": "sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}[1m]))",
+                     "format": "time_series",
+                     "legendFormat": "exceeded sample limit: {{cluster}} {{job}} {{instance}}",
+                     "legendLink": null
+                  },
+                  {
+                     "expr": "sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}[1m]))",
+                     "format": "time_series",
+                     "legendFormat": "duplicate timestamp: {{cluster}} {{job}} {{instance}}",
+                     "legendLink": null
+                  },
+                  {
+                     "expr": "sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}[1m]))",
+                     "format": "time_series",
+                     "legendFormat": "out of bounds: {{cluster}} {{job}} {{instance}}",
+                     "legendLink": null
+                  },
+                  {
+                     "expr": "sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}[1m]))",
+                     "format": "time_series",
+                     "legendFormat": "out of order: {{cluster}} {{job}} {{instance}}",
+                     "legendLink": null
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Scrape failures",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": false
+                  }
+               ]
+            },
             {
-              "options": {
-                "match": "null",
-                "result": {
-                  "text": "N/A"
-                }
-              },
-              "type": "special"
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "id": 6,
+               "legend": {
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "show": true,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 0,
+               "links": [ ],
+               "nullPointMode": "null as zero",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 4,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "rate(prometheus_tsdb_head_samples_appended_total{cluster=~\"$cluster\", job=~\"$job\",instance=~\"$instance\"}[5m])",
+                     "format": "time_series",
+                     "legendFormat": "{{cluster}} {{job}} {{instance}}",
+                     "legendLink": null
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Appended Samples",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": false
+                  }
+               ]
             }
-          ],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(50, 172, 45, 0.97)",
-                "value": null
-              },
-              {
-                "color": "rgba(237, 129, 40, 0.89)",
-                "value": 1
-              },
-              {
-                "color": "rgba(245, 54, 54, 0.9)",
-                "value": 10
-              }
-            ]
-          },
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 4,
-        "x": 4,
-        "y": 9
-      },
-      "id": 18,
-      "maxDataPoints": 100,
-      "options": {
-        "colorMode": "value",
-        "graphMode": "none",
-        "justifyMode": "auto",
-        "orientation": "horizontal",
-        "percentChangeColorMode": "standard",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showPercentChange": false,
-        "textMode": "auto",
-        "wideLayout": true
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Retrieval",
+         "titleSize": "h6"
       },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(sum_over_time(prometheus_evaluator_iterations_skipped_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "refId": "A",
-          "step": 40
-        }
-      ],
-      "title": "Skipped Iterations [$interval]",
-      "type": "stat"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "Total number of scrapes that hit the sample limit and were rejected.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [
+      {
+         "collapse": false,
+         "height": "250px",
+         "panels": [
             {
-              "options": {
-                "match": "null",
-                "result": {
-                  "text": "N/A"
-                }
-              },
-              "type": "special"
-            }
-          ],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(50, 172, 45, 0.97)",
-                "value": null
-              },
-              {
-                "color": "rgba(237, 129, 40, 0.89)",
-                "value": 1
-              },
-              {
-                "color": "rgba(245, 54, 54, 0.9)",
-                "value": 10
-              }
-            ]
-          },
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 4,
-        "x": 8,
-        "y": 9
-      },
-      "id": 19,
-      "maxDataPoints": 100,
-      "options": {
-        "colorMode": "value",
-        "graphMode": "none",
-        "justifyMode": "auto",
-        "orientation": "horizontal",
-        "percentChangeColorMode": "standard",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showPercentChange": false,
-        "textMode": "auto",
-        "wideLayout": true
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(sum_over_time(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "refId": "A",
-          "step": 40
-        }
-      ],
-      "title": "Tardy Scrapes [$interval]",
-      "type": "stat"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "Number of times the database failed to reload block data from disk.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "id": 7,
+               "legend": {
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "show": true,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 0,
+               "links": [ ],
+               "nullPointMode": "null as zero",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "prometheus_tsdb_head_series{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}",
+                     "format": "time_series",
+                     "legendFormat": "{{cluster}} {{job}} {{instance}} head series",
+                     "legendLink": null
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Head Series",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": false
+                  }
+               ]
+            },
             {
-              "options": {
-                "match": "null",
-                "result": {
-                  "text": "N/A"
-                }
-              },
-              "type": "special"
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "id": 8,
+               "legend": {
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "show": true,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 0,
+               "links": [ ],
+               "nullPointMode": "null as zero",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "prometheus_tsdb_head_chunks{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\"}",
+                     "format": "time_series",
+                     "legendFormat": "{{cluster}} {{job}} {{instance}} head chunks",
+                     "legendLink": null
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Head Chunks",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": false
+                  }
+               ]
             }
-          ],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(50, 172, 45, 0.97)",
-                "value": null
-              },
-              {
-                "color": "rgba(237, 129, 40, 0.89)",
-                "value": 1
-              },
-              {
-                "color": "rgba(245, 54, 54, 0.9)",
-                "value": 10
-              }
-            ]
-          },
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 4,
-        "x": 12,
-        "y": 9
-      },
-      "id": 13,
-      "maxDataPoints": 100,
-      "options": {
-        "colorMode": "value",
-        "graphMode": "none",
-        "justifyMode": "auto",
-        "orientation": "horizontal",
-        "percentChangeColorMode": "standard",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showPercentChange": false,
-        "textMode": "auto",
-        "wideLayout": true
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(sum_over_time(prometheus_tsdb_reloads_failures_total{job=~\"$job\",instance=~\"$instance\"}[$interval]))",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "refId": "A",
-          "step": 40
-        }
-      ],
-      "title": "Reload Failures [$interval]",
-      "type": "stat"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Storage",
+         "titleSize": "h6"
       },
-      "description": "Sum of all skipped scrapes",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [
+      {
+         "collapse": false,
+         "height": "250px",
+         "panels": [
             {
-              "options": {
-                "match": "null",
-                "result": {
-                  "text": "N/A"
-                }
-              },
-              "type": "special"
-            }
-          ],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "rgba(50, 172, 45, 0.97)",
-                "value": null
-              },
-              {
-                "color": "rgba(237, 129, 40, 0.89)",
-                "value": 1
-              },
-              {
-                "color": "rgba(245, 54, 54, 0.9)",
-                "value": 10
-              }
-            ]
-          },
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 8,
-        "x": 16,
-        "y": 9
-      },
-      "id": 20,
-      "maxDataPoints": 100,
-      "options": {
-        "colorMode": "value",
-        "graphMode": "none",
-        "justifyMode": "auto",
-        "orientation": "horizontal",
-        "percentChangeColorMode": "standard",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showPercentChange": false,
-        "textMode": "auto",
-        "wideLayout": true
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(sum_over_time(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) + \nsum(sum_over_time(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) + \nsum(sum_over_time(prometheus_target_scrapes_sample_out_of_bounds_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) + \nsum(sum_over_time(prometheus_target_scrapes_sample_out_of_order_total{job=~\"$job\",instance=~\"$instance\"}[$interval])) ",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "refId": "A",
-          "step": 40
-        }
-      ],
-      "title": "Skipped Scrapes [$interval]",
-      "type": "stat"
-    },
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 16
-      },
-      "id": 36,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "errors",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "All non-zero failures and errors",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Errors",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 24,
-        "x": 0,
-        "y": 17
-      },
-      "id": 33,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(net_conntrack_dialer_conn_failed_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Failed Connections",
-          "refId": "A",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_evaluator_iterations_missed_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Missed Iterations",
-          "refId": "B",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_evaluator_iterations_skipped_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Skipped Iterations",
-          "refId": "C",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_rule_evaluation_failures_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Evaluation",
-          "refId": "D",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_sd_azure_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Azure Refresh",
-          "refId": "E",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_sd_consul_rpc_failures_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Consul RPC",
-          "refId": "F",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_sd_dns_lookup_failures_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "DNS Lookup",
-          "refId": "G",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_sd_ec2_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "EC2 Refresh",
-          "refId": "H",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_sd_gce_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "GCE Refresh",
-          "refId": "I",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_sd_marathon_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Marathon Refresh",
-          "refId": "J",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_sd_openstack_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Openstack Refresh",
-          "refId": "K",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_sd_triton_refresh_failures_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Triton Refresh",
-          "refId": "L",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_target_scrapes_exceeded_sample_limit_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Sample Limit",
-          "refId": "M",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Duplicate Timestamp",
-          "refId": "N",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_target_scrapes_sample_out_of_bounds_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Timestamp Out of Bounds",
-          "refId": "O",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_target_scrapes_sample_out_of_order_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Sample Out of Order",
-          "refId": "P",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_treecache_zookeeper_failures_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Zookeeper",
-          "refId": "Q",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_tsdb_compactions_failed_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "TSDB Compactions",
-          "refId": "R",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_tsdb_head_series_not_found{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Series Not Found",
-          "refId": "S",
-          "step": 2
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(increase(prometheus_tsdb_reloads_failures_total{instance=~\"$instance\"}[5m])) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Reload",
-          "refId": "T",
-          "step": 2
-        }
-      ],
-      "title": "Failures and Errors",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 24
-      },
-      "id": 37,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "up",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Up",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "normal"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "decimals": 0,
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 0,
-        "y": 25
-      },
-      "id": 1,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "up{instance=~\"$instance\",job=~\"$job\"}",
-          "format": "time_series",
-          "interval": "",
-          "intervalFactor": 1,
-          "legendFormat": "{{instance}}",
-          "refId": "A",
-          "step": 2
-        }
-      ],
-      "title": "Upness (stacked)",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Chunks",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 12,
-        "y": 25
-      },
-      "id": 5,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "prometheus_tsdb_head_chunks{job=~\"$job\",instance=~\"$instance\"}",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{instance}}",
-          "refId": "A",
-          "step": 4
-        }
-      ],
-      "title": "Storage Memory Chunks",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 32
-      },
-      "id": 38,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "series",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Series",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 0,
-        "y": 33
-      },
-      "id": 3,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "prometheus_tsdb_head_series{job=~\"$job\",instance=~\"$instance\"}",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{instance}}",
-          "refId": "A",
-          "step": 4
-        }
-      ],
-      "title": "Series Count",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Series Count",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": [
-          {
-            "matcher": {
-              "id": "byName",
-              "options": "removed"
-            },
-            "properties": [
-              {
-                "id": "custom.transform",
-                "value": "negative-Y"
-              }
-            ]
-          }
-        ]
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 12,
-        "y": 33
-      },
-      "id": 32,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum( increase(prometheus_tsdb_head_series_created_total{instance=~\"$instance\"}[5m]) )",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "created",
-          "refId": "A",
-          "step": 4
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum( increase(prometheus_tsdb_head_series_removed_total{instance=~\"$instance\"}[5m]) )",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "removed",
-          "refId": "B",
-          "step": 4
-        }
-      ],
-      "title": "Series Created / Removed",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 40
-      },
-      "id": 39,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "appended samples",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "Rate of total number of appended samples",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Samples / Second",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": [
-          {
-            "matcher": {
-              "id": "byName",
-              "options": "10.58.3.10:80"
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "fixedColor": "#BA43A9",
-                  "mode": "fixed"
-                }
-              }
-            ]
-          }
-        ]
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 24,
-        "x": 0,
-        "y": 41
-      },
-      "id": 4,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "rate(prometheus_tsdb_head_samples_appended_total{job=~\"$job\",instance=~\"$instance\"}[1m])",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{instance}}",
-          "refId": "A",
-          "step": 2
-        }
-      ],
-      "title": "Appended Samples per Second",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 48
-      },
-      "id": 40,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "sync",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "Total number of syncs that were executed on a scrape pool.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Syncs",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 0,
-        "y": 49
-      },
-      "id": 6,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(prometheus_target_scrape_pool_sync_total{job=~\"$job\",instance=~\"$instance\"}) by (scrape_job)",
-          "format": "time_series",
-          "hide": false,
-          "intervalFactor": 2,
-          "legendFormat": "{{scrape_job}}",
-          "refId": "B",
-          "step": 4
-        }
-      ],
-      "title": "Scrape Sync Total",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "Actual interval to sync the scrape pool.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Milliseconds",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "id": 9,
+               "legend": {
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "show": true,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 0,
+               "links": [ ],
+               "nullPointMode": "null as zero",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "rate(prometheus_engine_query_duration_seconds_count{cluster=~\"$cluster\",job=~\"$job\",instance=~\"$instance\",slice=\"inner_eval\"}[5m])",
+                     "format": "time_series",
+                     "legendFormat": "{{cluster}} {{job}} {{instance}}",
+                     "legendLink": null
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Query Rate",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": false
+                  }
+               ]
             },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 12,
-        "y": 49
-      },
-      "id": 21,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(rate(prometheus_target_sync_length_seconds_sum{job=~\"$job\",instance=~\"$instance\"}[2m])) by (scrape_job) * 1000",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{scrape_job}}",
-          "refId": "A",
-          "step": 4
-        }
-      ],
-      "title": "Target Sync",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 56
-      },
-      "id": 41,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "scrapes",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Seconds",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 0,
-        "y": 57
-      },
-      "id": 29,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "scrape_duration_seconds{instance=~\"$instance\"}",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{instance}}",
-          "refId": "A",
-          "step": 4
-        }
-      ],
-      "title": "Scrape Duration",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "Total number of rejected scrapes",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Scrapes",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "decimals": 0,
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 12,
-        "y": 57
-      },
-      "id": 30,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(prometheus_target_scrapes_exceeded_sample_limit_total{job=~\"$job\",instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "exceeded sample limit",
-          "refId": "A",
-          "step": 4
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~\"$job\",instance=~\"$instance\"})",
-          "format": "time_series",
-          "hide": false,
-          "intervalFactor": 2,
-          "legendFormat": "duplicate timestamp",
-          "refId": "B",
-          "step": 4
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(prometheus_target_scrapes_sample_out_of_bounds_total{job=~\"$job\",instance=~\"$instance\"})",
-          "format": "time_series",
-          "hide": false,
-          "intervalFactor": 2,
-          "legendFormat": "out of bounds",
-          "refId": "C",
-          "step": 4
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(prometheus_target_scrapes_sample_out_of_order_total{job=~\"$job\",instance=~\"$instance\"}) ",
-          "format": "time_series",
-          "hide": false,
-          "intervalFactor": 2,
-          "legendFormat": "out of order",
-          "refId": "D",
-          "step": 4
-        }
-      ],
-      "title": "Rejected Scrapes",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 64
-      },
-      "id": 42,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "durations",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "The duration of rule group evaluations",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Milliseconds",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 0,
-        "y": 65
-      },
-      "id": 10,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "1000 * rate(prometheus_evaluator_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\"}[5m]) / rate(prometheus_evaluator_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[5m])",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{instance}}",
-          "refId": "E",
-          "step": 4
-        }
-      ],
-      "title": "Average Rule Evaluation Duration",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Microseconds",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 12,
-        "y": 65
-      },
-      "id": 11,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(rate(http_request_duration_microseconds_count{job=~\"$job\",instance=~\"$instance\"}[1m])) by (handler) > 0",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{handler}}",
-          "refId": "A",
-          "step": 4
-        }
-      ],
-      "title": "HTTP Request Duration",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Seconds",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 0,
-        "y": 72
-      },
-      "id": 15,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(prometheus_engine_query_duration_seconds_sum{job=~\"$job\",instance=~\"$instance\"}) by (slice)",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{slice}}",
-          "refId": "A",
-          "step": 4
-        }
-      ],
-      "title": "Prometheus Engine Query Duration Seconds",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "Rule-group evaluations \n - total\n - missed due to slow rule group evaluation\n - skipped due to throttled metric storage",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "iterations",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 12,
-        "y": 72
-      },
-      "id": 31,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(rate(prometheus_evaluator_iterations_total{job=~\"$job\", instance=~\"$instance\"}[5m]))",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Total",
-          "refId": "B",
-          "step": 4
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(rate(prometheus_evaluator_iterations_missed_total{job=~\"$job\", instance=~\"$instance\"}[5m]))",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Missed",
-          "refId": "A",
-          "step": 4
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(rate(prometheus_evaluator_iterations_skipped_total{job=~\"$job\", instance=~\"$instance\"}[5m]))",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "Skipped",
-          "refId": "C",
-          "step": 4
-        }
-      ],
-      "title": "Rule Evaluator Iterations",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 79
-      },
-      "id": 43,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "notifications",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Notifications",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 24,
-        "x": 0,
-        "y": 80
-      },
-      "id": 22,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "rate(prometheus_notifications_sent_total[5m])",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{instance}}",
-          "refId": "A",
-          "step": 2
-        }
-      ],
-      "title": "Notifications Sent",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 87
-      },
-      "id": 44,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "config",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Minutes",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 0,
-        "y": 88
-      },
-      "id": 23,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "(time() - prometheus_config_last_reload_success_timestamp_seconds{job=~\"$job\",instance=~\"$instance\"}) / 60",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{instance}}",
-          "refId": "A",
-          "step": 4
-        }
-      ],
-      "title": "Minutes Since Successful Config Reload",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Success",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "decimals": 0,
-          "mappings": [],
-          "max": 1,
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 12,
-        "y": 88
-      },
-      "id": 24,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "prometheus_config_last_reload_successful{job=~\"$job\",instance=~\"$instance\"}",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{instance}}",
-          "refId": "A",
-          "step": 4
-        }
-      ],
-      "title": "Successful Config Reload",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 95
-      },
-      "id": 45,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "garbage collection",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "GC invocation durations",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 24,
-        "x": 0,
-        "y": 96
-      },
-      "id": 28,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(rate(go_gc_duration_seconds_sum{instance=~\"$instance\",job=~\"$job\"}[2m])) by (instance)",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{instance}}",
-          "refId": "A",
-          "step": 2
-        }
-      ],
-      "title": "GC Rate / 2m",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "datasource": {
-        "type": "datasource",
-        "uid": "grafana"
-      },
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 103
-      },
-      "id": 46,
-      "panels": [],
-      "targets": [
-        {
-          "datasource": {
-            "type": "datasource",
-            "uid": "grafana"
-          },
-          "refId": "A"
-        }
-      ],
-      "title": "Broken, ignore",
-      "type": "row"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "description": "This is probably wrong!  Please help.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "normal"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "bytes"
-        },
-        "overrides": [
-          {
-            "matcher": {
-              "id": "byName",
-              "options": "allocated"
-            },
-            "properties": [
-              {
-                "id": "custom.stacking",
-                "value": {
-                  "group": "A",
-                  "mode": "none"
-                }
-              }
-            ]
-          }
-        ]
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 12,
-        "x": 0,
-        "y": 104
-      },
-      "id": 26,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_alloc_bytes_total{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "hide": true,
-          "intervalFactor": 2,
-          "legendFormat": "alloc_bytes_total",
-          "refId": "A",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_alloc_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "hide": false,
-          "intervalFactor": 2,
-          "legendFormat": "allocated",
-          "refId": "B",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_buck_hash_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "profiling bucket hash table",
-          "refId": "C",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_gc_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "GC metadata",
-          "refId": "D",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_heap_alloc_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "heap in-use",
-          "refId": "E",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_heap_idle_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "heap idle",
-          "refId": "F",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_heap_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "heap in use",
-          "refId": "G",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_heap_released_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "heap released",
-          "refId": "H",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_heap_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "heap system",
-          "refId": "I",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_mcache_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "mcache in use",
-          "refId": "J",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_mcache_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "mcache sys",
-          "refId": "K",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_mspan_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "mspan in use",
-          "refId": "L",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_mspan_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "mspan sys",
-          "refId": "M",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_next_gc_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "heap next gc",
-          "refId": "N",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_other_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "other sys",
-          "refId": "O",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_stack_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "stack in use",
-          "refId": "P",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_stack_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "stack sys",
-          "refId": "Q",
-          "step": 10
-        },
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(go_memstats_sys_bytes{job=~\"$job\", instance=~\"$instance\"})",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "sys",
-          "refId": "R",
-          "step": 10
-        }
-      ],
-      "title": "Go Memory Usage (FIXME)",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Seconds",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 6,
-        "x": 12,
-        "y": 104
-      },
-      "id": 9,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "prometheus_target_interval_length_seconds{instance=~\"$instance\", job=~\"$job\"}",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{quantile}} {{interval}}",
-          "refId": "A",
-          "step": 20
-        }
-      ],
-      "title": "Scrape Duration",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "uid": "${DS_THEMIS}"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Scrapes",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
+            {
+               "aliasColors": { },
+               "bars": false,
+               "dashLength": 10,
+               "dashes": false,
+               "datasource": "$datasource",
+               "fill": 10,
+               "id": 10,
+               "legend": {
+                  "avg": false,
+                  "current": false,
+                  "max": false,
+                  "min": false,
+                  "show": true,
+                  "total": false,
+                  "values": false
+               },
+               "lines": true,
+               "linewidth": 0,
+               "links": [ ],
+               "nullPointMode": "null as zero",
+               "percentage": false,
+               "pointradius": 5,
+               "points": false,
+               "renderer": "flot",
+               "seriesOverrides": [ ],
+               "spaceLength": 10,
+               "span": 6,
+               "stack": true,
+               "steppedLine": false,
+               "targets": [
+                  {
+                     "expr": "max by (slice) (prometheus_engine_query_duration_seconds{quantile=\"0.9\",cluster=~\"$cluster\", job=~\"$job\",instance=~\"$instance\"}) * 1e3",
+                     "format": "time_series",
+                     "legendFormat": "{{slice}}",
+                     "legendLink": null
+                  }
+               ],
+               "thresholds": [ ],
+               "timeFrom": null,
+               "timeShift": null,
+               "title": "Stage Duration",
+               "tooltip": {
+                  "shared": true,
+                  "sort": 2,
+                  "value_type": "individual"
+               },
+               "type": "graph",
+               "xaxis": {
+                  "buckets": null,
+                  "mode": "time",
+                  "name": null,
+                  "show": true,
+                  "values": [ ]
+               },
+               "yaxes": [
+                  {
+                     "format": "ms",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": 0,
+                     "show": true
+                  },
+                  {
+                     "format": "short",
+                     "label": null,
+                     "logBase": 1,
+                     "max": null,
+                     "min": null,
+                     "show": false
+                  }
+               ]
             }
-          },
-          "mappings": [],
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 7,
-        "w": 6,
-        "x": 18,
-        "y": 104
-      },
-      "id": 7,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "11.2.2",
-      "targets": [
-        {
-          "datasource": {
-            "uid": "${DS_THEMIS}"
-          },
-          "expr": "sum(rate(prometheus_target_interval_length_seconds_count{job=~\"$job\",instance=~\"$instance\"}[5m])) by (interval)",
-          "format": "time_series",
-          "intervalFactor": 2,
-          "legendFormat": "{{interval}}",
-          "refId": "A",
-          "step": 20
-        }
-      ],
-      "title": "Target Scrapes / 5m",
-      "type": "timeseries"
-    }
-  ],
-  "refresh": "30s",
-  "schemaVersion": 39,
-  "tags": [],
-  "templating": {
-    "list": [
-      {
-        "current": {
-          "selected": false,
-          "text": "Prometheus",
-          "value": "PBFA97CFB590B2093"
-        },
-        "hide": 0,
-        "includeAll": false,
-        "label": "datasource",
-        "multi": false,
-        "name": "DS_THEMIS",
-        "options": [],
-        "query": "prometheus",
-        "refresh": 1,
-        "regex": "",
-        "skipUrlSync": false,
-        "type": "datasource"
-      },
-      {
-        "current": {
-          "selected": false,
-          "text": "All",
-          "value": "$__all"
-        },
-        "datasource": {
-          "type": "datasource",
-          "uid": "${DS_THEMIS}"
-        },
-        "definition": "",
-        "hide": 0,
-        "includeAll": true,
-        "multi": true,
-        "name": "job",
-        "options": [],
-        "query": "query_result(prometheus_tsdb_head_samples_appended_total)",
-        "refresh": 2,
-        "regex": "/.*job=\"([^\"]+)/",
-        "skipUrlSync": false,
-        "sort": 1,
-        "tagValuesQuery": "",
-        "tagsQuery": "",
-        "type": "query",
-        "useTags": false
-      },
-      {
-        "current": {
-          "selected": false,
-          "text": "All",
-          "value": "$__all"
-        },
-        "datasource": {
-          "type": "datasource",
-          "uid": "${DS_THEMIS}"
-        },
-        "definition": "",
-        "hide": 0,
-        "includeAll": true,
-        "multi": true,
-        "name": "instance",
-        "options": [],
-        "query": "query_result(up{job=~\"$job\"})",
-        "refresh": 2,
-        "regex": "/.*instance=\"([^\"]+).*/",
-        "skipUrlSync": false,
-        "sort": 0,
-        "tagValuesQuery": "",
-        "tagsQuery": "",
-        "type": "query",
-        "useTags": false
-      },
-      {
-        "current": {
-          "selected": false,
-          "text": "1h",
-          "value": "1h"
-        },
-        "hide": 0,
-        "includeAll": false,
-        "multi": false,
-        "name": "interval",
-        "options": [
-          {
-            "selected": true,
-            "text": "1h",
-            "value": "1h"
-          },
-          {
-            "selected": false,
-            "text": "3h",
-            "value": "3h"
-          },
-          {
-            "selected": false,
-            "text": "6h",
-            "value": "6h"
-          },
-          {
-            "selected": false,
-            "text": "12h",
-            "value": "12h"
-          },
-          {
-            "selected": false,
-            "text": "1d",
-            "value": "1d"
-          },
-          {
-            "selected": false,
-            "text": "2d",
-            "value": "2d"
-          },
-          {
-            "selected": false,
-            "text": "7d",
-            "value": "7d"
-          },
-          {
-            "selected": false,
-            "text": "30d",
-            "value": "30d"
-          },
-          {
-            "selected": false,
-            "text": "90d",
-            "value": "90d"
-          },
-          {
-            "selected": false,
-            "text": "180d",
-            "value": "180d"
-          }
-        ],
-        "query": "1h, 3h, 6h, 12h, 1d, 2d, 7d, 30d, 90d, 180d",
-        "skipUrlSync": false,
-        "type": "custom"
+         ],
+         "repeat": null,
+         "repeatIteration": null,
+         "repeatRowId": null,
+         "showTitle": true,
+         "title": "Query",
+         "titleSize": "h6"
       }
-    ]
-  },
-  "time": {
-    "from": "now-30m",
-    "to": "now"
-  },
-  "timepicker": {
-    "refresh_intervals": [
-      "5s",
-      "10s",
-      "30s",
-      "1m",
-      "5m",
-      "15m",
-      "30m",
-      "1h",
-      "2h",
-      "1d"
-    ],
-    "time_options": [
-      "5m",
-      "15m",
-      "1h",
-      "6h",
-      "12h",
-      "24h",
-      "2d",
-      "7d",
-      "30d"
-    ]
-  },
-  "timezone": "",
-  "title": "Prometheus 2.0 Overview",
-  "uid": "fe0jbvq47lam8e",
-  "version": 1,
-  "weekStart": ""
+   ],
+   "schemaVersion": 14,
+   "style": "dark",
+   "tags": [
+      "prometheus-mixin"
+   ],
+   "templating": {
+      "list": [
+         {
+            "current": {
+               "text": "default",
+               "value": "default"
+            },
+            "hide": 0,
+            "label": "Data source",
+            "name": "datasource",
+            "options": [ ],
+            "query": "prometheus",
+            "refresh": 1,
+            "regex": "",
+            "type": "datasource"
+         },
+         {
+            "allValue": ".+",
+            "current": {
+               "selected": true,
+               "text": "All",
+               "value": "$__all"
+            },
+            "datasource": "$datasource",
+            "hide": 0,
+            "includeAll": true,
+            "label": "cluster",
+            "multi": true,
+            "name": "cluster",
+            "options": [ ],
+            "query": "label_values(prometheus_build_info{job=\"prometheus\"}, cluster)",
+            "refresh": 1,
+            "regex": "",
+            "sort": 2,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         },
+         {
+            "allValue": ".+",
+            "current": {
+               "selected": true,
+               "text": "All",
+               "value": "$__all"
+            },
+            "datasource": "$datasource",
+            "hide": 0,
+            "includeAll": true,
+            "label": "job",
+            "multi": true,
+            "name": "job",
+            "options": [ ],
+            "query": "label_values(prometheus_build_info{cluster=~\"$cluster\"}, job)",
+            "refresh": 1,
+            "regex": "",
+            "sort": 2,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         },
+         {
+            "allValue": ".+",
+            "current": {
+               "selected": true,
+               "text": "All",
+               "value": "$__all"
+            },
+            "datasource": "$datasource",
+            "hide": 0,
+            "includeAll": true,
+            "label": "instance",
+            "multi": true,
+            "name": "instance",
+            "options": [ ],
+            "query": "label_values(prometheus_build_info{cluster=~\"$cluster\", job=~\"$job\"}, instance)",
+            "refresh": 1,
+            "regex": "",
+            "sort": 2,
+            "tagValuesQuery": "",
+            "tags": [ ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+         }
+      ]
+   },
+   "time": {
+      "from": "now-1h",
+      "to": "now"
+   },
+   "timepicker": {
+      "refresh_intervals": [
+         "5s",
+         "10s",
+         "30s",
+         "1m",
+         "5m",
+         "15m",
+         "30m",
+         "1h",
+         "2h",
+         "1d"
+      ],
+      "time_options": [
+         "5m",
+         "15m",
+         "1h",
+         "6h",
+         "12h",
+         "24h",
+         "2d",
+         "7d",
+         "30d"
+      ]
+   },
+   "timezone": "utc",
+   "title": "Prometheus / Overview",
+   "uid": "",
+   "version": 0
 }
diff --git a/grafana/provisioning.yaml b/grafana/provisioning.yaml
index 76d07f2..76db963 100644
--- a/grafana/provisioning.yaml
+++ b/grafana/provisioning.yaml
@@ -3,18 +3,8 @@
 apiVersion: 1
 
 providers:
-  - name: Ceph
-    folder: Ceph
+  - name: "kolla-operations"
     type: file
     options:
-      path: /var/lib/grafana/dashboards/ceph
-  - name: Infrastructure
-    folder: Infrastructure
-    type: file
-    options:
-      path: /var/lib/grafana/dashboards/infrastructure
-  - name: OpenStack
-    folder: OpenStack
-    type: file
-    options:
-      path: /var/lib/grafana/dashboards/openstack
+      path: /var/lib/grafana/dashboards
+      foldersFromFilesStructure: true
diff --git a/prometheus/alertmanager.rec.rules b/prometheus/alertmanager.rec.rules
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/prometheus/alertmanager.rec.rules
@@ -0,0 +1 @@
+{}
diff --git a/prometheus/alertmanager.rules b/prometheus/alertmanager.rules
new file mode 100644
index 0000000..0d6a13b
--- /dev/null
+++ b/prometheus/alertmanager.rules
@@ -0,0 +1,117 @@
+"groups":
+- "name": "alertmanager.rules"
+  "rules":
+  - "alert": "AlertmanagerFailedReload"
+    "annotations":
+      "description": "Configuration has failed to load for {{$labels.instance}}."
+      "summary": "Reloading an Alertmanager configuration has failed."
+    "expr": |
+      # Without max_over_time, failed scrapes could create false negatives, see
+      # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+      max_over_time(alertmanager_config_last_reload_successful{job="alertmanager"}[5m]) == 0
+    "for": "10m"
+    "labels":
+      "severity": "critical"
+  - "alert": "AlertmanagerMembersInconsistent"
+    "annotations":
+      "description": "Alertmanager {{$labels.instance}} has only found {{ $value }} members of the {{$labels.job}} cluster."
+      "summary": "A member of an Alertmanager cluster has not found all other cluster members."
+    "expr": |
+      # Without max_over_time, failed scrapes could create false negatives, see
+      # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+        max_over_time(alertmanager_cluster_members{job="alertmanager"}[5m])
+      < on (job) group_left
+        count by (job) (max_over_time(alertmanager_cluster_members{job="alertmanager"}[5m]))
+    "for": "15m"
+    "labels":
+      "severity": "critical"
+  - "alert": "AlertmanagerFailedToSendAlerts"
+    "annotations":
+      "description": "Alertmanager {{$labels.instance}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}."
+      "summary": "An Alertmanager instance failed to send notifications."
+    "expr": |
+      (
+        rate(alertmanager_notifications_failed_total{job="alertmanager"}[5m])
+      /
+        ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager"}[5m])
+      )
+      > 0.01
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+  - "alert": "AlertmanagerClusterFailedToSendAlerts"
+    "annotations":
+      "description": "The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}."
+      "summary": "All Alertmanager instances in a cluster failed to send notifications to a critical integration."
+    "expr": |
+      min by (job, integration) (
+        rate(alertmanager_notifications_failed_total{job="alertmanager", integration=~`.*`}[5m])
+      /
+        ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager", integration=~`.*`}[5m])
+      )
+      > 0.01
+    "for": "5m"
+    "labels":
+      "severity": "critical"
+  - "alert": "AlertmanagerClusterFailedToSendAlerts"
+    "annotations":
+      "description": "The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}."
+      "summary": "All Alertmanager instances in a cluster failed to send notifications to a non-critical integration."
+    "expr": |
+      min by (job, integration) (
+        rate(alertmanager_notifications_failed_total{job="alertmanager", integration!~`.*`}[5m])
+      /
+        ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager", integration!~`.*`}[5m])
+      )
+      > 0.01
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+  - "alert": "AlertmanagerConfigInconsistent"
+    "annotations":
+      "description": "Alertmanager instances within the {{$labels.job}} cluster have different configurations."
+      "summary": "Alertmanager instances within the same cluster have different configurations."
+    "expr": |
+      count by (job) (
+        count_values by (job) ("config_hash", alertmanager_config_hash{job="alertmanager"})
+      )
+      != 1
+    "for": "20m"
+    "labels":
+      "severity": "critical"
+  - "alert": "AlertmanagerClusterDown"
+    "annotations":
+      "description": "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m."
+      "summary": "Half or more of the Alertmanager instances within the same cluster are down."
+    "expr": |
+      (
+        count by (job) (
+          avg_over_time(up{job="alertmanager"}[5m]) < 0.5
+        )
+      /
+        count by (job) (
+          up{job="alertmanager"}
+        )
+      )
+      >= 0.5
+    "for": "5m"
+    "labels":
+      "severity": "critical"
+  - "alert": "AlertmanagerClusterCrashlooping"
+    "annotations":
+      "description": "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m."
+      "summary": "Half or more of the Alertmanager instances within the same cluster are crashlooping."
+    "expr": |
+      (
+        count by (job) (
+          changes(process_start_time_seconds{job="alertmanager"}[10m]) > 4
+        )
+      /
+        count by (job) (
+          up{job="alertmanager"}
+        )
+      )
+      >= 0.5
+    "for": "5m"
+    "labels":
+      "severity": "critical"
diff --git a/prometheus/ceph.rec.rules b/prometheus/ceph.rec.rules
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/prometheus/ceph.rec.rules
@@ -0,0 +1 @@
+{}
diff --git a/prometheus/ceph.rules b/prometheus/ceph.rules
index e70ba72..bf6f257 100644
--- a/prometheus/ceph.rules
+++ b/prometheus/ceph.rules
@@ -1,359 +1,885 @@
-# Official set of upstream alerts https://github.com/ceph/ceph/blob/octopus/monitoring/prometheus/alerts/ceph_default_alerts.yml
-
-groups:
-- name: CEPH cluster health
-  rules:
-
-    - alert: health error
-      expr: ceph_health_status == 2
-      for: 5m
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.2.1
-      annotations:
-        description: >
-          Ceph in HEALTH_ERROR state for more than 5 minutes.
-          Please check "ceph health detail" for more information.
-
-    - alert: health warn
-      expr: ceph_health_status == 1
-      for: 15m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.2.2
-      annotations:
-        description: >
-          Ceph has been in HEALTH_WARN for more than 15 minutes.
-          Please check "ceph health detail" for more information.
-
-- name: mon
-  rules:
-    - alert: low monitor quorum count
-      expr: sum(ceph_mon_quorum_status) < 3
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.3.1
-      annotations:
-        description: |
-          Monitor count in quorum is below three.
-
-          Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . | first | value }}{{ end }} monitors are active.
-
-          The following monitors are down:
-          {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
-            - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
-          {{- end }}
-
-- name: osd
-  rules:
-    - alert: 10% OSDs down
-      expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.4.1
-      annotations:
-        description: |
-          {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (≥ 10%).
-
-          The following OSDs are down:
-          {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
-            - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
-          {{- end }}
-
-    - alert: OSD down
-      expr: count(ceph_osd_up == 0) > 0
-      for: 15m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.4.2
-      annotations:
-        description: |
-          {{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }}
-          {{ $value }} OSD{{ $s }} down for more than 15 minutes.
-
-          {{ $value }} of {{ query "count(ceph_osd_up)" | first | value }} OSDs are down.
-
-          The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
-            {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
-            - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
-            {{- end }}
-
-    - alert: OSDs near full
-      expr: |
-        (
-        ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1)
-        * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
-        ) * 100 > 90
-      for: 5m
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.4.3
-      annotations:
-        description: >
-          OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is
-          dangerously full: {{ $value | humanize }}%
-
-    - alert: flapping OSD
-      expr: |
-        (
-        rate(ceph_osd_up[5m])
-        * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
-        ) * 60 > 1
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.4.4
-      annotations:
-        description: >
-          OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
-          marked down and back up at {{ $value | humanize }} times once a
-          minute for 5 minutes.
-
-    - alert: Ceph OSD Reweighted needed
-      expr: ceph_osd_weight < 1
-      for: 1h
-      labels:
-        severity: warning
-      annotations:
-        description: >
-          Ceph OSD reweighted (OSD: {{ $labels.ceph_daemon }})
-
-
-#    # alert on high deviation from average PG count
-#    - alert: high pg count deviation
-#      expr: |
-#        abs(
-#        (
-#          (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
-#        ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
-#        ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
-#      for: 5m
-#      labels:
-#        severity: warning
-#        type: ceph_default
-#        oid: 1.3.6.1.4.1.50495.15.1.2.4.5
-#      annotations:
-#        description: >
-#          OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
-#          by more than 30% from average PG count.
-
-- name: pgs
-  rules:
-    - alert: pgs inactive
-      expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
-      for: 5m
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.1
-      annotations:
-        description: >
-          {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
-          Inactive placement groups aren't able to serve read/write
-          requests.
-
-    - alert: pgs unclean
-      expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
-      for: 15m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.2
-      annotations:
-        description: >
-          {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
-          Unclean PGs haven't been able to completely recover from a
-          previous failure.
-
-    - alert: pgs down
-      expr: ceph_pg_down > 0
-      for: 1m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.3
-      annotations:
-        description: >
-          Ceph PG down (Pool ID {{ $labels.pool_id }})
-
-    - alert: pgs incomplete
-      expr: ceph_pg_incomplete > 0
-      for: 1m
-      labels:
-          severity: warning
-          type: ceph_default
-          oid: 1.3.6.1.4.1.50495.15.1.2.7.4
-      annotations:
-        description: >
-          PG incomplete (Pool ID {{ $labels.pool_id }})
-
-    - alert: pg inconsistant
-      expr: ceph_pg_inconsistent > 0
-      for: 1m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.5
-      annotations:
-        description: >
-          pg inconsistant (Pool ID {{ $labels.pool_id }})
-
-    - alert: pg ActivationLong
-      expr: ceph_pg_activating > 0
-      for: 2m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.6
-      annotations:
-        description: >
-          pg activation too long (Pool ID {{ $labels.pool_id }})
-
-    - alert: pg backfill full
-      expr: ceph_pg_backfill_toofull > 0
-      for: 2m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.7
-      annotations:
-        description: >
-          pg backfill full (Pool ID {{ $labels.pool_id }})
-
-    - alert: CephPgUnavailable
-      expr: ceph_pg_total - ceph_pg_active > 0
-      for: 1m
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.7.8
-      annotations:
-        description: pg unavailable (Pool ID {{ $labels.pool_id }})
-
-- name: nodes
-  rules:
-    - alert: root volume full
-      expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
-      for: 5m
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.8.1
-      annotations:
-        description: "Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free."
-
-    # alert on nic packet errors and drops rates > 100 packet/s
-    #- alert: network packets dropped
-    #  expr: irate(node_network_receive_drop_total{device!~"lo|breth2-ovs"}[5m]) + irate(node_network_transmit_drop_total{device!~"lo|breth2-ovs"}[5m]) > 100
-    #  labels:
-    #    severity: warning
-    #    type: ceph_default
-    #    oid: 1.3.6.1.4.1.50495.15.1.2.8.2
-    #  annotations:
-    #    description: >
-    #      Node {{ $labels.instance }} experiences packet drop > 100
-    #      packet/s on interface {{ $labels.device }}.
-
-    - alert: network packet errors
-      expr: |
-        irate(node_network_receive_errs_total{device!="lo"}[5m]) +
-        irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.8.3
-      annotations:
-        description: >
-          Node {{ $labels.instance }} experiences packet errors > 1
-          packet/s on interface {{ $labels.device }}.
-
-    - alert: storage filling up
-      expr: |
-        predict_linear(node_filesystem_free_bytes[2d], 3600 * 24 * 5) *
-        on(instance) group_left(nodename) node_uname_info < 0
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.8.4
-      annotations:
-        description: >
-          Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
-          will be full in less than 5 days assuming the average fill-up
-          rate of the past 48 hours.
-
-- name: pools
-  rules:
-    - alert: pool full
-      expr: |
-        ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
-        * on(pool_id) group_right ceph_pool_metadata * 100 > 90
-      labels:
-        severity: critical
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.9.1
-      annotations:
-        description: "Pool {{ $labels.name }} at {{ $value | humanize }}% capacity."
-
-    - alert: pool filling up
-      expr: |
-        (
-        predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5)
-        >= ceph_pool_stored + ceph_pool_max_avail
-        ) * on(pool_id) group_left(name) ceph_pool_metadata
-      labels:
-        severity: warning
-        type: ceph_default
-        oid: 1.3.6.1.4.1.50495.15.1.2.9.2
-      annotations:
-        description: >
-          Pool {{ $labels.name }} will be full in less than 5 days
-          assuming the average fill-up rate of the past 48 hours.
-
-- name: healthchecks
-  rules:
-    - alert: Slow OSD Ops
-      expr: ceph_healthcheck_slow_ops > 0
-      for: 30s
-      labels:
-        severity: warning
-        type: ceph_default
-      annotations:
-        description: "{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
-
-- name: ceph exporter
-  rules:
-    - alert: CephMgrExporterDown
-      expr: up{job="ceph_mgr_exporter"} == 0
-      for: 1m
-      labels:
-        severity: critical
-      annotations:
-        description: >
-          the Ceph-Manager-Exporter is down.
-          message: CEPH target down for more than 1m, please check
-
-- name: Ceph Latency
-  rules:
-    - alert: CephHighLatency
-      expr: irate(ceph_osd_op_r_latency_sum[5m]) >= 15
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        description: "OSD read latencies {{ $labels.ceph_daemon }} > 15ms. It must be checked which host the OSD is on."
-        message: "Ceph OSD high read latency ({{ $labels.ceph_daemon }})"
-
-    - alert: CephHighLatency_crit
-      expr: irate(ceph_osd_op_r_latency_sum[5m]) >= 25
-      for: 2m
-      labels:
-        severity: critical
-      annotations:
-        description: "OSD read latencies {{ $labels.ceph_daemon }} > 25ms. It must be checked which host the OSD is on."
-        message: "Ceph OSD high read latency ({{ $labels.ceph_daemon }})"
+"groups":
+- "name": "cluster health"
+  "rules":
+  - "alert": "CephHealthError"
+    "annotations":
+      "description": "The cluster state has been HEALTH_ERROR for more than 5 minutes on cluster {{ $labels.cluster }}. Please check 'ceph health detail' for more information."
+      "summary": "Ceph is in the ERROR state on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_status == 2"
+    "for": "5m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.2.1"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephHealthWarning"
+    "annotations":
+      "description": "The cluster state has been HEALTH_WARN for more than 15 minutes on cluster {{ $labels.cluster }}. Please check 'ceph health detail' for more information."
+      "summary": "Ceph is in the WARNING state on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_status == 1"
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+- "name": "mon"
+  "rules":
+  - "alert": "CephMonDownQuorumAtRisk"
+    "annotations":
+      "description": "{{ $min := printf \"floor(count(ceph_mon_metadata{cluster='%s'}) / 2) + 1\" .Labels.cluster | query | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range printf \"(ceph_mon_quorum_status{cluster='%s'} == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down"
+      "summary": "Monitor quorum is at risk on cluster {{ $labels.cluster }}"
+    "expr": |
+      (
+        (ceph_health_detail{name="MON_DOWN"} == 1) * on() group_right(cluster) (
+          count(ceph_mon_quorum_status == 1) by(cluster)== bool (floor(count(ceph_mon_metadata) by(cluster) / 2) + 1)
+        )
+      ) == 1
+    "for": "30s"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.3.1"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephMonDown"
+    "annotations":
+      "description": "{{ $down := printf \"count(ceph_mon_quorum_status{cluster='%s'} == 0)\" .Labels.cluster | query | first | value }}{{ $s := \"\" }}{{ if gt $down 1.0 }}{{ $s = \"s\" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: {{- range printf \"(ceph_mon_quorum_status{cluster='%s'} == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down"
+      "summary": "One or more monitors down on cluster {{ $labels.cluster }}"
+    "expr": |
+      (count by (cluster) (ceph_mon_quorum_status == 0)) <= (count by (cluster) (ceph_mon_metadata) - floor((count by (cluster) (ceph_mon_metadata) / 2 + 1)))
+    "for": "30s"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephMonDiskspaceCritical"
+    "annotations":
+      "description": "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}"
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit"
+      "summary": "Filesystem space on at least one monitor is critically low on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"MON_DISK_CRIT\"} == 1"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.3.2"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephMonDiskspaceLow"
+    "annotations":
+      "description": "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*.  Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}"
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low"
+      "summary": "Drive space on at least one monitor is approaching full on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"MON_DISK_LOW\"} == 1"
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephMonClockSkew"
+    "annotations":
+      "description": "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew"
+      "summary": "Clock skew detected among monitors on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"MON_CLOCK_SKEW\"} == 1"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+- "name": "osd"
+  "rules":
+  - "alert": "CephOSDDownHigh"
+    "annotations":
+      "description": "{{ $value | humanize }}% or {{ with printf \"count (ceph_osd_up{cluster='%s'} == 0)\" .Labels.cluster | query }}{{ . | first | value }}{{ end }} of {{ with printf \"count (ceph_osd_up{cluster='%s'})\" .Labels.cluster | query }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range printf \"(ceph_osd_up{cluster='%s'} * on(cluster, ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
+      "summary": "More than 10% of OSDs are down on cluster {{ $labels.cluster }}"
+    "expr": "count by (cluster) (ceph_osd_up == 0) / count by (cluster) (ceph_osd_up) * 100 >= 10"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.4.1"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephOSDHostDown"
+    "annotations":
+      "description": "The following OSDs are down: {{- range printf \"(ceph_osd_up{cluster='%s'} * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" .Labels.cluster | query }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}"
+      "summary": "An OSD host is offline on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"OSD_HOST_DOWN\"} == 1"
+    "for": "5m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.4.8"
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephOSDDown"
+    "annotations":
+      "description": "{{ $num := printf \"count(ceph_osd_up{cluster='%s'} == 0) \" .Labels.cluster | query | first | value }}{{ $s := \"\" }}{{ if gt $num 1.0 }}{{ $s = \"s\" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s \"\" }}is{{ else }}are{{ end }} down: {{- range printf \"(ceph_osd_up{cluster='%s'} * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down"
+      "summary": "An OSD has been marked down on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"OSD_DOWN\"} == 1"
+    "for": "5m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.4.2"
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephOSDNearFull"
+    "annotations":
+      "description": "One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull"
+      "summary": "OSD(s) running low on free space (NEARFULL) on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"OSD_NEARFULL\"} == 1"
+    "for": "5m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.4.3"
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephOSDFull"
+    "annotations":
+      "description": "An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full"
+      "summary": "OSD full, writes blocked on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"OSD_FULL\"} > 0"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.4.6"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephOSDBackfillFull"
+    "annotations":
+      "description": "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull"
+      "summary": "OSD(s) too full for backfill operations on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"OSD_BACKFILLFULL\"} > 0"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephOSDTooManyRepairs"
+    "annotations":
+      "description": "Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs"
+      "summary": "OSD reports a high number of read errors on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"OSD_TOO_MANY_REPAIRS\"} == 1"
+    "for": "30s"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephOSDTimeoutsPublicNetwork"
+    "annotations":
+      "description": "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
+      "summary": "Network issues delaying OSD heartbeats (public network) on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_FRONT\"} == 1"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephOSDTimeoutsClusterNetwork"
+    "annotations":
+      "description": "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
+      "summary": "Network issues delaying OSD heartbeats (cluster network) on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_BACK\"} == 1"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephOSDInternalDiskSizeMismatch"
+    "annotations":
+      "description": "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch"
+      "summary": "OSD size inconsistency error on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"BLUESTORE_DISK_SIZE_MISMATCH\"} == 1"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephDeviceFailurePredicted"
+    "annotations":
+      "description": "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#id2"
+      "summary": "Device(s) predicted to fail soon on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"DEVICE_HEALTH\"} == 1"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephDeviceFailurePredictionTooHigh"
+    "annotations":
+      "description": "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany"
+      "summary": "Too many devices are predicted to fail on cluster {{ $labels.cluster }}, unable to resolve"
+    "expr": "ceph_health_detail{name=\"DEVICE_HEALTH_TOOMANY\"} == 1"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.4.7"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephDeviceFailureRelocationIncomplete"
+    "annotations":
+      "description": "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use"
+      "summary": "Device failure is predicted, but unable to relocate data on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"DEVICE_HEALTH_IN_USE\"} == 1"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephOSDFlapping"
+    "annotations":
+      "description": "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked down and back up {{ $value | humanize }} times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
+      "documentation": "https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds"
+      "summary": "Network issues are causing OSDs to flap (mark each other down) on cluster {{ $labels.cluster }}"
+    "expr": "(rate(ceph_osd_up[5m]) * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.4.4"
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephOSDReadErrors"
+    "annotations":
+      "description": "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors"
+      "summary": "Device read errors detected on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"BLUESTORE_SPURIOUS_READ_ERRORS\"} == 1"
+    "for": "30s"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephPGImbalance"
+    "annotations":
+      "description": "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count."
+      "summary": "PGs are not balanced across OSDs on cluster {{ $labels.cluster }}"
+    "expr": |
+      abs(
+        ((ceph_osd_numpg > 0) - on (cluster,job) group_left avg(ceph_osd_numpg > 0) by (cluster,job)) /
+        on (job) group_left avg(ceph_osd_numpg > 0) by (job)
+      ) * on (cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
+    "for": "5m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.4.5"
+      "severity": "warning"
+      "type": "ceph_default"
+- "name": "mds"
+  "rules":
+  - "alert": "CephFilesystemDamaged"
+    "annotations":
+      "description": "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
+      "documentation": "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages"
+      "summary": "CephFS filesystem is damaged on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"MDS_DAMAGE\"} > 0"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.5.1"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephFilesystemOffline"
+    "annotations":
+      "description": "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
+      "documentation": "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down"
+      "summary": "CephFS filesystem is offline on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"MDS_ALL_DOWN\"} > 0"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.5.3"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephFilesystemDegraded"
+    "annotations":
+      "description": "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
+      "documentation": "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded"
+      "summary": "CephFS filesystem is degraded on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"FS_DEGRADED\"} > 0"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.5.4"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephFilesystemMDSRanksLow"
+    "annotations":
+      "description": "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
+      "documentation": "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max"
+      "summary": "Ceph MDS daemon count is lower than configured on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"MDS_UP_LESS_THAN_MAX\"} > 0"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephFilesystemInsufficientStandby"
+    "annotations":
+      "description": "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
+      "documentation": "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby"
+      "summary": "Ceph filesystem standby daemons too few on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"MDS_INSUFFICIENT_STANDBY\"} > 0"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephFilesystemFailureNoStandby"
+    "annotations":
+      "description": "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
+      "documentation": "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds"
+      "summary": "MDS daemon failed, no further standby available on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"FS_WITH_FAILED_MDS\"} > 0"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.5.5"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephFilesystemReadOnly"
+    "annotations":
+      "description": "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
+      "documentation": "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages"
+      "summary": "CephFS filesystem in read only mode due to write error(s) on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"MDS_HEALTH_READ_ONLY\"} > 0"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.5.2"
+      "severity": "critical"
+      "type": "ceph_default"
+- "name": "mgr"
+  "rules":
+  - "alert": "CephMgrModuleCrash"
+    "annotations":
+      "description": "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash"
+      "summary": "A manager module has recently crashed on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"RECENT_MGR_MODULE_CRASH\"} == 1"
+    "for": "5m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.6.1"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephMgrPrometheusModuleInactive"
+    "annotations":
+      "description": "The mgr/prometheus module at {{ $labels.instance }} is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
+      "summary": "The mgr/prometheus module is not available"
+    "expr": "up{job=\"ceph\"} == 0"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.6.2"
+      "severity": "critical"
+      "type": "ceph_default"
+- "name": "pgs"
+  "rules":
+  - "alert": "CephPGsInactive"
+    "annotations":
+      "description": "{{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write requests."
+      "summary": "One or more placement groups are inactive on cluster {{ $labels.cluster }}"
+    "expr": "ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0"
+    "for": "5m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.7.1"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephPGsUnclean"
+    "annotations":
+      "description": "{{ $value }} PGs have been unclean for more than 15 minutes in pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure."
+      "summary": "One or more placement groups are marked unclean on cluster {{ $labels.cluster }}"
+    "expr": "ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0"
+    "for": "15m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.7.2"
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephPGsDamaged"
+    "annotations":
+      "description": "During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged"
+      "summary": "Placement group damaged, manual intervention needed on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=~\"PG_DAMAGED|OSD_SCRUB_ERRORS\"} == 1"
+    "for": "5m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.7.4"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephPGRecoveryAtRisk"
+    "annotations":
+      "description": "Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full"
+      "summary": "OSDs are too full for recovery on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"PG_RECOVERY_FULL\"} == 1"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.7.5"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephPGUnavailableBlockingIO"
+    "annotations":
+      "description": "Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability"
+      "summary": "PG is unavailable on cluster {{ $labels.cluster }}, blocking I/O"
+    "expr": "((ceph_health_detail{name=\"PG_AVAILABILITY\"} == 1) - scalar(ceph_health_detail{name=\"OSD_DOWN\"})) == 1"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.7.3"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephPGBackfillAtRisk"
+    "annotations":
+      "description": "Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full"
+      "summary": "Backfill operations are blocked due to lack of free space on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"PG_BACKFILL_FULL\"} == 1"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.7.6"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephPGNotScrubbed"
+    "annotations":
+      "description": "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed"
+      "summary": "Placement group(s) have not been scrubbed on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"PG_NOT_SCRUBBED\"} == 1"
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephPGsHighPerOSD"
+    "annotations":
+      "description": "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs"
+      "summary": "Placement groups per OSD is too high on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"TOO_MANY_PGS\"} == 1"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephPGNotDeepScrubbed"
+    "annotations":
+      "description": "One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed"
+      "summary": "Placement group(s) have not been deep scrubbed on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"PG_NOT_DEEP_SCRUBBED\"} == 1"
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+- "name": "nodes"
+  "rules":
+  - "alert": "CephNodeRootFilesystemFull"
+    "annotations":
+      "description": "Root volume is dangerously full: {{ $value | humanize }}% free."
+      "summary": "Root filesystem is dangerously full"
+    "expr": "node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"} * 100 < 5"
+    "for": "5m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.8.1"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephNodeNetworkPacketDrops"
+    "annotations":
+      "description": "Node {{ $labels.instance }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ $labels.device }}."
+      "summary": "One or more NICs reports packet drops"
+    "expr": |
+      (
+        rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+        rate(node_network_transmit_drop_total{device!="lo"}[1m])
+      ) / (
+        rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+        rate(node_network_transmit_packets_total{device!="lo"}[1m])
+      ) >= 0.0050000000000000001 and (
+        rate(node_network_receive_drop_total{device!="lo"}[1m]) +
+        rate(node_network_transmit_drop_total{device!="lo"}[1m])
+      ) >= 10
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.8.2"
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephNodeNetworkPacketErrors"
+    "annotations":
+      "description": "Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
+      "summary": "One or more NICs reports packet errors on cluster {{ $labels.cluster }}"
+    "expr": |
+      (
+        rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+        rate(node_network_transmit_errs_total{device!="lo"}[1m])
+      ) / (
+        rate(node_network_receive_packets_total{device!="lo"}[1m]) +
+        rate(node_network_transmit_packets_total{device!="lo"}[1m])
+      ) >= 0.0001 or (
+        rate(node_network_receive_errs_total{device!="lo"}[1m]) +
+        rate(node_network_transmit_errs_total{device!="lo"}[1m])
+      ) >= 10
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.8.3"
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephNodeNetworkBondDegraded"
+    "annotations":
+      "description": "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}."
+      "summary": "Degraded Bond on Node {{ $labels.instance }} on cluster {{ $labels.cluster }}"
+    "expr": |
+      node_bonding_slaves - node_bonding_active != 0
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephNodeDiskspaceWarning"
+    "annotations":
+      "description": "Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate."
+      "summary": "Host filesystem free space is getting low on cluster {{ $labels.cluster }}"
+    "expr": "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) * on(cluster, instance) group_left(nodename) node_uname_info < 0"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.8.4"
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephNodeInconsistentMTU"
+    "annotations":
+      "description": "Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}."
+      "summary": "MTU settings across Ceph hosts are inconsistent on cluster {{ $labels.cluster }}"
+    "expr": "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+- "name": "pools"
+  "rules":
+  - "alert": "CephPoolGrowthWarning"
+    "annotations":
+      "description": "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
+      "summary": "Pool growth rate may soon exceed capacity on cluster {{ $labels.cluster }}"
+    "expr": "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(cluster,pool_id, instance) group_right() ceph_pool_metadata) >= 95"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.9.2"
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephPoolBackfillFull"
+    "annotations":
+      "description": "A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity."
+      "summary": "Free space in a pool is too low for recovery/backfill on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"POOL_BACKFILLFULL\"} > 0"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephPoolFull"
+    "annotations":
+      "description": "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range printf \"topk(5, sort_desc(ceph_pool_percent_used{cluster='%s'} * on(cluster,pool_id) group_right ceph_pool_metadata))\" .Labels.cluster | query }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full"
+      "summary": "Pool is full - writes are blocked on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"POOL_FULL\"} > 0"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.9.1"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephPoolNearFull"
+    "annotations":
+      "description": "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
+      "summary": "One or more Ceph pools are nearly full on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"POOL_NEAR_FULL\"} > 0"
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+- "name": "healthchecks"
+  "rules":
+  - "alert": "CephSlowOps"
+    "annotations":
+      "description": "{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
+      "summary": "OSD operations are slow to complete on cluster {{ $labels.cluster }}"
+    "expr": "ceph_healthcheck_slow_ops > 0"
+    "for": "30s"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "CephDaemonSlowOps"
+    "annotations":
+      "description": "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
+      "summary": "{{ $labels.ceph_daemon }} operations are slow to complete on cluster {{ $labels.cluster }}"
+    "expr": "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
+    "for": "30s"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+- "name": "cephadm"
+  "rules":
+  - "alert": "CephadmUpgradeFailed"
+    "annotations":
+      "description": "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
+      "summary": "Ceph version upgrade has failed on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"UPGRADE_EXCEPTION\"} > 0"
+    "for": "30s"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.11.2"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephadmDaemonFailed"
+    "annotations":
+      "description": "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
+      "summary": "A ceph daemon managed by cephadm is down on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"CEPHADM_FAILED_DAEMON\"} > 0"
+    "for": "30s"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.11.1"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephadmPaused"
+    "annotations":
+      "description": "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
+      "documentation": "https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused"
+      "summary": "Orchestration tasks via cephadm are PAUSED on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"CEPHADM_PAUSED\"} > 0"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+- "name": "hardware"
+  "rules":
+  - "alert": "HardwareStorageError"
+    "annotations":
+      "description": "Some storage devices are in error. Check `ceph health detail`."
+      "summary": "Storage devices error(s) detected on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0"
+    "for": "30s"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.13.1"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "HardwareMemoryError"
+    "annotations":
+      "description": "DIMM error(s) detected. Check `ceph health detail`."
+      "summary": "DIMM error(s) detected on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0"
+    "for": "30s"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.13.2"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "HardwareProcessorError"
+    "annotations":
+      "description": "Processor error(s) detected. Check `ceph health detail`."
+      "summary": "Processor error(s) detected on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0"
+    "for": "30s"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.13.3"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "HardwareNetworkError"
+    "annotations":
+      "description": "Network error(s) detected. Check `ceph health detail`."
+      "summary": "Network error(s) detected on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0"
+    "for": "30s"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.13.4"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "HardwarePowerError"
+    "annotations":
+      "description": "Power supply error(s) detected. Check `ceph health detail`."
+      "summary": "Power supply error(s) detected on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0"
+    "for": "30s"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.13.5"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "HardwareFanError"
+    "annotations":
+      "description": "Fan error(s) detected. Check `ceph health detail`."
+      "summary": "Fan error(s) detected on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0"
+    "for": "30s"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.13.6"
+      "severity": "critical"
+      "type": "ceph_default"
+- "name": "PrometheusServer"
+  "rules":
+  - "alert": "PrometheusJobMissing"
+    "annotations":
+      "description": "The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster.  Please review the job definitions in the prometheus.yml file of the prometheus instance."
+      "summary": "The scrape job for Ceph is missing from Prometheus"
+    "expr": "absent(up{job=\"ceph\"})"
+    "for": "30s"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.12.1"
+      "severity": "critical"
+      "type": "ceph_default"
+- "name": "rados"
+  "rules":
+  - "alert": "CephObjectMissing"
+    "annotations":
+      "description": "The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound"
+      "summary": "Object(s) marked UNFOUND on cluster {{ $labels.cluster }}"
+    "expr": "(ceph_health_detail{name=\"OBJECT_UNFOUND\"} == 1) * on() group_right(cluster) (count(ceph_osd_up == 1) by (cluster) == bool count(ceph_osd_metadata) by(cluster)) == 1"
+    "for": "30s"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.10.1"
+      "severity": "critical"
+      "type": "ceph_default"
+- "name": "generic"
+  "rules":
+  - "alert": "CephDaemonCrash"
+    "annotations":
+      "description": "One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command."
+      "documentation": "https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash"
+      "summary": "One or more Ceph daemons have crashed, and are pending acknowledgement on cluster {{ $labels.cluster }}"
+    "expr": "ceph_health_detail{name=\"RECENT_CRASH\"} == 1"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.1.2"
+      "severity": "critical"
+      "type": "ceph_default"
+- "name": "rbdmirror"
+  "rules":
+  - "alert": "CephRBDMirrorImagesPerDaemonHigh"
+    "annotations":
+      "description": "Number of image replications per daemon is not supposed to go beyond threshold 100"
+      "summary": "Number of image replications are now above 100 on cluster {{ $labels.cluster }}"
+    "expr": "sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.10.2"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephRBDMirrorImagesNotInSync"
+    "annotations":
+      "description": "Both local and remote RBD mirror images should be in sync."
+      "summary": "Some of the RBD mirror images are not in sync with the remote counter parts on cluster {{ $labels.cluster }}"
+    "expr": "sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.10.3"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephRBDMirrorImagesNotInSyncVeryHigh"
+    "annotations":
+      "description": "More than 10% of the images have synchronization problems."
+      "summary": "Number of unsynchronized images are very high on cluster {{ $labels.cluster }}"
+    "expr": "count by (ceph_daemon, cluster) ((topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon, cluster) (ceph_rbd_mirror_snapshot_snapshots)*.1)"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.10.4"
+      "severity": "critical"
+      "type": "ceph_default"
+  - "alert": "CephRBDMirrorImageTransferBandwidthHigh"
+    "annotations":
+      "description": "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously"
+      "summary": "The replication network usage on cluster {{ $labels.cluster }} has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
+    "expr": "rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > 0.80"
+    "for": "1m"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.10.5"
+      "severity": "warning"
+      "type": "ceph_default"
+- "name": "nvmeof"
+  "rules":
+  - "alert": "NVMeoFSubsystemNamespaceLimit"
+    "annotations":
+      "description": "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}"
+      "summary": "{{ $labels.nqn }} subsystem has reached its maximum number of namespaces on cluster {{ $labels.cluster }}"
+    "expr": "(count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFTooManyGateways"
+    "annotations":
+      "description": "You may create many gateways, but 4 is the tested limit"
+      "summary": "Max supported gateways exceeded on cluster {{ $labels.cluster }}"
+    "expr": "count(ceph_nvmeof_gateway_info) by (cluster) > 4.00"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFMaxGatewayGroupSize"
+    "annotations":
+      "description": "You may create many gateways in a gateway group, but 4 is the tested limit"
+      "summary": "Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster {{ $labels.cluster }}"
+    "expr": "count(ceph_nvmeof_gateway_info) by (cluster,group) > 4.00"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFSingleGatewayGroup"
+    "annotations":
+      "description": "Although a single member gateway group is valid, it should only be used for test purposes"
+      "summary": "The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible on cluster {{ $labels.cluster }}"
+    "expr": "count(ceph_nvmeof_gateway_info) by(cluster,group) == 1"
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFHighGatewayCPU"
+    "annotations":
+      "description": "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores"
+      "summary": "CPU used by {{ $labels.instance }} NVMe-oF Gateway is high on cluster {{ $labels.cluster }}"
+    "expr": "label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode=\"busy\"}[1m])),\"instance\",\"$1\",\"instance\",\"(.*):.*\") > 80.00"
+    "for": "10m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFGatewayOpenSecurity"
+    "annotations":
+      "description": "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss"
+      "summary": "Subsystem {{ $labels.nqn }} has been defined without host level security on cluster {{ $labels.cluster }}"
+    "expr": "ceph_nvmeof_subsystem_metadata{allow_any_host=\"yes\"}"
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFTooManySubsystems"
+    "annotations":
+      "description": "Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported"
+      "summary": "The number of subsystems defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}"
+    "expr": "count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*):.*\")) > 16.00"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFVersionMismatch"
+    "annotations":
+      "description": "This may indicate an issue with deployment. Check cephadm logs"
+      "summary": "Too many different NVMe-oF gateway releases active on cluster {{ $labels.cluster }}"
+    "expr": "count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster) > 1"
+    "for": "1h"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFHighClientCount"
+    "annotations":
+      "description": "The supported limit for clients connecting to a subsystem is 32"
+      "summary": "The number of clients connected to {{ $labels.nqn }} is too high on cluster {{ $labels.cluster }}"
+    "expr": "ceph_nvmeof_subsystem_host_count > 32.00"
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFHighHostCPU"
+    "annotations":
+      "description": "High CPU on a gateway host can lead to CPU contention and performance degradation"
+      "summary": "The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host }}) on cluster {{ $labels.cluster }}"
+    "expr": "100-((100*(avg by(cluster,host) (label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]),\"host\",\"$1\",\"instance\",\"(.*):.*\")) * on(cluster, host) group_right label_replace(ceph_nvmeof_gateway_info,\"host\",\"$1\",\"instance\",\"(.*):.*\")))) >= 80.00"
+    "for": "10m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFInterfaceDown"
+    "annotations":
+      "description": "A NIC used by one or more subsystems is in a down state"
+      "summary": "Network interface {{ $labels.device }} is down on cluster {{ $labels.cluster }}"
+    "expr": "ceph_nvmeof_subsystem_listener_iface_info{operstate=\"down\"}"
+    "for": "30s"
+    "labels":
+      "oid": "1.3.6.1.4.1.50495.1.2.1.14.1"
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFInterfaceDuplex"
+    "annotations":
+      "description": "Until this is resolved, performance from the gateway will be degraded"
+      "summary": "Network interface {{ $labels.device }} is not running in full duplex mode on cluster {{ $labels.cluster }}"
+    "expr": "ceph_nvmeof_subsystem_listener_iface_info{duplex!=\"full\"}"
+    "for": "30s"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFHighReadLatency"
+    "annotations":
+      "description": "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
+      "summary": "The average read latency over the last 5 mins has reached 10 ms or more on {{ $labels.gateway }}"
+    "expr": "label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),\"gateway\",\"$1\",\"instance\",\"(.*):.*\") > 0.01"
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
+  - "alert": "NVMeoFHighWriteLatency"
+    "annotations":
+      "description": "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
+      "summary": "The average write latency over the last 5 mins has reached 20 ms or more on {{ $labels.gateway }}"
+    "expr": "label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),\"gateway\",\"$1\",\"instance\",\"(.*):.*\") > 0.02"
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+      "type": "ceph_default"
diff --git a/prometheus/node.rec.rules b/prometheus/node.rec.rules
new file mode 100644
index 0000000..a68530e
--- /dev/null
+++ b/prometheus/node.rec.rules
@@ -0,0 +1,68 @@
+"groups":
+- "name": "node-exporter.rules"
+  "rules":
+  - "expr": |
+      count without (cpu, mode) (
+        node_cpu_seconds_total{job="node",mode="idle"}
+      )
+    "record": "instance:node_num_cpu:sum"
+  - "expr": |
+      1 - avg without (cpu) (
+        sum without (mode) (rate(node_cpu_seconds_total{job="node", mode=~"idle|iowait|steal"}[5m]))
+      )
+    "record": "instance:node_cpu_utilisation:rate5m"
+  - "expr": |
+      (
+        node_load1{job="node"}
+      /
+        instance:node_num_cpu:sum{job="node"}
+      )
+    "record": "instance:node_load1_per_cpu:ratio"
+  - "expr": |
+      1 - (
+        (
+          node_memory_MemAvailable_bytes{job="node"}
+          or
+          (
+            node_memory_Buffers_bytes{job="node"}
+            +
+            node_memory_Cached_bytes{job="node"}
+            +
+            node_memory_MemFree_bytes{job="node"}
+            +
+            node_memory_Slab_bytes{job="node"}
+          )
+        )
+      /
+        node_memory_MemTotal_bytes{job="node"}
+      )
+    "record": "instance:node_memory_utilisation:ratio"
+  - "expr": |
+      rate(node_vmstat_pgmajfault{job="node"}[5m])
+    "record": "instance:node_vmstat_pgmajfault:rate5m"
+  - "expr": |
+      rate(node_disk_io_time_seconds_total{job="node", device!=""}[5m])
+    "record": "instance_device:node_disk_io_time_seconds:rate5m"
+  - "expr": |
+      rate(node_disk_io_time_weighted_seconds_total{job="node", device!=""}[5m])
+    "record": "instance_device:node_disk_io_time_weighted_seconds:rate5m"
+  - "expr": |
+      sum without (device) (
+        rate(node_network_receive_bytes_total{job="node", device!="lo"}[5m])
+      )
+    "record": "instance:node_network_receive_bytes_excluding_lo:rate5m"
+  - "expr": |
+      sum without (device) (
+        rate(node_network_transmit_bytes_total{job="node", device!="lo"}[5m])
+      )
+    "record": "instance:node_network_transmit_bytes_excluding_lo:rate5m"
+  - "expr": |
+      sum without (device) (
+        rate(node_network_receive_drop_total{job="node", device!="lo"}[5m])
+      )
+    "record": "instance:node_network_receive_drop_excluding_lo:rate5m"
+  - "expr": |
+      sum without (device) (
+        rate(node_network_transmit_drop_total{job="node", device!="lo"}[5m])
+      )
+    "record": "instance:node_network_transmit_drop_excluding_lo:rate5m"
diff --git a/prometheus/node.rules b/prometheus/node.rules
new file mode 100644
index 0000000..8cb7bf6
--- /dev/null
+++ b/prometheus/node.rules
@@ -0,0 +1,290 @@
+"groups":
+- "name": "node-exporter"
+  "rules":
+  - "alert": "NodeFilesystemSpaceFillingUp"
+    "annotations":
+      "description": "Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up."
+      "summary": "Filesystem is predicted to run out of space within the next 24 hours."
+    "expr": |
+      (
+        node_filesystem_avail_bytes{job="node",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node",fstype!="",mountpoint!=""} * 100 < 40
+      and
+        predict_linear(node_filesystem_avail_bytes{job="node",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
+      and
+        node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
+      )
+    "for": "1h"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeFilesystemSpaceFillingUp"
+    "annotations":
+      "description": "Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up fast."
+      "summary": "Filesystem is predicted to run out of space within the next 4 hours."
+    "expr": |
+      (
+        node_filesystem_avail_bytes{job="node",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node",fstype!="",mountpoint!=""} * 100 < 20
+      and
+        predict_linear(node_filesystem_avail_bytes{job="node",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
+      and
+        node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
+      )
+    "for": "1h"
+    "labels":
+      "severity": "critical"
+  - "alert": "NodeFilesystemAlmostOutOfSpace"
+    "annotations":
+      "description": "Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left."
+      "summary": "Filesystem has less than 5% space left."
+    "expr": |
+      (
+        node_filesystem_avail_bytes{job="node",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node",fstype!="",mountpoint!=""} * 100 < 5
+      and
+        node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
+      )
+    "for": "30m"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeFilesystemAlmostOutOfSpace"
+    "annotations":
+      "description": "Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left."
+      "summary": "Filesystem has less than 3% space left."
+    "expr": |
+      (
+        node_filesystem_avail_bytes{job="node",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node",fstype!="",mountpoint!=""} * 100 < 3
+      and
+        node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
+      )
+    "for": "30m"
+    "labels":
+      "severity": "critical"
+  - "alert": "NodeFilesystemFilesFillingUp"
+    "annotations":
+      "description": "Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up."
+      "summary": "Filesystem is predicted to run out of inodes within the next 24 hours."
+    "expr": |
+      (
+        node_filesystem_files_free{job="node",fstype!="",mountpoint!=""} / node_filesystem_files{job="node",fstype!="",mountpoint!=""} * 100 < 40
+      and
+        predict_linear(node_filesystem_files_free{job="node",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
+      and
+        node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
+      )
+    "for": "1h"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeFilesystemFilesFillingUp"
+    "annotations":
+      "description": "Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up fast."
+      "summary": "Filesystem is predicted to run out of inodes within the next 4 hours."
+    "expr": |
+      (
+        node_filesystem_files_free{job="node",fstype!="",mountpoint!=""} / node_filesystem_files{job="node",fstype!="",mountpoint!=""} * 100 < 20
+      and
+        predict_linear(node_filesystem_files_free{job="node",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
+      and
+        node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
+      )
+    "for": "1h"
+    "labels":
+      "severity": "critical"
+  - "alert": "NodeFilesystemAlmostOutOfFiles"
+    "annotations":
+      "description": "Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left."
+      "summary": "Filesystem has less than 5% inodes left."
+    "expr": |
+      (
+        node_filesystem_files_free{job="node",fstype!="",mountpoint!=""} / node_filesystem_files{job="node",fstype!="",mountpoint!=""} * 100 < 5
+      and
+        node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
+      )
+    "for": "1h"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeFilesystemAlmostOutOfFiles"
+    "annotations":
+      "description": "Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left."
+      "summary": "Filesystem has less than 3% inodes left."
+    "expr": |
+      (
+        node_filesystem_files_free{job="node",fstype!="",mountpoint!=""} / node_filesystem_files{job="node",fstype!="",mountpoint!=""} * 100 < 3
+      and
+        node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
+      )
+    "for": "1h"
+    "labels":
+      "severity": "critical"
+  - "alert": "NodeNetworkReceiveErrs"
+    "annotations":
+      "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes."
+      "summary": "Network interface is reporting many receive errors."
+    "expr": |
+      rate(node_network_receive_errs_total{job="node"}[2m]) / rate(node_network_receive_packets_total{job="node"}[2m]) > 0.01
+    "for": "1h"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeNetworkTransmitErrs"
+    "annotations":
+      "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes."
+      "summary": "Network interface is reporting many transmit errors."
+    "expr": |
+      rate(node_network_transmit_errs_total{job="node"}[2m]) / rate(node_network_transmit_packets_total{job="node"}[2m]) > 0.01
+    "for": "1h"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeHighNumberConntrackEntriesUsed"
+    "annotations":
+      "description": "{{ $value | humanizePercentage }} of conntrack entries are used."
+      "summary": "Number of conntrack are getting close to the limit."
+    "expr": |
+      (node_nf_conntrack_entries{job="node"} / node_nf_conntrack_entries_limit) > 0.75
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeTextFileCollectorScrapeError"
+    "annotations":
+      "description": "Node Exporter text file collector on {{ $labels.instance }} failed to scrape."
+      "summary": "Node Exporter text file collector failed to scrape."
+    "expr": |
+      node_textfile_scrape_error{job="node"} == 1
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeClockSkewDetected"
+    "annotations":
+      "description": "Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host."
+      "summary": "Clock skew detected."
+    "expr": |
+      (
+        node_timex_offset_seconds{job="node"} > 0.05
+      and
+        deriv(node_timex_offset_seconds{job="node"}[5m]) >= 0
+      )
+      or
+      (
+        node_timex_offset_seconds{job="node"} < -0.05
+      and
+        deriv(node_timex_offset_seconds{job="node"}[5m]) <= 0
+      )
+    "for": "10m"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeClockNotSynchronising"
+    "annotations":
+      "description": "Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host."
+      "summary": "Clock not synchronising."
+    "expr": |
+      min_over_time(node_timex_sync_status{job="node"}[5m]) == 0
+      and
+      node_timex_maxerror_seconds{job="node"} >= 16
+    "for": "10m"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeRAIDDegraded"
+    "annotations":
+      "description": "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically."
+      "summary": "RAID Array is degraded."
+    "expr": |
+      node_md_disks_required{job="node",device!=""} - ignoring (state) (node_md_disks{state="active",job="node",device!=""}) > 0
+    "for": "15m"
+    "labels":
+      "severity": "critical"
+  - "alert": "NodeRAIDDiskFailure"
+    "annotations":
+      "description": "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap."
+      "summary": "Failed device in RAID array."
+    "expr": |
+      node_md_disks{state="failed",job="node",device!=""} > 0
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeFileDescriptorLimit"
+    "annotations":
+      "description": "File descriptors limit at {{ $labels.instance }} is currently at {{ printf \"%.2f\" $value }}%."
+      "summary": "Kernel is predicted to exhaust file descriptors limit soon."
+    "expr": |
+      (
+        node_filefd_allocated{job="node"} * 100 / node_filefd_maximum{job="node"} > 70
+      )
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeFileDescriptorLimit"
+    "annotations":
+      "description": "File descriptors limit at {{ $labels.instance }} is currently at {{ printf \"%.2f\" $value }}%."
+      "summary": "Kernel is predicted to exhaust file descriptors limit soon."
+    "expr": |
+      (
+        node_filefd_allocated{job="node"} * 100 / node_filefd_maximum{job="node"} > 90
+      )
+    "for": "15m"
+    "labels":
+      "severity": "critical"
+  - "alert": "NodeCPUHighUsage"
+    "annotations":
+      "description": |
+        CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
+      "summary": "High CPU usage."
+    "expr": |
+      sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node", mode!="idle"}[2m]))) * 100 > 90
+    "for": "15m"
+    "labels":
+      "severity": "info"
+  - "alert": "NodeSystemSaturation"
+    "annotations":
+      "description": |
+        System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
+        This might indicate this instance resources saturation and can cause it becoming unresponsive.
+      "summary": "System saturated, load per core is very high."
+    "expr": |
+      node_load1{job="node"}
+      / count without (cpu, mode) (node_cpu_seconds_total{job="node", mode="idle"}) > 2
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeMemoryMajorPagesFaults"
+    "annotations":
+      "description": |
+        Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
+        Please check that there is enough memory available at this instance.
+      "summary": "Memory major page faults are occurring at very high rate."
+    "expr": |
+      rate(node_vmstat_pgmajfault{job="node"}[5m]) > 500
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeMemoryHighUtilization"
+    "annotations":
+      "description": |
+        Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
+      "summary": "Host is running out of memory."
+    "expr": |
+      100 - (node_memory_MemAvailable_bytes{job="node"} / node_memory_MemTotal_bytes{job="node"} * 100) > 90
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeDiskIOSaturation"
+    "annotations":
+      "description": |
+        Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
+        This symptom might indicate disk saturation.
+      "summary": "Disk IO queue is high."
+    "expr": |
+      rate(node_disk_io_time_weighted_seconds_total{job="node", device!=""}[5m]) > 10
+    "for": "30m"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeSystemdServiceFailed"
+    "annotations":
+      "description": "Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}"
+      "summary": "Systemd service has entered failed state."
+    "expr": |
+      node_systemd_unit_state{job="node", state="failed"} == 1
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+  - "alert": "NodeBondingDegraded"
+    "annotations":
+      "description": "Bonding interface {{ $labels.master }} on {{ $labels.instance }} is in degraded state due to one or more slave failures."
+      "summary": "Bonding interface is degraded"
+    "expr": |
+      (node_bonding_slaves - node_bonding_active) != 0
+    "for": "5m"
+    "labels":
+      "severity": "warning"
diff --git a/prometheus/prometheus-extra.rules b/prometheus/prometheus-extra.rules
new file mode 100644
index 0000000..40df8ce
--- /dev/null
+++ b/prometheus/prometheus-extra.rules
@@ -0,0 +1,257 @@
+# Taken from https://awesome-prometheus-alerts.grep.to/rules
+
+groups:
+- name: Prometheus
+  rules:
+
+    - alert: PrometheusJobMissing
+      expr: 'absent(up{job="prometheus"})'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Prometheus job missing (instance {{ $labels.instance }})
+        description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTargetMissing
+      expr: 'up == 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus target missing (instance {{ $labels.instance }})
+        description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusAllTargetsMissing
+      expr: 'sum by (job) (up) == 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus all targets missing (instance {{ $labels.instance }})
+        description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTargetMissingWithWarmupTime
+      expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})
+        description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusConfigurationReloadFailure
+      expr: 'prometheus_config_last_reload_successful != 1'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+        description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTooManyRestarts
+      expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Prometheus too many restarts (instance {{ $labels.instance }})
+        description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusAlertmanagerJobMissing
+      expr: 'absent(up{job="alertmanager"})'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
+        description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusAlertmanagerConfigurationReloadFailure
+      expr: 'alertmanager_config_last_reload_successful != 1'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
+        description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusAlertmanagerConfigNotSynced
+      expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
+        description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusAlertmanagerE2eDeadManSwitch
+      expr: 'vector(1)'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
+        description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusNotConnectedToAlertmanager
+      expr: 'prometheus_notifications_alertmanagers_discovered < 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
+        description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusRuleEvaluationFailures
+      expr: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
+        description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTemplateTextExpansionFailures
+      expr: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
+        description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusRuleEvaluationSlow
+      expr: 'prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+        description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusNotificationsBacklog
+      expr: 'min_over_time(prometheus_notifications_queue_length[10m]) > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+        description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusAlertmanagerNotificationFailing
+      expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
+        description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTargetEmpty
+      expr: 'prometheus_sd_discovered_targets == 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus target empty (instance {{ $labels.instance }})
+        description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTargetScrapingSlow
+      expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Prometheus target scraping slow (instance {{ $labels.instance }})
+        description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusLargeScrape
+      expr: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Prometheus large scrape (instance {{ $labels.instance }})
+        description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTargetScrapeDuplicate
+      expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
+        description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTsdbCheckpointCreationFailures
+      expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
+        description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTsdbCheckpointDeletionFailures
+      expr: 'increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
+        description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTsdbCompactionsFailed
+      expr: 'increase(prometheus_tsdb_compactions_failed_total[1m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
+        description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTsdbHeadTruncationsFailed
+      expr: 'increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
+        description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTsdbReloadFailures
+      expr: 'increase(prometheus_tsdb_reloads_failures_total[1m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+        description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTsdbWalCorruptions
+      expr: 'increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
+        description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTsdbWalTruncationsFailed
+      expr: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
+        description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PrometheusTimeseriesCardinality
+      expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
+        description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/prometheus/prometheus.rec.rules b/prometheus/prometheus.rec.rules
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/prometheus/prometheus.rec.rules
@@ -0,0 +1 @@
+{}
diff --git a/prometheus/prometheus.rules b/prometheus/prometheus.rules
index 40df8ce..33f65ab 100644
--- a/prometheus/prometheus.rules
+++ b/prometheus/prometheus.rules
@@ -1,257 +1,263 @@
-# Taken from https://awesome-prometheus-alerts.grep.to/rules
-
-groups:
-- name: Prometheus
-  rules:
-
-    - alert: PrometheusJobMissing
-      expr: 'absent(up{job="prometheus"})'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus job missing (instance {{ $labels.instance }})
-        description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTargetMissing
-      expr: 'up == 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus target missing (instance {{ $labels.instance }})
-        description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusAllTargetsMissing
-      expr: 'sum by (job) (up) == 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus all targets missing (instance {{ $labels.instance }})
-        description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTargetMissingWithWarmupTime
-      expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})
-        description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusConfigurationReloadFailure
-      expr: 'prometheus_config_last_reload_successful != 1'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
-        description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTooManyRestarts
-      expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus too many restarts (instance {{ $labels.instance }})
-        description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusAlertmanagerJobMissing
-      expr: 'absent(up{job="alertmanager"})'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
-        description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusAlertmanagerConfigurationReloadFailure
-      expr: 'alertmanager_config_last_reload_successful != 1'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
-        description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusAlertmanagerConfigNotSynced
-      expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
-        description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusAlertmanagerE2eDeadManSwitch
-      expr: 'vector(1)'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
-        description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusNotConnectedToAlertmanager
-      expr: 'prometheus_notifications_alertmanagers_discovered < 1'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
-        description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusRuleEvaluationFailures
-      expr: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
-        description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTemplateTextExpansionFailures
-      expr: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
-        description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusRuleEvaluationSlow
-      expr: 'prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
-        description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusNotificationsBacklog
-      expr: 'min_over_time(prometheus_notifications_queue_length[10m]) > 0'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus notifications backlog (instance {{ $labels.instance }})
-        description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusAlertmanagerNotificationFailing
-      expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
-        description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTargetEmpty
-      expr: 'prometheus_sd_discovered_targets == 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus target empty (instance {{ $labels.instance }})
-        description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTargetScrapingSlow
-      expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus target scraping slow (instance {{ $labels.instance }})
-        description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusLargeScrape
-      expr: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus large scrape (instance {{ $labels.instance }})
-        description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTargetScrapeDuplicate
-      expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
-        description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTsdbCheckpointCreationFailures
-      expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
-        description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTsdbCheckpointDeletionFailures
-      expr: 'increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
-        description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTsdbCompactionsFailed
-      expr: 'increase(prometheus_tsdb_compactions_failed_total[1m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
-        description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTsdbHeadTruncationsFailed
-      expr: 'increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
-        description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTsdbReloadFailures
-      expr: 'increase(prometheus_tsdb_reloads_failures_total[1m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
-        description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTsdbWalCorruptions
-      expr: 'increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
-        description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTsdbWalTruncationsFailed
-      expr: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
-        description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTimeseriesCardinality
-      expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
-        description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+"groups":
+- "name": "prometheus"
+  "rules":
+  - "alert": "PrometheusBadConfig"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} has failed to reload its configuration."
+      "summary": "Failed Prometheus configuration reload."
+    "expr": |
+      # Without max_over_time, failed scrapes could create false negatives, see
+      # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+      max_over_time(prometheus_config_last_reload_successful{job="prometheus"}[5m]) == 0
+    "for": "10m"
+    "labels":
+      "severity": "critical"
+  - "alert": "PrometheusSDRefreshFailure"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} has failed to refresh SD with mechanism {{$labels.mechanism}}."
+      "summary": "Failed Prometheus SD refresh."
+    "expr": |
+      increase(prometheus_sd_refresh_failures_total{job="prometheus"}[10m]) > 0
+    "for": "20m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusKubernetesListWatchFailures"
+    "annotations":
+      "description": "Kubernetes service discovery of Prometheus {{$labels.instance}} is experiencing {{ printf \"%.0f\" $value }} failures with LIST/WATCH requests to the Kubernetes API in the last 5 minutes."
+      "summary": "Requests in Kubernetes SD are failing."
+    "expr": |
+      increase(prometheus_sd_kubernetes_failures_total{job="prometheus"}[5m]) > 0
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusNotificationQueueRunningFull"
+    "annotations":
+      "description": "Alert notification queue of Prometheus {{$labels.instance}} is running full."
+      "summary": "Prometheus alert notification queue predicted to run full in less than 30m."
+    "expr": |
+      # Without min_over_time, failed scrapes could create false negatives, see
+      # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+      (
+        predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30)
+      >
+        min_over_time(prometheus_notifications_queue_capacity{job="prometheus"}[5m])
+      )
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusErrorSendingAlertsToSomeAlertmanagers"
+    "annotations":
+      "description": "{{ printf \"%.1f\" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}."
+      "summary": "Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager."
+    "expr": |
+      (
+        rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
+      /
+        rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
+      )
+      * 100
+      > 1
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusNotConnectedToAlertmanagers"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} is not connected to any Alertmanagers."
+      "summary": "Prometheus is not connected to any Alertmanagers."
+    "expr": |
+      # Without max_over_time, failed scrapes could create false negatives, see
+      # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+      max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1
+    "for": "10m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusTSDBReloadsFailing"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h."
+      "summary": "Prometheus has issues reloading blocks from disk."
+    "expr": |
+      increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0
+    "for": "4h"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusTSDBCompactionsFailing"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h."
+      "summary": "Prometheus has issues compacting blocks."
+    "expr": |
+      increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0
+    "for": "4h"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusNotIngestingSamples"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} is not ingesting samples."
+      "summary": "Prometheus is not ingesting samples."
+    "expr": |
+      (
+        sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m])) <= 0
+      and
+        (
+          sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus"}) > 0
+        or
+          sum without(rule_group) (prometheus_rule_group_rules{job="prometheus"}) > 0
+        )
+      )
+    "for": "10m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusDuplicateTimestamps"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} is dropping {{ printf \"%.4g\" $value  }} samples/s with different values but duplicated timestamp."
+      "summary": "Prometheus is dropping samples with duplicate timestamps."
+    "expr": |
+      rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0
+    "for": "10m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusOutOfOrderTimestamps"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} is dropping {{ printf \"%.4g\" $value  }} samples/s with timestamps arriving out of order."
+      "summary": "Prometheus drops samples with out-of-order timestamps."
+    "expr": |
+      rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0
+    "for": "10m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusRemoteStorageFailures"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} failed to send {{ printf \"%.1f\" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}"
+      "summary": "Prometheus fails to send samples to remote storage."
+    "expr": |
+      (
+        (rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m]))
+      /
+        (
+          (rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m]))
+        +
+          (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus"}[5m]))
+        )
+      )
+      * 100
+      > 1
+    "for": "15m"
+    "labels":
+      "severity": "critical"
+  - "alert": "PrometheusRemoteWriteBehind"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} remote write is {{ printf \"%.1f\" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}."
+      "summary": "Prometheus remote write is behind."
+    "expr": |
+      # Without max_over_time, failed scrapes could create false negatives, see
+      # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+      (
+        max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus"}[5m])
+      - ignoring(remote_name, url) group_right
+        max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus"}[5m])
+      )
+      > 120
+    "for": "15m"
+    "labels":
+      "severity": "critical"
+  - "alert": "PrometheusRemoteWriteDesiredShards"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance=\"%s\",job=\"prometheus\"}` $labels.instance | query | first | value }}."
+      "summary": "Prometheus remote write desired shards calculation wants to run more than configured max shards."
+    "expr": |
+      # Without max_over_time, failed scrapes could create false negatives, see
+      # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+      (
+        max_over_time(prometheus_remote_storage_shards_desired{job="prometheus"}[5m])
+      >
+        max_over_time(prometheus_remote_storage_shards_max{job="prometheus"}[5m])
+      )
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusRuleFailures"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} has failed to evaluate {{ printf \"%.0f\" $value }} rules in the last 5m."
+      "summary": "Prometheus is failing rule evaluations."
+    "expr": |
+      increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0
+    "for": "15m"
+    "labels":
+      "severity": "critical"
+  - "alert": "PrometheusMissingRuleEvaluations"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} has missed {{ printf \"%.0f\" $value }} rule group evaluations in the last 5m."
+      "summary": "Prometheus is missing rule evaluations due to slow rule group evaluation."
+    "expr": |
+      increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusTargetLimitHit"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} has dropped {{ printf \"%.0f\" $value }} targets because the number of targets exceeded the configured target_limit."
+      "summary": "Prometheus has dropped targets because some scrape configs have exceeded the targets limit."
+    "expr": |
+      increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusLabelLimitHit"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} has dropped {{ printf \"%.0f\" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit."
+      "summary": "Prometheus has dropped targets because some scrape configs have exceeded the labels limit."
+    "expr": |
+      increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus"}[5m]) > 0
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusScrapeBodySizeLimitHit"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} has failed {{ printf \"%.0f\" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit."
+      "summary": "Prometheus has dropped some targets that exceeded body size limit."
+    "expr": |
+      increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus"}[5m]) > 0
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusScrapeSampleLimitHit"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} has failed {{ printf \"%.0f\" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit."
+      "summary": "Prometheus has failed scrapes that have exceeded the configured sample limit."
+    "expr": |
+      increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus"}[5m]) > 0
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusTargetSyncFailure"
+    "annotations":
+      "description": "{{ printf \"%.0f\" $value }} targets in Prometheus {{$labels.instance}} have failed to sync because invalid configuration was supplied."
+      "summary": "Prometheus has failed to sync targets."
+    "expr": |
+      increase(prometheus_target_sync_failed_total{job="prometheus"}[30m]) > 0
+    "for": "5m"
+    "labels":
+      "severity": "critical"
+  - "alert": "PrometheusHighQueryLoad"
+    "annotations":
+      "description": "Prometheus {{$labels.instance}} query API has less than 20% available capacity in its query engine for the last 15 minutes."
+      "summary": "Prometheus is reaching its maximum capacity serving concurrent requests."
+    "expr": |
+      avg_over_time(prometheus_engine_queries{job="prometheus"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus"}[5m]) > 0.8
+    "for": "15m"
+    "labels":
+      "severity": "warning"
+  - "alert": "PrometheusErrorSendingAlertsToAnyAlertmanager"
+    "annotations":
+      "description": "{{ printf \"%.1f\" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager."
+      "summary": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager."
+    "expr": |
+      min without (alertmanager) (
+        rate(prometheus_notifications_errors_total{job="prometheus",alertmanager!~``}[5m])
+      /
+        rate(prometheus_notifications_sent_total{job="prometheus",alertmanager!~``}[5m])
+      )
+      * 100
+      > 3
+    "for": "15m"
+    "labels":
+      "severity": "critical"

From 790b3fa3d3a70dc02a2f38a08c55bf27f6b0d536 Mon Sep 17 00:00:00 2001
From: Jan Horstmann <horstmann@osism.tech>
Date: Thu, 7 Nov 2024 10:24:45 +0100
Subject: [PATCH 2/5] Add tests for whether mixin output is checked-in

With the proposed workflow mixin output is generated offline and
checked into version control for direct usage.
A test is added to check whether the generated rules and dashboards
match the checked-in version.
Additionally a check for the script generating the mixins is added.

Signed-off-by: Jan Horstmann <horstmann@osism.tech>
---
 .../mixin-ouput-ensure-git-checkin.yml        | 46 +++++++++++++++++++
 .zuul.yaml                                    | 16 +++++--
 2 files changed, 57 insertions(+), 5 deletions(-)
 create mode 100644 .src/playbooks/mixin-ouput-ensure-git-checkin.yml

diff --git a/.src/playbooks/mixin-ouput-ensure-git-checkin.yml b/.src/playbooks/mixin-ouput-ensure-git-checkin.yml
new file mode 100644
index 0000000..ade9e6c
--- /dev/null
+++ b/.src/playbooks/mixin-ouput-ensure-git-checkin.yml
@@ -0,0 +1,46 @@
+---
+- name: Run check-shell-syntax
+  hosts: all
+
+  tasks:
+    - name: Update package cache
+      become: true
+      ansible.builtin.apt:
+        update_cache: true
+
+    - name: Apply ensure-podman role
+      ansible.builtin.import_role:
+        name: ensure-podman
+
+    - name: Build monitoring-mixin container
+      containers.podman.podman_image:
+        name: mixin-builder
+        path: "./{{ zuul.project.src_dir }}/.src"
+        build:
+          file: "./{{ zuul.project.src_dir }}/.src/Containerfile"
+
+    - name: Install monitoring-mixin dependencies
+      containers.podman.podman_container:
+        command: jb install
+        <<: &mixin_container_options
+          name: mixin-builder
+          image: localhost/mixin-builder
+          detach: false
+          volume:
+            - "./{{ zuul.project.src_dir }}:/srv"
+          rm: true
+
+    - name: Render monitoring-mixin rules / dashboards
+      containers.podman.podman_container:
+        <<: *mixin_container_options
+
+    - name: Check difference between git and generated rules / dashboards
+      ansible.builtin.command:
+        argv:
+          - /usr/bin/git
+          - diff
+          - --exit-code
+        chdir: "./{{ zuul.project.src_dir }}"
+      register: git_diff
+      changed_when: false
+      failed_when: git_diff.rc != 0
diff --git a/.zuul.yaml b/.zuul.yaml
index 11b9930..69547e2 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -1,13 +1,19 @@
 ---
+- job:
+    name: mixin-ouput-ensure-git-checkin
+    description: Check for differences between monitoring-mixin output and checked in rules/dashboards
+    run: .src/playbooks/mixin-ouput-ensure-git-checkin.yml
 - project:
     default-branch: main
     merge-mode: squash-merge
     check:
-      jobs:
+      jobs: &jobs
         - yamllint
+        - python-black:
+            files: ^\.src/files/mixins.sh$
+        - mixin-ouput-ensure-git-checkin:
+            files: ^\.src/.*$
     gate:
-      jobs:
-        - yamllint
+      jobs: *jobs
     periodic-daily:
-      jobs:
-        - yamllint
+      jobs: *jobs

From 5abedaf6234bc1b9bd89a518fa79112cf8957f31 Mon Sep 17 00:00:00 2001
From: Jan Horstmann <horstmann@osism.tech>
Date: Fri, 8 Nov 2024 12:18:29 +0100
Subject: [PATCH 3/5] Remove redundant node-exporter alerts

With the alerts of the added node-exporter mixin some alerts became
redundant and are removed.

Signed-off-by: Jan Horstmann <horstmann@osism.tech>
---
 prometheus/hardware.rules | 243 --------------------------------------
 1 file changed, 243 deletions(-)

diff --git a/prometheus/hardware.rules b/prometheus/hardware.rules
index 090c704..9619270 100644
--- a/prometheus/hardware.rules
+++ b/prometheus/hardware.rules
@@ -4,96 +4,6 @@ groups:
 - name: hardware.rules
   rules:
 
-    - alert: HostOutOfMemory
-      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host out of memory (instance {{ $labels.instance }})
-        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostMemoryUnderMemoryPressure
-      expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host memory under memory pressure (instance {{ $labels.instance }})
-        description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostMemoryIsUnderutilized
-      expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 1w
-      labels:
-        severity: info
-      annotations:
-        summary: Host Memory is underutilized (instance {{ $labels.instance }})
-        description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostUnusualNetworkThroughputIn
-      expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host unusual network throughput in (instance {{ $labels.instance }})
-        description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostUnusualNetworkThroughputOut
-      expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host unusual network throughput out (instance {{ $labels.instance }})
-        description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostUnusualDiskReadRate
-      expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host unusual disk read rate (instance {{ $labels.instance }})
-        description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostUnusualDiskWriteRate
-      expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host unusual disk write rate (instance {{ $labels.instance }})
-        description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostOutOfDiskSpace
-      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host out of disk space (instance {{ $labels.instance }})
-        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostDiskWillFillIn24Hours
-      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
-        description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostOutOfInodes
-      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host out of inodes (instance {{ $labels.instance }})
-        description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: HostFilesystemDeviceError
       expr: 'node_filesystem_device_error == 1'
       for: 0m
@@ -103,51 +13,6 @@ groups:
         summary: Host filesystem device error (instance {{ $labels.instance }})
         description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: HostInodesWillFillIn24Hours
-      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
-        description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostUnusualDiskReadLatency
-      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host unusual disk read latency (instance {{ $labels.instance }})
-        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostUnusualDiskWriteLatency
-      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host unusual disk write latency (instance {{ $labels.instance }})
-        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostHighCpuLoad
-      expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 10m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host high CPU load (instance {{ $labels.instance }})
-        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostCpuIsUnderutilized
-      expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 1w
-      labels:
-        severity: info
-      annotations:
-        summary: Host CPU is underutilized (instance {{ $labels.instance }})
-        description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: HostCpuStealNoisyNeighbor
       expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
       for: 0m
@@ -166,42 +31,6 @@ groups:
         summary: Host CPU high iowait (instance {{ $labels.instance }})
         description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: HostUnusualDiskIo
-      expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host unusual disk IO (instance {{ $labels.instance }})
-        description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostContextSwitching
-      expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host context switching (instance {{ $labels.instance }})
-        description: "Context switching is growing on the node (> 10000 / CPU / s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostSwapIsFillingUp
-      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host swap is filling up (instance {{ $labels.instance }})
-        description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostSystemdServiceCrashed
-      expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host systemd service crashed (instance {{ $labels.instance }})
-        description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: HostPhysicalComponentTooHot
       expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
       for: 5m
@@ -220,24 +49,6 @@ groups:
         summary: Host node overtemperature alarm (instance {{ $labels.instance }})
         description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: HostRaidArrayGotInactive
-      expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Host RAID array got inactive (instance {{ $labels.instance }})
-        description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostRaidDiskFailure
-      expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host RAID disk failure (instance {{ $labels.instance }})
-        description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: HostKernelVersionDeviations
       expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
       for: 6h
@@ -274,24 +85,6 @@ groups:
         summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
         description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: HostNetworkReceiveErrors
-      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host Network Receive Errors (instance {{ $labels.instance }})
-        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostNetworkTransmitErrors
-      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
-        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: HostNetworkInterfaceSaturated
       expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
       for: 1m
@@ -301,42 +94,6 @@ groups:
         summary: Host Network Interface Saturated (instance {{ $labels.instance }})
         description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: HostNetworkBondDegraded
-      expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host Network Bond Degraded (instance {{ $labels.instance }})
-        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostConntrackLimit
-      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host conntrack limit (instance {{ $labels.instance }})
-        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostClockSkew
-      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 10m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host clock skew (instance {{ $labels.instance }})
-        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostClockNotSynchronising
-      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host clock not synchronising (instance {{ $labels.instance }})
-        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: HostRequiresReboot
       expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
       for: 4h

From 299b20e441fa89d82a0c48da8859df572fa3fdbf Mon Sep 17 00:00:00 2001
From: Jan Horstmann <horstmann@osism.tech>
Date: Fri, 8 Nov 2024 12:52:09 +0100
Subject: [PATCH 4/5] Remove redundant prometheus/alertmanager rules

With the alerts of the added prometheus-exporter mixin some alerts
became redundant and are removed.

Signed-off-by: Jan Horstmann <horstmann@osism.tech>
---
 prometheus/prometheus-extra.rules | 101 +-----------------------------
 1 file changed, 1 insertion(+), 100 deletions(-)

diff --git a/prometheus/prometheus-extra.rules b/prometheus/prometheus-extra.rules
index 40df8ce..7f1df79 100644
--- a/prometheus/prometheus-extra.rules
+++ b/prometheus/prometheus-extra.rules
@@ -40,17 +40,8 @@ groups:
         summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})
         description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: PrometheusConfigurationReloadFailure
-      expr: 'prometheus_config_last_reload_successful != 1'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
-        description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: PrometheusTooManyRestarts
-      expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
+      expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway"}[15m]) > 2'
       for: 0m
       labels:
         severity: warning
@@ -67,24 +58,6 @@ groups:
         summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
         description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: PrometheusAlertmanagerConfigurationReloadFailure
-      expr: 'alertmanager_config_last_reload_successful != 1'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
-        description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusAlertmanagerConfigNotSynced
-      expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
-        description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: PrometheusAlertmanagerE2eDeadManSwitch
       expr: 'vector(1)'
       for: 0m
@@ -94,24 +67,6 @@ groups:
         summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
         description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: PrometheusNotConnectedToAlertmanager
-      expr: 'prometheus_notifications_alertmanagers_discovered < 1'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
-        description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusRuleEvaluationFailures
-      expr: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
-        description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: PrometheusTemplateTextExpansionFailures
       expr: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0'
       for: 0m
@@ -130,24 +85,6 @@ groups:
         summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
         description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: PrometheusNotificationsBacklog
-      expr: 'min_over_time(prometheus_notifications_queue_length[10m]) > 0'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus notifications backlog (instance {{ $labels.instance }})
-        description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusAlertmanagerNotificationFailing
-      expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
-        description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: PrometheusTargetEmpty
       expr: 'prometheus_sd_discovered_targets == 0'
       for: 0m
@@ -166,24 +103,6 @@ groups:
         summary: Prometheus target scraping slow (instance {{ $labels.instance }})
         description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: PrometheusLargeScrape
-      expr: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus large scrape (instance {{ $labels.instance }})
-        description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: PrometheusTargetScrapeDuplicate
-      expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0'
-      for: 0m
-      labels:
-        severity: warning
-      annotations:
-        summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
-        description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: PrometheusTsdbCheckpointCreationFailures
       expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0'
       for: 0m
@@ -202,15 +121,6 @@ groups:
         summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
         description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: PrometheusTsdbCompactionsFailed
-      expr: 'increase(prometheus_tsdb_compactions_failed_total[1m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
-        description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: PrometheusTsdbHeadTruncationsFailed
       expr: 'increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0'
       for: 0m
@@ -220,15 +130,6 @@ groups:
         summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
         description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: PrometheusTsdbReloadFailures
-      expr: 'increase(prometheus_tsdb_reloads_failures_total[1m]) > 0'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
-        description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: PrometheusTsdbWalCorruptions
       expr: 'increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0'
       for: 0m

From 665ae7da5450b5c456a6ec7fbb7aa892ee95881c Mon Sep 17 00:00:00 2001
From: Jan Horstmann <horstmann@osism.tech>
Date: Sat, 9 Nov 2024 16:59:05 +0100
Subject: [PATCH 5/5] Add opensearch-mixin

Signed-off-by: Jan Horstmann <horstmann@osism.tech>
---
 .src/jsonnetfile.json                         |    9 +
 .src/jsonnetfile.lock.json                    |   50 +
 .../infrastructure/opensearch.libsonnet       |   10 +
 .../infrastructure/node-overview.json         | 1661 +++++++++++
 .../opensearch-cluster-overview.json          | 2089 ++++++++++++++
 .../search-and-index-overview.json            | 2470 +++++++++++++++++
 prometheus/opensearch.rec.rules               |    1 +
 prometheus/opensearch.rules                   |  121 +
 8 files changed, 6411 insertions(+)
 create mode 100644 .src/mixins/infrastructure/opensearch.libsonnet
 create mode 100644 grafana/dashboards/infrastructure/node-overview.json
 create mode 100644 grafana/dashboards/infrastructure/opensearch-cluster-overview.json
 create mode 100644 grafana/dashboards/infrastructure/search-and-index-overview.json
 create mode 100644 prometheus/opensearch.rec.rules
 create mode 100644 prometheus/opensearch.rules

diff --git a/.src/jsonnetfile.json b/.src/jsonnetfile.json
index 148f54c..22d7d00 100644
--- a/.src/jsonnetfile.json
+++ b/.src/jsonnetfile.json
@@ -19,6 +19,15 @@
       },
       "version": "main"
     },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/grafana/jsonnet-libs.git",
+          "subdir": "opensearch-mixin"
+        }
+      },
+      "version": "master"
+    },
     {
       "source": {
         "git": {
diff --git a/.src/jsonnetfile.lock.json b/.src/jsonnetfile.lock.json
index 458fe6d..44c9d09 100644
--- a/.src/jsonnetfile.lock.json
+++ b/.src/jsonnetfile.lock.json
@@ -41,6 +41,26 @@
       "version": "82a19822e54a0a12a51e24dbd48fcde717dc0864",
       "sum": "64fMUPI3frXGj4X1FqFd1t7r04w3CUSmXaDcJ23EYbQ="
     },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/grafana/grafonnet.git",
+          "subdir": "gen/grafonnet-v10.0.0"
+        }
+      },
+      "version": "82a19822e54a0a12a51e24dbd48fcde717dc0864",
+      "sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0="
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/grafana/grafonnet.git",
+          "subdir": "gen/grafonnet-v11.0.0"
+        }
+      },
+      "version": "82a19822e54a0a12a51e24dbd48fcde717dc0864",
+      "sum": "0BvzR0i4bS4hc2O3xDv6i9m52z7mPrjvqxtcPrGhynA="
+    },
     {
       "source": {
         "git": {
@@ -51,6 +71,16 @@
       "version": "82a19822e54a0a12a51e24dbd48fcde717dc0864",
       "sum": "41w7p/rwrNsITqNHMXtGSJAfAyKmnflg6rFhKBduUxM="
     },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/grafana/jsonnet-libs.git",
+          "subdir": "common-lib"
+        }
+      },
+      "version": "a8fc2139d881ae632a8c956eb9dd4b84b24f362e",
+      "sum": "c2Omoqo8FTwR/V3VC+hRN5CEyI0UDD1OyYXnEqwiKLY="
+    },
     {
       "source": {
         "git": {
@@ -61,6 +91,16 @@
       "version": "a8fc2139d881ae632a8c956eb9dd4b84b24f362e",
       "sum": "yxqWcq/N3E/a/XreeU6EuE6X7kYPnG0AspAQFKOjASo="
     },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/grafana/jsonnet-libs.git",
+          "subdir": "opensearch-mixin"
+        }
+      },
+      "version": "a8fc2139d881ae632a8c956eb9dd4b84b24f362e",
+      "sum": "AK83KBy5roMxhT0taG54ERV20oG9mhaCJA+EHRzuPO4="
+    },
     {
       "source": {
         "git": {
@@ -110,6 +150,16 @@
       },
       "version": "d3074b39c38493ebb81514c0ec962b7853ed0162",
       "sum": "dYLcLzGH4yF3qB7OGC/7z4nqeTNjv42L7Q3BENU8XJI="
+    },
+    {
+      "source": {
+        "git": {
+          "remote": "https://github.com/yugui/jsonnetunit.git",
+          "subdir": "jsonnetunit"
+        }
+      },
+      "version": "6927c58cae7624a00f368b977ccc477d4f74071f",
+      "sum": "9FFqqln65hooRF0l6rjICDtnTxUlmDj34+sKMh4sjPI="
     }
   ],
   "legacyImports": false
diff --git a/.src/mixins/infrastructure/opensearch.libsonnet b/.src/mixins/infrastructure/opensearch.libsonnet
new file mode 100644
index 0000000..9f9ba68
--- /dev/null
+++ b/.src/mixins/infrastructure/opensearch.libsonnet
@@ -0,0 +1,10 @@
+local opensearch = import "opensearch-mixin/mixin.libsonnet";
+
+opensearch {
+  _config+:: {
+    enableLokiLogs: false,
+  },
+  prometheusRules+: {},
+  prometheusAlerts+: {},
+  grafanaDashboards+: {}
+}
diff --git a/grafana/dashboards/infrastructure/node-overview.json b/grafana/dashboards/infrastructure/node-overview.json
new file mode 100644
index 0000000..cebcac7
--- /dev/null
+++ b/grafana/dashboards/infrastructure/node-overview.json
@@ -0,0 +1,1661 @@
+{
+   "links": [
+      {
+         "asDropdown": false,
+         "includeVars": true,
+         "keepTime": true,
+         "tags": [
+            "opensearch-mixin"
+         ],
+         "title": "Other Opensearch dashboards",
+         "type": "dashboards"
+      }
+   ],
+   "panels": [
+      {
+         "datasource": {
+            "type": "datasource",
+            "uid": "-- Mixed --"
+         },
+         "description": "OpenSearch node roles over time.",
+         "fieldConfig": {
+            "defaults": {
+               "mappings": [
+                  {
+                     "options": {
+                        "2": {
+                           "color": "light-purple",
+                           "index": 0,
+                           "text": "data"
+                        },
+                        "3": {
+                           "color": "light-green",
+                           "index": 1,
+                           "text": "master"
+                        },
+                        "4": {
+                           "color": "light-blue",
+                           "index": 2,
+                           "text": "ingest"
+                        },
+                        "5": {
+                           "color": "light-yellow",
+                           "index": 3,
+                           "text": "cluster_manager"
+                        },
+                        "6": {
+                           "color": "super-light-red",
+                           "index": 4,
+                           "text": "remote_cluster_client"
+                        }
+                     },
+                     "type": "value"
+                  }
+               ]
+            }
+         },
+         "gridPos": {
+            "h": 5,
+            "w": 24,
+            "x": 0,
+            "y": 0
+         },
+         "id": 1,
+         "maxDataPoints": 100,
+         "options": {
+            "legend": false,
+            "showValue": "never"
+         },
+         "pluginVersion": "v10.0.0",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "max by (node,role) (\n    max_over_time(opensearch_node_role_bool{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\", role=\"data\"}[1m]) == 1\n) * 2\n",
+               "legendFormat": "{{node}}"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "max by (node,role) (\n    max_over_time(opensearch_node_role_bool{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\", role=\"master\"}[1m]) == 1\n) * 3\n",
+               "legendFormat": "{{node}}"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "max by (node,role) (\n    max_over_time(opensearch_node_role_bool{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\", role=\"ingest\"}[1m]) == 1\n) * 4\n",
+               "legendFormat": "{{node}}"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "max by (node,role) (\n    max_over_time(opensearch_node_role_bool{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\", role=\"cluster_manager\"}[1m]) == 1\n) * 5\n",
+               "legendFormat": "{{node}}"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "max by (node,role) (\n    max_over_time(opensearch_node_role_bool{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\", role=\"remote_cluster_client\"}[1m]) == 1\n) * 6\n",
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Roles timeline",
+         "type": "status-history"
+      },
+      {
+         "collapsed": false,
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 1
+         },
+         "id": 2,
+         "targets": [ ],
+         "title": "Node health",
+         "type": "row"
+      },
+      {
+         "datasource": {
+            "type": "prometheus",
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "CPU usage percentage of the node's Operating System.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "continuous-BlYlRd"
+               },
+               "custom": {
+                  "fillOpacity": 5,
+                  "gradientMode": "scheme",
+                  "lineInterpolation": "smooth",
+                  "lineWidth": 2,
+                  "showPoints": "never"
+               },
+               "decimals": 1,
+               "max": 100,
+               "min": 0,
+               "unit": "percent"
+            }
+         },
+         "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 0,
+            "y": 2
+         },
+         "id": 3,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list"
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "desc"
+            }
+         },
+         "pluginVersion": "v11.0.0",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "opensearch_os_cpu_percent{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"}",
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Node CPU usage",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "type": "prometheus",
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Memory usage percentage of the node for the Operating System and OpenSearch",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "continuous-BlYlRd"
+               },
+               "custom": {
+                  "fillOpacity": 5,
+                  "gradientMode": "scheme",
+                  "lineInterpolation": "smooth",
+                  "lineWidth": 2,
+                  "showPoints": "never"
+               },
+               "decimals": 1,
+               "max": 100,
+               "min": 0,
+               "unit": "percent"
+            }
+         },
+         "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 6,
+            "y": 2
+         },
+         "id": 4,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list"
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "desc"
+            }
+         },
+         "pluginVersion": "v11.0.0",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "opensearch_os_mem_used_percent{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"}",
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Node memory usage",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "type": "prometheus",
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Node file system read and write data.",
+         "fieldConfig": {
+            "defaults": {
+               "custom": {
+                  "fillOpacity": 1,
+                  "gradientMode": "opacity",
+                  "lineInterpolation": "smooth",
+                  "lineWidth": 2,
+                  "showPoints": "never",
+                  "stacking": "normal"
+               },
+               "unit": "Bps"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byRegexp",
+                     "options": "/time|used|busy|util/"
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.axisSoftMax",
+                        "value": 100
+                     },
+                     {
+                        "id": "custom.drawStyle",
+                        "value": "points"
+                     },
+                     {
+                        "id": "unit",
+                        "value": "percent"
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 12,
+            "y": 2
+         },
+         "id": 5,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list"
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "desc"
+            }
+         },
+         "pluginVersion": "v11.0.0",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by(job,opensearch_cluster,node) (rate(opensearch_fs_io_total_read_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"}[$__rate_interval]))",
+               "legendFormat": "{{node}} - read"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by(job,opensearch_cluster,node) (rate(opensearch_fs_io_total_write_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"}[$__rate_interval]))",
+               "legendFormat": "{{node}} - write"
+            }
+         ],
+         "title": "Node I/O",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "type": "prometheus",
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Number of open connections for the selected node.",
+         "fieldConfig": {
+            "defaults": {
+               "custom": {
+                  "fillOpacity": 30,
+                  "gradientMode": "opacity",
+                  "lineInterpolation": "smooth",
+                  "lineWidth": 2,
+                  "showPoints": "never",
+                  "stacking": "normal"
+               },
+               "unit": ""
+            }
+         },
+         "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 18,
+            "y": 2
+         },
+         "id": 6,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list"
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "desc"
+            }
+         },
+         "pluginVersion": "v11.0.0",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by (job,opensearch_cluster,node) (opensearch_transport_server_open_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"})",
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Node open connections",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "type": "prometheus",
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Disk usage percentage of the selected node.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "continuous-BlYlRd"
+               },
+               "custom": {
+                  "fillOpacity": 1,
+                  "gradientMode": "scheme",
+                  "lineInterpolation": "smooth",
+                  "lineWidth": 2,
+                  "showPoints": "never"
+               },
+               "decimals": 1,
+               "max": 100,
+               "min": 0,
+               "unit": "percent"
+            }
+         },
+         "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 0,
+            "y": 8
+         },
+         "id": 7,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list"
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "desc"
+            }
+         },
+         "pluginVersion": "v11.0.0",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "100 - (100 * opensearch_fs_path_free_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"} / clamp_min(opensearch_fs_path_total_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"}, 1))",
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Node disk usage",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "type": "prometheus",
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Percentage of swap space used by OpenSearch and the Operating System on the selected node.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "continuous-BlYlRd"
+               },
+               "custom": {
+                  "fillOpacity": 5,
+                  "gradientMode": "scheme",
+                  "lineInterpolation": "smooth",
+                  "lineWidth": 2,
+                  "showPoints": "never"
+               },
+               "decimals": 1,
+               "max": 100,
+               "min": 0,
+               "unit": "percent"
+            }
+         },
+         "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 6,
+            "y": 8
+         },
+         "id": 8,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list"
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "desc"
+            }
+         },
+         "pluginVersion": "v11.0.0",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "100 * opensearch_os_swap_used_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"} / clamp_min((opensearch_os_swap_used_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"} + opensearch_os_swap_free_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"}), 1)",
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Node memory swap",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "type": "prometheus",
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Node network traffic sent and received.",
+         "fieldConfig": {
+            "defaults": {
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisLabel": "out(-) | in(+)",
+                  "fillOpacity": 5,
+                  "gradientMode": "opacity",
+                  "lineInterpolation": "smooth",
+                  "lineWidth": 2,
+                  "showPoints": "never"
+               },
+               "decimals": 1,
+               "unit": "bps"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byRegexp",
+                     "options": "/sent/"
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.transform",
+                        "value": "negative-Y"
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 12,
+            "y": 8
+         },
+         "id": 9,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list"
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "desc"
+            }
+         },
+         "pluginVersion": "v11.0.0",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by (job,opensearch_cluster,node) (rate(opensearch_transport_tx_bytes_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"}[$__rate_interval])) * 8",
+               "legendFormat": "{{node}} - sent"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by (job,opensearch_cluster,node) (rate(opensearch_transport_rx_bytes_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"}[$__rate_interval])) * 8",
+               "legendFormat": "{{node}} - received"
+            }
+         ],
+         "title": "Node network traffic",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Circuit breakers tripped on the selected node by type",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "trips"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 7,
+            "w": 6,
+            "x": 18,
+            "y": 8
+         },
+         "id": 10,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by(name, job,opensearch_cluster,node) (increase(opensearch_circuitbreaker_tripped_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"}[$__interval:]))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}} - {{ name }}"
+            }
+         ],
+         "title": "Circuit breakers",
+         "type": "timeseries"
+      },
+      {
+         "collapsed": false,
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 15
+         },
+         "id": 11,
+         "targets": [ ],
+         "title": "Node JVM",
+         "type": "row"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The amount of heap memory used vs committed on the selected node.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "bytes"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 0,
+            "y": 16
+         },
+         "id": 12,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by (job,opensearch_cluster,node) (opensearch_jvm_mem_heap_used_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}} - used"
+            },
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by (job,opensearch_cluster,node) (opensearch_jvm_mem_heap_committed_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}} - commited"
+            }
+         ],
+         "title": "JVM heap used vs. committed",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The amount of non-heap memory used vs committed on the selected node.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "bytes"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 6,
+            "y": 16
+         },
+         "id": 13,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by (job,opensearch_cluster,node) (opensearch_jvm_mem_nonheap_used_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}} - used"
+            },
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by (job,opensearch_cluster,node) (opensearch_jvm_mem_nonheap_committed_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}} - commited"
+            }
+         ],
+         "title": "JVM non-heap used vs. committed",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The number of threads running in the JVM on the selected node.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "threads"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 12,
+            "y": 16
+         },
+         "id": 14,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by (job,opensearch_cluster,node) (opensearch_jvm_threads_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "JVM threads",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The number of buffer pools available on the selected node.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "buffer pools"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 18,
+            "y": 16
+         },
+         "id": 15,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by( job,opensearch_cluster,node, bufferpool) (opensearch_jvm_bufferpool_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}} - {{bufferpool}}"
+            }
+         ],
+         "title": "JVM buffer pools",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The uptime of the JVM in seconds on the selected node.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "s"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 0,
+            "y": 22
+         },
+         "id": 16,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "pluginVersion": "9.4.3",
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by(job,opensearch_cluster,node) (opensearch_jvm_uptime_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "JVM uptime",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The number of garbage collection operations on the selected node.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "operations"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 6,
+            "y": 22
+         },
+         "id": 17,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by (job,opensearch_cluster,node) (increase(opensearch_jvm_gc_collection_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"}[$__interval:]))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "JVM garbage collections",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The amount of time spent on garbage collection on the selected node.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "s"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 12,
+            "y": 22
+         },
+         "id": 18,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by (job,opensearch_cluster,node) (increase(opensearch_jvm_gc_collection_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"}[$__interval:]))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "JVM garbage collection time",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The percent used of JVM buffer pool memory.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "percent"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 6,
+            "w": 6,
+            "x": 18,
+            "y": 22
+         },
+         "id": 19,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "100 * (sum by (job,opensearch_cluster,node, bufferpool) (opensearch_jvm_bufferpool_used_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"})) / clamp_min((sum by (job, bufferpool, cluster) (opensearch_jvm_bufferpool_total_capacity_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"})),1)",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}} - {{bufferpool}}"
+            }
+         ],
+         "title": "JVM buffer pool usage",
+         "type": "timeseries"
+      },
+      {
+         "collapsed": false,
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 28
+         },
+         "id": 20,
+         "targets": [ ],
+         "title": "Thread pools",
+         "type": "row"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The number of threads in the thread pool for the selected node",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green"
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "threads"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 29
+         },
+         "id": 21,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by(job,opensearch_cluster,node) ((opensearch_threadpool_threads_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"}))",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Thread pool threads",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The number of tasks in the thread pool for the selected node.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green"
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "tasks"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 29
+         },
+         "id": 22,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by (job,opensearch_cluster,node) (opensearch_threadpool_tasks_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",node=~\"$node\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Thread pool tasks",
+         "type": "timeseries"
+      }
+   ],
+   "refresh": "1m",
+   "schemaVersion": 36,
+   "tags": [
+      "opensearch-mixin"
+   ],
+   "templating": {
+      "list": [
+         {
+            "label": "Prometheus data source",
+            "name": "prometheus_datasource",
+            "query": "prometheus",
+            "regex": "",
+            "type": "datasource"
+         },
+         {
+            "allValue": ".+",
+            "datasource": {
+               "type": "prometheus",
+               "uid": "${prometheus_datasource}"
+            },
+            "includeAll": true,
+            "label": "Job",
+            "multi": true,
+            "name": "job",
+            "query": "label_values(opensearch_os_cpu_percent{opensearch_cluster!=\"\"}, job)",
+            "refresh": 2,
+            "sort": 1,
+            "type": "query"
+         },
+         {
+            "allValue": ".+",
+            "datasource": {
+               "type": "prometheus",
+               "uid": "${prometheus_datasource}"
+            },
+            "includeAll": true,
+            "label": "Opensearch_cluster",
+            "multi": true,
+            "name": "opensearch_cluster",
+            "query": "label_values(opensearch_os_cpu_percent{opensearch_cluster!=\"\",job=~\"$job\"}, opensearch_cluster)",
+            "refresh": 2,
+            "sort": 1,
+            "type": "query"
+         },
+         {
+            "allValue": ".+",
+            "datasource": {
+               "type": "prometheus",
+               "uid": "${prometheus_datasource}"
+            },
+            "includeAll": true,
+            "label": "Node",
+            "multi": true,
+            "name": "node",
+            "query": "label_values(opensearch_os_cpu_percent{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"}, node)",
+            "refresh": 2,
+            "sort": 1,
+            "type": "query"
+         }
+      ]
+   },
+   "time": {
+      "from": "now-1h",
+      "to": "now"
+   },
+   "timezone": "default",
+   "title": "OpenSearch node overview",
+   "uid": "opensearch-node-overview"
+}
diff --git a/grafana/dashboards/infrastructure/opensearch-cluster-overview.json b/grafana/dashboards/infrastructure/opensearch-cluster-overview.json
new file mode 100644
index 0000000..27af57f
--- /dev/null
+++ b/grafana/dashboards/infrastructure/opensearch-cluster-overview.json
@@ -0,0 +1,2089 @@
+{
+   "links": [
+      {
+         "asDropdown": false,
+         "includeVars": true,
+         "keepTime": true,
+         "tags": [
+            "opensearch-mixin"
+         ],
+         "title": "Other Opensearch dashboards",
+         "type": "dashboards"
+      }
+   ],
+   "panels": [
+      {
+         "datasource": {
+            "type": "datasource",
+            "uid": "-- Mixed --"
+         },
+         "description": "OpenSearch node roles.",
+         "fieldConfig": {
+            "defaults": {
+               "mappings": [
+                  {
+                     "options": {
+                        "0": {
+                           "color": "super-light-orange",
+                           "index": 5,
+                           "text": "False"
+                        },
+                        "1": {
+                           "color": "light-green",
+                           "index": 3,
+                           "text": "True"
+                        },
+                        "Data": {
+                           "color": "light-purple",
+                           "index": 0,
+                           "text": "data"
+                        },
+                        "Ingest": {
+                           "color": "light-blue",
+                           "index": 2,
+                           "text": "ingest"
+                        },
+                        "Master": {
+                           "color": "light-green",
+                           "index": 1,
+                           "text": "master"
+                        },
+                        "Remote cluster client": {
+                           "color": "light-orange",
+                           "index": 4,
+                           "text": "remote_cluster_client"
+                        }
+                     },
+                     "type": "value"
+                  }
+               ]
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byRegexp",
+                     "options": "/Data|Master|Ingest|Remote.+|Cluster.+/"
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.cellOptions",
+                        "value": {
+                           "type": "color-text"
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 6,
+            "w": 24,
+            "x": 0,
+            "y": 0
+         },
+         "id": 1,
+         "pluginVersion": "v10.0.0",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "max by (job,opensearch_cluster,node,node,nodeid,role,primary_ip) (last_over_time(opensearch_node_role_bool{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"}[1d]))",
+               "instant": true,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Roles",
+         "transformations": [
+            {
+               "id": "labelsToFields",
+               "options": {
+                  "mode": "columns",
+                  "valueLabel": "role"
+               }
+            },
+            {
+               "id": "merge",
+               "options": { }
+            },
+            {
+               "id": "organize",
+               "options": {
+                  "excludeByName": {
+                     "Time": true
+                  },
+                  "indexByName": {
+                     "Time": 0,
+                     "cluster_manager": 108,
+                     "data": 105,
+                     "ingest": 106,
+                     "job": 3,
+                     "master": 104,
+                     "node": 3,
+                     "nodeid": 3,
+                     "opensearch_cluster": 3,
+                     "remote_cluster_client": 107
+                  },
+                  "renameByName": {
+                     "Time": "",
+                     "cluster": "Cluster",
+                     "cluster_manager": "Cluster manager",
+                     "data": "Data",
+                     "ingest": "Ingest",
+                     "master": "Master",
+                     "node": "Node",
+                     "nodeid": "Nodeid",
+                     "remote_cluster_client": "Remote cluster client"
+                  }
+               }
+            }
+         ],
+         "type": "table"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The overall health and availability of the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "thresholds"
+               },
+               "mappings": [
+                  {
+                     "options": {
+                        "0": {
+                           "index": 0,
+                           "text": "Green"
+                        },
+                        "1": {
+                           "index": 1,
+                           "text": "Yellow"
+                        },
+                        "2": {
+                           "index": 2,
+                           "text": "Red"
+                        }
+                     },
+                     "type": "value"
+                  }
+               ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "green",
+                        "value": 0
+                     },
+                     {
+                        "color": "yellow",
+                        "value": 1
+                     },
+                     {
+                        "color": "red",
+                        "value": 2
+                     }
+                  ]
+               }
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 5,
+            "w": 3,
+            "x": 0,
+            "y": 2
+         },
+         "id": 2,
+         "options": {
+            "colorMode": "value",
+            "graphMode": "none",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {
+               "calcs": [
+                  "lastNotNull"
+               ],
+               "fields": "",
+               "values": false
+            },
+            "textMode": "auto"
+         },
+         "pluginVersion": "9.4.3",
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "min by(job,opensearch_cluster) (opensearch_cluster_status{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{job}}/{{opensearch_cluster}}"
+            }
+         ],
+         "title": "Cluster status",
+         "type": "stat"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The number of running nodes across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "thresholds"
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 0
+                     },
+                     {
+                        "color": "green",
+                        "value": 1
+                     }
+                  ]
+               }
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 5,
+            "w": 3,
+            "x": 3,
+            "y": 2
+         },
+         "id": 3,
+         "options": {
+            "colorMode": "value",
+            "graphMode": "none",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {
+               "calcs": [
+                  "lastNotNull"
+               ],
+               "fields": "",
+               "values": false
+            },
+            "textMode": "auto"
+         },
+         "pluginVersion": "9.4.3",
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "min by(job,opensearch_cluster) (opensearch_cluster_nodes_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{job}}/{{opensearch_cluster}}"
+            }
+         ],
+         "title": "Node count",
+         "type": "stat"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The number of data nodes in the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "thresholds"
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 0
+                     },
+                     {
+                        "color": "green",
+                        "value": 1
+                     }
+                  ]
+               }
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 5,
+            "w": 3,
+            "x": 6,
+            "y": 2
+         },
+         "id": 4,
+         "options": {
+            "colorMode": "value",
+            "graphMode": "none",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {
+               "calcs": [
+                  "lastNotNull"
+               ],
+               "fields": "",
+               "values": false
+            },
+            "textMode": "auto"
+         },
+         "pluginVersion": "9.4.3",
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "min by(job,opensearch_cluster) (opensearch_cluster_datanodes_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{job}}/{{opensearch_cluster}}"
+            }
+         ],
+         "title": "Data node count",
+         "type": "stat"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The number of shards in the OpenSearch cluster across all indices.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "thresholds"
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 0
+                     },
+                     {
+                        "color": "green",
+                        "value": 1
+                     }
+                  ]
+               }
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 5,
+            "w": 3,
+            "x": 9,
+            "y": 2
+         },
+         "id": 5,
+         "options": {
+            "colorMode": "value",
+            "graphMode": "none",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {
+               "calcs": [
+                  "lastNotNull"
+               ],
+               "fields": "",
+               "values": false
+            },
+            "textMode": "auto"
+         },
+         "pluginVersion": "9.4.3",
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum(max by (type) (opensearch_cluster_shards_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"}))",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{job}}/{{opensearch_cluster}}"
+            }
+         ],
+         "title": "Shard count",
+         "type": "stat"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Percent of active shards across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "thresholds"
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 0
+                     },
+                     {
+                        "color": "yellow",
+                        "value": 1
+                     },
+                     {
+                        "color": "green",
+                        "value": 100
+                     }
+                  ]
+               },
+               "unit": "percent"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 5,
+            "w": 3,
+            "x": 12,
+            "y": 2
+         },
+         "id": 6,
+         "options": {
+            "colorMode": "value",
+            "graphMode": "none",
+            "justifyMode": "auto",
+            "orientation": "auto",
+            "reduceOptions": {
+               "calcs": [
+                  "lastNotNull"
+               ],
+               "fields": "",
+               "values": false
+            },
+            "textMode": "auto"
+         },
+         "pluginVersion": "9.4.3",
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "min by(job,opensearch_cluster) (opensearch_cluster_shards_active_percent{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{job}}/{{opensearch_cluster}}"
+            }
+         ],
+         "title": "Active shards %",
+         "type": "stat"
+      },
+      {
+         "datasource": {
+            "type": "datasource",
+            "uid": "-- Mixed --"
+         },
+         "description": "OpenSearch node roles over time.",
+         "fieldConfig": {
+            "defaults": {
+               "mappings": [
+                  {
+                     "options": {
+                        "2": {
+                           "color": "light-purple",
+                           "index": 0,
+                           "text": "data"
+                        },
+                        "3": {
+                           "color": "light-green",
+                           "index": 1,
+                           "text": "master"
+                        },
+                        "4": {
+                           "color": "light-blue",
+                           "index": 2,
+                           "text": "ingest"
+                        },
+                        "5": {
+                           "color": "light-yellow",
+                           "index": 3,
+                           "text": "cluster_manager"
+                        },
+                        "6": {
+                           "color": "super-light-red",
+                           "index": 4,
+                           "text": "remote_cluster_client"
+                        }
+                     },
+                     "type": "value"
+                  }
+               ]
+            }
+         },
+         "gridPos": {
+            "h": 5,
+            "w": 9,
+            "x": 15,
+            "y": 2
+         },
+         "id": 7,
+         "maxDataPoints": 100,
+         "options": {
+            "legend": false,
+            "showValue": "never"
+         },
+         "pluginVersion": "v10.0.0",
+         "targets": [
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "max by (node,role) (\n    max_over_time(opensearch_node_role_bool{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", role=\"data\"}[1m]) == 1\n) * 2\n",
+               "legendFormat": "{{node}}"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "max by (node,role) (\n    max_over_time(opensearch_node_role_bool{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", role=\"master\"}[1m]) == 1\n) * 3\n",
+               "legendFormat": "{{node}}"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "max by (node,role) (\n    max_over_time(opensearch_node_role_bool{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", role=\"ingest\"}[1m]) == 1\n) * 4\n",
+               "legendFormat": "{{node}}"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "max by (node,role) (\n    max_over_time(opensearch_node_role_bool{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", role=\"cluster_manager\"}[1m]) == 1\n) * 5\n",
+               "legendFormat": "{{node}}"
+            },
+            {
+               "datasource": {
+                  "type": "prometheus",
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "max by (node,role) (\n    max_over_time(opensearch_node_role_bool{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", role=\"remote_cluster_client\"}[1m]) == 1\n) * 6\n",
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Roles timeline",
+         "type": "status-history"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Top nodes by OS CPU usage across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "thresholds"
+               },
+               "mappings": [ ],
+               "max": 100,
+               "min": 0,
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "percent"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 9,
+            "w": 8,
+            "x": 0,
+            "y": 4
+         },
+         "id": 8,
+         "options": {
+            "displayMode": "gradient",
+            "minVizHeight": 10,
+            "minVizWidth": 0,
+            "orientation": "horizontal",
+            "reduceOptions": {
+               "calcs": [
+                  "lastNotNull"
+               ],
+               "fields": "",
+               "values": false
+            },
+            "showUnfilled": true
+         },
+         "pluginVersion": "9.4.3",
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "topk(10, sort_desc(sum by(node, job,opensearch_cluster) (opensearch_os_cpu_percent{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"})))",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Top nodes by CPU usage",
+         "type": "bargauge"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The total count of circuit breakers tripped across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "thresholds"
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "trips"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 9,
+            "w": 8,
+            "x": 8,
+            "y": 4
+         },
+         "id": 9,
+         "options": {
+            "displayMode": "gradient",
+            "minVizHeight": 10,
+            "minVizWidth": 0,
+            "orientation": "horizontal",
+            "reduceOptions": {
+               "calcs": [
+                  "lastNotNull"
+               ],
+               "fields": "",
+               "values": false
+            },
+            "showUnfilled": true
+         },
+         "pluginVersion": "9.4.3",
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by(job,opensearch_cluster, node) (increase(opensearch_circuitbreaker_tripped_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"}[$__interval:]))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Breakers tripped",
+         "type": "bargauge"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Shard status counts across the Opensearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "thresholds"
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "shards"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 9,
+            "w": 8,
+            "x": 16,
+            "y": 4
+         },
+         "id": 10,
+         "options": {
+            "displayMode": "gradient",
+            "minVizHeight": 10,
+            "minVizWidth": 0,
+            "orientation": "horizontal",
+            "reduceOptions": {
+               "calcs": [
+                  "lastNotNull"
+               ],
+               "fields": "",
+               "values": false
+            },
+            "showUnfilled": true
+         },
+         "pluginVersion": "9.4.3",
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "min by(type, job,opensearch_cluster) (opensearch_cluster_shards_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{type}}"
+            }
+         ],
+         "title": "Shard status",
+         "type": "bargauge"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Top nodes by disk usage across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "thresholds"
+               },
+               "mappings": [ ],
+               "max": 100,
+               "min": 0,
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "percent"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 10,
+            "w": 8,
+            "x": 0,
+            "y": 13
+         },
+         "id": 11,
+         "options": {
+            "displayMode": "gradient",
+            "minVizHeight": 10,
+            "minVizWidth": 0,
+            "orientation": "horizontal",
+            "reduceOptions": {
+               "calcs": [
+                  "lastNotNull"
+               ],
+               "fields": "",
+               "values": false
+            },
+            "showUnfilled": true
+         },
+         "pluginVersion": "9.4.3",
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "topk(10, sort_desc((100 * (sum by(node, job,opensearch_cluster) (opensearch_fs_path_total_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"})- sum by(node, job,opensearch_cluster) (opensearch_fs_path_free_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"})) / sum by(node, job,opensearch_cluster) (opensearch_fs_path_total_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"}))))",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Top nodes by disk usage",
+         "type": "bargauge"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The total count of documents indexed across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "documents"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 5,
+            "w": 8,
+            "x": 8,
+            "y": 13
+         },
+         "id": 12,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster) (opensearch_indices_indexing_index_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{job}}/{{opensearch_cluster}}"
+            }
+         ],
+         "title": "Total documents",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The number of tasks waiting to be executed across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "tasks"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 5,
+            "w": 8,
+            "x": 16,
+            "y": 13
+         },
+         "id": 13,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster) (opensearch_cluster_pending_tasks_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{job}}/{{opensearch_cluster}}"
+            }
+         ],
+         "title": "Pending tasks",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The total size of the store across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "bytes"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 5,
+            "w": 8,
+            "x": 8,
+            "y": 18
+         },
+         "id": 14,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster) (opensearch_indices_store_size_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{job}}/{{opensearch_cluster}}"
+            }
+         ],
+         "title": "Store size",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The max wait time for tasks to be executed across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "s"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 5,
+            "w": 8,
+            "x": 16,
+            "y": 18
+         },
+         "id": 15,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "max by(job,opensearch_cluster) (opensearch_cluster_task_max_waiting_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{job}}/{{opensearch_cluster}}"
+            }
+         ],
+         "title": "Max task wait time",
+         "type": "timeseries"
+      },
+      {
+         "collapsed": false,
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 23
+         },
+         "id": 16,
+         "targets": [ ],
+         "title": "Cluster search and index summary",
+         "type": "row"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Top indices by combined fetch, query, and scroll request rate across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "reqps"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 8,
+            "x": 0,
+            "y": 24
+         },
+         "id": 17,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "topk(10, sort_desc(avg by(index, job,opensearch_cluster) (\n  opensearch_index_search_fetch_current_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"} + \n  opensearch_index_search_query_current_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"} + \n  opensearch_index_search_scroll_current_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"}\n)))\n",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Top indices by request rate",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Top indices by combined fetch, query, and scroll latency across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "s"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 8,
+            "x": 8,
+            "y": 24
+         },
+         "id": 18,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "topk(10, sort_desc(sum by(index, job,opensearch_cluster) ((increase(opensearch_index_search_fetch_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"}[$__interval:])\n+increase(opensearch_index_search_query_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"}[$__interval:])\n+increase(opensearch_index_search_scroll_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"}[$__interval:]))\n/ clamp_min(increase(opensearch_index_search_fetch_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"}[$__interval:])\n+increase(opensearch_index_search_query_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"}[$__interval:])\n+increase(opensearch_index_search_scroll_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"}[$__interval:]), 1))))\n",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Top indices by request latency",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Top indices by cache hit ratio for the combined request and query cache across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "percent"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 8,
+            "x": 16,
+            "y": 24
+         },
+         "id": 19,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "topk(10, sort_desc(avg by(index, job,opensearch_cluster) (\n  100 * (opensearch_index_requestcache_hit_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"} + \n  opensearch_index_querycache_hit_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"}) / \n  clamp_min((opensearch_index_requestcache_hit_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"} + \n  opensearch_index_querycache_hit_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"} + \n  opensearch_index_requestcache_miss_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"} + \n  opensearch_index_querycache_miss_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"}), 1\n  ))))\n",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Top indices by combined cache hit ratio",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Top nodes by rate of ingest across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "Bps"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 8,
+            "x": 0,
+            "y": 32
+         },
+         "id": 20,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "topk(10, sum by(node, job,opensearch_cluster) (rate(opensearch_ingest_total_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"}[$__rate_interval])))",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Top nodes by ingest rate",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Top nodes by ingestion latency across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "s"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 8,
+            "x": 8,
+            "y": 32
+         },
+         "id": 21,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "topk(10, sum by(job,opensearch_cluster, node) (\n  increase(opensearch_ingest_total_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"}[$__interval:]) / \n  clamp_min(increase(opensearch_ingest_total_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"}[$__interval:]), 1)))\n",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Top nodes by ingest latency",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Top nodes by ingestion failures across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "errors"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 8,
+            "x": 16,
+            "y": 32
+         },
+         "id": 22,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "topk(10, sum by(job,opensearch_cluster, node) (increase(opensearch_ingest_total_failed_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"}[$__interval:])))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{node}}"
+            }
+         ],
+         "title": "Top nodes by ingest errors",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Top indices by rate of document indexing across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "documents/s"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 8,
+            "x": 0,
+            "y": 40
+         },
+         "id": 23,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "topk(10, avg by(job,opensearch_cluster, index) (opensearch_index_indexing_index_current_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"}))",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Top indices by index rate",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Top indices by indexing latency across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "s"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 8,
+            "x": 8,
+            "y": 40
+         },
+         "id": 24,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "topk(10, avg by(job,opensearch_cluster, index) \n(increase(opensearch_index_indexing_index_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"}[$__interval:]) / \nclamp_min(increase(opensearch_index_indexing_index_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\", context=\"total\"}[$__interval:]), 1)))\n",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Top indices by index latency",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Top indices by index document failures across the OpenSearch cluster.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     }
+                  ]
+               },
+               "unit": "failures"
+            },
+            "overrides": [ ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 8,
+            "x": 16,
+            "y": 40
+         },
+         "id": 25,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "topk(10, avg by(job,opensearch_cluster, index) (increase(opensearch_index_indexing_index_failed_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"}[$__interval:])))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Top indices by index failures",
+         "type": "timeseries"
+      }
+   ],
+   "refresh": "1m",
+   "schemaVersion": 36,
+   "tags": [
+      "opensearch-mixin"
+   ],
+   "templating": {
+      "list": [
+         {
+            "label": "Prometheus data source",
+            "name": "prometheus_datasource",
+            "query": "prometheus",
+            "regex": "",
+            "type": "datasource"
+         },
+         {
+            "allValue": ".+",
+            "datasource": {
+               "type": "prometheus",
+               "uid": "${prometheus_datasource}"
+            },
+            "includeAll": true,
+            "label": "Job",
+            "multi": true,
+            "name": "job",
+            "query": "label_values(opensearch_cluster_status{opensearch_cluster!=\"\"}, job)",
+            "refresh": 2,
+            "sort": 1,
+            "type": "query"
+         },
+         {
+            "allValue": ".+",
+            "datasource": {
+               "type": "prometheus",
+               "uid": "${prometheus_datasource}"
+            },
+            "includeAll": true,
+            "label": "Opensearch_cluster",
+            "multi": true,
+            "name": "opensearch_cluster",
+            "query": "label_values(opensearch_cluster_status{opensearch_cluster!=\"\",job=~\"$job\"}, opensearch_cluster)",
+            "refresh": 2,
+            "sort": 1,
+            "type": "query"
+         }
+      ]
+   },
+   "time": {
+      "from": "now-1h",
+      "to": "now"
+   },
+   "timezone": "default",
+   "title": "OpenSearch cluster overview",
+   "uid": "opensearch-cluster-overview"
+}
diff --git a/grafana/dashboards/infrastructure/search-and-index-overview.json b/grafana/dashboards/infrastructure/search-and-index-overview.json
new file mode 100644
index 0000000..6e5ce4a
--- /dev/null
+++ b/grafana/dashboards/infrastructure/search-and-index-overview.json
@@ -0,0 +1,2470 @@
+{
+   "links": [
+      {
+         "asDropdown": false,
+         "includeVars": true,
+         "keepTime": true,
+         "tags": [
+            "opensearch-mixin"
+         ],
+         "title": "Other Opensearch dashboards",
+         "type": "dashboards"
+      }
+   ],
+   "panels": [
+      {
+         "collapsed": false,
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 0
+         },
+         "id": 1,
+         "targets": [ ],
+         "title": "Request performance",
+         "type": "row"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Rate of fetch, scroll, and query requests by selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "reqps"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 0,
+            "y": 1
+         },
+         "id": 2,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by (job,opensearch_cluster,index) (opensearch_index_search_query_current_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=~\"total\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - query"
+            },
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by (job,opensearch_cluster,index) (opensearch_index_search_fetch_current_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=~\"total\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - fetch"
+            },
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by (job,opensearch_cluster,index) (opensearch_index_search_scroll_current_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=~\"total\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - scroll"
+            }
+         ],
+         "title": "Request rate",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Latency of fetch, scroll, and query requests by selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "s"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 6,
+            "y": 1
+         },
+         "id": 3,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by (job,opensearch_cluster,index) (increase(opensearch_index_search_query_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\"}[$__interval:]) / clamp_min(increase(opensearch_index_search_query_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]), 1))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - query"
+            },
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by (job,opensearch_cluster,index) (increase(opensearch_index_search_fetch_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]) / clamp_min(increase(opensearch_index_search_fetch_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]), 1))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - fetch"
+            },
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by (job,opensearch_cluster,index) (increase(opensearch_index_search_scroll_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]) / clamp_min(increase(opensearch_index_search_scroll_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]), 1))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - scroll"
+            }
+         ],
+         "title": "Request latency",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Ratio of query cache and request cache hits and misses.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "percent"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 12,
+            "y": 1
+         },
+         "id": 4,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (100 * (opensearch_index_requestcache_hit_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}) / clamp_min(opensearch_index_requestcache_hit_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"} + opensearch_index_requestcache_miss_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}, 1))",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - request"
+            },
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (100 * (opensearch_index_querycache_hit_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}) / clamp_min(opensearch_index_querycache_hit_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"} + opensearch_index_querycache_miss_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}, 1))",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - query"
+            }
+         ],
+         "title": "Cache hit ratio",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Total evictions count by cache type for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "evictions"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 18,
+            "y": 1
+         },
+         "id": 5,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (increase(opensearch_index_querycache_evictions_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - query cache"
+            },
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (increase(opensearch_index_requestcache_evictions_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - request cache"
+            },
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (increase(opensearch_index_fielddata_evictions_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - field data"
+            }
+         ],
+         "title": "Evictions",
+         "type": "timeseries"
+      },
+      {
+         "collapsed": false,
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 9
+         },
+         "id": 6,
+         "targets": [ ],
+         "title": "Index performance",
+         "type": "row"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Rate of indexed documents for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "documents/s"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 0,
+            "y": 10
+         },
+         "id": 7,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (opensearch_index_indexing_index_current_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Index rate",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Document indexing latency for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "s"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 6,
+            "y": 10
+         },
+         "id": 8,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (increase(opensearch_index_indexing_index_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=~\"total\"}[$__interval:]) / clamp_min(increase(opensearch_index_indexing_index_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=~\"total\"}[$__interval:]),1))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Index latency",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Number of indexing failures for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "failures"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 12,
+            "y": 10
+         },
+         "id": 9,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (increase(opensearch_index_indexing_index_failed_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]))",
+               "format": "time_series",
+               "interval": "1m",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Index failures",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Index flush latency for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "s"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 18,
+            "y": 10
+         },
+         "id": 10,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (increase(opensearch_index_flush_total_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]) / clamp_min(increase(opensearch_index_flush_total_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]),1))",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Flush latency",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Index merge time for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "points",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "s"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 0,
+            "y": 18
+         },
+         "id": 11,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (increase(opensearch_index_merges_total_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:])) > 0",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - total"
+            },
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (increase(opensearch_index_merges_total_stopped_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:])) > 0",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - stopped"
+            },
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (increase(opensearch_index_merges_total_throttled_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:])) > 0",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - throttled"
+            }
+         ],
+         "title": "Merge time",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Index refresh latency for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "s"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 6,
+            "y": 18
+         },
+         "id": 12,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (increase(opensearch_index_refresh_total_time_seconds{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]) / clamp_min(increase(opensearch_index_refresh_total_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:]),1))",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Refresh latency",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Current number of translog operations for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "operations"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 12,
+            "y": 18
+         },
+         "id": 13,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (opensearch_index_translog_operations_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Translog operations",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Rate of documents deleted for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "documents/s"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 18,
+            "y": 18
+         },
+         "id": 14,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by (job,opensearch_cluster,index) (opensearch_index_indexing_delete_current_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Docs deleted",
+         "type": "timeseries"
+      },
+      {
+         "collapsed": false,
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 26
+         },
+         "id": 15,
+         "targets": [ ],
+         "title": "Index capacity",
+         "type": "row"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Number of indexed documents for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "documents"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 0,
+            "y": 27
+         },
+         "id": 16,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by (job,opensearch_cluster,index) (opensearch_index_indexing_index_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Documents indexed",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Current number of segments for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "segments"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 6,
+            "y": 27
+         },
+         "id": 17,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (opensearch_index_segments_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Segment count",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Number of merge operations for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "points",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "merges"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 12,
+            "y": 27
+         },
+         "id": 18,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (increase(opensearch_index_merges_total_docs_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}[$__interval:])) > 0",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Merge count",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Size of query cache and request cache.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "bytes"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 18,
+            "y": 27
+         },
+         "id": 19,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "multi",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (opensearch_index_querycache_memory_size_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - query"
+            },
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (opensearch_index_requestcache_memory_size_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}} - request"
+            }
+         ],
+         "title": "Cache size",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Size of the store in bytes for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "bytes"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 0,
+            "y": 35
+         },
+         "id": 20,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (opensearch_index_store_size_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Store size",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Memory used by segments for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "bytes"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 6,
+            "y": 35
+         },
+         "id": 21,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (opensearch_index_segments_memory_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"})",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Segment size",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "Size of merge operations in bytes for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "points",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "bytes"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 12,
+            "y": 35
+         },
+         "id": 22,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "avg by(job,opensearch_cluster,index) (opensearch_index_merges_current_size_bytes{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", context=\"total\"}) > 0",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Merge size",
+         "type": "timeseries"
+      },
+      {
+         "datasource": {
+            "uid": "${prometheus_datasource}"
+         },
+         "description": "The number of index shards for the selected index.",
+         "fieldConfig": {
+            "defaults": {
+               "color": {
+                  "mode": "palette-classic"
+               },
+               "custom": {
+                  "axisCenteredZero": false,
+                  "axisColorMode": "text",
+                  "axisLabel": "",
+                  "axisPlacement": "auto",
+                  "barAlignment": 0,
+                  "drawStyle": "line",
+                  "fillOpacity": 0,
+                  "gradientMode": "none",
+                  "hideFrom": {
+                     "legend": false,
+                     "tooltip": false,
+                     "viz": false
+                  },
+                  "lineInterpolation": "linear",
+                  "lineWidth": 1,
+                  "pointSize": 5,
+                  "scaleDistribution": {
+                     "type": "linear"
+                  },
+                  "showPoints": "auto",
+                  "spanNulls": false,
+                  "stacking": {
+                     "group": "A",
+                     "mode": "none"
+                  },
+                  "thresholdsStyle": {
+                     "mode": "off"
+                  }
+               },
+               "mappings": [ ],
+               "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                     {
+                        "color": "green",
+                        "value": null
+                     },
+                     {
+                        "color": "red",
+                        "value": 80
+                     }
+                  ]
+               },
+               "unit": "shards"
+            },
+            "overrides": [
+               {
+                  "matcher": {
+                     "id": "byValue",
+                     "options": {
+                        "op": "gte",
+                        "reducer": "allIsZero",
+                        "value": 0
+                     }
+                  },
+                  "properties": [
+                     {
+                        "id": "custom.hideFrom",
+                        "value": {
+                           "legend": true,
+                           "tooltip": true,
+                           "viz": false
+                        }
+                     }
+                  ]
+               }
+            ]
+         },
+         "gridPos": {
+            "h": 8,
+            "w": 6,
+            "x": 18,
+            "y": 35
+         },
+         "id": 23,
+         "options": {
+            "legend": {
+               "calcs": [ ],
+               "displayMode": "list",
+               "placement": "bottom",
+               "showLegend": true
+            },
+            "tooltip": {
+               "mode": "single",
+               "sort": "none"
+            }
+         },
+         "targets": [
+            {
+               "datasource": {
+                  "uid": "${prometheus_datasource}"
+               },
+               "expr": "sum by (index) (avg by(job,opensearch_cluster,index) (opensearch_index_shards_number{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\",index=~\"$index\", type=~\"active|active_primary\"}))",
+               "format": "time_series",
+               "intervalFactor": 2,
+               "legendFormat": "{{index}}"
+            }
+         ],
+         "title": "Shard count",
+         "type": "timeseries"
+      }
+   ],
+   "refresh": "1m",
+   "schemaVersion": 36,
+   "tags": [
+      "opensearch-mixin"
+   ],
+   "templating": {
+      "list": [
+         {
+            "label": "Prometheus data source",
+            "name": "prometheus_datasource",
+            "query": "prometheus",
+            "regex": "",
+            "type": "datasource"
+         },
+         {
+            "allValue": ".+",
+            "datasource": {
+               "type": "prometheus",
+               "uid": "${prometheus_datasource}"
+            },
+            "includeAll": true,
+            "label": "Job",
+            "multi": true,
+            "name": "job",
+            "query": "label_values(opensearch_index_search_fetch_count{opensearch_cluster!=\"\"}, job)",
+            "refresh": 2,
+            "sort": 1,
+            "type": "query"
+         },
+         {
+            "allValue": ".+",
+            "datasource": {
+               "type": "prometheus",
+               "uid": "${prometheus_datasource}"
+            },
+            "includeAll": true,
+            "label": "Opensearch_cluster",
+            "multi": true,
+            "name": "opensearch_cluster",
+            "query": "label_values(opensearch_index_search_fetch_count{opensearch_cluster!=\"\",job=~\"$job\"}, opensearch_cluster)",
+            "refresh": 2,
+            "sort": 1,
+            "type": "query"
+         },
+         {
+            "allValue": ".+",
+            "datasource": {
+               "type": "prometheus",
+               "uid": "${prometheus_datasource}"
+            },
+            "includeAll": true,
+            "label": "Index",
+            "multi": true,
+            "name": "index",
+            "query": "label_values(opensearch_index_search_fetch_count{opensearch_cluster!=\"\",job=~\"$job\",opensearch_cluster=~\"$opensearch_cluster\"}, index)",
+            "refresh": 2,
+            "sort": 1,
+            "type": "query"
+         }
+      ]
+   },
+   "time": {
+      "from": "now-1h",
+      "to": "now"
+   },
+   "timezone": "default",
+   "title": "OpenSearch search and index overview",
+   "uid": "opensearch-search-and-index-overview"
+}
diff --git a/prometheus/opensearch.rec.rules b/prometheus/opensearch.rec.rules
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/prometheus/opensearch.rec.rules
@@ -0,0 +1 @@
+{}
diff --git a/prometheus/opensearch.rules b/prometheus/opensearch.rules
new file mode 100644
index 0000000..4aa4ade
--- /dev/null
+++ b/prometheus/opensearch.rules
@@ -0,0 +1,121 @@
+"groups":
+- "name": "opensearch-alerts"
+  "rules":
+  - "alert": "OpenSearchYellowCluster"
+    "annotations":
+      "description": "{{$labels.cluster}} health status is yellow over the last 5 minutes"
+      "summary": "At least one of the clusters is reporting a yellow status."
+    "expr": |
+      opensearch_cluster_status{opensearch_cluster!=""} == 1
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+  - "alert": "OpenSearchRedCluster"
+    "annotations":
+      "description": "{{$labels.cluster}} health status is red over the last 5 minutes"
+      "summary": "At least one of the clusters is reporting a red status."
+    "expr": |
+      opensearch_cluster_status{opensearch_cluster!=""} == 2
+    "for": "5m"
+    "labels":
+      "severity": "critical"
+  - "alert": "OpenSearchUnstableShardReallocation"
+    "annotations":
+      "description": |
+        {{$labels.cluster}} has had {{ printf "%.0f" $value }} shard reallocation over the last 1m which is above the threshold of 0.
+      "summary": "A node has gone offline or has been disconnected triggering shard reallocation."
+    "expr": |
+      sum without(type) (opensearch_cluster_shards_number{opensearch_cluster!="", type="relocating"}) > 0
+    "for": "1m"
+    "labels":
+      "severity": "warning"
+  - "alert": "OpenSearchUnstableShardUnassigned"
+    "annotations":
+      "description": |
+        {{$labels.cluster}} has had {{ printf "%.0f" $value }} shard unassigned over the last 5m which is above the threshold of 0.
+      "summary": "There are shards that have been detected as unassigned."
+    "expr": |
+      sum without(type) (opensearch_cluster_shards_number{opensearch_cluster!="", type="unassigned"}) > 0
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+  - "alert": "OpenSearchHighNodeDiskUsage"
+    "annotations":
+      "description": |
+        {{$labels.node}} has had {{ printf "%.0f" $value }} disk usage over the last 5m which is above the threshold of 60.
+      "summary": "The node disk usage has exceeded the warning threshold."
+    "expr": |
+      100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{opensearch_cluster!=""} - opensearch_fs_path_free_bytes{opensearch_cluster!=""}) / opensearch_fs_path_total_bytes{opensearch_cluster!=""}) > 60
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+  - "alert": "OpenSearchHighNodeDiskUsage"
+    "annotations":
+      "description": |
+        {{$labels.node}} has had {{ printf "%.0f" $value }}% disk usage over the last 5m which is above the threshold of 80.
+      "summary": "The node disk usage has exceeded the critical threshold."
+    "expr": |
+      100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{opensearch_cluster!=""} - opensearch_fs_path_free_bytes) / opensearch_fs_path_total_bytes{opensearch_cluster!=""}) > 80
+    "for": "5m"
+    "labels":
+      "severity": "critical"
+  - "alert": "OpenSearchHighNodeCpuUsage"
+    "annotations":
+      "description": |
+        {{$labels.node}} has had {{ printf "%.0f" $value }}% CPU usage over the last 5m which is above the threshold of 70.
+      "summary": "The node CPU usage has exceeded the warning threshold."
+    "expr": |
+      sum without(nodeid) (opensearch_os_cpu_percent{opensearch_cluster!=""}) > 70
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+  - "alert": "OpenSearchHighNodeCpuUsage"
+    "annotations":
+      "description": |
+        {{$labels.node}} has had {{ printf "%.0f" $value }}% CPU usage over the last 5m which is above the threshold of 85.
+      "summary": "The node CPU usage has exceeded the critical threshold."
+    "expr": |
+      sum without(nodeid) (opensearch_os_cpu_percent{opensearch_cluster!=""}) > 85
+    "for": "5m"
+    "labels":
+      "severity": "critical"
+  - "alert": "OpenSearchHighNodeMemoryUsage"
+    "annotations":
+      "description": |
+        {{$labels.node}} has had {{ printf "%.0f" $value }}% memory usage over the last 5m which is above the threshold of 70.
+      "summary": "The node memory usage has exceeded the warning threshold."
+    "expr": |
+      sum without(nodeid) (opensearch_os_mem_used_percent{opensearch_cluster!=""}) > 70
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+  - "alert": "OpenSearchHighNodeMemoryUsage"
+    "annotations":
+      "description": |
+        {{$labels.node}} has had {{ printf "%.0f" $value }}% memory usage over the last 5m which is above the threshold of 85.
+      "summary": "The node memory usage has exceeded the critical threshold."
+    "expr": |
+      sum without(nodeid) (opensearch_os_mem_used_percent{opensearch_cluster!=""}) > 85
+    "for": "5m"
+    "labels":
+      "severity": "critical"
+  - "alert": "OpenSearchModerateRequestLatency"
+    "annotations":
+      "description": |
+        {{$labels.index}} has had {{ printf "%.0f" $value }}s of request latency over the last 5m which is above the threshold of 0.5.
+      "summary": "The request latency has exceeded the warning threshold."
+    "expr": |
+      sum without(context) ((increase(opensearch_index_search_fetch_time_seconds{opensearch_cluster!="", context="total"}[5m])+increase(opensearch_index_search_query_time_seconds{context="total"}[5m])+increase(opensearch_index_search_scroll_time_seconds{context="total"}[5m])) / clamp_min(increase(opensearch_index_search_fetch_count{context="total"}[5m])+increase(opensearch_index_search_query_count{context="total"}[5m])+increase(opensearch_index_search_scroll_count{context="total"}[5m]), 1)) > 0.5
+    "for": "5m"
+    "labels":
+      "severity": "warning"
+  - "alert": "OpenSearchModerateIndexLatency"
+    "annotations":
+      "description": |
+        {{$labels.index}} has had {{ printf "%.0f" $value }}s of index latency over the last 5m which is above the threshold of 0.5.
+      "summary": "The index latency has exceeded the warning threshold."
+    "expr": |
+      sum without(context) (increase(opensearch_index_indexing_index_time_seconds{opensearch_cluster!="", context="total"}[5m]) / clamp_min(increase(opensearch_index_indexing_index_count{context="total"}[5m]), 1)) > 0.5
+    "for": "5m"
+    "labels":
+      "severity": "warning"