From d8dad8b1374954bb7ded53faef1dae25c17aaac8 Mon Sep 17 00:00:00 2001 From: Teddy Andrieux Date: Fri, 18 Jun 2021 10:48:16 +0200 Subject: [PATCH] charts,salt,build: Bump kube-prometheus-stack version to 16.9.1 Update kube-prometheus-stack chart to 16.9.1 ``` rm -rf charts/kube-prometheus-stack helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update helm fetch -d charts --untar prometheus-community/kube-prometheus-stack ``` Re-render chart to salt state using ``` ./charts/render.py prometheus-operator \ charts/kube-prometheus-stack.yaml \ charts/kube-prometheus-stack/ \ --namespace metalk8s-monitoring \ --service-config grafana \ metalk8s-grafana-config \ metalk8s/addons/prometheus-operator/config/grafana.yaml \ metalk8s-monitoring \ --service-config prometheus \ metalk8s-prometheus-config \ metalk8s/addons/prometheus-operator/config/prometheus.yaml \ metalk8s-monitoring \ --service-config alertmanager \ metalk8s-alertmanager-config \ metalk8s/addons/prometheus-operator/config/alertmanager.yaml \ metalk8s-monitoring \ --service-config dex \ metalk8s-dex-config \ metalk8s/addons/dex/config/dex.yaml.j2 metalk8s-auth \ --drop-prometheus-rules charts/drop-prometheus-rules.yaml \ > salt/metalk8s/addons/prometheus-operator/deployed/chart.sls ``` Update vendored rules Update alert rules extract ``` ./tools/rule_extractor/rule_extractor.py \ -i -p 8443 -t rules ``` --- CHANGELOG.md | 19 +- buildchain/buildchain/constants.py | 4 +- buildchain/buildchain/image.py | 6 +- buildchain/buildchain/versions.py | 28 +- charts/kube-prometheus-stack/Chart.lock | 10 +- charts/kube-prometheus-stack/Chart.yaml | 15 +- charts/kube-prometheus-stack/README.md | 7 +- .../charts/grafana/Chart.yaml | 4 +- .../charts/grafana/README.md | 4 +- .../charts/grafana/templates/_helpers.tpl | 13 + .../charts/grafana/templates/_pod.tpl | 12 +- .../charts/grafana/templates/deployment.yaml | 4 +- .../charts/grafana/templates/hpa.yaml | 20 + .../charts/grafana/templates/secret.yaml | 4 +- .../charts/grafana/templates/statefulset.yaml | 2 +- .../charts/grafana/values.yaml | 22 +- .../charts/kube-state-metrics/Chart.yaml | 7 +- .../charts/kube-state-metrics/README.md | 34 +- .../templates/deployment.yaml | 63 +- .../kube-state-metrics/templates/role.yaml | 12 +- .../templates/rolebinding.yaml | 2 +- .../charts/kube-state-metrics/values.yaml | 8 +- .../prometheus-node-exporter/Chart.yaml | 2 +- .../templates/psp.yaml | 4 + .../prometheus-node-exporter/values.yaml | 1 + .../crds/crd-alertmanagerconfigs.yaml | 2 +- .../crds/crd-alertmanagers.yaml | 2 +- .../crds/crd-podmonitors.yaml | 2 +- .../crds/crd-probes.yaml | 2 +- .../crds/crd-prometheuses.yaml | 2 +- .../crds/crd-prometheusrules.yaml | 2 +- .../crds/crd-servicemonitors.yaml | 2 +- .../crds/crd-thanosrulers.yaml | 2 +- .../templates/_helpers.tpl | 13 +- .../templates/alertmanager/ingress.yaml | 5 +- .../alertmanager/ingressperreplica.yaml | 5 +- .../kube-api-server/servicemonitor.yaml | 4 +- .../exporters/kubelet/servicemonitor.yaml | 22 +- .../node-exporter/servicemonitor.yaml | 9 +- .../grafana/configmap-dashboards.yaml | 4 +- .../grafana/configmaps-datasources.yaml | 6 +- .../alertmanager-overview.yaml} | 310 +- .../grafana/dashboards-1.14/apiserver.yaml | 4 +- .../dashboards-1.14/cluster-total.yaml | 6 +- .../dashboards-1.14/controller-manager.yaml | 56 +- .../grafana/dashboards-1.14/etcd.yaml | 6 +- .../grafana/dashboards-1.14/k8s-coredns.yaml | 2 +- .../k8s-resources-cluster.yaml | 588 +- .../k8s-resources-namespace.yaml | 584 +- .../dashboards-1.14/k8s-resources-node.yaml | 28 +- .../dashboards-1.14/k8s-resources-pod.yaml | 863 +- .../k8s-resources-workload.yaml | 114 +- .../k8s-resources-workloads-namespace.yaml | 158 +- .../grafana/dashboards-1.14/kubelet.yaml | 4139 ++++--- .../dashboards-1.14/namespace-by-pod.yaml | 8 +- .../namespace-by-workload.yaml | 10 +- .../node-cluster-rsrc-use.yaml | 38 +- .../dashboards-1.14/node-rsrc-use.yaml | 38 +- .../grafana/dashboards-1.14/nodes.yaml | 22 +- .../persistentvolumesusage.yaml | 16 +- .../grafana/dashboards-1.14/pod-total.yaml | 10 +- .../prometheus-remote-write.yaml | 4 +- .../grafana/dashboards-1.14/prometheus.yaml | 28 +- .../grafana/dashboards-1.14/proxy.yaml | 60 +- .../grafana/dashboards-1.14/scheduler.yaml | 84 +- .../grafana/dashboards-1.14/statefulset.yaml | 4 +- .../dashboards-1.14/workload-total.yaml | 12 +- .../templates/grafana/dashboards/etcd.yaml | 1116 -- .../dashboards/k8s-cluster-rsrc-use.yaml | 959 -- .../grafana/dashboards/k8s-node-rsrc-use.yaml | 986 -- .../dashboards/k8s-resources-cluster.yaml | 1479 --- .../dashboards/k8s-resources-namespace.yaml | 963 -- .../grafana/dashboards/k8s-resources-pod.yaml | 1006 -- .../dashboards/k8s-resources-workload.yaml | 936 -- .../k8s-resources-workloads-namespace.yaml | 972 -- .../templates/grafana/dashboards/nodes.yaml | 1383 --- .../dashboards/persistentvolumesusage.yaml | 573 - .../grafana/dashboards/statefulset.yaml | 926 -- .../job-patch/job-createSecret.yaml | 6 +- .../job-patch/job-patchWebhook.yaml | 6 +- .../mutatingWebhookConfiguration.yaml | 4 +- .../validatingWebhookConfiguration.yaml | 4 +- .../templates/prometheus/ingress.yaml | 5 +- .../prometheus/ingressThanosSidecar.yaml | 5 +- .../prometheus/ingressperreplica.yaml | 5 +- .../prometheus/podDisruptionBudget.yaml | 2 +- .../templates/prometheus/prometheus.yaml | 4 +- .../rules-1.14/alertmanager.rules.yaml | 147 +- .../prometheus/rules-1.14/general.rules.yaml | 10 +- .../prometheus/rules-1.14/k8s.rules.yaml | 6 +- .../kube-apiserver-availability.rules.yaml | 90 +- .../rules-1.14/kube-apiserver-slos.yaml | 2 +- .../rules-1.14/kube-apiserver.rules.yaml | 150 +- .../kube-prometheus-general.rules.yaml | 2 +- .../kube-prometheus-node-recording.rules.yaml | 2 +- .../rules-1.14/kube-scheduler.rules.yaml | 2 +- .../rules-1.14/kube-state-metrics.yaml | 30 +- .../prometheus/rules-1.14/kubelet.rules.yaml | 2 +- .../rules-1.14/kubernetes-apps.yaml | 4 +- .../rules-1.14/kubernetes-resources.yaml | 2 +- .../rules-1.14/kubernetes-storage.yaml | 2 +- .../kubernetes-system-apiserver.yaml | 2 +- .../kubernetes-system-controller-manager.yaml | 2 +- .../rules-1.14/kubernetes-system-kubelet.yaml | 2 +- .../kubernetes-system-scheduler.yaml | 2 +- .../rules-1.14/kubernetes-system.yaml | 2 +- .../rules-1.14/node-exporter.rules.yaml | 34 +- .../prometheus/rules-1.14/node-exporter.yaml | 24 +- .../prometheus/rules-1.14/node-network.yaml | 5 +- .../prometheus/rules-1.14/node.rules.yaml | 2 +- .../rules-1.14/prometheus-operator.yaml | 2 +- .../prometheus/rules-1.14/prometheus.yaml | 85 +- .../prometheus/rules/alertmanager.rules.yaml | 63 - .../templates/prometheus/rules/etcd.yaml | 179 - .../prometheus/rules/general.rules.yaml | 56 - .../templates/prometheus/rules/k8s.rules.yaml | 83 - .../rules/kube-apiserver.rules.yaml | 39 - .../kube-prometheus-node-alerting.rules.yaml | 47 - .../kube-prometheus-node-recording.rules.yaml | 41 - .../rules/kube-scheduler.rules.yaml | 63 - .../prometheus/rules/kubernetes-absent.yaml | 159 - .../prometheus/rules/kubernetes-apps.yaml | 200 - .../rules/kubernetes-resources.yaml | 121 - .../prometheus/rules/kubernetes-storage.yaml | 72 - .../prometheus/rules/kubernetes-system.yaml | 184 - .../prometheus/rules/node-network.yaml | 57 - .../templates/prometheus/rules/node-time.yaml | 37 - .../prometheus/rules/node.rules.yaml | 202 - .../prometheus/rules/prometheus-operator.yaml | 49 - .../prometheus/rules/prometheus.rules.yaml | 139 - .../templates/prometheus/service.yaml | 2 +- .../prometheus/serviceThanosSidecar.yaml | 2 +- .../serviceThanosSidecarExternal.yaml | 2 +- .../prometheus/serviceperreplica.yaml | 2 +- charts/kube-prometheus-stack/values.yaml | 54 +- .../config/prometheus.yaml | 4 +- .../prometheus-operator/deployed/chart.sls | 9679 ++++++++++------- .../deployed/prometheus-rules.sls | 13 +- tools/rule_extractor/alerting_rules.csv | 29 +- tools/rule_extractor/alerting_rules.json | 90 +- tools/rule_extractor/rules.json | 1577 +-- 141 files changed, 11678 insertions(+), 21103 deletions(-) create mode 100644 charts/kube-prometheus-stack/charts/grafana/templates/hpa.yaml rename charts/kube-prometheus-stack/templates/grafana/{dashboards/pods.yaml => dashboards-1.14/alertmanager-overview.yaml} (63%) delete mode 100644 charts/kube-prometheus-stack/templates/grafana/dashboards/etcd.yaml delete mode 100644 charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-cluster-rsrc-use.yaml delete mode 100644 charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-node-rsrc-use.yaml delete mode 100644 charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-cluster.yaml delete mode 100644 charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-namespace.yaml delete mode 100644 charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-pod.yaml delete mode 100644 charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-workload.yaml delete mode 100644 charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-workloads-namespace.yaml delete mode 100644 charts/kube-prometheus-stack/templates/grafana/dashboards/nodes.yaml delete mode 100644 charts/kube-prometheus-stack/templates/grafana/dashboards/persistentvolumesusage.yaml delete mode 100644 charts/kube-prometheus-stack/templates/grafana/dashboards/statefulset.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/alertmanager.rules.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/etcd.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/general.rules.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/k8s.rules.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/kube-apiserver.rules.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/kube-scheduler.rules.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-absent.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-apps.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-resources.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-storage.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-system.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/node-network.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/node-time.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/node.rules.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/prometheus-operator.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules/prometheus.rules.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 401887eab3..124e5853eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,17 +28,16 @@ dex image from v2.27.0 to v2.28.1 (PR[#3370](https://github.com/scality/metalk8s/pull/3370)) -- [#3279](https://github.com/scality/metalk8s/issues/3279) - Bump - kube-prometheus-stack charts version from 12.2.3 to 15.4.4. +- Bump kube-prometheus-stack charts version to 16.9.1 The following images have also been bumped accordingly: - - grafana from 7.3.5 to 7.5.5 - - k8s-sidecar from 1.1.0 to 1.10.7 - - kube-state-metrics from v1.9.7 to v1.9.8 - - node-exporter from v1.0.1 to v1.1.2 - - prometheus from v2.22.1 to v2.26.0 - - prometheus-config-reloader fomr v0.43.2 to v0.47.0 - - prometheus-operator from v0.43.2 to v0.47.0 - (PR[#3360](https://github.com/scality/metalk8s/pull/3360)) + - grafana to 8.0.1 + - k8s-sidecar to 1.12.2 + - kube-state-metrics to v2.0.0 + - node-exporter to v1.1.2 + - prometheus to v2.27.1 + - prometheus-config-reloader to v0.48.1 + - prometheus-operator to v0.48.1 + (PR[#3422](https://github.com/scality/metalk8s/pull/3422)) - [#3279](https://github.com/scality/metalk8s/issues/3279) - Bump ingress-nginx chart version from 3.13.0 to 3.30.0 diff --git a/buildchain/buildchain/constants.py b/buildchain/buildchain/constants.py index 59b376436b..6ac38e1d8a 100644 --- a/buildchain/buildchain/constants.py +++ b/buildchain/buildchain/constants.py @@ -19,13 +19,13 @@ # URLs of the main container repositories. CALICO_REPOSITORY: str = "docker.io/calico" COREDNS_REPOSITORY: str = "k8s.gcr.io/coredns" -COREOS_REPOSITORY: str = "quay.io/coreos" DEX_REPOSITORY: str = "ghcr.io/dexidp" DOCKER_REPOSITORY: str = "docker.io/library" GOOGLE_REPOSITORY: str = "k8s.gcr.io" GRAFANA_REPOSITORY: str = "docker.io/grafana" INGRESS_REPOSITORY: str = "k8s.gcr.io/ingress-nginx" -KIWIGRID_REPOSITORY: str = "docker.io/kiwigrid" +KIWIGRID_REPOSITORY: str = "quay.io/kiwigrid" +KUBE_STATE_METRICS_REPOSITORY: str = "k8s.gcr.io/kube-state-metrics" PROMETHEUS_ADAPTER_REPOSITORY: str = "docker.io/directxman12" PROMETHEUS_OPERATOR_REPOSITORY: str = "quay.io/prometheus-operator" PROMETHEUS_REPOSITORY: str = "quay.io/prometheus" diff --git a/buildchain/buildchain/image.py b/buildchain/buildchain/image.py index 756f6e05d6..1b74ee2f1e 100644 --- a/buildchain/buildchain/image.py +++ b/buildchain/buildchain/image.py @@ -181,9 +181,6 @@ def _operator_image(name: str, **kwargs: Any) -> targets.OperatorImage: constants.COREDNS_REPOSITORY: [ "coredns", ], - constants.COREOS_REPOSITORY: [ - "kube-state-metrics", - ], constants.DEX_REPOSITORY: [ "dex", ], @@ -209,6 +206,9 @@ def _operator_image(name: str, **kwargs: Any) -> targets.OperatorImage: constants.KIWIGRID_REPOSITORY: [ "k8s-sidecar", ], + constants.KUBE_STATE_METRICS_REPOSITORY: [ + "kube-state-metrics", + ], constants.PROMETHEUS_ADAPTER_REPOSITORY: [ "k8s-prometheus-adapter-amd64", ], diff --git a/buildchain/buildchain/versions.py b/buildchain/buildchain/versions.py index d39d99ed5b..08e9c1f9f4 100644 --- a/buildchain/buildchain/versions.py +++ b/buildchain/buildchain/versions.py @@ -92,8 +92,8 @@ def _version_prefix(version: str, prefix: str = "v") -> str: # Remote images Image( name="alertmanager", - version="v0.21.0", - digest="sha256:24a5204b418e8fa0214cfb628486749003b039c279c56b5bddb5b10cd100d926", + version="v0.22.2", + digest="sha256:624c1a5063c7c80635081a504c3e1b020d89809651978eb5d0b652a394f3022d", ), Image( name="calico-node", @@ -122,13 +122,13 @@ def _version_prefix(version: str, prefix: str = "v") -> str: ), Image( name="grafana", - version="7.5.5", - digest="sha256:075819791b43f30aab18497fff217d3aec918d7d55bf873818de6a916da04d54", + version="8.0.1", + digest="sha256:1c3e2fc7896adf9e33be5d062c08066087cb556f63b0a95f8aefe92bd37a6f38", ), Image( name="k8s-sidecar", - version="1.10.7", - digest="sha256:18feb3906286814364b2f42eeaf82649b4847a4d0a779222a613c79c9da7ad87", + version="1.12.2", + digest="sha256:ca760f94b35eb78575b170e41d1e19e27359b29245dacfd1c42ae90452ecc08e", ), Image( name="kube-apiserver", @@ -152,8 +152,8 @@ def _version_prefix(version: str, prefix: str = "v") -> str: ), Image( name="kube-state-metrics", - version="v1.9.8", - digest="sha256:47d3a12d9da6699a9d95df8aaff235305229ef08203fae3fc1f1d47b2a409f89", + version="v2.0.0", + digest="sha256:eb2f41024a583e8795213726099c6f9432f2d64ab3754cc8ab8d00bdbc328910", ), Image( name="nginx", @@ -182,8 +182,8 @@ def _version_prefix(version: str, prefix: str = "v") -> str: ), Image( name="prometheus", - version="v2.26.0", - digest="sha256:38d40a760569b1c5aec4a36e8a7f11e86299e9191b9233672a5d41296d8fa74e", + version="v2.27.1", + digest="sha256:5accb68b56ba452e449a5e552411acaeabbbe0f087acf19a1157ce3dd10a8bed", ), Image( name="k8s-prometheus-adapter-amd64", @@ -192,13 +192,13 @@ def _version_prefix(version: str, prefix: str = "v") -> str: ), Image( name="prometheus-config-reloader", - version="v0.47.0", - digest="sha256:0029252e7cf8cf38fc58795928d4e1c746b9e609432a2ee5417a9cab4633b864", + version="v0.48.1", + digest="sha256:9bed3dae8023c7a83b906c6c8abd92900697fb3806f14b7f2dbfbc37fe4b7941", ), Image( name="prometheus-operator", - version="v0.47.0", - digest="sha256:89a2d121b1a8f9a4a45dd20fdcf081a4468a0a0ad4e0cbe1aa7fd289e5a85cb3", + version="v0.48.1", + digest="sha256:2e7b61c86ee8b0aef4f5da8b6a4e51ecef249c9ccf4a329c5aa0c81e3fd074c1", ), # Local images Image( diff --git a/charts/kube-prometheus-stack/Chart.lock b/charts/kube-prometheus-stack/Chart.lock index 1af21766c7..373cf3a60a 100644 --- a/charts/kube-prometheus-stack/Chart.lock +++ b/charts/kube-prometheus-stack/Chart.lock @@ -1,12 +1,12 @@ dependencies: - name: kube-state-metrics repository: https://prometheus-community.github.io/helm-charts - version: 2.13.2 + version: 3.1.1 - name: prometheus-node-exporter repository: https://prometheus-community.github.io/helm-charts - version: 1.17.0 + version: 1.18.1 - name: grafana repository: https://grafana.github.io/helm-charts - version: 6.8.3 -digest: sha256:0891efa2b27146cbbde43ec58903f634f65153a7a19b811566bfbc276bf85891 -generated: "2021-05-05T00:44:27.452086517Z" + version: 6.13.0 +digest: sha256:3ca182deee748d72f0ad4d2a984c45eb1d2685018dc29d06a784e93cb281f776 +generated: "2021-06-17T14:10:55.043306+02:00" diff --git a/charts/kube-prometheus-stack/Chart.yaml b/charts/kube-prometheus-stack/Chart.yaml index 54a73d5968..e3bdf79354 100644 --- a/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kube-prometheus-stack/Chart.yaml @@ -6,21 +6,24 @@ annotations: url: https://github.com/prometheus-operator/kube-prometheus artifacthub.io/operator: "true" apiVersion: v2 -appVersion: 0.47.0 +appVersion: 0.48.1 dependencies: - condition: kubeStateMetrics.enabled name: kube-state-metrics repository: https://prometheus-community.github.io/helm-charts - version: 2.13.* + version: 3.1.* - condition: nodeExporter.enabled name: prometheus-node-exporter repository: https://prometheus-community.github.io/helm-charts - version: 1.17.* + version: 1.18.* - condition: grafana.enabled name: grafana repository: https://grafana.github.io/helm-charts - version: 6.8.* -description: kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. + version: 6.13.* +description: kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards, + and Prometheus rules combined with documentation and scripts to provide easy to + operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus + Operator. home: https://github.com/prometheus-operator/kube-prometheus icon: https://raw.githubusercontent.com/prometheus/prometheus.github.io/master/assets/prometheus_logo-cb55bb5c346.png keywords: @@ -44,4 +47,4 @@ sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus type: application -version: 15.4.4 +version: 16.9.1 diff --git a/charts/kube-prometheus-stack/README.md b/charts/kube-prometheus-stack/README.md index eeaad6b9d6..747c2b9779 100644 --- a/charts/kube-prometheus-stack/README.md +++ b/charts/kube-prometheus-stack/README.md @@ -35,7 +35,7 @@ _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documen By default this chart installs additional, dependent charts: -- [kubernetes/kube-state-metrics](https://github.com/kubernetes/kube-state-metrics/tree/master/charts/kube-state-metrics) +- [prometheus-community/kube-state-metrics](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics) - [prometheus-community/prometheus-node-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter) - [grafana/grafana](https://github.com/grafana/helm-charts/tree/main/charts/grafana) @@ -83,6 +83,9 @@ _See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade/) for command documen A major chart version change (like v1.2.3 -> v2.0.0) indicates that there is an incompatible breaking change needing manual actions. +### From 15.x to 16.x +Version 16 upgrades kube-state-metrics to v2.0.0. This includes changed command-line arguments and removed metrics, see this [blog post](https://kubernetes.io/blog/2021/04/13/kube-state-metrics-v-2-0/). This version also removes Grafana dashboards that supported Kubernetes 1.14 or earlier. + ### From 14.x to 15.x Version 15 upgrades prometheus-operator from 0.46.x to 0.47.x. Helm does not automatically upgrade or install new CRDs on a chart upgrade, so you have to install the CRDs manually before updating: @@ -392,7 +395,7 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: labels: - app: prometheus + app.kubernetes.io/name: prometheus prometheus: prometheus-migration-prometheus name: prometheus-prometheus-migration-prometheus-db-prometheus-prometheus-migration-prometheus-0 namespace: monitoring diff --git a/charts/kube-prometheus-stack/charts/grafana/Chart.yaml b/charts/kube-prometheus-stack/charts/grafana/Chart.yaml index 167090c60d..204c2bd365 100644 --- a/charts/kube-prometheus-stack/charts/grafana/Chart.yaml +++ b/charts/kube-prometheus-stack/charts/grafana/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -appVersion: 7.5.5 +appVersion: 8.0.1 description: The leading tool for querying and visualizing time series and metrics. home: https://grafana.net icon: https://raw.githubusercontent.com/grafana/grafana/master/public/img/logo_transparent_400x.png @@ -19,4 +19,4 @@ name: grafana sources: - https://github.com/grafana/grafana type: application -version: 6.8.3 +version: 6.13.0 diff --git a/charts/kube-prometheus-stack/charts/grafana/README.md b/charts/kube-prometheus-stack/charts/grafana/README.md index 8719100b64..fd19a3bc09 100644 --- a/charts/kube-prometheus-stack/charts/grafana/README.md +++ b/charts/kube-prometheus-stack/charts/grafana/README.md @@ -83,7 +83,7 @@ This version requires Helm >= 3.1.0. | `ingress.path` | Ingress accepted path | `/` | | `ingress.pathType` | Ingress type of path | `Prefix` | | `ingress.hosts` | Ingress accepted hostnames | `["chart-example.local"]` | -| `ingress.extraPaths` | Ingress extra paths to prepend to every host configuration. Useful when configuring [custom actions with AWS ALB Ingress Controller](https://kubernetes-sigs.github.io/aws-alb-ingress-controller/guide/ingress/annotation/#actions). | `[]` | +| `ingress.extraPaths` | Ingress extra paths to prepend to every host configuration. Useful when configuring [custom actions with AWS ALB Ingress Controller](https://kubernetes-sigs.github.io/aws-alb-ingress-controller/guide/ingress/annotation/#actions). Requires `ingress.hosts` to have one or more host entries. | `[]` | | `ingress.tls` | Ingress TLS configuration | `[]` | | `resources` | CPU/Memory resource requests/limits | `{}` | | `nodeSelector` | Node labels for pod assignment | `{}` | @@ -136,7 +136,7 @@ This version requires Helm >= 3.1.0. | `podLabels` | Pod labels | `{}` | | `podPortName` | Name of the grafana port on the pod | `grafana` | | `sidecar.image.repository` | Sidecar image repository | `quay.io/kiwigrid/k8s-sidecar` | -| `sidecar.image.tag` | Sidecar image tag | `1.10.7` | +| `sidecar.image.tag` | Sidecar image tag | `1.12.2` | | `sidecar.image.sha` | Sidecar image sha (optional) | `""` | | `sidecar.imagePullPolicy` | Sidecar image pull policy | `IfNotPresent` | | `sidecar.resources` | Sidecar resources | `{}` | diff --git a/charts/kube-prometheus-stack/charts/grafana/templates/_helpers.tpl b/charts/kube-prometheus-stack/charts/grafana/templates/_helpers.tpl index 70e055883f..fd40868cb7 100644 --- a/charts/kube-prometheus-stack/charts/grafana/templates/_helpers.tpl +++ b/charts/kube-prometheus-stack/charts/grafana/templates/_helpers.tpl @@ -114,3 +114,16 @@ Return the appropriate apiVersion for rbac. {{- print "rbac.authorization.k8s.io/v1beta1" -}} {{- end -}} {{- end -}} + +{{/* +Looks if there's an existing secret and reuse its password. If not it generates +new password and use it. +*/}} +{{- define "grafana.password" -}} +{{- $secret := (lookup "v1" "Secret" (include "grafana.namespace" .) (include "grafana.fullname" .) ) -}} + {{- if $secret -}} + {{- index $secret "data" "admin-password" -}} + {{- else -}} + {{- (randAlphaNum 40) | b64enc | quote -}} + {{- end -}} +{{- end -}} diff --git a/charts/kube-prometheus-stack/charts/grafana/templates/_pod.tpl b/charts/kube-prometheus-stack/charts/grafana/templates/_pod.tpl index 4a4aeea4ac..f8e7c9ed7a 100644 --- a/charts/kube-prometheus-stack/charts/grafana/templates/_pod.tpl +++ b/charts/kube-prometheus-stack/charts/grafana/templates/_pod.tpl @@ -313,14 +313,14 @@ containers: containerPort: 3000 protocol: TCP env: - {{- if not .Values.env.GF_SECURITY_ADMIN_USER }} + {{- if and (not .Values.env.GF_SECURITY_ADMIN_USER) (not .Values.env.GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION) }} - name: GF_SECURITY_ADMIN_USER valueFrom: secretKeyRef: name: {{ .Values.admin.existingSecret | default (include "grafana.fullname" .) }} key: {{ .Values.admin.userKey | default "admin-user" }} {{- end }} - {{- if and (not .Values.env.GF_SECURITY_ADMIN_PASSWORD) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD__FILE) }} + {{- if and (not .Values.env.GF_SECURITY_ADMIN_PASSWORD) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD__FILE) (not .Values.env.GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION) }} - name: GF_SECURITY_ADMIN_PASSWORD valueFrom: secretKeyRef: @@ -352,6 +352,14 @@ containers: - name: GF_RENDERING_CALLBACK_URL value: http://{{ template "grafana.fullname" . }}.{{ template "grafana.namespace" . }}:{{ .Values.service.port }}/{{ .Values.imageRenderer.grafanaSubPath }} {{ end }} + - name: GF_PATHS_DATA + value: {{ (get .Values "grafana.ini").paths.data }} + - name: GF_PATHS_LOGS + value: {{ (get .Values "grafana.ini").paths.logs }} + - name: GF_PATHS_PLUGINS + value: {{ (get .Values "grafana.ini").paths.plugins }} + - name: GF_PATHS_PROVISIONING + value: {{ (get .Values "grafana.ini").paths.provisioning }} {{- range $key, $value := .Values.envValueFrom }} - name: {{ $key | quote }} valueFrom: diff --git a/charts/kube-prometheus-stack/charts/grafana/templates/deployment.yaml b/charts/kube-prometheus-stack/charts/grafana/templates/deployment.yaml index 4d77794cd9..1c9ae86381 100644 --- a/charts/kube-prometheus-stack/charts/grafana/templates/deployment.yaml +++ b/charts/kube-prometheus-stack/charts/grafana/templates/deployment.yaml @@ -14,7 +14,9 @@ metadata: {{ toYaml . | indent 4 }} {{- end }} spec: + {{- if not .Values.autoscaling.enabled }} replicas: {{ .Values.replicas }} + {{- end }} revisionHistoryLimit: {{ .Values.revisionHistoryLimit }} selector: matchLabels: @@ -34,7 +36,7 @@ spec: checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} checksum/dashboards-json-config: {{ include (print $.Template.BasePath "/dashboards-json-configmap.yaml") . | sha256sum }} checksum/sc-dashboard-provider-config: {{ include (print $.Template.BasePath "/configmap-dashboard-provider.yaml") . | sha256sum }} -{{- if or (and (not .Values.admin.existingSecret) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD__FILE) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD)) (and .Values.ldap.enabled (not .Values.ldap.existingSecret)) }} +{{- if and (or (and (not .Values.admin.existingSecret) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD__FILE) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD)) (and .Values.ldap.enabled (not .Values.ldap.existingSecret))) (not .Values.env.GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION) }} checksum/secret: {{ include (print $.Template.BasePath "/secret.yaml") . | sha256sum }} {{- end }} {{- if .Values.envRenderSecret }} diff --git a/charts/kube-prometheus-stack/charts/grafana/templates/hpa.yaml b/charts/kube-prometheus-stack/charts/grafana/templates/hpa.yaml new file mode 100644 index 0000000000..9c186d74ac --- /dev/null +++ b/charts/kube-prometheus-stack/charts/grafana/templates/hpa.yaml @@ -0,0 +1,20 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2beta1 +kind: HorizontalPodAutoscaler +metadata: + name: {{ template "grafana.fullname" . }} + labels: + app.kubernetes.io/name: {{ template "grafana.name" . }} + helm.sh/chart: {{ template "grafana.chart" . }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/instance: {{ .Release.Name }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ template "grafana.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: +{{ toYaml .Values.autoscaling.metrics | indent 4 }} +{{- end }} diff --git a/charts/kube-prometheus-stack/charts/grafana/templates/secret.yaml b/charts/kube-prometheus-stack/charts/grafana/templates/secret.yaml index 57d2e5fcf7..6d06cf584f 100644 --- a/charts/kube-prometheus-stack/charts/grafana/templates/secret.yaml +++ b/charts/kube-prometheus-stack/charts/grafana/templates/secret.yaml @@ -1,4 +1,4 @@ -{{- if or (and (not .Values.admin.existingSecret) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD__FILE) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD)) (and .Values.ldap.enabled (not .Values.ldap.existingSecret)) }} +{{- if and (or (and (not .Values.admin.existingSecret) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD__FILE) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD)) (and .Values.ldap.enabled (not .Values.ldap.existingSecret))) (not .Values.env.GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION) }} apiVersion: v1 kind: Secret metadata: @@ -17,7 +17,7 @@ data: {{- if .Values.adminPassword }} admin-password: {{ .Values.adminPassword | b64enc | quote }} {{- else }} - admin-password: {{ randAlphaNum 40 | b64enc | quote }} + admin-password: {{ template "grafana.password" . }} {{- end }} {{- end }} {{- if not .Values.ldap.existingSecret }} diff --git a/charts/kube-prometheus-stack/charts/grafana/templates/statefulset.yaml b/charts/kube-prometheus-stack/charts/grafana/templates/statefulset.yaml index b2b4616f32..ad3dd06963 100644 --- a/charts/kube-prometheus-stack/charts/grafana/templates/statefulset.yaml +++ b/charts/kube-prometheus-stack/charts/grafana/templates/statefulset.yaml @@ -27,7 +27,7 @@ spec: checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} checksum/dashboards-json-config: {{ include (print $.Template.BasePath "/dashboards-json-configmap.yaml") . | sha256sum }} checksum/sc-dashboard-provider-config: {{ include (print $.Template.BasePath "/configmap-dashboard-provider.yaml") . | sha256sum }} - {{- if or (and (not .Values.admin.existingSecret) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD__FILE) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD)) (and .Values.ldap.enabled (not .Values.ldap.existingSecret)) }} + {{- if and (or (and (not .Values.admin.existingSecret) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD__FILE) (not .Values.env.GF_SECURITY_ADMIN_PASSWORD)) (and .Values.ldap.enabled (not .Values.ldap.existingSecret))) (not .Values.env.GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION) }} checksum/secret: {{ include (print $.Template.BasePath "/secret.yaml") . | sha256sum }} {{- end }} {{- with .Values.podAnnotations }} diff --git a/charts/kube-prometheus-stack/charts/grafana/values.yaml b/charts/kube-prometheus-stack/charts/grafana/values.yaml index 5782ae0370..5045231fe9 100644 --- a/charts/kube-prometheus-stack/charts/grafana/values.yaml +++ b/charts/kube-prometheus-stack/charts/grafana/values.yaml @@ -22,6 +22,22 @@ serviceAccount: replicas: 1 +## Create HorizontalPodAutoscaler object for deployment type +# +autoscaling: + enabled: false +# minReplicas: 1 +# maxReplicas: 10 +# metrics: +# - type: Resource +# resource: +# name: cpu +# targetAverageUtilization: 60 +# - type: Resource +# resource: +# name: memory +# targetAverageUtilization: 60 + ## See `kubectl explain poddisruptionbudget.spec` for more ## ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/ podDisruptionBudget: {} @@ -53,7 +69,7 @@ livenessProbe: image: repository: grafana/grafana - tag: 7.5.5 + tag: 8.0.1 sha: "" pullPolicy: IfNotPresent @@ -516,7 +532,7 @@ dashboardsConfigMaps: {} ## grafana.ini: paths: - data: /var/lib/grafana/data + data: /var/lib/grafana/ logs: /var/log/grafana plugins: /var/lib/grafana/plugins provisioning: /etc/grafana/provisioning @@ -587,7 +603,7 @@ smtp: sidecar: image: repository: quay.io/kiwigrid/k8s-sidecar - tag: 1.10.7 + tag: 1.12.2 sha: "" imagePullPolicy: IfNotPresent resources: {} diff --git a/charts/kube-prometheus-stack/charts/kube-state-metrics/Chart.yaml b/charts/kube-prometheus-stack/charts/kube-state-metrics/Chart.yaml index b7f6ddf904..6bc1ce12f7 100644 --- a/charts/kube-prometheus-stack/charts/kube-state-metrics/Chart.yaml +++ b/charts/kube-prometheus-stack/charts/kube-state-metrics/Chart.yaml @@ -1,5 +1,5 @@ -apiVersion: v1 -appVersion: 1.9.8 +apiVersion: v2 +appVersion: 2.0.0 description: Install kube-state-metrics to generate and expose cluster-level metrics home: https://github.com/kubernetes/kube-state-metrics/ keywords: @@ -15,4 +15,5 @@ maintainers: name: kube-state-metrics sources: - https://github.com/kubernetes/kube-state-metrics/ -version: 2.13.2 +type: application +version: 3.1.1 diff --git a/charts/kube-prometheus-stack/charts/kube-state-metrics/README.md b/charts/kube-prometheus-stack/charts/kube-state-metrics/README.md index e93a3d2524..7c2e16918f 100644 --- a/charts/kube-prometheus-stack/charts/kube-state-metrics/README.md +++ b/charts/kube-prometheus-stack/charts/kube-state-metrics/README.md @@ -5,7 +5,7 @@ Installs the [kube-state-metrics agent](https://github.com/kubernetes/kube-state ## Get Repo Info ```console -helm repo add kube-state-metrics https://kubernetes.github.io/kube-state-metrics +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update ``` @@ -14,11 +14,7 @@ _See [helm repo](https://helm.sh/docs/helm/helm_repo/) for command documentation ## Install Chart ```console -# Helm 3 -$ helm install [RELEASE_NAME] kube-state-metrics/kube-state-metrics [flags] - -# Helm 2 -$ helm install --name [RELEASE_NAME] kube-state-metrics/kube-state-metrics [flags] +helm install [RELEASE_NAME] prometheus-community/kube-state-metrics [flags] ``` _See [configuration](#configuration) below._ @@ -28,11 +24,7 @@ _See [helm install](https://helm.sh/docs/helm/helm_install/) for command documen ## Uninstall Chart ```console -# Helm 3 -$ helm uninstall [RELEASE_NAME] - -# Helm 2 -# helm delete --purge [RELEASE_NAME] +helm uninstall [RELEASE_NAME] ``` This removes all the Kubernetes components associated with the chart and deletes the release. @@ -42,25 +34,35 @@ _See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall/) for command doc ## Upgrading Chart ```console -# Helm 3 or 2 -$ helm upgrade [RELEASE_NAME] kube-state-metrics/kube-state-metrics [flags] +helm upgrade [RELEASE_NAME] prometheus-community/kube-state-metrics [flags] ``` _See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade/) for command documentation._ -### From stable/kube-state-metrics +### Migrating from stable/kube-state-metrics and kubernetes/kube-state-metrics You can upgrade in-place: 1. [get repo info](#get-repo-info) 1. [upgrade](#upgrading-chart) your existing release name using the new chart repo + +## Upgrading to v3.0.0 + +v3.0.0 includes kube-state-metrics v2.0, see the [changelog](https://github.com/kubernetes/kube-state-metrics/blob/release-2.0/CHANGELOG.md) for major changes on the application-side. + +The upgraded chart now the following changes: +* Dropped support for helm v2 (helm v3 or later is required) +* collectors key was renamed to resources +* namespace key was renamed to namespaces + + ## Configuration See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments: ```console -helm show values kube-state-metrics/kube-state-metrics +helm show values prometheus-community/kube-state-metrics ``` -You may also `helm show values` on this chart's [dependencies](#dependencies) for additional options. +You may also run `helm show values` on this chart's [dependencies](#dependencies) for additional options. diff --git a/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/deployment.yaml b/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/deployment.yaml index 5f6b644573..090d880a3a 100644 --- a/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/deployment.yaml +++ b/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/deployment.yaml @@ -69,91 +69,91 @@ spec: {{- end }} {{ end }} {{ if .Values.collectors.certificatesigningrequests }} - - --collectors=certificatesigningrequests + - --resources=certificatesigningrequests {{ end }} {{ if .Values.collectors.configmaps }} - - --collectors=configmaps + - --resources=configmaps {{ end }} {{ if .Values.collectors.cronjobs }} - - --collectors=cronjobs + - --resources=cronjobs {{ end }} {{ if .Values.collectors.daemonsets }} - - --collectors=daemonsets + - --resources=daemonsets {{ end }} {{ if .Values.collectors.deployments }} - - --collectors=deployments + - --resources=deployments {{ end }} {{ if .Values.collectors.endpoints }} - - --collectors=endpoints + - --resources=endpoints {{ end }} {{ if .Values.collectors.horizontalpodautoscalers }} - - --collectors=horizontalpodautoscalers + - --resources=horizontalpodautoscalers {{ end }} {{ if .Values.collectors.ingresses }} - - --collectors=ingresses + - --resources=ingresses {{ end }} {{ if .Values.collectors.jobs }} - - --collectors=jobs + - --resources=jobs {{ end }} {{ if .Values.collectors.limitranges }} - - --collectors=limitranges + - --resources=limitranges {{ end }} {{ if .Values.collectors.mutatingwebhookconfigurations }} - - --collectors=mutatingwebhookconfigurations + - --resources=mutatingwebhookconfigurations {{ end }} {{ if .Values.collectors.namespaces }} - - --collectors=namespaces + - --resources=namespaces {{ end }} {{ if .Values.collectors.networkpolicies }} - - --collectors=networkpolicies + - --resources=networkpolicies {{ end }} {{ if .Values.collectors.nodes }} - - --collectors=nodes + - --resources=nodes {{ end }} {{ if .Values.collectors.persistentvolumeclaims }} - - --collectors=persistentvolumeclaims + - --resources=persistentvolumeclaims {{ end }} {{ if .Values.collectors.persistentvolumes }} - - --collectors=persistentvolumes + - --resources=persistentvolumes {{ end }} {{ if .Values.collectors.poddisruptionbudgets }} - - --collectors=poddisruptionbudgets + - --resources=poddisruptionbudgets {{ end }} {{ if .Values.collectors.pods }} - - --collectors=pods + - --resources=pods {{ end }} {{ if .Values.collectors.replicasets }} - - --collectors=replicasets + - --resources=replicasets {{ end }} {{ if .Values.collectors.replicationcontrollers }} - - --collectors=replicationcontrollers + - --resources=replicationcontrollers {{ end }} {{ if .Values.collectors.resourcequotas }} - - --collectors=resourcequotas + - --resources=resourcequotas {{ end }} {{ if .Values.collectors.secrets }} - - --collectors=secrets + - --resources=secrets {{ end }} {{ if .Values.collectors.services }} - - --collectors=services + - --resources=services {{ end }} {{ if .Values.collectors.statefulsets }} - - --collectors=statefulsets + - --resources=statefulsets {{ end }} {{ if .Values.collectors.storageclasses }} - - --collectors=storageclasses + - --resources=storageclasses {{ end }} {{ if .Values.collectors.validatingwebhookconfigurations }} - - --collectors=validatingwebhookconfigurations + - --resources=validatingwebhookconfigurations {{ end }} {{ if .Values.collectors.verticalpodautoscalers }} - - --collectors=verticalpodautoscalers + - --resources=verticalpodautoscalers {{ end }} {{ if .Values.collectors.volumeattachments }} - - --collectors=volumeattachments + - --resources=volumeattachments {{ end }} -{{ if .Values.namespace }} - - --namespace={{ .Values.namespace | join "," }} +{{ if .Values.namespaces }} + - --namespaces={{ tpl .Values.namespaces $ | join "," }} {{ end }} {{ if .Values.autosharding.enabled }} - --pod=$(POD_NAME) @@ -176,6 +176,9 @@ spec: image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" ports: - containerPort: 8080 +{{- if .Values.selfMonitor.enabled }} + - containerPort: 8081 +{{- end }} livenessProbe: httpGet: path: /healthz diff --git a/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/role.yaml b/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/role.yaml index c493f16753..25c8bc8933 100644 --- a/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/role.yaml +++ b/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/role.yaml @@ -1,11 +1,8 @@ -{{- if and (eq $.Values.rbac.create true) (not .Values.rbac.useExistingRole) -}} -{{- if eq .Values.rbac.useClusterRole false }} -{{- range (split "," $.Values.namespace) }} -{{- end }} -{{- end -}} +{{- if and (eq .Values.rbac.create true) (not .Values.rbac.useExistingRole) -}} +{{- range (split "," .Values.namespaces) }} --- apiVersion: rbac.authorization.k8s.io/v1 -{{- if eq .Values.rbac.useClusterRole false }} +{{- if eq $.Values.rbac.useClusterRole false }} kind: Role {{- else }} kind: ClusterRole @@ -17,7 +14,7 @@ metadata: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/instance: {{ $.Release.Name }} name: {{ template "kube-state-metrics.fullname" $ }} -{{- if eq .Values.rbac.useClusterRole false }} +{{- if eq $.Values.rbac.useClusterRole false }} namespace: {{ . }} {{- end }} rules: @@ -190,3 +187,4 @@ rules: verbs: ["list", "watch"] {{ end -}} {{- end -}} +{{- end -}} diff --git a/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/rolebinding.yaml b/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/rolebinding.yaml index 732174a334..72a1a2e904 100644 --- a/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/rolebinding.yaml +++ b/charts/kube-prometheus-stack/charts/kube-state-metrics/templates/rolebinding.yaml @@ -1,5 +1,5 @@ {{- if and (eq .Values.rbac.create true) (eq .Values.rbac.useClusterRole false) -}} -{{- range (split "," $.Values.namespace) }} +{{- range (split "," $.Values.namespaces) }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding diff --git a/charts/kube-prometheus-stack/charts/kube-state-metrics/values.yaml b/charts/kube-prometheus-stack/charts/kube-state-metrics/values.yaml index aaf97bd794..62e178d2cd 100644 --- a/charts/kube-prometheus-stack/charts/kube-state-metrics/values.yaml +++ b/charts/kube-prometheus-stack/charts/kube-state-metrics/values.yaml @@ -2,7 +2,7 @@ prometheusScrape: true image: repository: k8s.gcr.io/kube-state-metrics/kube-state-metrics - tag: v1.9.8 + tag: v2.0.0 pullPolicy: IfNotPresent imagePullSecrets: [] @@ -110,7 +110,7 @@ podAnnotations: {} podDisruptionBudget: {} # Available collectors for kube-state-metrics. By default all available -# collectors are enabled. +# resources are enabled. collectors: certificatesigningrequests: true configmaps: true @@ -147,8 +147,8 @@ kubeconfig: # base64 encoded kube-config file secret: -# Namespace to be enabled for collecting resources. By default all namespaces are collected. -# namespace: "" +# Comma-separated list of namespaces to be enabled for collecting resources. By default all namespaces are collected. +namespaces: "" ## Override the deployment namespace ## diff --git a/charts/kube-prometheus-stack/charts/prometheus-node-exporter/Chart.yaml b/charts/kube-prometheus-stack/charts/prometheus-node-exporter/Chart.yaml index b045551af8..1628ee12e3 100644 --- a/charts/kube-prometheus-stack/charts/prometheus-node-exporter/Chart.yaml +++ b/charts/kube-prometheus-stack/charts/prometheus-node-exporter/Chart.yaml @@ -14,4 +14,4 @@ maintainers: name: prometheus-node-exporter sources: - https://github.com/prometheus/node_exporter/ -version: 1.17.0 +version: 1.18.1 diff --git a/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/psp.yaml b/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/psp.yaml index f00506c980..ec1259e01e 100644 --- a/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/psp.yaml +++ b/charts/kube-prometheus-stack/charts/prometheus-node-exporter/templates/psp.yaml @@ -6,6 +6,10 @@ metadata: name: {{ template "prometheus-node-exporter.fullname" . }} namespace: {{ template "prometheus-node-exporter.namespace" . }} labels: {{ include "prometheus-node-exporter.labels" . | indent 4 }} +{{- if .Values.rbac.pspAnnotations }} + annotations: +{{ toYaml .Values.rbac.pspAnnotations | indent 4 }} +{{- end}} spec: privileged: false # Required to prevent escalations to root. diff --git a/charts/kube-prometheus-stack/charts/prometheus-node-exporter/values.yaml b/charts/kube-prometheus-stack/charts/prometheus-node-exporter/values.yaml index 5cb6981ae1..400ce113b6 100644 --- a/charts/kube-prometheus-stack/charts/prometheus-node-exporter/values.yaml +++ b/charts/kube-prometheus-stack/charts/prometheus-node-exporter/values.yaml @@ -72,6 +72,7 @@ rbac: ## If true, create & use Pod Security Policy resources ## https://kubernetes.io/docs/concepts/policy/pod-security-policy/ pspEnabled: true + pspAnnotations: {} # for deployments that have node_exporter deployed outside of the cluster, list # their addresses here diff --git a/charts/kube-prometheus-stack/crds/crd-alertmanagerconfigs.yaml b/charts/kube-prometheus-stack/crds/crd-alertmanagerconfigs.yaml index e8c2828928..e84eb6bb97 100644 --- a/charts/kube-prometheus-stack/crds/crd-alertmanagerconfigs.yaml +++ b/charts/kube-prometheus-stack/crds/crd-alertmanagerconfigs.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.47.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kube-prometheus-stack/crds/crd-alertmanagers.yaml b/charts/kube-prometheus-stack/crds/crd-alertmanagers.yaml index 41128a682a..76f90f7ace 100644 --- a/charts/kube-prometheus-stack/crds/crd-alertmanagers.yaml +++ b/charts/kube-prometheus-stack/crds/crd-alertmanagers.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.47.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kube-prometheus-stack/crds/crd-podmonitors.yaml b/charts/kube-prometheus-stack/crds/crd-podmonitors.yaml index 45294963b6..745b866b24 100644 --- a/charts/kube-prometheus-stack/crds/crd-podmonitors.yaml +++ b/charts/kube-prometheus-stack/crds/crd-podmonitors.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.47.0/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kube-prometheus-stack/crds/crd-probes.yaml b/charts/kube-prometheus-stack/crds/crd-probes.yaml index 6e5af51ed1..ca2ef5b816 100644 --- a/charts/kube-prometheus-stack/crds/crd-probes.yaml +++ b/charts/kube-prometheus-stack/crds/crd-probes.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.47.0/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kube-prometheus-stack/crds/crd-prometheuses.yaml b/charts/kube-prometheus-stack/crds/crd-prometheuses.yaml index be23c9f551..ce033a66b7 100644 --- a/charts/kube-prometheus-stack/crds/crd-prometheuses.yaml +++ b/charts/kube-prometheus-stack/crds/crd-prometheuses.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.47.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kube-prometheus-stack/crds/crd-prometheusrules.yaml b/charts/kube-prometheus-stack/crds/crd-prometheusrules.yaml index cd8705a203..8e625d67a8 100644 --- a/charts/kube-prometheus-stack/crds/crd-prometheusrules.yaml +++ b/charts/kube-prometheus-stack/crds/crd-prometheusrules.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.47.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kube-prometheus-stack/crds/crd-servicemonitors.yaml b/charts/kube-prometheus-stack/crds/crd-servicemonitors.yaml index 543546af39..3bf6572d0d 100644 --- a/charts/kube-prometheus-stack/crds/crd-servicemonitors.yaml +++ b/charts/kube-prometheus-stack/crds/crd-servicemonitors.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.47.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kube-prometheus-stack/crds/crd-thanosrulers.yaml b/charts/kube-prometheus-stack/crds/crd-thanosrulers.yaml index aae4b9eeb7..faf36d25b8 100644 --- a/charts/kube-prometheus-stack/crds/crd-thanosrulers.yaml +++ b/charts/kube-prometheus-stack/crds/crd-thanosrulers.yaml @@ -1,4 +1,4 @@ -# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.47.0/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.48.1/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml --- apiVersion: apiextensions.k8s.io/v1 diff --git a/charts/kube-prometheus-stack/templates/_helpers.tpl b/charts/kube-prometheus-stack/templates/_helpers.tpl index 8336cb8918..61989b574d 100644 --- a/charts/kube-prometheus-stack/templates/_helpers.tpl +++ b/charts/kube-prometheus-stack/templates/_helpers.tpl @@ -98,7 +98,12 @@ Allow the release namespace to be overridden for multi-namespace deployments in {{/* Allow KubeVersion to be overridden. */}} {{- define "kube-prometheus-stack.ingress.kubeVersion" -}} - {{- default .Capabilities.KubeVersion.Version .Values.kubeVersionOverride -}} + {{- $kubeVersion := default .Capabilities.KubeVersion.Version .Values.kubeVersionOverride -}} + {{/* Special use case for Amazon EKS, Google GKE */}} + {{- if and (regexMatch "\\d+\\.\\d+\\.\\d+-(?:eks|gke).+" $kubeVersion) (not .Values.kubeVersionOverride) -}} + {{- $kubeVersion = regexFind "\\d+\\.\\d+\\.\\d+" $kubeVersion -}} + {{- end -}} + {{- $kubeVersion -}} {{- end -}} {{/* Get Ingress API Version */}} @@ -116,3 +121,9 @@ Allow the release namespace to be overridden for multi-namespace deployments in {{- define "kube-prometheus-stack.ingress.isStable" -}} {{- eq (include "kube-prometheus-stack.ingress.apiVersion" .) "networking.k8s.io/v1" -}} {{- end -}} + +{{/* Check Ingress supports pathType */}} +{{/* pathType was added to networking.k8s.io/v1beta1 in Kubernetes 1.18 */}} +{{- define "kube-prometheus-stack.ingress.supportsPathType" -}} + {{- or (eq (include "kube-prometheus-stack.ingress.isStable" .) "true") (and (eq (include "kube-prometheus-stack.ingress.apiVersion" .) "networking.k8s.io/v1beta1") (semverCompare ">= 1.18.x" (include "kube-prometheus-stack.ingress.kubeVersion" .))) -}} +{{- end -}} diff --git a/charts/kube-prometheus-stack/templates/alertmanager/ingress.yaml b/charts/kube-prometheus-stack/templates/alertmanager/ingress.yaml index 8ade27093c..f337502e97 100644 --- a/charts/kube-prometheus-stack/templates/alertmanager/ingress.yaml +++ b/charts/kube-prometheus-stack/templates/alertmanager/ingress.yaml @@ -5,6 +5,7 @@ {{- $routePrefix := list .Values.alertmanager.alertmanagerSpec.routePrefix }} {{- $paths := .Values.alertmanager.ingress.paths | default $routePrefix -}} {{- $apiIsStable := eq (include "kube-prometheus-stack.ingress.isStable" .) "true" -}} +{{- $ingressSupportsPathType := eq (include "kube-prometheus-stack.ingress.supportsPathType" .) "true" -}} apiVersion: {{ include "kube-prometheus-stack.ingress.apiVersion" . }} kind: Ingress metadata: @@ -34,7 +35,7 @@ spec: paths: {{- range $p := $paths }} - path: {{ tpl $p $ }} - {{- if $pathType }} + {{- if and $pathType $ingressSupportsPathType }} pathType: {{ $pathType }} {{- end }} backend: @@ -54,7 +55,7 @@ spec: paths: {{- range $p := $paths }} - path: {{ tpl $p $ }} - {{- if $pathType }} + {{- if and $pathType $ingressSupportsPathType }} pathType: {{ $pathType }} {{- end }} backend: diff --git a/charts/kube-prometheus-stack/templates/alertmanager/ingressperreplica.yaml b/charts/kube-prometheus-stack/templates/alertmanager/ingressperreplica.yaml index c55ec2adfe..f21bf96169 100644 --- a/charts/kube-prometheus-stack/templates/alertmanager/ingressperreplica.yaml +++ b/charts/kube-prometheus-stack/templates/alertmanager/ingressperreplica.yaml @@ -4,6 +4,7 @@ {{- $servicePort := .Values.alertmanager.service.port -}} {{- $ingressValues := .Values.alertmanager.ingressPerReplica -}} {{- $apiIsStable := eq (include "kube-prometheus-stack.ingress.isStable" .) "true" -}} +{{- $ingressSupportsPathType := eq (include "kube-prometheus-stack.ingress.supportsPathType" .) "true" -}} apiVersion: v1 kind: List metadata: @@ -12,7 +13,7 @@ metadata: items: {{ range $i, $e := until $count }} - kind: Ingress - apiVersion: {{ include "kube-prometheus-stack.ingress.apiVersion" . }} + apiVersion: {{ include "kube-prometheus-stack.ingress.apiVersion" $ }} metadata: name: {{ include "kube-prometheus-stack.fullname" $ }}-alertmanager-{{ $i }} namespace: {{ template "kube-prometheus-stack.namespace" $ }} @@ -38,7 +39,7 @@ items: paths: {{- range $p := $ingressValues.paths }} - path: {{ tpl $p $ }} - {{- if $pathType }} + {{- if and $pathType $ingressSupportsPathType }} pathType: {{ $pathType }} {{- end }} backend: diff --git a/charts/kube-prometheus-stack/templates/exporters/kube-api-server/servicemonitor.yaml b/charts/kube-prometheus-stack/templates/exporters/kube-api-server/servicemonitor.yaml index f34c1876fc..33a57b82e2 100644 --- a/charts/kube-prometheus-stack/templates/exporters/kube-api-server/servicemonitor.yaml +++ b/charts/kube-prometheus-stack/templates/exporters/kube-api-server/servicemonitor.yaml @@ -22,9 +22,9 @@ spec: metricRelabelings: {{ tpl (toYaml .Values.kubeApiServer.serviceMonitor.metricRelabelings | indent 6) . }} {{- end }} -{{- if .Values.kubeApiServer.relabelings }} +{{- if .Values.kubeApiServer.serviceMonitor.relabelings }} relabelings: -{{ toYaml .Values.kubeApiServer.relabelings | indent 6 }} +{{ toYaml .Values.kubeApiServer.serviceMonitor.relabelings | indent 6 }} {{- end }} tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt diff --git a/charts/kube-prometheus-stack/templates/exporters/kubelet/servicemonitor.yaml b/charts/kube-prometheus-stack/templates/exporters/kubelet/servicemonitor.yaml index e802922378..9933ecb1c3 100644 --- a/charts/kube-prometheus-stack/templates/exporters/kubelet/servicemonitor.yaml +++ b/charts/kube-prometheus-stack/templates/exporters/kubelet/servicemonitor.yaml @@ -16,7 +16,7 @@ spec: interval: {{ .Values.kubelet.serviceMonitor.interval }} {{- end }} {{- if .Values.kubelet.serviceMonitor.proxyUrl }} - proxyUrl: {{ .Values.kubelet.serviceMonitor.proxyUrl}} + proxyUrl: {{ .Values.kubelet.serviceMonitor.proxyUrl }} {{- end }} tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt @@ -38,6 +38,9 @@ spec: {{- if .Values.kubelet.serviceMonitor.interval }} interval: {{ .Values.kubelet.serviceMonitor.interval }} {{- end }} + {{- if .Values.kubelet.serviceMonitor.proxyUrl }} + proxyUrl: {{ .Values.kubelet.serviceMonitor.proxyUrl }} + {{- end }} {{- if .Values.kubelet.serviceMonitor.scrapeTimeout }} scrapeTimeout: {{ .Values.kubelet.serviceMonitor.scrapeTimeout }} {{- end }} @@ -62,6 +65,9 @@ spec: {{- if .Values.kubelet.serviceMonitor.interval }} interval: {{ .Values.kubelet.serviceMonitor.interval }} {{- end }} + {{- if .Values.kubelet.serviceMonitor.proxyUrl }} + proxyUrl: {{ .Values.kubelet.serviceMonitor.proxyUrl }} + {{- end }} honorLabels: true tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt @@ -83,6 +89,9 @@ spec: {{- if .Values.kubelet.serviceMonitor.interval }} interval: {{ .Values.kubelet.serviceMonitor.interval }} {{- end }} + {{- if .Values.kubelet.serviceMonitor.proxyUrl }} + proxyUrl: {{ .Values.kubelet.serviceMonitor.proxyUrl }} + {{- end }} honorLabels: true tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt @@ -102,6 +111,9 @@ spec: {{- if .Values.kubelet.serviceMonitor.interval }} interval: {{ .Values.kubelet.serviceMonitor.interval }} {{- end }} + {{- if .Values.kubelet.serviceMonitor.proxyUrl }} + proxyUrl: {{ .Values.kubelet.serviceMonitor.proxyUrl }} + {{- end }} honorLabels: true {{- if .Values.kubelet.serviceMonitor.metricRelabelings }} metricRelabelings: @@ -117,6 +129,9 @@ spec: {{- if .Values.kubelet.serviceMonitor.interval }} interval: {{ .Values.kubelet.serviceMonitor.interval }} {{- end }} + {{- if .Values.kubelet.serviceMonitor.proxyUrl }} + proxyUrl: {{ .Values.kubelet.serviceMonitor.proxyUrl }} + {{- end }} honorLabels: true {{- if .Values.kubelet.serviceMonitor.cAdvisorMetricRelabelings }} metricRelabelings: @@ -132,6 +147,9 @@ spec: {{- if .Values.kubelet.serviceMonitor.interval }} interval: {{ .Values.kubelet.serviceMonitor.interval }} {{- end }} + {{- if .Values.kubelet.serviceMonitor.proxyUrl }} + proxyUrl: {{ .Values.kubelet.serviceMonitor.proxyUrl }} + {{- end }} honorLabels: true {{- if .Values.kubelet.serviceMonitor.resourceMetricRelabelings }} metricRelabelings: @@ -150,6 +168,6 @@ spec: - {{ .Values.kubelet.namespace }} selector: matchLabels: - app.kubernetes.io/managed-by: prometheus-operator + app.kubernetes.io/name: kubelet k8s-app: kubelet {{- end}} diff --git a/charts/kube-prometheus-stack/templates/exporters/node-exporter/servicemonitor.yaml b/charts/kube-prometheus-stack/templates/exporters/node-exporter/servicemonitor.yaml index adcc419450..5615ba7e87 100644 --- a/charts/kube-prometheus-stack/templates/exporters/node-exporter/servicemonitor.yaml +++ b/charts/kube-prometheus-stack/templates/exporters/node-exporter/servicemonitor.yaml @@ -13,13 +13,18 @@ spec: matchLabels: app: prometheus-node-exporter release: {{ $.Release.Name }} + {{- if (index .Values "prometheus-node-exporter" "namespaceOverride") }} + namespaceSelector: + matchNames: + - {{ index .Values "prometheus-node-exporter" "namespaceOverride" }} + {{- end }} endpoints: - port: metrics {{- if .Values.nodeExporter.serviceMonitor.interval }} interval: {{ .Values.nodeExporter.serviceMonitor.interval }} {{- end }} - {{- if .Values.kubeApiServer.serviceMonitor.proxyUrl }} - proxyUrl: {{ .Values.kubeApiServer.serviceMonitor.proxyUrl}} + {{- if .Values.nodeExporter.serviceMonitor.proxyUrl }} + proxyUrl: {{ .Values.nodeExporter.serviceMonitor.proxyUrl}} {{- end }} {{- if .Values.nodeExporter.serviceMonitor.scrapeTimeout }} scrapeTimeout: {{ .Values.nodeExporter.serviceMonitor.scrapeTimeout }} diff --git a/charts/kube-prometheus-stack/templates/grafana/configmap-dashboards.yaml b/charts/kube-prometheus-stack/templates/grafana/configmap-dashboards.yaml index fdd64ce6f4..ce5dd06844 100644 --- a/charts/kube-prometheus-stack/templates/grafana/configmap-dashboards.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/configmap-dashboards.yaml @@ -1,5 +1,5 @@ -{{- if and .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} -{{- $files := .Files.Glob "dashboards/*.json" }} +{{- if or (and .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled) .Values.grafana.forceDeployDashboards }} +{{- $files := .Files.Glob "dashboards-1.14/*.json" }} {{- if $files }} apiVersion: v1 kind: ConfigMapList diff --git a/charts/kube-prometheus-stack/templates/grafana/configmaps-datasources.yaml b/charts/kube-prometheus-stack/templates/grafana/configmaps-datasources.yaml index db62d537e9..c7c82efcb6 100644 --- a/charts/kube-prometheus-stack/templates/grafana/configmaps-datasources.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/configmaps-datasources.yaml @@ -1,4 +1,4 @@ -{{- if and .Values.grafana.enabled .Values.grafana.sidecar.datasources.enabled }} +{{- if or (and .Values.grafana.enabled .Values.grafana.sidecar.datasources.enabled) .Values.grafana.forceDeployDatasources }} apiVersion: v1 kind: ConfigMap metadata: @@ -20,7 +20,11 @@ data: {{- if .Values.grafana.sidecar.datasources.defaultDatasourceEnabled }} - name: Prometheus type: prometheus + {{- if .Values.grafana.sidecar.datasources.url }} + url: {{ .Values.grafana.sidecar.datasources.url }} + {{- else }} url: http://{{ template "kube-prometheus-stack.fullname" . }}-prometheus:{{ .Values.prometheus.service.port }}/{{ trimPrefix "/" .Values.prometheus.prometheusSpec.routePrefix }} + {{- end }} access: proxy isDefault: true jsonData: diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards/pods.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/alertmanager-overview.yaml similarity index 63% rename from charts/kube-prometheus-stack/templates/grafana/dashboards/pods.yaml rename to charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/alertmanager-overview.yaml index e2534d7fdf..2135644ab0 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards/pods.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/alertmanager-overview.yaml @@ -1,15 +1,15 @@ {{- /* -Generated from 'pods' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/grafana-dashboardDefinitions.yaml +Generated from 'alertmanager-overview' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: namespace: {{ template "kube-prometheus-stack.namespace" . }} - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "pods" | trunc 63 | trimSuffix "-" }} + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "alertmanager-overview" | trunc 63 | trimSuffix "-" }} annotations: {{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} labels: @@ -19,7 +19,7 @@ metadata: app: {{ template "kube-prometheus-stack.name" $ }}-grafana {{ include "kube-prometheus-stack.labels" $ | indent 4 }} data: - pods.json: |- + alertmanager-overview.json: |- { "__inputs": [ @@ -29,31 +29,18 @@ data: ], "annotations": { "list": [ - { - "builtIn": 1, - "datasource": "$datasource", - "enable": true, - "expr": "time() == BOOL timestamp(rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[2m]) > 0)", - "hide": false, - "iconColor": "rgba(215, 44, 44, 1)", - "name": "Restarts", - "showIn": 0, - "tags": [ - "restart" - ], - "type": "rows" - } + ] }, "editable": false, "gnetId": null, - "graphTooltip": 0, + "graphTooltip": 1, "hideControls": false, "id": null, "links": [ ], - "refresh": "", + "refresh": "30s", "rows": [ { "collapse": false, @@ -68,18 +55,20 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, "id": 2, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, + "alignAsTable": false, + "avg": false, + "current": false, "max": false, "min": false, - "rightSide": true, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -98,37 +87,16 @@ data: ], "spaceLength": 10, - "span": 12, - "stack": false, + "span": 6, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "expr": "sum(alertmanager_alerts{namespace=\"$namespace\",service=\"$service\"}) by (namespace,service,instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Current: {{`{{`}} container_name {{`}}`}}", + "legendFormat": "{{`{{`}}instance{{`}}`}}", "refId": "A" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\", pod=\"$pod\", container=~\"$container\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Requested: {{`{{`}} container {{`}}`}}", - "refId": "B" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\", pod=\"$pod\", container=~\"$container\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Limit: {{`{{`}} container {{`}}`}}", - "refId": "C" - }, - { - "expr": "sum by(container_name) (container_memory_cache{job=\"kubelet\", namespace=\"$namespace\", pod_name=~\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Cache: {{`{{`}} container_name {{`}}`}}", - "refId": "D" } ], "thresholds": [ @@ -136,9 +104,9 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "Alerts", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -154,36 +122,23 @@ data: }, "yaxes": [ { - "format": "bytes", + "format": "none", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "bytes", + "format": "none", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + }, { "aliasColors": { @@ -193,18 +148,20 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, "id": 3, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, + "alignAsTable": false, + "avg": false, + "current": false, "max": false, "min": false, - "rightSide": true, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -223,30 +180,23 @@ data: ], "spaceLength": 10, - "span": 12, - "stack": false, + "span": 6, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", image!=\"\", pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"}[1m]))", + "expr": "sum(rate(alertmanager_alerts_received_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Current: {{`{{`}} container_name {{`}}`}}", + "legendFormat": "{{`{{`}}instance{{`}}`}} Received", "refId": "A" }, { - "expr": "sum by(container) (kube_pod_container_resource_requests{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\", pod=\"$pod\", container=~\"$container\"})", + "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Requested: {{`{{`}} container {{`}}`}}", + "legendFormat": "{{`{{`}}instance{{`}}`}} Invalid", "refId": "B" - }, - { - "expr": "sum by(container) (kube_pod_container_resource_limits{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\", pod=\"$pod\", container=~\"$container\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Limit: {{`{{`}} container {{`}}`}}", - "refId": "C" } ], "thresholds": [ @@ -254,9 +204,9 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "CPU Usage", + "title": "Alerts receive rate", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -272,19 +222,19 @@ data: }, "yaxes": [ { - "format": "short", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -293,8 +243,8 @@ data: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", + "showTitle": true, + "title": "Alerts", "titleSize": "h6", "type": "row" }, @@ -311,18 +261,20 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, "id": 4, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, + "alignAsTable": false, + "avg": false, + "current": false, "max": false, "min": false, - "rightSide": true, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -336,27 +288,26 @@ data: "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, + "repeat": "integration", "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}[1m])))", + "expr": "sum(rate(alertmanager_notifications_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "RX: {{`{{`}} pod_name {{`}}`}}", + "legendFormat": "{{`{{`}}instance{{`}}`}} Total", "refId": "A" }, { - "expr": "sort_desc(sum by (pod_name) (rate(container_network_transmit_bytes_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}[1m])))", + "expr": "sum(rate(alertmanager_notifications_failed_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "TX: {{`{{`}} pod_name {{`}}`}}", + "legendFormat": "{{`{{`}}instance{{`}}`}} Failed", "refId": "B" } ], @@ -365,9 +316,9 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Network I/O", + "title": "$integration: Notifications Send Rate", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -383,36 +334,23 @@ data: }, "yaxes": [ { - "format": "bytes", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "bytes", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + }, { "aliasColors": { @@ -422,18 +360,20 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, "id": 5, "legend": { - "alignAsTable": true, - "avg": true, - "current": true, + "alignAsTable": false, + "avg": false, + "current": false, "max": false, "min": false, - "rightSide": true, - "show": true, + "rightSide": false, + "show": false, + "sideWidth": null, "total": false, "values": false }, @@ -447,21 +387,34 @@ data: "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, + "repeat": "integration", "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "max by (container) (kube_pod_container_status_restarts_total{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", + "expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "Restarts: {{`{{`}} container {{`}}`}}", + "legendFormat": "{{`{{`}}instance{{`}}`}} 99th Percentile", "refId": "A" + }, + { + "expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Median", + "refId": "B" + }, + { + "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} Average", + "refId": "C" } ], "thresholds": [ @@ -469,9 +422,9 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Total Restarts Per Container", + "title": "$integration: Notification Duration", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -487,19 +440,19 @@ data: }, "yaxes": [ { - "format": "short", + "format": "s", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "short", + "format": "s", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -508,8 +461,8 @@ data: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", + "showTitle": true, + "title": "Notifications", "titleSize": "h6", "type": "row" } @@ -517,7 +470,7 @@ data: "schemaVersion": 14, "style": "dark", "tags": [ - "kubernetes-mixin" + "alertmanager-mixin" ], "templating": { "list": [ @@ -540,47 +493,22 @@ data: { "allValue": null, "current": { - - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "Namespace", + "label": null, "multi": false, "name": "namespace", "options": [ ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "query": "label_values(alertmanager_alerts, namespace)", "refresh": 2, "regex": "", - "sort": 0, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -592,21 +520,22 @@ data: { "allValue": null, "current": { - + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "Pod", + "label": null, "multi": false, - "name": "pod", + "name": "service", "options": [ ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=~\"$namespace\"}, pod)", + "query": "label_values(alertmanager_alerts, service)", "refresh": 2, "regex": "", - "sort": 0, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -618,21 +547,22 @@ data: { "allValue": null, "current": { - + "text": "all", + "value": "$__all" }, "datasource": "$datasource", - "hide": 0, + "hide": 2, "includeAll": true, - "label": "Container", + "label": null, "multi": false, - "name": "container", + "name": "integration", "options": [ ], - "query": "label_values(kube_pod_container_info{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}, container)", + "query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)", "refresh": 2, "regex": "", - "sort": 0, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -672,9 +602,9 @@ data: "30d" ] }, - "timezone": "", - "title": "Kubernetes / Pods", - "uid": "ab4f13a9892a76a4d21ce8c2445bf4ea", + "timezone": "utc", + "title": "Alertmanager / Overview", + "uid": "alertmanager-overview", "version": 0 } {{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/apiserver.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/apiserver.yaml index 1fa3a19350..5ad552919b 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/apiserver.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/apiserver.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'apiserver' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'apiserver' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeApiServer.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeApiServer.enabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/cluster-total.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/cluster-total.yaml index 1d1c3e91fd..a10f7524be 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/cluster-total.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/cluster-total.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'cluster-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'cluster-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -1832,7 +1832,7 @@ data: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 0, "tagValuesQuery": "", diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/controller-manager.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/controller-manager.yaml index 76d561aed3..3717c8e860 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/controller-manager.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/controller-manager.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'controller-manager' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'controller-manager' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeControllerManager.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeControllerManager.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -107,7 +107,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"kube-controller-manager\"})", + "expr": "sum(up{cluster=\"$cluster\", job=\"kube-controller-manager\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -176,10 +176,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_adds_total{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", "refId": "A" } ], @@ -282,10 +282,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_depth{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", "refId": "A" } ], @@ -388,10 +388,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name, le))", + "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}", "refId": "A" } ], @@ -608,7 +608,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}", @@ -714,7 +714,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}", @@ -820,7 +820,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kube-controller-manager\",instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -913,7 +913,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kube-controller-manager\",instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -1006,7 +1006,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kube-controller-manager\",instance=~\"$instance\"}", + "expr": "go_goroutines{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -1090,6 +1090,32 @@ data: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -1100,7 +1126,7 @@ data: "options": [ ], - "query": "label_values(process_cpu_seconds_total{job=\"kube-controller-manager\"}, instance)", + "query": "label_values(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-controller-manager\"}, instance)", "refresh": 2, "regex": "", "sort": 1, diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/etcd.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/etcd.yaml index 66768d9ada..2185b7d813 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/etcd.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/etcd.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'etcd' from https://raw.githubusercontent.com/etcd-io/website/master/content/docs/current/op-guide/grafana.json +Generated from 'etcd' from https://raw.githubusercontent.com/etcd-io/website/master/content/en/docs/v3.4/op-guide/grafana.json Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeEtcd.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeEtcd.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -28,6 +28,7 @@ data: "editable": true, "gnetId": null, "hideControls": false, + "id": 6, "links": [], "refresh": "10s", "rows": [ @@ -1110,7 +1111,6 @@ data: }, "timezone": "browser", "title": "etcd", - "uid": "c2f4e12cdf69feb95caa41a5a1b423d9", "version": 215 } {{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-coredns.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-coredns.yaml index 3665b57ae8..a8cfd818a9 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-coredns.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-coredns.yaml @@ -1,6 +1,6 @@ {{- /* Added manually, can be changed in-place. */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.coreDns.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.coreDns.enabled }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-cluster.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-cluster.yaml index b1aadb1776..2388875a19 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-cluster.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-cluster.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'k8s-resources-cluster' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'k8s-resources-cluster' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -163,7 +163,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_cpu_cores{cluster=\"$cluster\"})", + "expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -247,7 +247,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_cpu_cores{cluster=\"$cluster\"})", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -331,7 +331,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "1 - sum(:node_memory_MemAvailable_bytes:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_memory_bytes{cluster=\"$cluster\"})", + "expr": "1 - sum(:node_memory_MemAvailable_bytes:sum{cluster=\"$cluster\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -415,7 +415,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_memory_bytes{cluster=\"$cluster\"})", + "expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -499,7 +499,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_memory_bytes{cluster=\"$cluster\"})", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -594,7 +594,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}namespace{{`}}`}}", @@ -885,7 +885,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -894,7 +894,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -903,7 +903,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -912,7 +912,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -921,7 +921,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1321,7 +1321,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1330,7 +1330,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1339,7 +1339,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1348,7 +1348,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1705,7 +1705,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -1745,7 +1745,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1797,19 +1797,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1843,7 +1831,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1901,7 +1889,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -1941,7 +1929,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1993,19 +1981,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -2039,7 +2015,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -2097,7 +2073,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Average Container Bandwidth by Namespace", "titleSize": "h6" }, { @@ -2137,7 +2113,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -2189,19 +2165,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -2235,7 +2199,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -2293,7 +2257,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -2333,7 +2297,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -2385,13 +2349,99 @@ data: "show": false } ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}namespace{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Transmitted Packets Dropped", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" }, { @@ -2406,8 +2456,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "decimals": -1, "fill": 10, - "id": 19, + "id": 20, "legend": { "avg": false, "current": false, @@ -2431,12 +2482,12 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}namespace{{`}}`}}", @@ -2449,7 +2500,93 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets Dropped", + "title": "IOPS(Reads+Writes)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}namespace{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", "tooltip": { "shared": false, "sort": 0, @@ -2489,7 +2626,312 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "sort": { + "col": 4, + "desc": true + }, + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Namespace", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": false, + "linkTooltip": "Drill down to pods", + "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", + "pattern": "namespace", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Storage IO", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution", "titleSize": "h6" } ], diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-namespace.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-namespace.yaml index 4694b59967..f14b36d92a 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-namespace.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-namespace.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'k8s-resources-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'k8s-resources-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -78,7 +78,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -162,7 +162,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -246,7 +246,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -256,7 +256,7 @@ data: "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "Memory Utilization (from requests)", + "title": "Memory Utilisation (from requests)", "tooltip": { "shared": false, "sort": 0, @@ -330,7 +330,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -422,8 +422,9 @@ data: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -432,8 +433,9 @@ data: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -444,7 +446,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -695,7 +697,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -704,7 +706,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -713,7 +715,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -722,7 +724,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -731,7 +733,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -828,8 +830,9 @@ data: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -838,8 +841,9 @@ data: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -1167,7 +1171,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1176,7 +1180,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1185,7 +1189,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1194,7 +1198,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1578,7 +1582,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -1618,7 +1622,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1670,19 +1674,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1716,7 +1708,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1774,7 +1766,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -1814,7 +1806,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1866,19 +1858,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1912,7 +1892,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1970,7 +1950,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -2010,7 +1990,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -2062,13 +2042,99 @@ data: "show": false } ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}pod{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Transmitted Packets Dropped", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" }, { @@ -2083,8 +2149,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "decimals": -1, "fill": 10, - "id": 15, + "id": 16, "legend": { "avg": false, "current": false, @@ -2108,12 +2175,12 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -2126,7 +2193,93 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets Dropped", + "title": "IOPS(Reads+Writes)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}pod{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", "tooltip": { "shared": false, "sort": 0, @@ -2166,7 +2319,312 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "sort": { + "col": 4, + "desc": true + }, + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": false, + "linkTooltip": "Drill down to pods", + "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Storage IO", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution", "titleSize": "h6" } ], @@ -2209,7 +2667,7 @@ data: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -2236,7 +2694,7 @@ data: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-node.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-node.yaml index b99e40ff1a..cfdf0acb35 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-node.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-node.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'k8s-resources-node' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'k8s-resources-node' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -77,7 +77,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -312,7 +312,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -321,7 +321,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -330,7 +330,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -339,7 +339,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -348,7 +348,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -749,7 +749,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -758,7 +758,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -767,7 +767,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -776,7 +776,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -901,7 +901,7 @@ data: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -928,7 +928,7 @@ data: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, node)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-pod.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-pod.yaml index acee659c4b..b813ee22e9 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-pod.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-pod.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'k8s-resources-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'k8s-resources-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -94,7 +94,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", cluster=\"$cluster\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}container{{`}}`}}", @@ -102,7 +102,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "requests", @@ -110,7 +110,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "limits", @@ -208,7 +208,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container)", + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}container{{`}}`}}", @@ -450,7 +450,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -459,7 +459,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -468,7 +468,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -477,7 +477,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -486,7 +486,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -584,7 +584,7 @@ data: "dashes": true, "fill": 0, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -594,7 +594,7 @@ data: "dashes": true, "fill": 0, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -605,7 +605,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", image!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}container{{`}}`}}", @@ -613,7 +613,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "requests", @@ -621,7 +621,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "limits", @@ -634,7 +634,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "Memory Usage (WSS)", "tooltip": { "shared": false, "sort": 0, @@ -725,7 +725,7 @@ data: "type": "hidden" }, { - "alias": "Memory Usage", + "alias": "Memory Usage (WSS)", "colorMode": null, "colors": [ @@ -913,7 +913,7 @@ data: ], "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", image!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -922,7 +922,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -931,7 +931,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -940,7 +940,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -949,7 +949,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1072,12 +1072,12 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -1124,19 +1124,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1171,12 +1159,12 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -1229,7 +1217,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -1270,12 +1258,12 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -1322,19 +1310,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1369,12 +1345,12 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -1427,7 +1403,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -1468,12 +1444,12 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -1520,13 +1496,100 @@ data: "show": false } ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 11, + "interval": "1m", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}pod{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Transmitted Packets Dropped", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" }, { @@ -1541,9 +1604,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "decimals": -1, "fill": 10, - "id": 11, - "interval": "1m", + "id": 12, "legend": { "avg": false, "current": false, @@ -1567,15 +1630,23 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}pod{{`}}`}}", + "legendFormat": "Reads", + "legendLink": null, + "step": 10 + }, + { + "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Writes", "legendLink": null, "step": 10 } @@ -1585,7 +1656,101 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets Dropped", + "title": "IOPS", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Reads", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Writes", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut", "tooltip": { "shared": false, "sort": 0, @@ -1625,42 +1790,532 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO - Distribution(Pod - Read & Writes)", "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "default", - "value": "default" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "", - "value": "" - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": -1, + "fill": 10, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "ceil(sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}container{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS(Reads+Writes)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}container{{`}}`}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution(Containers)", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "sort": { + "col": 4, + "desc": true + }, + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Container", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Storage IO", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, + "includeAll": false, "label": null, "multi": false, "name": "cluster", @@ -1668,7 +2323,7 @@ data: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -1695,7 +2350,7 @@ data: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workload.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workload.yaml index 0df1bea025..57b40886fd 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workload.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workload.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'k8s-resources-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'k8s-resources-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -77,7 +77,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}pod{{`}}`}}", @@ -312,7 +312,7 @@ data: ], "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -321,7 +321,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -330,7 +330,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -339,7 +339,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -348,7 +348,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -692,7 +692,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -701,7 +701,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -710,7 +710,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -719,7 +719,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -1076,7 +1076,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -1116,7 +1116,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1168,19 +1168,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1214,7 +1202,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1272,7 +1260,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -1312,7 +1300,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1364,19 +1352,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1410,7 +1386,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1468,7 +1444,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Average Container Bandwidth by Pod", "titleSize": "h6" }, { @@ -1508,7 +1484,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1560,19 +1536,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1606,7 +1570,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1664,7 +1628,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -1704,7 +1668,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1756,19 +1720,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1802,7 +1754,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1860,7 +1812,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" } ], @@ -1903,7 +1855,7 @@ data: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -1930,7 +1882,7 @@ data: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -1957,7 +1909,7 @@ data: ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -1984,7 +1936,7 @@ data: ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workloads-namespace.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workloads-namespace.yaml index 9ff5394fa1..a485147ed7 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workloads-namespace.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/k8s-resources-workloads-namespace.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'k8s-resources-workloads-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'k8s-resources-workloads-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -74,8 +74,9 @@ data: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -84,8 +85,9 @@ data: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -96,7 +98,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}workload{{`}}`}} - {{`{{`}}workload_type{{`}}`}}", @@ -394,7 +396,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -403,7 +405,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -412,7 +414,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -421,7 +423,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -430,7 +432,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -527,8 +529,9 @@ data: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -537,8 +540,9 @@ data: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -856,7 +860,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -865,7 +869,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -874,7 +878,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -883,7 +887,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -1259,7 +1263,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -1299,7 +1303,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1351,19 +1355,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1397,7 +1389,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1455,7 +1447,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -1495,7 +1487,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1547,19 +1539,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1593,7 +1573,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1651,7 +1631,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Average Container Bandwidth by Workload", "titleSize": "h6" }, { @@ -1691,7 +1671,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1743,19 +1723,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1789,7 +1757,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1847,7 +1815,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -1887,7 +1855,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -1939,19 +1907,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { @@ -1985,7 +1941,7 @@ data: ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -2043,7 +1999,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" } ], @@ -2072,28 +2028,23 @@ data: }, { "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", "current": { - "text": "deployment", - "value": "deployment" + "text": "", + "value": "" }, "datasource": "$datasource", - "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", - "hide": 0, + "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, "includeAll": false, "label": null, "multi": false, - "name": "type", + "name": "cluster", "options": [ ], - "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", - "refresh": 1, + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, "regex": "", - "skipUrlSync": false, - "sort": 0, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -2104,23 +2055,28 @@ data: }, { "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", "current": { - "text": "", - "value": "" + "text": "deployment", + "value": "deployment" }, "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, + "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "hide": 0, "includeAll": false, "label": null, "multi": false, - "name": "cluster", + "name": "type", "options": [ ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "refresh": 2, "regex": "", - "sort": 1, + "skipUrlSync": false, + "sort": 0, "tagValuesQuery": "", "tags": [ @@ -2145,7 +2101,7 @@ data: ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/kubelet.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/kubelet.yaml index 72382997ca..613a6c2872 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/kubelet.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/kubelet.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'kubelet' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'kubelet' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubelet.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubelet.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -40,2384 +40,2099 @@ data: "links": [ ], - "refresh": "10s", - "rows": [ + "panels": [ { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(up{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Up", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Running Pods", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Running Container", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ + "mappings": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Actual Volume Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { + "thresholds": { + "mode": "absolute", + "steps": [ + ] }, - "id": 6, - "interval": null, - "links": [ + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "links": [ - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Desired Volume Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" - }, + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Config Error Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" + "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "title": "Running Kubelets", + "transparent": false, + "type": "stat" }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + "mappings": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Operation Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "thresholds": { + "mode": "absolute", + "steps": [ ] }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 3, + "links": [ - }, - "id": 9, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Running Pods", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + "mappings": [ ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Operation Error Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "thresholds": { + "mode": "absolute", + "steps": [ ] }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "unit": "none" } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 4, + "links": [ + ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Running Container", + "transparent": false, + "type": "stat" }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + "mappings": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Operation duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "thresholds": { + "mode": "absolute", + "steps": [ ] }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "unit": "none" } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 5, + "links": [ + ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Actual Volume Count", + "transparent": false, + "type": "stat" }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 11, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} pod", - "refId": "A" - }, - { - "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} worker", - "refId": "B" - } - ], - "thresholds": [ + "mappings": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Start Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "thresholds": { + "mode": "absolute", + "steps": [ ] }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 6, + "links": [ - }, - "id": 12, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Desired Volume Count", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} pod", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} worker", - "refId": "B" - } - ], - "thresholds": [ + "mappings": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Start Duration", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + "thresholds": { + "mode": "absolute", + "steps": [ ] }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "unit": "none" } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 7, + "links": [ + ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "title": "Config Error Count", + "transparent": false, + "type": "stat" }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 13, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + "aliasColors": { - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 14, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Error Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 15, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 16, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Cgroup manager operation rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} pod", + "refId": "A" }, { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 17, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} worker", + "refId": "B" + } + ], + "thresholds": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod Start Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Cgroup manager 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} pod", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} worker", + "refId": "B" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod Start Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "Pod lifecycle event generator", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + "aliasColors": { - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 19, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist interval", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 20, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}operation_type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Cgroup manager operation rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist duration", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Cgroup manager 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Pod lifecycle event generator", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 49 + }, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 21, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "2xx", - "refId": "A" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "3xx", - "refId": "B" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "4xx", - "refId": "C" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "5xx", - "refId": "D" - } - ], - "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "timeFrom": null, - "timeShift": null, - "title": "RPC Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 49 + }, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist interval", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 56 + }, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 22, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "timeFrom": null, - "timeShift": null, - "title": "Request duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 63 + }, + "id": 21, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - } ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { + "seriesOverrides": [ - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "2xx", + "refId": "A" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "3xx", + "refId": "B" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "4xx", + "refId": "C" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5xx", + "refId": "D" + } + ], + "thresholds": [ - }, - "id": 23, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "RPC Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 70 + }, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + ], + "timeFrom": null, + "timeShift": null, + "title": "Request duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 24, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 77 + }, + "id": 23, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 77 + }, + "id": 24, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - }, - "id": 25, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { - ], - "timeFrom": null, - "timeShift": null, - "title": "Goroutines", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 77 + }, + "id": 25, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{`{{`}}instance{{`}}`}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] } + ], + "refresh": "10s", + "rows": [ + ], "schemaVersion": 14, "style": "dark", diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-pod.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-pod.yaml index c131e683d7..288aeed672 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-pod.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-pod.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'namespace-by-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'namespace-by-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -1302,7 +1302,7 @@ data: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 0, "tagValuesQuery": "", @@ -1333,7 +1333,7 @@ data: ], "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-workload.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-workload.yaml index 097d7f55ee..f533068ffa 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-workload.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/namespace-by-workload.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'namespace-by-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'namespace-by-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -1542,7 +1542,7 @@ data: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 0, "tagValuesQuery": "", @@ -1573,7 +1573,7 @@ data: ], "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -1605,7 +1605,7 @@ data: ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-cluster-rsrc-use.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-cluster-rsrc-use.yaml index 62ab619454..22f5aea96c 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-cluster-rsrc-use.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-cluster-rsrc-use.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'node-cluster-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'node-cluster-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -77,7 +77,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", + "expr": "(\n instance:node_cpu_utilisation:rate5m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -92,7 +92,7 @@ data: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -178,7 +178,7 @@ data: "timeShift": null, "title": "CPU Saturation (load1 per CPU)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -276,7 +276,7 @@ data: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -347,7 +347,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -362,7 +362,7 @@ data: "timeShift": null, "title": "Memory Saturation (Major Page Faults)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -453,7 +453,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}} Receive", @@ -461,7 +461,7 @@ data: "step": 10 }, { - "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}} Transmit", @@ -476,7 +476,7 @@ data: "timeShift": null, "title": "Net Utilisation (Bytes Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -555,7 +555,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}} Receive", @@ -563,7 +563,7 @@ data: "step": 10 }, { - "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}} Transmit", @@ -578,7 +578,7 @@ data: "timeShift": null, "title": "Net Saturation (Drops Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -661,7 +661,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n", + "expr": "instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}device{{`}}`}}", @@ -676,7 +676,7 @@ data: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -747,7 +747,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n", + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}device{{`}}`}}", @@ -762,7 +762,7 @@ data: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -860,7 +860,7 @@ data: "timeShift": null, "title": "Disk Space Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-rsrc-use.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-rsrc-use.yaml index cd21961b54..52d24bfcbd 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-rsrc-use.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/node-rsrc-use.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'node-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'node-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.nodeExporter.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -77,7 +77,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_cpu_utilisation:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_cpu_utilisation:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Utilisation", @@ -92,7 +92,7 @@ data: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -178,7 +178,7 @@ data: "timeShift": null, "title": "CPU Saturation (Load1 per CPU)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -276,7 +276,7 @@ data: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -347,7 +347,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Major page faults", @@ -362,7 +362,7 @@ data: "timeShift": null, "title": "Memory Saturation (Major Page Faults)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -453,7 +453,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Receive", @@ -461,7 +461,7 @@ data: "step": 10 }, { - "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Transmit", @@ -476,7 +476,7 @@ data: "timeShift": null, "title": "Net Utilisation (Bytes Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -555,7 +555,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Receive drops", @@ -563,7 +563,7 @@ data: "step": 10 }, { - "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Transmit drops", @@ -578,7 +578,7 @@ data: "timeShift": null, "title": "Net Saturation (Drops Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -661,7 +661,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}device{{`}}`}}", @@ -676,7 +676,7 @@ data: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -747,7 +747,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}device{{`}}`}}", @@ -762,7 +762,7 @@ data: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -860,7 +860,7 @@ data: "timeShift": null, "title": "Disk Space Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/nodes.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/nodes.yaml index 2a29fc0598..9da7444b56 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/nodes.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/nodes.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'nodes' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'nodes' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -92,9 +92,8 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n", + "expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__rate_interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n", "format": "time_series", - "interval": "1m", "intervalFactor": 5, "legendFormat": "{{`{{`}}cpu{{`}}`}}", "refId": "A" @@ -528,25 +527,22 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{`{{`}}device{{`}}`}} read", "refId": "A" }, { - "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{`{{`}}device{{`}}`}} written", "refId": "B" }, { - "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{`{{`}}device{{`}}`}} io time", "refId": "C" @@ -758,9 +754,8 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{`{{`}}device{{`}}`}}", "refId": "A" @@ -852,9 +847,8 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{`{{`}}device{{`}}`}}", "refId": "A" diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/persistentvolumesusage.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/persistentvolumesusage.yaml index 06fb315351..93065ffcc1 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/persistentvolumesusage.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/persistentvolumesusage.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'persistentvolumesusage' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'persistentvolumesusage' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -92,14 +92,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n sum without(instance, node) (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n", + "expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Used Space", "refId": "A" }, { - "expr": "sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n", + "expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Free Space", @@ -207,7 +207,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "max without(instance,node) (\n(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n", + "expr": "max without(instance,node) (\n(\n topk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n topk(1, kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n/\ntopk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -289,14 +289,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n", + "expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Used inodes", "refId": "A" }, { - "expr": "(\n sum without(instance, node) (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n", + "expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": " Free inodes", @@ -404,7 +404,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "max without(instance,node) (\nkubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n", + "expr": "max without(instance,node) (\ntopk(1, kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n/\ntopk(1, kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/pod-total.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/pod-total.yaml index 95abda4d14..d2842c3ba6 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/pod-total.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/pod-total.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'pod-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'pod-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -1034,7 +1034,7 @@ data: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 0, "tagValuesQuery": "", @@ -1065,7 +1065,7 @@ data: ], "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -1097,7 +1097,7 @@ data: ], "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}, pod)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus-remote-write.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus-remote-write.yaml index b33b738058..a98581d41f 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus-remote-write.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus-remote-write.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'prometheus-remote-write' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'prometheus-remote-write' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.prometheus.prometheusSpec.remoteWriteDashboards }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.prometheus.prometheusSpec.remoteWriteDashboards }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus.yaml index 7095fb7ad9..71d3a621ad 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/prometheus.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'prometheus' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'prometheus' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -220,7 +220,7 @@ data: "timeShift": null, "title": "Prometheus Stats", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -319,7 +319,7 @@ data: "timeShift": null, "title": "Target Sync", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -405,7 +405,7 @@ data: "timeShift": null, "title": "Targets", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -503,7 +503,7 @@ data: "timeShift": null, "title": "Average Scrape Interval Duration", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -613,7 +613,7 @@ data: "timeShift": null, "title": "Scrape failures", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -699,7 +699,7 @@ data: "timeShift": null, "title": "Appended Samples", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -797,7 +797,7 @@ data: "timeShift": null, "title": "Head Series", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -883,7 +883,7 @@ data: "timeShift": null, "title": "Head Chunks", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -981,7 +981,7 @@ data: "timeShift": null, "title": "Query Rate", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1067,7 +1067,7 @@ data: "timeShift": null, "title": "Stage Duration", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -1133,7 +1133,7 @@ data: "type": "datasource" }, { - "allValue": null, + "allValue": ".+", "current": { "selected": true, "text": "All", @@ -1161,7 +1161,7 @@ data: "useTags": false }, { - "allValue": null, + "allValue": ".+", "current": { "selected": true, "text": "All", diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/proxy.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/proxy.yaml index 8c6b77c093..e6b9b98d33 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/proxy.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/proxy.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'proxy' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'proxy' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeProxy.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeProxy.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -107,7 +107,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"kube-proxy\"})", + "expr": "sum(up{cluster=\"$cluster\", job=\"kube-proxy\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -176,7 +176,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", + "expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "rate", @@ -269,7 +269,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", + "expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -375,7 +375,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", + "expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "rate", @@ -468,7 +468,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m])) by (instance, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -574,28 +574,28 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"2..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "2xx", "refId": "A" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"3..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "3xx", "refId": "B" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"4..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "4xx", "refId": "C" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"5..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "5xx", @@ -688,7 +688,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}", @@ -794,7 +794,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}", @@ -900,7 +900,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kube-proxy\",instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -993,7 +993,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kube-proxy\",instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -1086,7 +1086,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kube-proxy\",instance=~\"$instance\"}", + "expr": "go_goroutines{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -1170,6 +1170,32 @@ data: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -1180,7 +1206,7 @@ data: "options": [ ], - "query": "label_values(kubeproxy_network_programming_duration_seconds_bucket{job=\"kube-proxy\"}, instance)", + "query": "label_values(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\"}, instance)", "refresh": 2, "regex": "", "sort": 1, diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/scheduler.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/scheduler.yaml index 860fdc8bb4..82edd364f7 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/scheduler.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/scheduler.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'scheduler' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'scheduler' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeScheduler.enabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled .Values.kubeScheduler.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -107,7 +107,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"kube-scheduler\"})", + "expr": "sum(up{cluster=\"$cluster\", job=\"kube-scheduler\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -176,31 +176,31 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} e2e", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e", "refId": "A" }, { - "expr": "sum(rate(scheduler_binding_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} binding", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding", "refId": "B" }, { - "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm", "refId": "C" }, { - "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} volume", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume", "refId": "D" } ], @@ -290,31 +290,31 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} e2e", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e", "refId": "A" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} binding", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding", "refId": "B" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm", "refId": "C" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} volume", + "legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume", "refId": "D" } ], @@ -417,28 +417,28 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"2..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "2xx", "refId": "A" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"3..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "3xx", "refId": "B" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"4..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "4xx", "refId": "C" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"5..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "5xx", @@ -531,7 +531,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}", @@ -637,7 +637,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}", @@ -743,7 +743,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kube-scheduler\", instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -836,7 +836,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -929,7 +929,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kube-scheduler\",instance=~\"$instance\"}", + "expr": "go_goroutines{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{`{{`}}instance{{`}}`}}", @@ -1013,6 +1013,32 @@ data: "allValue": null, "current": { + }, + "datasource": "$datasource", + "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, "datasource": "$datasource", "hide": 0, @@ -1023,7 +1049,7 @@ data: "options": [ ], - "query": "label_values(process_cpu_seconds_total{job=\"kube-scheduler\"}, instance)", + "query": "label_values(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\"}, instance)", "refresh": 2, "regex": "", "sort": 1, diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/statefulset.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/statefulset.yaml index 9b3ce7d640..f1ffea50fd 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/statefulset.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/statefulset.yaml @@ -4,7 +4,7 @@ Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -925,4 +925,4 @@ data: "uid": "a31c1f46e6f727cb37c0d731a7245005", "version": 0 } -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/workload-total.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/workload-total.yaml index 07f53532e7..dc1e4ed16a 100644 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/workload-total.yaml +++ b/charts/kube-prometheus-stack/templates/grafana/dashboards-1.14/workload-total.yaml @@ -1,10 +1,10 @@ {{- /* -Generated from 'workload-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml +Generated from 'workload-total' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} +{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -1212,7 +1212,7 @@ data: ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 0, "tagValuesQuery": "", @@ -1243,7 +1243,7 @@ data: ], "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -1275,7 +1275,7 @@ data: ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\"}, workload)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -1307,7 +1307,7 @@ data: ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\"}, workload_type)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards/etcd.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards/etcd.yaml deleted file mode 100644 index 157843af04..0000000000 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards/etcd.yaml +++ /dev/null @@ -1,1116 +0,0 @@ -{{- /* -Generated from 'etcd' from https://raw.githubusercontent.com/etcd-io/website/master/content/docs/current/op-guide/grafana.json -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled .Values.kubeEtcd.enabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: {{ template "kube-prometheus-stack.namespace" . }} - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "etcd" | trunc 63 | trimSuffix "-" }} - annotations: -{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} - labels: - {{- if $.Values.grafana.sidecar.dashboards.label }} - {{ $.Values.grafana.sidecar.dashboards.label }}: "1" - {{- end }} - app: {{ template "kube-prometheus-stack.name" $ }}-grafana -{{ include "kube-prometheus-stack.labels" $ | indent 4 }} -data: - etcd.json: |- - { - "annotations": { - "list": [] - }, - "description": "etcd sample Grafana dashboard with Prometheus", - "editable": true, - "gnetId": null, - "hideControls": false, - "links": [], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "$datasource", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 28, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [ - { - "expr": "sum(etcd_server_has_leader{job=\"$cluster\"})", - "intervalFactor": 2, - "legendFormat": "", - "metric": "etcd_server_has_leader", - "refId": "A", - "step": 20 - } - ], - "thresholds": "", - "title": "Up", - "type": "singlestat", - "valueFontSize": "200%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "aliasColors": {}, - "bars": false, - "datasource": "$datasource", - "editable": true, - "error": false, - "fill": 0, - "id": 23, - "isNew": true, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 5, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "RPC Rate", - "metric": "grpc_server_started_total", - "refId": "A", - "step": 2 - }, - { - "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "RPC Failed Rate", - "metric": "grpc_server_handled_total", - "refId": "B", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "RPC Rate", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "datasource": "$datasource", - "editable": true, - "error": false, - "fill": 0, - "id": 41, - "isNew": true, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 4, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(grpc_server_started_total{job=\"$cluster\",grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{job=\"$cluster\",grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})", - "intervalFactor": 2, - "legendFormat": "Watch Streams", - "metric": "grpc_server_handled_total", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(grpc_server_started_total{job=\"$cluster\",grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{job=\"$cluster\",grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})", - "intervalFactor": 2, - "legendFormat": "Lease Streams", - "metric": "grpc_server_handled_total", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Active Streams", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "showTitle": false, - "title": "Row" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "$datasource", - "decimals": null, - "editable": true, - "error": false, - "fill": 0, - "grid": {}, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "etcd_mvcc_db_total_size_in_bytes{job=\"$cluster\"}", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} DB Size", - "metric": "", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "DB Size", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "datasource": "$datasource", - "editable": true, - "error": false, - "fill": 0, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 1, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 4, - "stack": false, - "steppedLine": true, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))", - "hide": false, - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} WAL fsync", - "metric": "etcd_disk_wal_fsync_duration_seconds_bucket", - "refId": "A", - "step": 4 - }, - { - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} DB fsync", - "metric": "etcd_disk_backend_commit_duration_seconds_bucket", - "refId": "B", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk Sync Duration", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "datasource": "$datasource", - "editable": true, - "error": false, - "fill": 0, - "id": 29, - "isNew": true, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_resident_memory_bytes{job=\"$cluster\"}", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} Resident Memory", - "metric": "process_resident_memory_bytes", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "New row" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "$datasource", - "editable": true, - "error": false, - "fill": 5, - "id": 22, - "isNew": true, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 3, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} Client Traffic In", - "metric": "etcd_network_client_grpc_received_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Client Traffic In", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "datasource": "$datasource", - "editable": true, - "error": false, - "fill": 5, - "id": 21, - "isNew": true, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 3, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} Client Traffic Out", - "metric": "etcd_network_client_grpc_sent_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Client Traffic Out", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "datasource": "$datasource", - "editable": true, - "error": false, - "fill": 0, - "id": 20, - "isNew": true, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} Peer Traffic In", - "metric": "etcd_network_peer_received_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Peer Traffic In", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "datasource": "$datasource", - "decimals": null, - "editable": true, - "error": false, - "fill": 0, - "grid": {}, - "id": 16, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} Peer Traffic Out", - "metric": "etcd_network_peer_sent_bytes_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Peer Traffic Out", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "New row" - }, - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "datasource": "$datasource", - "editable": true, - "error": false, - "fill": 0, - "id": 40, - "isNew": true, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))", - "intervalFactor": 2, - "legendFormat": "Proposal Failure Rate", - "metric": "etcd_server_proposals_failed_total", - "refId": "A", - "step": 2 - }, - { - "expr": "sum(etcd_server_proposals_pending{job=\"$cluster\"})", - "intervalFactor": 2, - "legendFormat": "Proposal Pending Total", - "metric": "etcd_server_proposals_pending", - "refId": "B", - "step": 2 - }, - { - "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))", - "intervalFactor": 2, - "legendFormat": "Proposal Commit Rate", - "metric": "etcd_server_proposals_committed_total", - "refId": "C", - "step": 2 - }, - { - "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))", - "intervalFactor": 2, - "legendFormat": "Proposal Apply Rate", - "refId": "D", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Raft Proposals", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "datasource": "$datasource", - "decimals": 0, - "editable": true, - "error": false, - "fill": 0, - "id": 19, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "changes(etcd_server_leader_changes_seen_total{job=\"$cluster\"}[1d])", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}instance{{`}}`}} Total Leader Elections Per Day", - "metric": "etcd_server_leader_changes_seen_total", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Total Leader Elections Per Day", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "title": "New row" - } - ], - "schemaVersion": 13, - "sharedCrosshair": false, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [], - "query": "label_values(etcd_server_has_leader, job)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-15m", - "to": "now" - }, - "timepicker": { - "now": true, - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "etcd", - "uid": "c2f4e12cdf69feb95caa41a5a1b423d9", - "version": 215 - } -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-cluster-rsrc-use.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-cluster-rsrc-use.yaml deleted file mode 100644 index fbe5e313b8..0000000000 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-cluster-rsrc-use.yaml +++ /dev/null @@ -1,959 +0,0 @@ -{{- /* -Generated from 'k8s-cluster-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/grafana-dashboardDefinitions.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: {{ template "kube-prometheus-stack.namespace" . }} - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-cluster-rsrc-use" | trunc 63 | trimSuffix "-" }} - annotations: -{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} - labels: - {{- if $.Values.grafana.sidecar.dashboards.label }} - {{ $.Values.grafana.sidecar.dashboards.label }}: "1" - {{- end }} - app: {{ template "kube-prometheus-stack.name" $ }}-grafana -{{ include "kube-prometheus-stack.labels" $ | indent 4 }} -data: - k8s-cluster-rsrc-use.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:cluster_cpu_utilisation:ratio{cluster=\"$cluster\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}node{{`}}`}}", - "legendLink": "./d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_cpu_saturation_load1:{cluster=\"$cluster\"} / scalar(sum(min(kube_pod_info{cluster=\"$cluster\"}) by (node)))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}node{{`}}`}}", - "legendLink": "./d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Saturation (Load1)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:cluster_memory_utilisation:ratio{cluster=\"$cluster\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}node{{`}}`}}", - "legendLink": "./d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_memory_swap_io_bytes:sum_rate{cluster=\"$cluster\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}node{{`}}`}}", - "legendLink": "./d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Saturation (Swap I/O)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_disk_utilisation:avg_irate{cluster=\"$cluster\"} / scalar(:kube_pod_info_node_count:{cluster=\"$cluster\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}node{{`}}`}}", - "legendLink": "./d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk IO Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_disk_saturation:avg_irate{cluster=\"$cluster\"} / scalar(:kube_pod_info_node_count:{cluster=\"$cluster\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}node{{`}}`}}", - "legendLink": "./d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk IO Saturation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Disk", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 7, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_net_utilisation:sum_irate{cluster=\"$cluster\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}node{{`}}`}}", - "legendLink": "./d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Net Utilisation (Transmitted)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_net_saturation:sum_irate{cluster=\"$cluster\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}node{{`}}`}}", - "legendLink": "./d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Net Saturation (Dropped)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"} - node_filesystem_avail_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\", cluster=\"$cluster\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{cluster=\"$cluster\"}\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}node{{`}}`}}", - "legendLink": "./d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk Capacity", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": 1, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Storage", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(:kube_pod_info_node_count:, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / USE Method / Cluster", - "uid": "a6e7d1362e1ddbb79db21d5bb40d7137", - "version": 0 - } -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-node-rsrc-use.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-node-rsrc-use.yaml deleted file mode 100644 index 6d01578544..0000000000 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-node-rsrc-use.yaml +++ /dev/null @@ -1,986 +0,0 @@ -{{- /* -Generated from 'k8s-node-rsrc-use' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/grafana-dashboardDefinitions.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: {{ template "kube-prometheus-stack.namespace" . }} - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-node-rsrc-use" | trunc 63 | trimSuffix "-" }} - annotations: -{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} - labels: - {{- if $.Values.grafana.sidecar.dashboards.label }} - {{ $.Values.grafana.sidecar.dashboards.label }}: "1" - {{- end }} - app: {{ template "kube-prometheus-stack.name" $ }}-grafana -{{ include "kube-prometheus-stack.labels" $ | indent 4 }} -data: - k8s-node-rsrc-use.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_cpu_utilisation:avg1m{cluster=\"$cluster\", node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Utilisation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_cpu_saturation_load1:{cluster=\"$cluster\", node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Saturation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Saturation (Load1)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_memory_utilisation:{cluster=\"$cluster\", node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Memory", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_memory_swap_io_bytes:sum_rate{cluster=\"$cluster\", node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Swap IO", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Saturation (Swap I/O)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_disk_utilisation:avg_irate{cluster=\"$cluster\", node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Utilisation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk IO Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_disk_saturation:avg_irate{cluster=\"$cluster\", node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Saturation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk IO Saturation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Disk", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 7, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_net_utilisation:sum_irate{cluster=\"$cluster\", node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Utilisation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Net Utilisation (Transmitted)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_net_saturation:sum_irate{cluster=\"$cluster\", node=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Saturation", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Net Saturation (Dropped)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Net", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node:node_filesystem_usage:{cluster=\"$cluster\"}\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{cluster=\"$cluster\", node=\"$node\"}\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}device{{`}}`}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Disk", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(:kube_pod_info_node_count:, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "node", - "multi": false, - "name": "node", - "options": [ - - ], - "query": "label_values(kube_node_info{cluster=\"$cluster\"}, node)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / USE Method / Node", - "uid": "4ac4f123aae0ff6dbaf4f4f66120033b", - "version": 0 - } -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-cluster.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-cluster.yaml deleted file mode 100644 index ecfbb3cbf8..0000000000 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-cluster.yaml +++ /dev/null @@ -1,1479 +0,0 @@ -{{- /* -Generated from 'k8s-resources-cluster' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/grafana-dashboardDefinitions.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: {{ template "kube-prometheus-stack.namespace" . }} - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-resources-cluster" | trunc 63 | trimSuffix "-" }} - annotations: -{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} - labels: - {{- if $.Values.grafana.sidecar.dashboards.label }} - {{ $.Values.grafana.sidecar.dashboards.label }}: "1" - {{- end }} - app: {{ template "kube-prometheus-stack.name" $ }}-grafana -{{ include "kube-prometheus-stack.labels" $ | indent 4 }} -data: - k8s-resources-cluster.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "100px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "format": "percentunit", - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[1m]))", - "format": "time_series", - "instant": true, - "intervalFactor": 2, - "refId": "A" - } - ], - "thresholds": "70,80", - "timeFrom": null, - "timeShift": null, - "title": "CPU Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "singlestat", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "format": "percentunit", - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) / sum(node:node_num_cpu:sum{cluster=\"$cluster\"})", - "format": "time_series", - "instant": true, - "intervalFactor": 2, - "refId": "A" - } - ], - "thresholds": "70,80", - "timeFrom": null, - "timeShift": null, - "title": "CPU Requests Commitment", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "singlestat", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "format": "percentunit", - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) / sum(node:node_num_cpu:sum{cluster=\"$cluster\"})", - "format": "time_series", - "instant": true, - "intervalFactor": 2, - "refId": "A" - } - ], - "thresholds": "70,80", - "timeFrom": null, - "timeShift": null, - "title": "CPU Limits Commitment", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "singlestat", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "format": "percentunit", - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "1 - sum(:node_memory_MemFreeCachedBuffers_bytes:sum{cluster=\"$cluster\"}) / sum(:node_memory_MemTotal_bytes:sum{cluster=\"$cluster\"})", - "format": "time_series", - "instant": true, - "intervalFactor": 2, - "refId": "A" - } - ], - "thresholds": "70,80", - "timeFrom": null, - "timeShift": null, - "title": "Memory Utilisation", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "singlestat", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "format": "percentunit", - "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) / sum(:node_memory_MemTotal_bytes:sum{cluster=\"$cluster\"})", - "format": "time_series", - "instant": true, - "intervalFactor": 2, - "refId": "A" - } - ], - "thresholds": "70,80", - "timeFrom": null, - "timeShift": null, - "title": "Memory Requests Commitment", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "singlestat", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "format": "percentunit", - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) / sum(:node_memory_MemTotal_bytes:sum{cluster=\"$cluster\"})", - "format": "time_series", - "instant": true, - "intervalFactor": 2, - "refId": "A" - } - ], - "thresholds": "70,80", - "timeFrom": null, - "timeShift": null, - "title": "Memory Limits Commitment", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "singlestat", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Headlines", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 7, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}namespace{{`}}`}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Pods", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "link": true, - "linkTooltip": "Drill down to pods", - "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "Workloads", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "link": true, - "linkTooltip": "Drill down to workloads", - "linkUrl": "./d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "CPU Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #F", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #G", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Namespace", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down to pods", - "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", - "pattern": "namespace", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "count(mixin_pod_workload{cluster=\"$cluster\"}) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "count(avg(mixin_pod_workload{cluster=\"$cluster\"}) by (workload, namespace)) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "F", - "step": 10 - }, - { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "G", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Quota", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Quota", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}namespace{{`}}`}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage (w/o cache)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Pods", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "link": true, - "linkTooltip": "Drill down to pods", - "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "Workloads", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "link": true, - "linkTooltip": "Drill down to workloads", - "linkUrl": "./d/a87fb0d919ec0ea5f6543124e16c42a5/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "Memory Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Memory Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #F", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #G", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Namespace", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down to pods", - "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", - "pattern": "namespace", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "count(mixin_pod_workload{cluster=\"$cluster\"}) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "count(avg(mixin_pod_workload{cluster=\"$cluster\"}) by (workload, namespace)) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "F", - "step": 10 - }, - { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "G", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Requests by Namespace", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Requests", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(:kube_pod_info_node_count:, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / Compute Resources / Cluster", - "uid": "efa86fd1d0c121a26444b636a3f509a8", - "version": 0 - } -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-namespace.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-namespace.yaml deleted file mode 100644 index 64da49cb5b..0000000000 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-namespace.yaml +++ /dev/null @@ -1,963 +0,0 @@ -{{- /* -Generated from 'k8s-resources-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/grafana-dashboardDefinitions.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: {{ template "kube-prometheus-stack.namespace" . }} - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-resources-namespace" | trunc 63 | trimSuffix "-" }} - annotations: -{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} - labels: - {{- if $.Values.grafana.sidecar.dashboards.label }} - {{ $.Values.grafana.sidecar.dashboards.label }}: "1" - {{- end }} - app: {{ template "kube-prometheus-stack.name" $ }}-grafana -{{ include "kube-prometheus-stack.labels" $ | indent 4 }} -data: - k8s-resources-namespace.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod_name)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}pod_name{{`}}`}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "CPU Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "CPU Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Pod", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", - "pattern": "pod", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Quota", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Quota", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"}) by (pod_name)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}pod_name{{`}}`}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage (w/o cache)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Memory Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Memory Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Memory Usage (RSS)", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #F", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Usage (Cache)", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #G", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Usage (Swap", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #H", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Pod", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", - "pattern": "pod", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "F", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "G", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "H", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Quota", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Quota", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(:kube_pod_info_node_count:, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / Compute Resources / Namespace (Pods)", - "uid": "85a562078cdf77779eaa1add43ccec1e", - "version": 0 - } -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-pod.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-pod.yaml deleted file mode 100644 index 78cd0452f3..0000000000 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-pod.yaml +++ /dev/null @@ -1,1006 +0,0 @@ -{{- /* -Generated from 'k8s-resources-pod' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/grafana-dashboardDefinitions.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: {{ template "kube-prometheus-stack.namespace" . }} - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-resources-pod" | trunc 63 | trimSuffix "-" }} - annotations: -{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} - labels: - {{- if $.Values.grafana.sidecar.dashboards.label }} - {{ $.Values.grafana.sidecar.dashboards.label }}: "1" - {{- end }} - app: {{ template "kube-prometheus-stack.name" $ }}-grafana -{{ include "kube-prometheus-stack.labels" $ | indent 4 }} -data: - k8s-resources-pod.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", cluster=\"$cluster\"}) by (container_name)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}container_name{{`}}`}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "CPU Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "CPU Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Container", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "container", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Quota", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Quota", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}container_name{{`}}`}} (RSS)", - "legendLink": null, - "step": 10 - }, - { - "expr": "sum(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}container_name{{`}}`}} (Cache)", - "legendLink": null, - "step": 10 - }, - { - "expr": "sum(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}container_name{{`}}`}} (Swap)", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Memory Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Memory Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Memory Usage (RSS)", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #F", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Usage (Cache)", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #G", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Usage (Swap", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #H", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Container", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "container", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "F", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "G", - "step": 10 - }, - { - "expr": "sum(label_replace(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=\"$pod\", container_name != \"\", container_name != \"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "H", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Quota", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Quota", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(:kube_pod_info_node_count:, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "pod", - "multi": false, - "name": "pod", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / Compute Resources / Pod", - "uid": "6581e46e4e5c7ba40a07646395ef7b23", - "version": 0 - } -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-workload.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-workload.yaml deleted file mode 100644 index af1a153d61..0000000000 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-workload.yaml +++ /dev/null @@ -1,936 +0,0 @@ -{{- /* -Generated from 'k8s-resources-workload' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/grafana-dashboardDefinitions.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: {{ template "kube-prometheus-stack.namespace" . }} - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-resources-workload" | trunc 63 | trimSuffix "-" }} - annotations: -{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} - labels: - {{- if $.Values.grafana.sidecar.dashboards.label }} - {{ $.Values.grafana.sidecar.dashboards.label }}: "1" - {{- end }} - app: {{ template "kube-prometheus-stack.name" $ }}-grafana -{{ include "kube-prometheus-stack.labels" $ | indent 4 }} -data: - k8s-resources-workload.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}pod{{`}}`}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "CPU Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "CPU Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Pod", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", - "pattern": "pod", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Quota", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Quota", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n ) by (pod)\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}pod{{`}}`}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Memory Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Memory Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Pod", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", - "pattern": "pod", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n ) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n ) by (pod)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n ) by (pod)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Quota", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Quota", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(:kube_pod_info_node_count:, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "workload", - "multi": false, - "name": "workload", - "options": [ - - ], - "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "type", - "multi": false, - "name": "type", - "options": [ - - ], - "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / Compute Resources / Workload", - "uid": "a164a7f0339f99e89cea5cb47e9be617", - "version": 0 - } -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-workloads-namespace.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-workloads-namespace.yaml deleted file mode 100644 index c11f51bb27..0000000000 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards/k8s-resources-workloads-namespace.yaml +++ /dev/null @@ -1,972 +0,0 @@ -{{- /* -Generated from 'k8s-resources-workloads-namespace' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/grafana-dashboardDefinitions.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: {{ template "kube-prometheus-stack.namespace" . }} - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-resources-workloads-namespace" | trunc 63 | trimSuffix "-" }} - annotations: -{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} - labels: - {{- if $.Values.grafana.sidecar.dashboards.label }} - {{ $.Values.grafana.sidecar.dashboards.label }}: "1" - {{- end }} - app: {{ template "kube-prometheus-stack.name" $ }}-grafana -{{ include "kube-prometheus-stack.labels" $ | indent 4 }} -data: - k8s-resources-workloads-namespace.json: |- - { - "annotations": { - "list": [ - - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - - ], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}workload{{`}}`}} - {{`{{`}}workload_type{{`}}`}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Running Pods", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "CPU Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "CPU Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #F", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Workload", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "./d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2", - "pattern": "workload", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "Workload Type", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "workload_type", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "count(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}) by (workload, workload_type)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 - }, - { - "expr": "sum(\n label_replace(\n namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "F", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Quota", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "CPU Quota", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n ) by (workload, workload_type)\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}workload{{`}}`}} - {{`{{`}}workload_type{{`}}`}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Usage", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "pattern": "Time", - "type": "hidden" - }, - { - "alias": "Running Pods", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 0, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #A", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "Memory Usage", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #B", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Requests", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #C", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Requests %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #D", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Memory Limits", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #E", - "thresholds": [ - - ], - "type": "number", - "unit": "bytes" - }, - { - "alias": "Memory Limits %", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "Value #F", - "thresholds": [ - - ], - "type": "number", - "unit": "percentunit" - }, - { - "alias": "Workload", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": true, - "linkTooltip": "Drill down", - "linkUrl": "./d/a164a7f0339f99e89cea5cb47e9be617/k8s-resources-workload?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-workload=$__cell&var-type=$__cell_2", - "pattern": "workload", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "Workload Type", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "linkTooltip": "Drill down", - "linkUrl": "", - "pattern": "workload_type", - "thresholds": [ - - ], - "type": "number", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "/.*/", - "thresholds": [ - - ], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "count(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}) by (workload, workload_type)", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 10 - }, - { - "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n ) by (workload, workload_type)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - }, - { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "C", - "step": 10 - }, - { - "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n ) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "D", - "step": 10 - }, - { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "E", - "step": 10 - }, - { - "expr": "sum(\n label_replace(\n container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container_name!=\"\"},\n \"pod\", \"$1\", \"pod_name\", \"(.*)\"\n ) * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n ) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod) group_left(workload, workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}\n) by (workload, workload_type)\n", - "format": "table", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "F", - "step": 10 - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Quota", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "transform": "table", - "type": "table", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Memory Quota", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(:kube_pod_info_node_count:, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / Compute Resources / Namespace (Workloads)", - "uid": "a87fb0d919ec0ea5f6543124e16c42a5", - "version": 0 - } -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards/nodes.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards/nodes.yaml deleted file mode 100644 index 82d332a5c4..0000000000 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards/nodes.yaml +++ /dev/null @@ -1,1383 +0,0 @@ -{{- /* -Generated from 'nodes' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/grafana-dashboardDefinitions.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: {{ template "kube-prometheus-stack.namespace" . }} - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "nodes" | trunc 63 | trimSuffix "-" }} - annotations: -{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} - labels: - {{- if $.Values.grafana.sidecar.dashboards.label }} - {{ $.Values.grafana.sidecar.dashboards.label }}: "1" - {{- end }} - app: {{ template "kube-prometheus-stack.name" $ }}-grafana -{{ include "kube-prometheus-stack.labels" $ | indent 4 }} -data: - nodes.json: |- - { - "__inputs": [ - - ], - "__requires": [ - - ], - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 2, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(node_load1{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 1m", - "refId": "A" - }, - { - "expr": "max(node_load5{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 5m", - "refId": "B" - }, - { - "expr": "max(node_load15{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "load 15m", - "refId": "C" - }, - { - "expr": "count(node_cpu_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\", mode=\"user\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "logical cores", - "refId": "D" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "System load", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 3, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (cpu) (irate(node_cpu_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}cpu{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Usage Per Core", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 4, - "legend": { - "alignAsTable": "true", - "avg": "true", - "current": "true", - "max": "false", - "min": "false", - "rightSide": "true", - "show": "true", - "total": "false", - "values": "true" - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max (sum by (cpu) (irate(node_cpu_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n", - "format": "time_series", - "intervalFactor": 10, - "legendFormat": "{{`{{`}} cpu {{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Utilization", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percent", - "label": null, - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "percent", - "label": null, - "logBase": 1, - "max": 100, - "min": 0, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "$datasource", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "avg(sum by (cpu) (irate(node_cpu_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "80, 90", - "title": "CPU Usage", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 6, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(\n node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n)\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory used", - "refId": "A" - }, - { - "expr": "max(node_memory_Buffers_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory buffers", - "refId": "B" - }, - { - "expr": "max(node_memory_Cached_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory cached", - "refId": "C" - }, - { - "expr": "max(node_memory_MemFree_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "memory free", - "refId": "D" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "$datasource", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(\n (\n (\n node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "80, 90", - "title": "Memory Usage", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 8, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - { - "alias": "read", - "yaxis": 1 - }, - { - "alias": "io time", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(rate(node_disk_read_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}[2m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "read", - "refId": "A" - }, - { - "expr": "max(rate(node_disk_written_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}[2m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "written", - "refId": "B" - }, - { - "expr": "max(rate(node_disk_io_time_seconds_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}[2m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "io time", - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk I/O", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 9, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max by (namespace, pod, device) ((node_filesystem_size_bytes{cluster=\"$cluster\", fstype=~\"ext[234]|btrfs|xfs|zfs\", instance=\"$instance\", job=\"node-exporter\"} - node_filesystem_avail_bytes{cluster=\"$cluster\", fstype=~\"ext[234]|btrfs|xfs|zfs\", instance=\"$instance\", job=\"node-exporter\"}) / node_filesystem_size_bytes{cluster=\"$cluster\", fstype=~\"ext[234]|btrfs|xfs|zfs\", instance=\"$instance\", job=\"node-exporter\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "disk used", - "refId": "A" - }, - { - "expr": "max by (namespace, pod, device) (node_filesystem_avail_bytes{cluster=\"$cluster\", fstype=~\"ext[234]|btrfs|xfs|zfs\", instance=\"$instance\", job=\"node-exporter\"} / node_filesystem_size_bytes{cluster=\"$cluster\", fstype=~\"ext[234]|btrfs|xfs|zfs\", instance=\"$instance\", job=\"node-exporter\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "disk free", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Disk Space Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 10, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(rate(node_network_receive_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}device{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Network Received", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 11, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(rate(node_network_transmit_bytes_total{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\", device!~\"lo\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{`{{`}}device{{`}}`}}", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Network Transmitted", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 12, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(\n node_filesystem_files{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n)\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "inodes used", - "refId": "A" - }, - { - "expr": "max(node_filesystem_files_free{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "inodes free", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Inodes Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "$datasource", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 13, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(\n (\n (\n node_filesystem_files{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_filesystem_files{cluster=\"$cluster\", job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "80, 90", - "title": "Inodes Usage", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "instance", - "options": [ - - ], - "query": "label_values(node_boot_time_seconds{cluster=\"$cluster\", job=\"node-exporter\"}, instance)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / Nodes", - "uid": "fa49a4706d07a042595b664c87fb33ea", - "version": 0 - } -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards/persistentvolumesusage.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards/persistentvolumesusage.yaml deleted file mode 100644 index 3c51e11787..0000000000 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards/persistentvolumesusage.yaml +++ /dev/null @@ -1,573 +0,0 @@ -{{- /* -Generated from 'persistentvolumesusage' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/grafana-dashboardDefinitions.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: {{ template "kube-prometheus-stack.namespace" . }} - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "persistentvolumesusage" | trunc 63 | trimSuffix "-" }} - annotations: -{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} - labels: - {{- if $.Values.grafana.sidecar.dashboards.label }} - {{ $.Values.grafana.sidecar.dashboards.label }}: "1" - {{- end }} - app: {{ template "kube-prometheus-stack.name" $ }}-grafana -{{ include "kube-prometheus-stack.labels" $ | indent 4 }} -data: - persistentvolumesusage.json: |- - { - "__inputs": [ - - ], - "__requires": [ - - ], - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 2, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "(\n sum without(instance, node) (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Used Space", - "refId": "A" - }, - { - "expr": "sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Free Space", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Volume Space Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "$datasource", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "80, 90", - "title": "Volume Space Usage", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 9, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Used inodes", - "refId": "A" - }, - { - "expr": "(\n sum without(instance, node) (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": " Free inodes", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Volume inodes Usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "$datasource", - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "80, 90", - "title": "Volume inodes Usage", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kubelet_volume_stats_capacity_bytes, cluster)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\"}, namespace)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "PersistentVolumeClaim", - "multi": false, - "name": "volume", - "options": [ - - ], - "query": "label_values(kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\"}, persistentvolumeclaim)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-7d", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / Persistent Volumes", - "uid": "919b92a8e8041bd567af9edab12c840c", - "version": 0 - } -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/grafana/dashboards/statefulset.yaml b/charts/kube-prometheus-stack/templates/grafana/dashboards/statefulset.yaml deleted file mode 100644 index 04b563be59..0000000000 --- a/charts/kube-prometheus-stack/templates/grafana/dashboards/statefulset.yaml +++ /dev/null @@ -1,926 +0,0 @@ -{{- /* -Generated from 'statefulset' from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/grafana-dashboardDefinitions.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.grafana.enabled .Values.grafana.defaultDashboardsEnabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: {{ template "kube-prometheus-stack.namespace" . }} - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "statefulset" | trunc 63 | trimSuffix "-" }} - annotations: -{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }} - labels: - {{- if $.Values.grafana.sidecar.dashboards.label }} - {{ $.Values.grafana.sidecar.dashboards.label }}: "1" - {{- end }} - app: {{ template "kube-prometheus-stack.name" $ }}-grafana -{{ include "kube-prometheus-stack.labels" $ | indent 4 }} -data: - statefulset.json: |- - { - "__inputs": [ - - ], - "__requires": [ - - ], - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "", - "rows": [ - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 2, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "cores", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "CPU", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 3, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "GB", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}) / 1024^3", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Memory", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 4, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "Bps", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\",pod_name=~\"$statefulset.*\"}[3m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Network", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "height": "100px", - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 5, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Desired Replicas", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 6, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Replicas of current version", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 7, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Observed Generation", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - - }, - "id": 8, - "interval": null, - "links": [ - - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Metadata Generation", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "0", - "value": "null" - } - ], - "valueName": "current" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "gridPos": { - - }, - "id": 9, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "replicas specified", - "refId": "A" - }, - { - "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "replicas created", - "refId": "B" - }, - { - "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "ready", - "refId": "C" - }, - { - "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "replicas of current version", - "refId": "D" - }, - { - "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\", cluster=\"$cluster\", namespace=\"$namespace\"}) without (instance, pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "updated", - "refId": "E" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Replicas", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": {{ if .Values.grafana.sidecar.dashboards.multicluster }}0{{ else }}2{{ end }}, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_statefulset_metadata_generation, cluster)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}, namespace)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "Name", - "multi": false, - "name": "statefulset", - "options": [ - - ], - "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", namespace=\"$namespace\"}, statefulset)", - "refresh": 2, - "regex": "", - "sort": 0, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Kubernetes / StatefulSets", - "uid": "a31c1f46e6f727cb37c0d731a7245005", - "version": 0 - } -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-createSecret.yaml b/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-createSecret.yaml index f8afcb854c..de81974421 100644 --- a/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-createSecret.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-createSecret.yaml @@ -58,8 +58,8 @@ spec: tolerations: {{ toYaml . | indent 8 }} {{- end }} +{{- if .Values.prometheusOperator.admissionWebhooks.patch.securityContext }} securityContext: - runAsGroup: 2000 - runAsNonRoot: true - runAsUser: 2000 +{{ toYaml .Values.prometheusOperator.admissionWebhooks.patch.securityContext | indent 8 }} +{{- end }} {{- end }} diff --git a/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-patchWebhook.yaml b/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-patchWebhook.yaml index b2d8912f88..23dc9f43c6 100644 --- a/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-patchWebhook.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/job-patch/job-patchWebhook.yaml @@ -59,8 +59,8 @@ spec: tolerations: {{ toYaml . | indent 8 }} {{- end }} +{{- if .Values.prometheusOperator.admissionWebhooks.patch.securityContext }} securityContext: - runAsGroup: 2000 - runAsNonRoot: true - runAsUser: 2000 +{{ toYaml .Values.prometheusOperator.admissionWebhooks.patch.securityContext | indent 8 }} +{{- end }} {{- end }} diff --git a/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml b/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml index b67df54bf0..f42e33e0d0 100644 --- a/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml @@ -5,8 +5,8 @@ metadata: name: {{ template "kube-prometheus-stack.fullname" . }}-admission {{- if .Values.prometheusOperator.admissionWebhooks.certManager.enabled }} annotations: - certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} - cert-manager.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} + certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} + cert-manager.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} {{- end }} labels: app: {{ template "kube-prometheus-stack.name" $ }}-admission diff --git a/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/validatingWebhookConfiguration.yaml b/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/validatingWebhookConfiguration.yaml index 249488e417..1439ed54e6 100644 --- a/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/validatingWebhookConfiguration.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/validatingWebhookConfiguration.yaml @@ -5,8 +5,8 @@ metadata: name: {{ template "kube-prometheus-stack.fullname" . }}-admission {{- if .Values.prometheusOperator.admissionWebhooks.certManager.enabled }} annotations: - certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} - cert-manager.io/inject-ca-from: {{ printf "%s/%s-root-cert" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} + certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} + cert-manager.io/inject-ca-from: {{ printf "%s/%s-admission" .Release.Namespace (include "kube-prometheus-stack.fullname" .) | quote }} {{- end }} labels: app: {{ template "kube-prometheus-stack.name" $ }}-admission diff --git a/charts/kube-prometheus-stack/templates/prometheus/ingress.yaml b/charts/kube-prometheus-stack/templates/prometheus/ingress.yaml index 67f6ecea2c..3992789ba0 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/ingress.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/ingress.yaml @@ -5,6 +5,7 @@ {{- $routePrefix := list .Values.prometheus.prometheusSpec.routePrefix -}} {{- $paths := .Values.prometheus.ingress.paths | default $routePrefix -}} {{- $apiIsStable := eq (include "kube-prometheus-stack.ingress.isStable" .) "true" -}} + {{- $ingressSupportsPathType := eq (include "kube-prometheus-stack.ingress.supportsPathType" .) "true" -}} apiVersion: {{ include "kube-prometheus-stack.ingress.apiVersion" . }} kind: Ingress metadata: @@ -34,7 +35,7 @@ spec: paths: {{- range $p := $paths }} - path: {{ tpl $p $ }} - {{- if $pathType }} + {{- if and $pathType $ingressSupportsPathType }} pathType: {{ $pathType }} {{- end }} backend: @@ -54,7 +55,7 @@ spec: paths: {{- range $p := $paths }} - path: {{ tpl $p $ }} - {{- if $pathType }} + {{- if and $pathType $ingressSupportsPathType }} pathType: {{ $pathType }} {{- end }} backend: diff --git a/charts/kube-prometheus-stack/templates/prometheus/ingressThanosSidecar.yaml b/charts/kube-prometheus-stack/templates/prometheus/ingressThanosSidecar.yaml index 5a4d6e194d..ace4058678 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/ingressThanosSidecar.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/ingressThanosSidecar.yaml @@ -5,6 +5,7 @@ {{- $routePrefix := list .Values.prometheus.prometheusSpec.routePrefix }} {{- $paths := .Values.prometheus.thanosIngress.paths | default $routePrefix -}} {{- $apiIsStable := eq (include "kube-prometheus-stack.ingress.isStable" .) "true" -}} +{{- $ingressSupportsPathType := eq (include "kube-prometheus-stack.ingress.supportsPathType" .) "true" -}} apiVersion: {{ include "kube-prometheus-stack.ingress.apiVersion" . }} kind: Ingress metadata: @@ -33,7 +34,7 @@ spec: paths: {{- range $p := $paths }} - path: {{ tpl $p $ }} - {{- if $pathType }} + {{- if and $pathType $ingressSupportsPathType }} pathType: {{ $pathType }} {{- end }} backend: @@ -53,7 +54,7 @@ spec: paths: {{- range $p := $paths }} - path: {{ tpl $p $ }} - {{- if $pathType }} + {{- if and $pathType $ingressSupportsPathType }} pathType: {{ $pathType }} {{- end }} backend: diff --git a/charts/kube-prometheus-stack/templates/prometheus/ingressperreplica.yaml b/charts/kube-prometheus-stack/templates/prometheus/ingressperreplica.yaml index a89c1a98b9..df631993ba 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/ingressperreplica.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/ingressperreplica.yaml @@ -4,6 +4,7 @@ {{- $servicePort := .Values.prometheus.servicePerReplica.port -}} {{- $ingressValues := .Values.prometheus.ingressPerReplica -}} {{- $apiIsStable := eq (include "kube-prometheus-stack.ingress.isStable" .) "true" -}} +{{- $ingressSupportsPathType := eq (include "kube-prometheus-stack.ingress.supportsPathType" .) "true" -}} apiVersion: v1 kind: List metadata: @@ -12,7 +13,7 @@ metadata: items: {{ range $i, $e := until $count }} - kind: Ingress - apiVersion: {{ include "kube-prometheus-stack.ingress.apiVersion" . }} + apiVersion: {{ include "kube-prometheus-stack.ingress.apiVersion" $ }} metadata: name: {{ include "kube-prometheus-stack.fullname" $ }}-prometheus-{{ $i }} namespace: {{ template "kube-prometheus-stack.namespace" $ }} @@ -38,7 +39,7 @@ items: paths: {{- range $p := $ingressValues.paths }} - path: {{ tpl $p $ }} - {{- if $pathType }} + {{- if and $pathType $ingressSupportsPathType }} pathType: {{ $pathType }} {{- end }} backend: diff --git a/charts/kube-prometheus-stack/templates/prometheus/podDisruptionBudget.yaml b/charts/kube-prometheus-stack/templates/prometheus/podDisruptionBudget.yaml index 573317a32f..cce4a855c0 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/podDisruptionBudget.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/podDisruptionBudget.yaml @@ -16,6 +16,6 @@ spec: {{- end }} selector: matchLabels: - app: prometheus + app.kubernetes.io/name: prometheus prometheus: {{ template "kube-prometheus-stack.fullname" . }}-prometheus {{- end }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/prometheus.yaml b/charts/kube-prometheus-stack/templates/prometheus/prometheus.yaml index 7b47e3846e..6f00a8fa15 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/prometheus.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/prometheus.yaml @@ -218,7 +218,7 @@ spec: - topologyKey: {{ .Values.prometheus.prometheusSpec.podAntiAffinityTopologyKey }} labelSelector: matchExpressions: - - {key: app, operator: In, values: [prometheus]} + - {key: app.kubernetes.io/name, operator: In, values: [prometheus]} - {key: prometheus, operator: In, values: [{{ template "kube-prometheus-stack.fullname" . }}-prometheus]} {{- else if eq .Values.prometheus.prometheusSpec.podAntiAffinity "soft" }} podAntiAffinity: @@ -228,7 +228,7 @@ spec: topologyKey: {{ .Values.prometheus.prometheusSpec.podAntiAffinityTopologyKey }} labelSelector: matchExpressions: - - {key: app, operator: In, values: [prometheus]} + - {key: app.kubernetes.io/name, operator: In, values: [prometheus]} - {key: prometheus, operator: In, values: [{{ template "kube-prometheus-stack.fullname" . }}-prometheus]} {{- end }} {{- end }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml index 387a677152..49e78b5e05 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/alertmanager-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -26,41 +26,146 @@ spec: groups: - name: alertmanager.rules rules: - - alert: AlertmanagerConfigInconsistent + - alert: AlertmanagerFailedReload annotations: - message: 'The configuration of the instances of the Alertmanager cluster `{{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.service {{`}}`}}` are out of sync. - - {{`{{`}} range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query {{`}}`}} - - Configuration hash for pod {{`{{`}} .Labels.pod {{`}}`}} is "{{`{{`}} printf "%.f" .Value {{`}}`}}" - - {{`{{`}} end {{`}}`}} - - ' - expr: count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})) != 1 - for: 5m + description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedreload + summary: Reloading an Alertmanager configuration has failed. + expr: |- + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0 + for: 10m labels: severity: critical {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - - alert: AlertmanagerFailedReload + - alert: AlertmanagerMembersInconsistent annotations: - message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}. - expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0 + description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagermembersinconsistent + summary: A member of an Alertmanager cluster has not found all other cluster members. + expr: |- + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])) for: 10m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedtosendalerts + summary: An Alertmanager instance failed to send notifications. + expr: |- + ( + rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) + / + rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) + ) + > 0.01 + for: 5m labels: severity: warning {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - - alert: AlertmanagerMembersInconsistent + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. + expr: |- + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. + expr: |- + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerconfiginconsistent + summary: Alertmanager instances within the same cluster have different configurations. + expr: |- + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) + ) + != 1 + for: 20m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: AlertmanagerClusterDown + annotations: + description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterdown + summary: Half or more of the Alertmanager instances within the same cluster are down. + expr: |- + ( + count by (namespace,service) ( + avg_over_time(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: AlertmanagerClusterCrashlooping annotations: - message: Alertmanager has not found all other members of the cluster. + description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclustercrashlooping + summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. expr: |- - alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} - != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) + ( + count by (namespace,service) ( + changes(process_start_time_seconds{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[10m]) > 4 + ) + / + count by (namespace,service) ( + up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} + ) + ) + >= 0.5 for: 5m labels: severity: critical diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml index 80771f4f8b..e41175f56d 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -26,7 +26,9 @@ spec: rules: - alert: TargetDown annotations: - message: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.' + description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-targetdown + summary: One or more targets are unreachable. expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 for: 10m labels: @@ -36,7 +38,7 @@ spec: {{- end }} - alert: Watchdog annotations: - message: 'This is an alert meant to ensure that the entire alerting pipeline is functional. + description: 'This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager @@ -47,6 +49,8 @@ spec: "DeadMansSnitch" integration in PagerDuty. ' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-watchdog + summary: An alert that should always be firing to certify that Alertmanager is working properly. expr: vector(1) labels: severity: none diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml index 011a4a77fe..b7863cbf9d 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -26,11 +26,11 @@ spec: rules: - expr: |- sum by (cluster, namespace, pod, container) ( - rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate - expr: |- container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml index 7b00b54a7f..8dda6b7a71 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -29,56 +29,56 @@ spec: 1 - ( ( # write too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) ) + ( # read too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) - ( ( - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) or vector(0) ) + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) ) ) + # errors - sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) ) / - sum(code:apiserver_request_total:increase30d) + sum by (cluster) (code:apiserver_request_total:increase30d) labels: verb: all record: apiserver_request:availability30d - expr: |- 1 - ( - sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) - ( # too slow ( - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) or vector(0) ) + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) ) + # errors - sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) ) / - sum(code:apiserver_request_total:increase30d{verb="read"}) + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) labels: verb: read record: apiserver_request:availability30d @@ -86,74 +86,74 @@ spec: 1 - ( ( # too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) ) + # errors - sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) ) / - sum(code:apiserver_request_total:increase30d{verb="write"}) + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) labels: verb: write record: apiserver_request:availability30d - expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 record: code_verb:apiserver_request_total:increase30d - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) labels: verb: read record: code:apiserver_request_total:increase30d - - expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) labels: verb: write record: code:apiserver_request_total:increase30d diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml index 0f44ccc104..22ede0519a 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml index eddc1e40f1..4bb373d64e 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -28,26 +28,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) labels: verb: read record: apiserver_request:burnrate1d @@ -55,26 +55,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) labels: verb: read record: apiserver_request:burnrate1h @@ -82,26 +82,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) labels: verb: read record: apiserver_request:burnrate2h @@ -109,26 +109,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) labels: verb: read record: apiserver_request:burnrate30m @@ -136,26 +136,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) labels: verb: read record: apiserver_request:burnrate3d @@ -163,26 +163,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read record: apiserver_request:burnrate5m @@ -190,26 +190,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) labels: verb: read record: apiserver_request:burnrate6h @@ -217,15 +217,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) labels: verb: write record: apiserver_request:burnrate1d @@ -233,15 +233,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) labels: verb: write record: apiserver_request:burnrate1h @@ -249,15 +249,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) labels: verb: write record: apiserver_request:burnrate2h @@ -265,15 +265,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) labels: verb: write record: apiserver_request:burnrate30m @@ -281,15 +281,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) labels: verb: write record: apiserver_request:burnrate3d @@ -297,15 +297,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write record: apiserver_request:burnrate5m @@ -313,32 +313,32 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) labels: verb: write record: apiserver_request:burnrate6h - - expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read record: code_resource:apiserver_request_total:rate5m - - expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write record: code_resource:apiserver_request_total:rate5m - - expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 + - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 labels: quantile: '0.99' verb: read record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 + - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 labels: quantile: '0.99' verb: write diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml index e54bee5879..1b95d21476 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml index 27271f1b58..2d38ef8655 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml index 24df268bb1..4e44196445 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml index 0fa5032ba7..f7e0c439e3 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-state-metrics-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -55,5 +55,33 @@ spec: severity: critical {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: KubeStateMetricsShardingMismatch + annotations: + description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardingmismatch + summary: kube-state-metrics sharding is misconfigured. + expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 + for: 15m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: KubeStateMetricsShardsMissing + annotations: + description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsshardsmissing + summary: kube-state-metrics shards are missing. + expr: |- + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 + - + sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) + != 0 + for: 15m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} {{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml index 8712b9ff5e..a073214645 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml index 77bb40a1ee..83f2488aa3 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -30,7 +30,7 @@ spec: description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 10 minutes. runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping summary: Pod is crash looping. - expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) * 60 * 5 > 0 + expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) > 0 for: 15m labels: severity: warning diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml index 27babbd378..7a2ecbc3d0 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml index 527e6e3088..3e06766a39 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml index c3110cfb35..96945b8237 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml index 3d1ace13b7..07ff37a2d0 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml index 5671b1c7c4..1ea531f24a 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml index 098f6fb5ef..47376e3bc2 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml index ea2f2589ce..657e5c23cd 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml index ddb7376473..f734e8dcb0 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/node-exporter-prometheusRule.yaml +Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -33,9 +33,9 @@ spec: record: instance:node_num_cpu:sum - expr: |- 1 - avg without (cpu, mode) ( - rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) + rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m]) ) - record: instance:node_cpu_utilisation:rate1m + record: instance:node_cpu_utilisation:rate5m - expr: |- ( node_load1{job="node-exporter"} @@ -50,30 +50,30 @@ spec: node_memory_MemTotal_bytes{job="node-exporter"} ) record: instance:node_memory_utilisation:ratio - - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) - record: instance:node_vmstat_pgmajfault:rate1m - - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_seconds:rate1m - - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_weighted_seconds:rate1m + - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + record: instance:node_vmstat_pgmajfault:rate5m + - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + record: instance_device:node_disk_io_time_seconds:rate5m + - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate5m - expr: |- sum without (device) ( - rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_receive_bytes_excluding_lo:rate1m + record: instance:node_network_receive_bytes_excluding_lo:rate5m - expr: |- sum without (device) ( - rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_transmit_bytes_excluding_lo:rate1m + record: instance:node_network_transmit_bytes_excluding_lo:rate5m - expr: |- sum without (device) ( - rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_receive_drop_excluding_lo:rate1m + record: instance:node_network_receive_drop_excluding_lo:rate5m - expr: |- sum without (device) ( - rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_transmit_drop_excluding_lo:rate1m + record: instance:node_network_transmit_drop_excluding_lo:rate5m {{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml index 3be497c1f0..d69a6350a8 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/node-exporter-prometheusRule.yaml +Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -27,6 +27,7 @@ spec: - alert: NodeFilesystemSpaceFillingUp annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 24 hours. expr: |- ( @@ -45,6 +46,7 @@ spec: - alert: NodeFilesystemSpaceFillingUp annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 4 hours. expr: |- ( @@ -63,6 +65,7 @@ spec: - alert: NodeFilesystemAlmostOutOfSpace annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace summary: Filesystem has less than 5% space left. expr: |- ( @@ -79,6 +82,7 @@ spec: - alert: NodeFilesystemAlmostOutOfSpace annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace summary: Filesystem has less than 3% space left. expr: |- ( @@ -95,6 +99,7 @@ spec: - alert: NodeFilesystemFilesFillingUp annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup summary: Filesystem is predicted to run out of inodes within the next 24 hours. expr: |- ( @@ -113,6 +118,7 @@ spec: - alert: NodeFilesystemFilesFillingUp annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup summary: Filesystem is predicted to run out of inodes within the next 4 hours. expr: |- ( @@ -131,6 +137,7 @@ spec: - alert: NodeFilesystemAlmostOutOfFiles annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles summary: Filesystem has less than 5% inodes left. expr: |- ( @@ -147,6 +154,7 @@ spec: - alert: NodeFilesystemAlmostOutOfFiles annotations: description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles summary: Filesystem has less than 3% inodes left. expr: |- ( @@ -163,6 +171,7 @@ spec: - alert: NodeNetworkReceiveErrs annotations: description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkreceiveerrs summary: Network interface is reporting many receive errors. expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 for: 1h @@ -174,6 +183,7 @@ spec: - alert: NodeNetworkTransmitErrs annotations: description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworktransmiterrs summary: Network interface is reporting many transmit errors. expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 for: 1h @@ -185,6 +195,7 @@ spec: - alert: NodeHighNumberConntrackEntriesUsed annotations: description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodehighnumberconntrackentriesused summary: Number of conntrack are getting close to the limit. expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 labels: @@ -195,6 +206,7 @@ spec: - alert: NodeTextFileCollectorScrapeError annotations: description: Node Exporter text file collector failed to scrape. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodetextfilecollectorscrapeerror summary: Node Exporter text file collector failed to scrape. expr: node_textfile_scrape_error{job="node-exporter"} == 1 labels: @@ -204,7 +216,8 @@ spec: {{- end }} - alert: NodeClockSkewDetected annotations: - message: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + description: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclockskewdetected summary: Clock skew detected. expr: |- ( @@ -226,7 +239,8 @@ spec: {{- end }} - alert: NodeClockNotSynchronising annotations: - message: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host. + description: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclocknotsynchronising summary: Clock not synchronising. expr: |- min_over_time(node_timex_sync_status[5m]) == 0 @@ -241,6 +255,7 @@ spec: - alert: NodeRAIDDegraded annotations: description: RAID array '{{`{{`}} $labels.device {{`}}`}}' on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddegraded summary: RAID Array is degraded expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 for: 15m @@ -252,8 +267,9 @@ spec: - alert: NodeRAIDDiskFailure annotations: description: At least one device in RAID array on {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-noderaiddiskfailure summary: Failed device in RAID array - expr: node_md_disks{state="fail"} > 0 + expr: node_md_disks{state="failed"} > 0 labels: severity: warning {{- if .Values.defaultRules.additionalRuleLabels }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml index 9a6955ae97..8964334d8e 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -26,7 +26,8 @@ spec: rules: - alert: NodeNetworkInterfaceFlapping annotations: - message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}" + message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkinterfaceflapping expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 for: 2m labels: diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml index f24c5550ba..c6dd9e1848 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml +Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml index d1c1f6545a..d3010bcd98 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-operator-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml index c9c805eeaa..52cc0d8b2f 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/prometheus-rules.yaml +Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -29,6 +29,7 @@ spec: - alert: PrometheusBadConfig annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusbadconfig summary: Failed Prometheus configuration reload. expr: |- # Without max_over_time, failed scrapes could create false negatives, see @@ -43,6 +44,7 @@ spec: - alert: PrometheusNotificationQueueRunningFull annotations: description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotificationqueuerunningfull summary: Prometheus alert notification queue predicted to run full in less than 30m. expr: |- # Without min_over_time, failed scrapes could create false negatives, see @@ -61,6 +63,7 @@ spec: - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuserrorsendingalertstosomealertmanagers summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. expr: |- ( @@ -75,28 +78,11 @@ spec: severity: warning {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.' - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: |- - min without(alertmanager) ( - rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - / - rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - ) - * 100 - > 3 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - alert: PrometheusNotConnectedToAlertmanagers annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotconnectedtoalertmanagers summary: Prometheus is not connected to any Alertmanagers. expr: |- # Without max_over_time, failed scrapes could create false negatives, see @@ -111,6 +97,7 @@ spec: - alert: PrometheusTSDBReloadsFailing annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustsdbreloadsfailing summary: Prometheus has issues reloading blocks from disk. expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 for: 4h @@ -122,6 +109,7 @@ spec: - alert: PrometheusTSDBCompactionsFailing annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustsdbcompactionsfailing summary: Prometheus has issues compacting blocks. expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 for: 4h @@ -133,8 +121,18 @@ spec: - alert: PrometheusNotIngestingSamples annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusnotingestingsamples summary: Prometheus is not ingesting samples. - expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0 + expr: |- + ( + rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0 + ) + ) for: 10m labels: severity: warning @@ -144,6 +142,7 @@ spec: - alert: PrometheusDuplicateTimestamps annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with different values but duplicated timestamp. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusduplicatetimestamps summary: Prometheus is dropping samples with duplicate timestamps. expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 10m @@ -155,6 +154,7 @@ spec: - alert: PrometheusOutOfOrderTimestamps annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with timestamps arriving out of order. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoutofordertimestamps summary: Prometheus drops samples with out-of-order timestamps. expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 10m @@ -166,15 +166,16 @@ spec: - alert: PrometheusRemoteStorageFailures annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}} + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotestoragefailures summary: Prometheus fails to send samples to remote storage. expr: |- ( - rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + (rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])) / ( - rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + (rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])) + - rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + (rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])) ) ) * 100 @@ -188,13 +189,14 @@ spec: - alert: PrometheusRemoteWriteBehind annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotewritebehind summary: Prometheus remote write is behind. expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) - - on(job, instance) group_right + - ignoring(remote_name, url) group_right max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) ) > 120 @@ -207,6 +209,7 @@ spec: - alert: PrometheusRemoteWriteDesiredShards annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}` $labels.instance | query | first | value {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusremotewritedesiredshards summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. expr: |- # Without max_over_time, failed scrapes could create false negatives, see @@ -225,6 +228,7 @@ spec: - alert: PrometheusRuleFailures annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusrulefailures summary: Prometheus is failing rule evaluations. expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 15m @@ -236,6 +240,7 @@ spec: - alert: PrometheusMissingRuleEvaluations annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusmissingruleevaluations summary: Prometheus is missing rule evaluations due to slow rule group evaluation. expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 15m @@ -247,6 +252,7 @@ spec: - alert: PrometheusTargetLimitHit annotations: description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because the number of targets exceeded the configured target_limit. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheustargetlimithit summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 15m @@ -254,5 +260,36 @@ spec: severity: warning {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: PrometheusLabelLimitHit + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuslabellimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. + expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 + for: 15m + labels: + severity: warning +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} +{{- end }} + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheuserrorsendingalertstoanyalertmanager + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: |- + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m]) + / + rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical +{{- if .Values.defaultRules.additionalRuleLabels }} +{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} {{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/alertmanager.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/alertmanager.rules.yaml deleted file mode 100644 index 71159849c0..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/alertmanager.rules.yaml +++ /dev/null @@ -1,63 +0,0 @@ -{{- /* -Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }} -{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }} -{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }} -{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: alertmanager.rules - rules: - - alert: AlertmanagerConfigInconsistent - annotations: - message: The configuration of the instances of the Alertmanager cluster `{{`{{`}}$labels.service{{`}}`}}` are out of sync. - expr: count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="{{ $operatorJob }}",namespace="{{ $namespace }}",controller="alertmanager"}) by (name, job, namespace, controller), "service", "$1", "name", "(.*)") != 1 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: AlertmanagerFailedReload - annotations: - message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}. - expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: AlertmanagerMembersInconsistent - annotations: - message: Alertmanager has not found all other members of the cluster. - expr: |- - alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} - != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/etcd.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/etcd.yaml deleted file mode 100644 index ce4e87bf45..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/etcd.yaml +++ /dev/null @@ -1,179 +0,0 @@ -{{- /* -Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/website/master/content/en/docs/v3.4/op-guide/etcd3_alert.rules.yml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: etcd - rules: - - alert: etcdInsufficientMembers - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).' - expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2) - for: 3m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdNoLeader - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.' - expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 - for: 1m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfLeaderChanges - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.' - expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedGRPCRequests - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) - / - sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) - > 1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedGRPCRequests - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) - / - sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) - > 5 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdGRPCRequestsSlow - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le)) - > 0.15 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdMemberCommunicationSlow - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.15 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedProposals - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighFsyncDurations - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.5 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighCommitDurations - annotations: - message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.25 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedHTTPRequests - annotations: - message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}' - expr: |- - sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) - BY (method) > 0.01 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHighNumberOfFailedHTTPRequests - annotations: - message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' - expr: |- - sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) - BY (method) > 0.05 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: etcdHTTPRequestsSlow - annotations: - message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow. - expr: |- - histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) - > 0.15 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/general.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/general.rules.yaml deleted file mode 100644 index cde6feb5c9..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/general.rules.yaml +++ /dev/null @@ -1,56 +0,0 @@ -{{- /* -Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.general }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: general.rules - rules: - - alert: TargetDown - annotations: - message: '{{`{{`}} $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}} targets are down.' - expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: Watchdog - annotations: - message: 'This is an alert meant to ensure that the entire alerting pipeline is functional. - - This alert is always firing, therefore it should always be firing in Alertmanager - - and always fire against a receiver. There are integrations with various notification - - mechanisms that send a notification when this alert is not firing. For example the - - "DeadMansSnitch" integration in PagerDuty. - - ' - expr: vector(1) - labels: - severity: none -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/k8s.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/k8s.rules.yaml deleted file mode 100644 index 08aa7fe2b3..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/k8s.rules.yaml +++ /dev/null @@ -1,83 +0,0 @@ -{{- /* -Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8s }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: k8s.rules - rules: - - expr: sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace) - record: namespace:container_cpu_usage_seconds_total:sum_rate - - expr: sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace) - record: namespace:container_memory_usage_bytes:sum - - expr: |- - sum by (namespace, pod_name, container_name) ( - rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m]) - ) - record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate - - expr: |- - sum by(namespace) ( - kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} - * on (endpoint, instance, job, namespace, pod, service) - group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1) - ) - record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum - - expr: |- - sum by (namespace) ( - kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} - * on (endpoint, instance, job, namespace, pod, service) - group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1) - ) - record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum - - expr: |- - sum( - label_replace( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, - "replicaset", "$1", "owner_name", "(.*)" - ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) by (namespace, workload, pod) - labels: - workload_type: deployment - record: mixin_pod_workload - - expr: |- - sum( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) by (namespace, workload, pod) - labels: - workload_type: daemonset - record: mixin_pod_workload - - expr: |- - sum( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) by (namespace, workload, pod) - labels: - workload_type: statefulset - record: mixin_pod_workload -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/kube-apiserver.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/kube-apiserver.rules.yaml deleted file mode 100644 index e3a9296923..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/kube-apiserver.rules.yaml +++ /dev/null @@ -1,39 +0,0 @@ -{{- /* -Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserver }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-apiserver.rules - rules: - - expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.99' - record: cluster_quantile:apiserver_request_latencies:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.9' - record: cluster_quantile:apiserver_request_latencies:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.5' - record: cluster_quantile:apiserver_request_latencies:histogram_quantile -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml deleted file mode 100644 index a8d5400cb4..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml +++ /dev/null @@ -1,47 +0,0 @@ -{{- /* -Generated from 'kube-prometheus-node-alerting.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeAlerting }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-node-alerting.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-prometheus-node-alerting.rules - rules: - - alert: NodeDiskRunningFull - annotations: - message: Device {{`{{`}} $labels.device {{`}}`}} of node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} will be full within the next 24 hours. - expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)' - for: 30m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeDiskRunningFull - annotations: - message: Device {{`{{`}} $labels.device {{`}}`}} of node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} will be full within the next 2 hours. - expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)' - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml deleted file mode 100644 index 87f072fd02..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml +++ /dev/null @@ -1,41 +0,0 @@ -{{- /* -Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeRecording }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-prometheus-node-recording.rules - rules: - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance) - record: instance:node_cpu:rate:sum - - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance) - record: instance:node_filesystem_usage:sum - - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) - record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) - record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) - record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) - record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) - record: cluster:node_cpu:ratio -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/kube-scheduler.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/kube-scheduler.rules.yaml deleted file mode 100644 index 46c8d1d4a9..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/kube-scheduler.rules.yaml +++ /dev/null @@ -1,63 +0,0 @@ -{{- /* -Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kube-scheduler.rules - rules: - - expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.99' - record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile - - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.99' - record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile - - expr: histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.99' - record: cluster_quantile:scheduler_binding_latency:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.9' - record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.9' - record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.9' - record: cluster_quantile:scheduler_binding_latency:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.5' - record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.5' - record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 - labels: - quantile: '0.5' - record: cluster_quantile:scheduler_binding_latency:histogram_quantile -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-absent.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-absent.yaml deleted file mode 100644 index 5c1ebce9ea..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-absent.yaml +++ /dev/null @@ -1,159 +0,0 @@ -{{- /* -Generated from 'kubernetes-absent' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesAbsent }} -{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }} -{{- $prometheusJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" }} -{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }} -{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-absent" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-absent - rules: -{{- if .Values.alertmanager.enabled }} - - alert: AlertmanagerDown - annotations: - message: Alertmanager has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerdown - expr: absent(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.kubeDns.enabled }} - - alert: CoreDNSDown - annotations: - message: CoreDNS has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-corednsdown - expr: absent(up{job="kube-dns"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.kubeApiServer.enabled }} - - alert: KubeAPIDown - annotations: - message: KubeAPI has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapidown - expr: absent(up{job="apiserver"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.kubeControllerManager.enabled }} - - alert: KubeControllerManagerDown - annotations: - message: KubeControllerManager has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontrollermanagerdown - expr: absent(up{job="kube-controller-manager"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.kubeScheduler.enabled }} - - alert: KubeSchedulerDown - annotations: - message: KubeScheduler has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeschedulerdown - expr: absent(up{job="kube-scheduler"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.kubeStateMetrics.enabled }} - - alert: KubeStateMetricsDown - annotations: - message: KubeStateMetrics has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsdown - expr: absent(up{job="kube-state-metrics"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.prometheusOperator.kubeletService.enabled }} - - alert: KubeletDown - annotations: - message: Kubelet has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletdown - expr: absent(up{job="kubelet"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- if .Values.nodeExporter.enabled }} - - alert: NodeExporterDown - annotations: - message: NodeExporter has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeexporterdown - expr: absent(up{job="node-exporter"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} - - alert: PrometheusDown - annotations: - message: Prometheus has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusdown - expr: absent(up{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- if .Values.prometheusOperator.enabled }} - - alert: PrometheusOperatorDown - annotations: - message: PrometheusOperator has disappeared from Prometheus target discovery. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatordown - expr: absent(up{job="{{ $operatorJob }}",namespace="{{ $namespace }}"} == 1) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-apps.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-apps.yaml deleted file mode 100644 index e7a41ca2ab..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-apps.yaml +++ /dev/null @@ -1,200 +0,0 @@ -{{- /* -Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesApps }} -{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-apps - rules: - - alert: KubePodCrashLooping - annotations: - message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 5 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping - expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) * 60 * 5 > 0 - for: 1h - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubePodNotReady - annotations: - message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than an hour. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodnotready - expr: sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown"}) > 0 - for: 1h - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDeploymentGenerationMismatch - annotations: - message: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentgenerationmismatch - expr: |- - kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDeploymentReplicasMismatch - annotations: - message: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than an hour. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentreplicasmismatch - expr: |- - kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - for: 1h - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeStatefulSetReplicasMismatch - annotations: - message: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetreplicasmismatch - expr: |- - kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeStatefulSetGenerationMismatch - annotations: - message: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetgenerationmismatch - expr: |- - kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeStatefulSetUpdateNotRolledOut - annotations: - message: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetupdatenotrolledout - expr: |- - max without (revision) ( - kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - unless - kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - ) - * - ( - kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - != - kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - ) - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDaemonSetRolloutStuck - annotations: - message: Only {{`{{`}} $value {{`}}`}}% of the desired Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are scheduled and ready. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetrolloutstuck - expr: |- - kube_daemonset_status_number_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - / - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} * 100 < 100 - for: 15m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDaemonSetNotScheduled - annotations: - message: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetnotscheduled - expr: |- - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - - - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeDaemonSetMisScheduled - annotations: - message: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetmisscheduled - expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeCronJobRunning - annotations: - message: CronJob {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.cronjob {{`}}`}} is taking more than 1h to complete. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecronjobrunning - expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 3600 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeJobCompletion - annotations: - message: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than one hour to complete. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobcompletion - expr: kube_job_spec_completions{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - kube_job_status_succeeded{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeJobFailed - annotations: - message: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobfailed - expr: kube_job_status_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-resources.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-resources.yaml deleted file mode 100644 index b34b442f3b..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-resources.yaml +++ /dev/null @@ -1,121 +0,0 @@ -{{- /* -Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-resources - rules: - - alert: KubeCPUOvercommit - annotations: - message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit - expr: |- - sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) - / - sum(node:node_num_cpu:sum) - > - (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeMemOvercommit - annotations: - message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememovercommit - expr: |- - sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) - / - sum(node_memory_MemTotal_bytes) - > - (count(node:node_num_cpu:sum)-1) - / - count(node:node_num_cpu:sum) - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeCPUOvercommit - annotations: - message: Cluster has overcommitted CPU resource requests for Namespaces. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit - expr: |- - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) - / - sum(node:node_num_cpu:sum) - > 1.5 - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeMemOvercommit - annotations: - message: Cluster has overcommitted memory resource requests for Namespaces. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememovercommit - expr: |- - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) - / - sum(node_memory_MemTotal_bytes{job="node-exporter"}) - > 1.5 - for: 5m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeQuotaExceeded - annotations: - message: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} printf "%0.0f" $value {{`}}`}}% of its {{`{{`}} $labels.resource {{`}}`}} quota. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaexceeded - expr: |- - 100 * kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 90 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: CPUThrottlingHigh - annotations: - message: '{{`{{`}} printf "%0.0f" $value {{`}}`}}% throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container_name {{`}}`}} in pod {{`{{`}} $labels.pod_name {{`}}`}}.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-cputhrottlinghigh - expr: |- - 100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!="", }[5m])) by (container_name, pod_name, namespace) - / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace) - > 25 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-storage.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-storage.yaml deleted file mode 100644 index 6469fffc52..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-storage.yaml +++ /dev/null @@ -1,72 +0,0 @@ -{{- /* -Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesStorage }} -{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-storage - rules: - - alert: KubePersistentVolumeUsageCritical - annotations: - message: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} printf "%0.2f" $value {{`}}`}}% free. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeusagecritical - expr: |- - 100 * kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"} - < 3 - for: 1m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubePersistentVolumeFullInFourDays - annotations: - message: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} printf "%0.2f" $value {{`}}`}}% is available. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefullinfourdays - expr: |- - 100 * ( - kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"} - ) < 15 - and - predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"}[6h], 4 * 24 * 3600) < 0 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubePersistentVolumeErrors - annotations: - message: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeerrors - expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 - for: 5m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-system.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-system.yaml deleted file mode 100644 index da232057be..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/kubernetes-system.yaml +++ /dev/null @@ -1,184 +0,0 @@ -{{- /* -Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: kubernetes-system - rules: - - alert: KubeNodeNotReady - annotations: - message: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than an hour.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodenotready - expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeVersionMismatch - annotations: - message: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeversionmismatch - expr: count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 - for: 1h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeClientErrors - annotations: - message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} printf "%0.0f" $value {{`}}`}}% errors.' - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors - expr: |- - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) - / - sum(rate(rest_client_requests_total[5m])) by (instance, job)) - * 100 > 1 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeClientErrors - annotations: - message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} printf "%0.0f" $value {{`}}`}} errors / second. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors - expr: sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeletTooManyPods - annotations: - message: Kubelet {{`{{`}} $labels.instance {{`}}`}} is running {{`{{`}} $value {{`}}`}} Pods, close to the limit of 110. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubelettoomanypods - expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 - for: 15m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPILatencyHigh - annotations: - message: The API server has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh - expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPILatencyHigh - annotations: - message: The API server has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh - expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPIErrorsHigh - annotations: - message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh - expr: |- - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) - / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPIErrorsHigh - annotations: - message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh - expr: |- - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) - / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPIErrorsHigh - annotations: - message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh - expr: |- - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) - / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeAPIErrorsHigh - annotations: - message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh - expr: |- - sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) - / - sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeClientCertificateExpiration - annotations: - message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration - expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: KubeClientCertificateExpiration - annotations: - message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. - runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration - expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/node-network.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/node-network.yaml deleted file mode 100644 index c75f1ae074..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/node-network.yaml +++ /dev/null @@ -1,57 +0,0 @@ -{{- /* -Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.network }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-network" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: node-network - rules: - - alert: NetworkReceiveErrors - annotations: - message: Network interface "{{`{{`}} $labels.device {{`}}`}}" showing receive errors on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}" - expr: rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0 - for: 2m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NetworkTransmitErrors - annotations: - message: Network interface "{{`{{`}} $labels.device {{`}}`}}" showing transmit errors on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}" - expr: rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0 - for: 2m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: NodeNetworkInterfaceFlapping - annotations: - message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}" - expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 - for: 2m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/node-time.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/node-time.yaml deleted file mode 100644 index b7a2fc92fd..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/node-time.yaml +++ /dev/null @@ -1,37 +0,0 @@ -{{- /* -Generated from 'node-time' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.time }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-time" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: node-time - rules: - - alert: ClockSkewDetected - annotations: - message: Clock skew detected on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}. Ensure NTP is configured correctly on this host. - expr: abs(node_timex_offset_seconds{job="node-exporter"}) > 0.03 - for: 2m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/node.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/node.rules.yaml deleted file mode 100644 index 2bc7af3a97..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/node.rules.yaml +++ /dev/null @@ -1,202 +0,0 @@ -{{- /* -Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: node.rules - rules: - - expr: sum(min(kube_pod_info) by (node)) - record: ':kube_pod_info_node_count:' - - expr: max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) - record: 'node_namespace_pod:kube_pod_info:' - - expr: |- - count by (node) (sum by (node, cpu) ( - node_cpu_seconds_total{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - )) - record: node:node_num_cpu:sum - - expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) - record: :node_cpu_utilisation:avg1m - - expr: |- - 1 - avg by (node) ( - rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info:) - record: node:node_cpu_utilisation:avg1m - - expr: |- - node:node_cpu_utilisation:avg1m - * - node:node_num_cpu:sum - / - scalar(sum(node:node_num_cpu:sum)) - record: node:cluster_cpu_utilisation:ratio - - expr: |- - sum(node_load1{job="node-exporter"}) - / - sum(node:node_num_cpu:sum) - record: ':node_cpu_saturation_load1:' - - expr: |- - sum by (node) ( - node_load1{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - / - node:node_num_cpu:sum - record: 'node:node_cpu_saturation_load1:' - - expr: |- - 1 - - sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) - / - sum(node_memory_MemTotal_bytes{job="node-exporter"}) - record: ':node_memory_utilisation:' - - expr: sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) - record: :node_memory_MemFreeCachedBuffers_bytes:sum - - expr: sum(node_memory_MemTotal_bytes{job="node-exporter"}) - record: :node_memory_MemTotal_bytes:sum - - expr: |- - sum by (node) ( - (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_bytes_available:sum - - expr: |- - sum by (node) ( - node_memory_MemTotal_bytes{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_bytes_total:sum - - expr: |- - (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) - / - node:node_memory_bytes_total:sum - record: node:node_memory_utilisation:ratio - - expr: |- - (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) - / - scalar(sum(node:node_memory_bytes_total:sum)) - record: node:cluster_memory_utilisation:ratio - - expr: |- - 1e3 * sum( - (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) - + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) - ) - record: :node_memory_swap_io_bytes:sum_rate - - expr: |- - 1 - - sum by (node) ( - (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - / - sum by (node) ( - node_memory_MemTotal_bytes{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: 'node:node_memory_utilisation:' - - expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) - record: 'node:node_memory_utilisation_2:' - - expr: |- - 1e3 * sum by (node) ( - (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) - + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_swap_io_bytes:sum_rate - - expr: avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) - record: :node_disk_utilisation:avg_irate - - expr: |- - avg by (node) ( - irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_disk_utilisation:avg_irate - - expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) - record: :node_disk_saturation:avg_irate - - expr: |- - avg by (node) ( - irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_disk_saturation:avg_irate - - expr: |- - max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - record: 'node:node_filesystem_usage:' - - expr: max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - record: 'node:node_filesystem_avail:' - - expr: |- - sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) + - sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) - record: :node_net_utilisation:sum_irate - - expr: |- - sum by (node) ( - (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) + - irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_net_utilisation:sum_irate - - expr: |- - sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) + - sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) - record: :node_net_saturation:sum_irate - - expr: |- - sum by (node) ( - (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) + - irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_net_saturation:sum_irate - - expr: |- - max( - max( - kube_pod_info{job="kube-state-metrics", host_ip!=""} - ) by (node, host_ip) - * on (host_ip) group_right (node) - label_replace( - (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" - ) - ) by (node) - record: 'node:node_inodes_total:' - - expr: |- - max( - max( - kube_pod_info{job="kube-state-metrics", host_ip!=""} - ) by (node, host_ip) - * on (host_ip) group_right (node) - label_replace( - (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" - ) - ) by (node) - record: 'node:node_inodes_free:' -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/prometheus-operator.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/prometheus-operator.yaml deleted file mode 100644 index a8a8915b62..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/prometheus-operator.yaml +++ /dev/null @@ -1,49 +0,0 @@ -{{- /* -Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheusOperator }} -{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }} -{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: prometheus-operator - rules: - - alert: PrometheusOperatorReconcileErrors - annotations: - message: Errors while reconciling {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} Namespace. - expr: rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusOperatorNodeLookupErrors - annotations: - message: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace. - expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules/prometheus.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules/prometheus.rules.yaml deleted file mode 100644 index 0480c83b56..0000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules/prometheus.rules.yaml +++ /dev/null @@ -1,139 +0,0 @@ -{{- /* -Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheus }} -{{- $prometheusJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" }} -{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: prometheus.rules - rules: - - alert: PrometheusConfigReloadFailed - annotations: - description: Reloading Prometheus' configuration has failed for {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} - summary: Reloading Prometheus' configuration failed - expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusNotificationQueueRunningFull - annotations: - description: Prometheus' alert notification queue is running full for {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} - summary: Prometheus' alert notification queue is running full - expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusErrorSendingAlerts - annotations: - description: Errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.Alertmanager{{`}}`}} - summary: Errors while sending alert from Prometheus - expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.01 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusErrorSendingAlerts - annotations: - description: Errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.Alertmanager{{`}}`}} - summary: Errors while sending alerts from Prometheus - expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.03 - for: 10m - labels: - severity: critical -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusNotConnectedToAlertmanagers - annotations: - description: Prometheus {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} is not connected to any Alertmanagers - summary: Prometheus is not connected to any Alertmanagers - expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} < 1 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusTSDBReloadsFailing - annotations: - description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} had {{`{{`}}$value | humanize{{`}}`}} reload failures over the last four hours.' - summary: Prometheus has issues reloading data blocks from disk - expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0 - for: 12h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusTSDBCompactionsFailing - annotations: - description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} had {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last four hours.' - summary: Prometheus has issues compacting sample blocks - expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0 - for: 12h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusTSDBWALCorruptions - annotations: - description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} has a corrupted write-ahead log (WAL).' - summary: Prometheus write-ahead log is corrupted - expr: prometheus_tsdb_wal_corruptions_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} > 0 - for: 4h - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusNotIngestingSamples - annotations: - description: Prometheus {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} isn't ingesting samples. - summary: Prometheus isn't ingesting samples - expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} - - alert: PrometheusTargetScrapesDuplicate - annotations: - description: '{{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has many samples rejected due to duplicate timestamps but different values' - summary: Prometheus has many samples rejected - expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 10m - labels: - severity: warning -{{- if .Values.defaultRules.additionalRuleLabels }} -{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/service.yaml b/charts/kube-prometheus-stack/templates/prometheus/service.yaml index 8676b81ead..c6420060ae 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/service.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/service.yaml @@ -51,7 +51,7 @@ spec: {{ toYaml .Values.prometheus.service.additionalPorts | indent 2 }} {{- end }} selector: - app: prometheus + app.kubernetes.io/name: prometheus prometheus: {{ template "kube-prometheus-stack.fullname" . }}-prometheus {{- if .Values.prometheus.service.sessionAffinity }} sessionAffinity: {{ .Values.prometheus.service.sessionAffinity }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/serviceThanosSidecar.yaml b/charts/kube-prometheus-stack/templates/prometheus/serviceThanosSidecar.yaml index 7c33379cb4..c3d52ef806 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/serviceThanosSidecar.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/serviceThanosSidecar.yaml @@ -25,6 +25,6 @@ spec: nodePort: {{ .Values.prometheus.thanosService.nodePort }} {{- end }} selector: - app: prometheus + app.kubernetes.io/name: prometheus prometheus: {{ template "kube-prometheus-stack.fullname" . }}-prometheus {{- end }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/serviceThanosSidecarExternal.yaml b/charts/kube-prometheus-stack/templates/prometheus/serviceThanosSidecarExternal.yaml index f9a0331152..99668f425d 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/serviceThanosSidecarExternal.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/serviceThanosSidecarExternal.yaml @@ -23,6 +23,6 @@ spec: nodePort: {{ .Values.prometheus.thanosServiceExternal.nodePort }} {{- end }} selector: - app: prometheus + app.kubernetes.io/name: prometheus prometheus: {{ template "kube-prometheus-stack.fullname" . }}-prometheus {{- end }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/serviceperreplica.yaml b/charts/kube-prometheus-stack/templates/prometheus/serviceperreplica.yaml index 1a55433621..470ce79f20 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/serviceperreplica.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/serviceperreplica.yaml @@ -38,7 +38,7 @@ items: port: {{ $serviceValues.port }} targetPort: {{ $serviceValues.targetPort }} selector: - app: prometheus + app.kubernetes.io/name: prometheus prometheus: {{ include "kube-prometheus-stack.fullname" $ }}-prometheus statefulset.kubernetes.io/pod-name: prometheus-{{ include "kube-prometheus-stack.fullname" $ }}-prometheus-{{ $i }} type: "{{ $serviceValues.type }}" diff --git a/charts/kube-prometheus-stack/values.yaml b/charts/kube-prometheus-stack/values.yaml index 5178836669..c4b34a1a5a 100644 --- a/charts/kube-prometheus-stack/values.yaml +++ b/charts/kube-prometheus-stack/values.yaml @@ -396,7 +396,7 @@ alertmanager: ## image: repository: quay.io/prometheus/alertmanager - tag: v0.21.0 + tag: v0.22.2 sha: "" ## If true then the user will be responsible to provide a secret with alertmanager configuration @@ -613,6 +613,14 @@ grafana: enabled: true namespaceOverride: "" + ## ForceDeployDatasources Create datasource configmap even if grafana deployment has been disabled + ## + forceDeployDatasources: false + + ## ForceDeployDashboard Create dashboard configmap even if grafana deployment has been disabled + ## + forceDeployDashboards: false + ## Deploy default dashboards. ## defaultDashboardsEnabled: true @@ -665,6 +673,10 @@ grafana: enabled: true defaultDatasourceEnabled: true + ## URL of prometheus datasource + ## + # url: http://prometheus-stack-prometheus:9090/ + # If not defined, will use prometheus.prometheusSpec.scrapeInterval or its default # defaultDatasourceScrapeInterval: 15s @@ -742,19 +754,6 @@ kubeApiServer: tlsConfig: serverName: kubernetes insecureSkipVerify: false - - ## If your API endpoint address is not reachable (as in AKS) you can replace it with the kubernetes service - ## - relabelings: [] - # - sourceLabels: - # - __meta_kubernetes_namespace - # - __meta_kubernetes_service_name - # - __meta_kubernetes_endpoint_port_name - # action: keep - # regex: default;kubernetes;https - # - targetLabel: __address__ - # replacement: kubernetes.default.svc:443 - serviceMonitor: ## Scrape interval. If not set, the Prometheus default scrape interval is used. ## @@ -775,6 +774,15 @@ kubeApiServer: # - action: keep # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' # sourceLabels: [__name__] + relabelings: [] + # - sourceLabels: + # - __meta_kubernetes_namespace + # - __meta_kubernetes_service_name + # - __meta_kubernetes_endpoint_port_name + # action: keep + # regex: default;kubernetes;https + # - targetLabel: __address__ + # replacement: kubernetes.default.svc:443 ## Component scraping the kubelet and kubelet-hosted cAdvisor ## @@ -1342,7 +1350,7 @@ prometheusOperator: enabled: true image: repository: jettech/kube-webhook-certgen - tag: v1.5.0 + tag: v1.5.2 sha: "" pullPolicy: IfNotPresent resources: {} @@ -1353,6 +1361,16 @@ prometheusOperator: nodeSelector: {} affinity: {} tolerations: [] + + ## SecurityContext holds pod-level security attributes and common container settings. + ## This defaults to non root user with uid 2000 and gid 2000. *v1.PodSecurityContext false + ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ + ## + securityContext: + runAsGroup: 2000 + runAsNonRoot: true + runAsUser: 2000 + # Use certmanager to generate webhook certs certManager: enabled: false @@ -1540,7 +1558,7 @@ prometheusOperator: ## image: repository: quay.io/prometheus-operator/prometheus-operator - tag: v0.47.0 + tag: v0.48.1 sha: "" pullPolicy: IfNotPresent @@ -1556,7 +1574,7 @@ prometheusOperator: ## prometheusConfigReloaderImage: repository: quay.io/prometheus-operator/prometheus-config-reloader - tag: v0.47.0 + tag: v0.48.1 sha: "" ## Set the prometheus config reloader side-car CPU limit @@ -1899,7 +1917,7 @@ prometheus: ## image: repository: quay.io/prometheus/prometheus - tag: v2.26.0 + tag: v2.27.1 sha: "" ## Tolerations for use with node taints diff --git a/salt/metalk8s/addons/prometheus-operator/config/prometheus.yaml b/salt/metalk8s/addons/prometheus-operator/config/prometheus.yaml index 7dcd6dca57..41506c5af1 100644 --- a/salt/metalk8s/addons/prometheus-operator/config/prometheus.yaml +++ b/salt/metalk8s/addons/prometheus-operator/config/prometheus.yaml @@ -39,10 +39,10 @@ spec: available: 3 node_network_receive_errors: warning: - errors: 10 # Number of receive errors for the last 2m + error_rate: 0.01 # Rate of receive errors for the last 2m node_network_transmit_errors: warning: - errors: 10 # Number of transmit errors for the last 2m + error_rate: 0.01 # Rate of transmit errors for the last 2m node_high_number_conntrack_entries_used: warning: threshold: 0.75 diff --git a/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls b/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls index 29ac18f01d..ed78f2b460 100644 --- a/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls +++ b/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls @@ -20422,8 +20422,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 7.5.5 - helm.sh/chart: grafana-6.8.3 + app.kubernetes.io/version: 8.0.1 + helm.sh/chart: grafana-6.13.0 heritage: metalk8s name: prometheus-operator-grafana namespace: metalk8s-monitoring @@ -20467,7 +20467,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/part-of: metalk8s - helm.sh/chart: kube-state-metrics-2.13.2 + helm.sh/chart: kube-state-metrics-3.1.1 heritage: metalk8s name: prometheus-operator-kube-state-metrics namespace: metalk8s-monitoring @@ -20502,7 +20502,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-node-exporter app.kubernetes.io/part-of: metalk8s - chart: prometheus-node-exporter-1.17.0 + chart: prometheus-node-exporter-1.18.1 heritage: metalk8s jobLabel: node-exporter release: prometheus-operator @@ -20549,8 +20549,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-alertmanager app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -20593,8 +20593,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -20637,8 +20637,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-prometheus app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -20680,8 +20680,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 7.5.5 - helm.sh/chart: grafana-6.8.3 + app.kubernetes.io/version: 8.0.1 + helm.sh/chart: grafana-6.13.0 heritage: metalk8s name: prometheus-operator-grafana namespace: metalk8s-monitoring @@ -20695,7 +20695,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/part-of: metalk8s - helm.sh/chart: kube-state-metrics-2.13.2 + helm.sh/chart: kube-state-metrics-3.1.1 heritage: metalk8s name: prometheus-operator-kube-state-metrics namespace: metalk8s-monitoring @@ -20710,7 +20710,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-node-exporter app.kubernetes.io/part-of: metalk8s - chart: prometheus-node-exporter-1.17.0 + chart: prometheus-node-exporter-1.18.1 heritage: metalk8s release: prometheus-operator name: prometheus-operator-prometheus-node-exporter @@ -20726,8 +20726,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-alertmanager app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -20744,8 +20744,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -20762,8 +20762,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-prometheus app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -20782,8 +20782,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 7.5.5 - helm.sh/chart: grafana-6.8.3 + app.kubernetes.io/version: 8.0.1 + helm.sh/chart: grafana-6.13.0 heritage: metalk8s name: prometheus-operator-grafana namespace: metalk8s-monitoring @@ -20811,8 +20811,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 7.5.5 - helm.sh/chart: grafana-6.8.3 + app.kubernetes.io/version: 8.0.1 + helm.sh/chart: grafana-6.13.0 heritage: metalk8s name: prometheus-operator-grafana-config-dashboards namespace: metalk8s-monitoring @@ -20840,7 +20840,7 @@ data: [log] mode = console [paths] - data = /var/lib/grafana/data + data = /var/lib/grafana/ logs = /var/log/grafana plugins = /var/lib/grafana/plugins provisioning = /etc/grafana/provisioning @@ -20853,8 +20853,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 7.5.5 - helm.sh/chart: grafana-6.8.3 + app.kubernetes.io/version: 8.0.1 + helm.sh/chart: grafana-6.13.0 heritage: metalk8s name: prometheus-operator-grafana namespace: metalk8s-monitoring @@ -20879,8 +20879,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_datasource: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -20890,7 +20890,7 @@ metadata: --- apiVersion: v1 data: - apiserver.json: |- + alertmanager-overview.json: |- { "__inputs": [ ], @@ -20902,119 +20902,17 @@ data: }, "editable": false, "gnetId": null, - "graphTooltip": 0, + "graphTooltip": 1, "hideControls": false, "id": null, "links": [ ], - "panels": [ - { - "content": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.", - "datasource": null, - "description": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.", - "gridPos": { - "h": 2, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 2, - "mode": "markdown", - "span": 12, - "title": "Notice", - "type": "text" - } - ], - "refresh": "10s", + "refresh": "30s", "rows": [ { "collapse": false, "collapsed": false, "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "decimals": 3, - "description": "How many percent of requests (both read and write) in 30 days have been answered successfully and fast enough?", - "format": "percentunit", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - }, - "id": 3, - "interval": null, - "links": [ - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Availability (30d) > 99.000%", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, { "aliasColors": { }, @@ -21022,13 +20920,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "decimals": 3, - "description": "How much error budget is left looking at our 0.990% availability guarantees?", - "fill": 10, + "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 4, + "id": 2, "legend": { "alignAsTable": false, "avg": false, @@ -21036,7 +20932,7 @@ data: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -21054,15 +20950,15 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 8, - "stack": false, + "span": 6, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "100 * (apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"} - 0.990000)", + "expr": "sum(alertmanager_alerts{namespace=\"$namespace\",service=\"$service\"}) by (namespace,service,instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "errorbudget", + "legendFormat": "{{instance}}", "refId": "A" } ], @@ -21070,9 +20966,9 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "ErrorBudget (30d) > 99.000%", + "title": "Alerts", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -21087,8 +20983,7 @@ data: }, "yaxes": [ { - "decimals": 3, - "format": "percentunit", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -21096,8 +20991,7 @@ data: "show": true }, { - "decimals": 3, - "format": "percentunit", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -21105,103 +20999,6 @@ data: "show": true } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "decimals": 3, - "description": "How many percent of read requests (LIST,GET) in 30 days have been answered successfully and fast enough?", - "format": "percentunit", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - }, - "id": 5, - "interval": null, - "links": [ - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "apiserver_request:availability30d{verb=\"read\", cluster=\"$cluster\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": "", - "title": "Read Availability (30d)", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" }, { "aliasColors": { @@ -21210,12 +21007,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "How many read requests (LIST,GET) per second do the apiservers get by code?", - "fill": 10, + "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 6, + "id": 3, "legend": { "alignAsTable": false, "avg": false, @@ -21223,7 +21019,7 @@ data: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -21239,43 +21035,34 @@ data: "renderer": "flot", "repeat": null, "seriesOverrides": [ - { - "alias": "/2../i", - "color": "#56A64B" - }, - { - "alias": "/3../i", - "color": "#F2CC0C" - }, - { - "alias": "/4../i", - "color": "#3274D9" - }, - { - "alias": "/5../i", - "color": "#E02F44" - } ], "spaceLength": 10, - "span": 3, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})", + "expr": "sum(rate(alertmanager_alerts_received_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ code }}", + "legendFormat": "{{instance}} Received", "refId": "A" + }, + { + "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=\"$namespace\",service=\"$service\"}[5m])) by (namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Invalid", + "refId": "B" } ], "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Read SLI - Requests", + "title": "Alerts receive rate", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -21290,7 +21077,7 @@ data: }, "yaxes": [ { - "format": "reqps", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -21298,7 +21085,7 @@ data: "show": true }, { - "format": "reqps", + "format": "ops", "label": null, "logBase": 1, "max": null, @@ -21306,7 +21093,20 @@ data: "show": true } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Alerts", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ { "aliasColors": { }, @@ -21314,12 +21114,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?", "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 7, + "id": 4, "legend": { "alignAsTable": false, "avg": false, @@ -21327,7 +21126,7 @@ data: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -21341,29 +21140,35 @@ data: "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, + "repeat": "integration", "seriesOverrides": [ ], "spaceLength": 10, - "span": 3, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})", + "expr": "sum(rate(alertmanager_notifications_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ resource }}", + "legendFormat": "{{instance}} Total", "refId": "A" + }, + { + "expr": "sum(rate(alertmanager_notifications_failed_total{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (integration,namespace,service,instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Failed", + "refId": "B" } ], "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Read SLI - Errors", + "title": "$integration: Notifications Send Rate", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -21378,19 +21183,19 @@ data: }, "yaxes": [ { - "format": "percentunit", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "percentunit", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -21402,12 +21207,11 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "How many seconds is the 99th percentile for reading (LIST|GET) a given resource?", "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 8, + "id": 5, "legend": { "alignAsTable": false, "avg": false, @@ -21415,7 +21219,7 @@ data: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -21429,29 +21233,42 @@ data: "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, + "repeat": "integration", "seriesOverrides": [ ], "spaceLength": 10, - "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"read\", cluster=\"$cluster\"}", + "expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ resource }}", + "legendFormat": "{{instance}} 99th Percentile", "refId": "A" + }, + { + "expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (le,namespace,service,instance)\n) \n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Median", + "refId": "B" + }, + { + "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=\"$namespace\",service=\"$service\", integration=\"$integration\"}[5m])) by (namespace,service,instance)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Average", + "refId": "C" } ], "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Read SLI - Duration", + "title": "$integration: Notification Duration", "tooltip": { - "shared": false, + "shared": true, "sort": 0, "value_type": "individual" }, @@ -21487,11 +21304,202 @@ data: "repeat": null, "repeatIteration": null, "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", + "showTitle": true, + "title": "Notifications", "titleSize": "h6", "type": "row" - }, + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "alertmanager-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + ], + "query": "label_values(alertmanager_alerts, namespace)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "service", + "options": [ + ], + "query": "label_values(alertmanager_alerts, service)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": true, + "label": null, + "multi": false, + "name": "integration", + "options": [ + ], + "query": "label_values(alertmanager_notifications_total{integration=~\".*\"}, integration)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Alertmanager / Overview", + "uid": "alertmanager-overview", + "version": 0 + } +kind: ConfigMap +metadata: + annotations: {} + labels: + app: prometheus-operator-grafana + app.kubernetes.io/instance: prometheus-operator + app.kubernetes.io/managed-by: salt + app.kubernetes.io/name: prometheus-operator-grafana + app.kubernetes.io/part-of: metalk8s + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 + grafana_dashboard: '1' + heritage: metalk8s + metalk8s.scality.com/monitor: '' + release: prometheus-operator + name: prometheus-operator-alertmanager-overview + namespace: metalk8s-monitoring +--- +apiVersion: v1 +data: + apiserver.json: |- + { + "__inputs": [ + ], + "__requires": [ + ], + "annotations": { + "list": [ + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + ], + "panels": [ + { + "content": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.", + "datasource": null, + "description": "The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.", + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "mode": "markdown", + "span": 12, + "title": "Notice", + "type": "text" + } + ], + "refresh": "10s", + "rows": [ { "collapse": false, "collapsed": false, @@ -21507,7 +21515,7 @@ data: ], "datasource": "$datasource", "decimals": 3, - "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) in 30 days have been answered successfully and fast enough?", + "description": "How many percent of requests (both read and write) in 30 days have been answered successfully and fast enough?", "format": "percentunit", "gauge": { "maxValue": 100, @@ -21518,7 +21526,195 @@ data: }, "gridPos": { }, - "id": 9, + "id": 3, + "interval": null, + "links": [ + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Availability (30d) > 99.000%", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": 3, + "description": "How much error budget is left looking at our 0.990% availability guarantees?", + "fill": 10, + "fillGradient": 0, + "gridPos": { + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 8, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 * (apiserver_request:availability30d{verb=\"all\", cluster=\"$cluster\"} - 0.990000)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "errorbudget", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "ErrorBudget (30d) > 99.000%", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "decimals": 3, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": 3, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "decimals": 3, + "description": "How many percent of read requests (LIST,GET) in 30 days have been answered successfully and fast enough?", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + }, + "id": 5, "interval": null, "links": [ ], @@ -21557,7 +21753,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "apiserver_request:availability30d{verb=\"write\", cluster=\"$cluster\"}", + "expr": "apiserver_request:availability30d{verb=\"read\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -21565,7 +21761,7 @@ data: } ], "thresholds": "", - "title": "Write Availability (30d)", + "title": "Read Availability (30d)", "tooltip": { "shared": false }, @@ -21587,12 +21783,12 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?", + "description": "How many read requests (LIST,GET) per second do the apiservers get by code?", "fill": 10, "fillGradient": 0, "gridPos": { }, - "id": 10, + "id": 6, "legend": { "alignAsTable": false, "avg": false, @@ -21639,7 +21835,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})", + "expr": "sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ code }}", @@ -21650,7 +21846,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Write SLI - Requests", + "title": "Read SLI - Requests", "tooltip": { "shared": false, "sort": 0, @@ -21691,12 +21887,12 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?", + "description": "How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?", "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 11, + "id": 7, "legend": { "alignAsTable": false, "avg": false, @@ -21727,7 +21923,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})", + "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"read\", cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ resource }}", @@ -21738,7 +21934,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Write SLI - Errors", + "title": "Read SLI - Errors", "tooltip": { "shared": false, "sort": 0, @@ -21779,12 +21975,12 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "description": "How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?", + "description": "How many seconds is the 99th percentile for reading (LIST|GET) a given resource?", "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 12, + "id": 8, "legend": { "alignAsTable": false, "avg": false, @@ -21815,7 +22011,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"write\", cluster=\"$cluster\"}", + "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"read\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ resource }}", @@ -21826,7 +22022,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Write SLI - Duration", + "title": "Read SLI - Duration", "tooltip": { "shared": false, "sort": 0, @@ -21873,6 +22069,90 @@ data: "collapse": false, "collapsed": false, "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "decimals": 3, + "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) in 30 days have been answered successfully and fast enough?", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + }, + "id": 9, + "interval": null, + "links": [ + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "apiserver_request:availability30d{verb=\"write\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Write Availability (30d)", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, { "aliasColors": { }, @@ -21880,11 +22160,12 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "description": "How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?", + "fill": 10, "fillGradient": 0, "gridPos": { }, - "id": 13, + "id": 10, "legend": { "alignAsTable": false, "avg": false, @@ -21892,7 +22173,7 @@ data: "max": false, "min": false, "rightSide": false, - "show": false, + "show": true, "sideWidth": null, "total": false, "values": false @@ -21908,17 +22189,33 @@ data: "renderer": "flot", "repeat": null, "seriesOverrides": [ + { + "alias": "/2../i", + "color": "#56A64B" + }, + { + "alias": "/3../i", + "color": "#F2CC0C" + }, + { + "alias": "/4../i", + "color": "#3274D9" + }, + { + "alias": "/5../i", + "color": "#E02F44" + } ], "spaceLength": 10, - "span": 6, - "stack": false, + "span": 3, + "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_adds_total{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)", + "expr": "sum by (code) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{ code }}", "refId": "A" } ], @@ -21926,7 +22223,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Work Queue Add Rate", + "title": "Write SLI - Requests", "tooltip": { "shared": false, "sort": 0, @@ -21943,19 +22240,19 @@ data: }, "yaxes": [ { - "format": "ops", + "format": "reqps", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true }, { - "format": "ops", + "format": "reqps", "label": null, "logBase": 1, "max": null, - "min": 0, + "min": null, "show": true } ] @@ -21967,11 +22264,12 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?", "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 14, + "id": 11, "legend": { "alignAsTable": false, "avg": false, @@ -21979,7 +22277,7 @@ data: "max": false, "min": false, "rightSide": false, - "show": false, + "show": true, "sideWidth": null, "total": false, "values": false @@ -21997,15 +22295,15 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_depth{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)", + "expr": "sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\",code=~\"5..\", cluster=\"$cluster\"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb=\"write\", cluster=\"$cluster\"})", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{ resource }}", "refId": "A" } ], @@ -22013,7 +22311,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Work Queue Depth", + "title": "Write SLI - Errors", "tooltip": { "shared": false, "sort": 0, @@ -22030,7 +22328,7 @@ data: }, "yaxes": [ { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -22038,7 +22336,7 @@ data: "show": true }, { - "format": "short", + "format": "percentunit", "label": null, "logBase": 1, "max": null, @@ -22054,22 +22352,23 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "description": "How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?", "fill": 1, "fillGradient": 0, "gridPos": { }, - "id": 15, + "id": 12, "legend": { - "alignAsTable": true, + "alignAsTable": false, "avg": false, - "current": true, + "current": false, "max": false, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sideWidth": null, "total": false, - "values": true + "values": false }, "lines": true, "linewidth": 1, @@ -22084,15 +22383,15 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name, le))", + "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb=\"write\", cluster=\"$cluster\"}", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{ resource }}", "refId": "A" } ], @@ -22100,7 +22399,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Work Queue Latency", + "title": "Write SLI - Duration", "tooltip": { "shared": false, "sort": 0, @@ -22158,7 +22457,7 @@ data: "fillGradient": 0, "gridPos": { }, - "id": 16, + "id": 13, "legend": { "alignAsTable": false, "avg": false, @@ -22166,7 +22465,7 @@ data: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -22184,15 +22483,15 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 6, "stack": false, "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}", + "expr": "sum(rate(workqueue_adds_total{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", + "legendFormat": "{{instance}} {{name}}", "refId": "A" } ], @@ -22200,7 +22499,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Memory", + "title": "Work Queue Add Rate", "tooltip": { "shared": false, "sort": 0, @@ -22217,19 +22516,19 @@ data: }, "yaxes": [ { - "format": "bytes", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true }, { - "format": "bytes", + "format": "ops", "label": null, "logBase": 1, "max": null, - "min": null, + "min": 0, "show": true } ] @@ -22245,7 +22544,7 @@ data: "fillGradient": 0, "gridPos": { }, - "id": 17, + "id": 14, "legend": { "alignAsTable": false, "avg": false, @@ -22253,7 +22552,7 @@ data: "max": false, "min": false, "rightSide": false, - "show": true, + "show": false, "sideWidth": null, "total": false, "values": false @@ -22271,15 +22570,15 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 6, "stack": false, "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])", + "expr": "sum(rate(workqueue_depth{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", + "legendFormat": "{{instance}} {{name}}", "refId": "A" } ], @@ -22287,7 +22586,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "CPU usage", + "title": "Work Queue Depth", "tooltip": { "shared": false, "sort": 0, @@ -22332,18 +22631,18 @@ data: "fillGradient": 0, "gridPos": { }, - "id": 18, + "id": 15, "legend": { - "alignAsTable": false, + "alignAsTable": true, "avg": false, - "current": false, + "current": true, "max": false, "min": false, - "rightSide": false, + "rightSide": true, "show": true, "sideWidth": null, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -22358,15 +22657,15 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 12, "stack": false, "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}", + "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"apiserver\", instance=~\"$instance\", cluster=\"$cluster\"}[5m])) by (instance, name, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", + "legendFormat": "{{instance}} {{name}}", "refId": "A" } ], @@ -22374,7 +22673,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Goroutines", + "title": "Work Queue Latency", "tooltip": { "shared": false, "sort": 0, @@ -22391,7 +22690,7 @@ data: }, "yaxes": [ { - "format": "short", + "format": "s", "label": null, "logBase": 1, "max": null, @@ -22399,7 +22698,7 @@ data: "show": true }, { - "format": "short", + "format": "s", "label": null, "logBase": 1, "max": null, @@ -22416,83 +22715,357 @@ data: "title": "Dashboard Row", "titleSize": "h6", "type": "row" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "default", - "value": "default" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - ], - "query": "label_values(apiserver_request_total, cluster)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "instance", - "options": [ - ], - "query": "label_values(apiserver_request_total{job=\"apiserver\", cluster=\"$cluster\"}, instance)", - "refresh": 2, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + }, + "id": 16, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + }, + "id": 17, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + }, + "id": 18, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{job=\"apiserver\",instance=~\"$instance\", cluster=\"$cluster\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + ], + "query": "label_values(apiserver_request_total, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [ + ], + "query": "label_values(apiserver_request_total{job=\"apiserver\", cluster=\"$cluster\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { "refresh_intervals": [ "5s", "10s", @@ -22531,8 +23104,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -24269,7 +24842,7 @@ data: "options": [ ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 0, "tagValuesQuery": "", @@ -24324,8 +24897,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -24417,7 +24990,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"kube-controller-manager\"})", + "expr": "sum(up{cluster=\"$cluster\", job=\"kube-controller-manager\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -24482,10 +25055,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_adds_total{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{cluster}} {{instance}} {{name}}", "refId": "A" } ], @@ -24582,10 +25155,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(workqueue_depth{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)", + "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{cluster}} {{instance}} {{name}}", "refId": "A" } ], @@ -24682,10 +25255,10 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name, le))", + "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} {{name}}", + "legendFormat": "{{cluster}} {{instance}} {{name}}", "refId": "A" } ], @@ -24890,7 +25463,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -24990,7 +25563,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -25090,7 +25663,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kube-controller-manager\",instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -25177,7 +25750,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kube-controller-manager\",instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -25264,7 +25837,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kube-controller-manager\",instance=~\"$instance\"}", + "expr": "go_goroutines{cluster=\"$cluster\", job=\"kube-controller-manager\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -25341,6 +25914,29 @@ data: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { @@ -25353,7 +25949,7 @@ data: "name": "instance", "options": [ ], - "query": "label_values(process_cpu_seconds_total{job=\"kube-controller-manager\"}, instance)", + "query": "label_values(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-controller-manager\"}, instance)", "refresh": 2, "regex": "", "sort": 1, @@ -25409,8 +26005,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -25429,6 +26025,7 @@ data: "editable": true, "gnetId": null, "hideControls": false, + "id": 6, "links": [], "refresh": "10s", "rows": [ @@ -26511,7 +27108,6 @@ data: }, "timezone": "browser", "title": "etcd", - "uid": "c2f4e12cdf69feb95caa41a5a1b423d9", "version": 215 } kind: ConfigMap @@ -26523,8 +27119,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -28056,8 +28652,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -28202,7 +28798,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_cpu_cores{cluster=\"$cluster\"})", + "expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -28282,7 +28878,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_cpu_cores{cluster=\"$cluster\"})", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) / sum(kube_node_status_allocatable{resource=\"cpu\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -28362,7 +28958,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "1 - sum(:node_memory_MemAvailable_bytes:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_memory_bytes{cluster=\"$cluster\"})", + "expr": "1 - sum(:node_memory_MemAvailable_bytes:sum{cluster=\"$cluster\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -28442,7 +29038,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_memory_bytes{cluster=\"$cluster\"})", + "expr": "sum(namespace_memory:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -28522,7 +29118,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) / sum(kube_node_status_allocatable_memory_bytes{cluster=\"$cluster\"})", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) / sum(kube_node_status_allocatable{resource=\"memory\",cluster=\"$cluster\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -28613,7 +29209,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -28881,7 +29477,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -28890,7 +29486,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(namespace_cpu:kube_pod_container_resource_requests:sum{cluster=\"$cluster\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -28899,7 +29495,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -28908,7 +29504,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -28917,7 +29513,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"cpu\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -29289,7 +29885,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -29298,7 +29894,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -29307,7 +29903,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -29316,7 +29912,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\"}) by (namespace)", + "expr": "sum(container_memory_rss{cluster=\"$cluster\", container!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", resource=\"memory\"}) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -29650,7 +30246,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -29687,7 +30283,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -29737,13 +30333,94 @@ data: "show": false } ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Transmit Bandwidth", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -29758,7 +30435,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 13, + "id": 14, "legend": { "avg": false, "current": false, @@ -29780,12 +30457,12 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -29797,7 +30474,88 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Transmit Bandwidth", + "title": "Average Container Bandwidth by Namespace: Received", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Average Container Bandwidth by Namespace: Transmitted", "tooltip": { "shared": false, "sort": 0, @@ -29836,7 +30594,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Average Container Bandwidth by Namespace", "titleSize": "h6" }, { @@ -29851,7 +30609,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 14, + "id": 16, "legend": { "avg": false, "current": false, @@ -29873,12 +30631,12 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -29890,7 +30648,88 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Average Container Bandwidth by Namespace: Received", + "title": "Rate of Received Packets", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Transmitted Packets", "tooltip": { "shared": false, "sort": 0, @@ -29929,7 +30768,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -29944,7 +30783,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 15, + "id": 18, "legend": { "avg": false, "current": false, @@ -29966,12 +30805,12 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -29983,7 +30822,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Average Container Bandwidth by Namespace: Transmitted", + "title": "Rate of Received Packets Dropped", "tooltip": { "shared": false, "sort": 0, @@ -30016,19 +30855,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { }, @@ -30037,7 +30864,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 16, + "id": 19, "legend": { "avg": false, "current": false, @@ -30059,12 +30886,12 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -30076,7 +30903,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets", + "title": "Rate of Transmitted Packets Dropped", "tooltip": { "shared": false, "sort": 0, @@ -30115,7 +30942,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" }, { @@ -30129,8 +30956,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "decimals": -1, "fill": 10, - "id": 17, + "id": 20, "legend": { "avg": false, "current": false, @@ -30152,12 +30980,12 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -30169,7 +30997,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets", + "title": "IOPS(Reads+Writes)", "tooltip": { "shared": false, "sort": 0, @@ -30186,7 +31014,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -30202,19 +31030,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { }, @@ -30223,7 +31039,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 18, + "id": 21, "legend": { "avg": false, "current": false, @@ -30245,12 +31061,12 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -30262,7 +31078,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets Dropped", + "title": "ThroughPut(Read+Write)", "tooltip": { "shared": false, "sort": 0, @@ -30301,7 +31117,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO", "titleSize": "h6" }, { @@ -30315,8 +31131,8 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 19, + "fill": 1, + "id": 22, "legend": { "avg": false, "current": false, @@ -30327,7 +31143,7 @@ data: "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], "nullPointMode": "null as zero", @@ -30337,17 +31153,207 @@ data: "renderer": "flot", "seriesOverrides": [ ], + "sort": { + "col": 4, + "desc": true + }, "spaceLength": 10, "span": 12, - "stack": true, + "stack": false, "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Namespace", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": false, + "linkTooltip": "Drill down to pods", + "linkUrl": "./d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell", + "pattern": "namespace", + "thresholds": [ + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + ], + "type": "string", + "unit": "short" + } + ], "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", - "format": "time_series", + "expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, "intervalFactor": 2, - "legendFormat": "{{namespace}}", - "legendLink": null, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", "step": 10 } ], @@ -30355,13 +31361,14 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets Dropped", + "title": "Current Storage IO", "tooltip": { "shared": false, "sort": 0, "value_type": "individual" }, - "type": "graph", + "transform": "table", + "type": "table", "xaxis": { "buckets": null, "mode": "time", @@ -30372,7 +31379,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -30394,7 +31401,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO - Distribution", "titleSize": "h6" } ], @@ -30490,8 +31497,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -30555,7 +31562,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -30635,7 +31642,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -30715,7 +31722,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -30725,7 +31732,7 @@ data: "thresholds": "70,80", "timeFrom": null, "timeShift": null, - "title": "Memory Utilization (from requests)", + "title": "Memory Utilisation (from requests)", "tooltip": { "shared": false, "sort": 0, @@ -30795,7 +31802,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"})", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -30884,8 +31891,9 @@ data: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -30894,8 +31902,9 @@ data: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -30906,7 +31915,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -31138,7 +32147,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -31147,7 +32156,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -31156,7 +32165,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -31165,7 +32174,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -31174,7 +32183,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -31267,8 +32276,9 @@ data: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -31277,8 +32287,9 @@ data: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -31581,7 +32592,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -31590,7 +32601,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -31599,7 +32610,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -31608,7 +32619,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\", image!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -31969,7 +32980,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -32006,7 +33017,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -32056,19 +33067,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { }, @@ -32099,7 +33098,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -32155,7 +33154,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -32192,7 +33191,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -32242,19 +33241,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { }, @@ -32285,7 +33272,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -32341,7 +33328,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -32378,7 +33365,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -32428,19 +33415,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { }, @@ -32471,7 +33446,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -32527,152 +33502,9 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [ - "kubernetes-mixin" - ], - "templating": { - "list": [ - { - "current": { - "text": "default", - "value": "default" - }, - "hide": 0, - "label": null, - "name": "datasource", - "options": [ - ], - "query": "prometheus", - "refresh": 1, - "regex": "", - "type": "datasource" - }, - { - "allValue": null, - "current": { - "text": "", - "value": "" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": null, - "multi": false, - "name": "cluster", - "options": [ - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "", - "value": "" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "namespace", - "options": [ - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "UTC", - "title": "Kubernetes / Compute Resources / Namespace (Pods)", - "uid": "85a562078cdf77779eaa1add43ccec1e", - "version": 0 - } -kind: ConfigMap -metadata: - annotations: {} - labels: - app: prometheus-operator-grafana - app.kubernetes.io/instance: prometheus-operator - app.kubernetes.io/managed-by: salt - app.kubernetes.io/name: prometheus-operator-grafana - app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 - grafana_dashboard: '1' - heritage: metalk8s - metalk8s.scality.com/monitor: '' - release: prometheus-operator - name: prometheus-operator-k8s-resources-namespace - namespace: metalk8s-monitoring ---- -apiVersion: v1 -data: - k8s-resources-node.json: |- - { - "annotations": { - "list": [ - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "links": [ - ], - "refresh": "10s", - "rows": [ + }, { "collapse": false, "height": "250px", @@ -32684,8 +33516,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "decimals": -1, "fill": 10, - "id": 1, + "id": 16, "legend": { "avg": false, "current": false, @@ -32707,12 +33540,12 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -32724,7 +33557,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "CPU Usage", + "title": "IOPS(Reads+Writes)", "tooltip": { "shared": false, "sort": 0, @@ -32757,13 +33590,94 @@ data: "show": false } ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "CPU Usage", + "title": "Storage IO", "titleSize": "h6" }, { @@ -32778,7 +33692,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 2, + "id": 18, "legend": { "avg": false, "current": false, @@ -32799,6 +33713,10 @@ data: "renderer": "flot", "seriesOverrides": [ ], + "sort": { + "col": 4, + "desc": true + }, "spaceLength": 10, "span": 12, "stack": false, @@ -32811,12 +33729,12 @@ data: "type": "hidden" }, { - "alias": "CPU Usage", + "alias": "IOPS(Reads)", "colorMode": null, "colors": [ ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, + "decimals": -1, "link": false, "linkTargetBlank": false, "linkTooltip": "Drill down", @@ -32828,12 +33746,12 @@ data: "unit": "short" }, { - "alias": "CPU Requests", + "alias": "IOPS(Writes)", "colorMode": null, "colors": [ ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, + "decimals": -1, "link": false, "linkTargetBlank": false, "linkTooltip": "Drill down", @@ -32845,12 +33763,12 @@ data: "unit": "short" }, { - "alias": "CPU Requests %", + "alias": "IOPS(Reads + Writes)", "colorMode": null, "colors": [ ], "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, + "decimals": -1, "link": false, "linkTargetBlank": false, "linkTooltip": "Drill down", @@ -32859,10 +33777,10 @@ data: "thresholds": [ ], "type": "number", - "unit": "percentunit" + "unit": "short" }, { - "alias": "CPU Limits", + "alias": "Throughput(Read)", "colorMode": null, "colors": [ ], @@ -32876,10 +33794,10 @@ data: "thresholds": [ ], "type": "number", - "unit": "short" + "unit": "Bps" }, { - "alias": "CPU Limits %", + "alias": "Throughput(Write)", "colorMode": null, "colors": [ ], @@ -32893,10 +33811,10 @@ data: "thresholds": [ ], "type": "number", - "unit": "percentunit" + "unit": "Bps" }, { - "alias": "Pod", + "alias": "Throughput(Read + Write)", "colorMode": null, "colors": [ ], @@ -32906,6 +33824,23 @@ data: "linkTargetBlank": false, "linkTooltip": "Drill down", "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": false, + "linkTooltip": "Drill down to pods", + "linkUrl": "./d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$namespace&var-pod=$__cell", "pattern": "pod", "thresholds": [ ], @@ -32928,7 +33863,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", "format": "table", "instant": true, "intervalFactor": 2, @@ -32937,7 +33872,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", "format": "table", "instant": true, "intervalFactor": 2, @@ -32946,7 +33881,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", "format": "table", "instant": true, "intervalFactor": 2, @@ -32955,7 +33890,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", "format": "table", "instant": true, "intervalFactor": 2, @@ -32964,7 +33899,506 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Storage IO", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage IO - Distribution", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "UTC", + "title": "Kubernetes / Compute Resources / Namespace (Pods)", + "uid": "85a562078cdf77779eaa1add43ccec1e", + "version": 0 + } +kind: ConfigMap +metadata: + annotations: {} + labels: + app: prometheus-operator-grafana + app.kubernetes.io/instance: prometheus-operator + app.kubernetes.io/managed-by: salt + app.kubernetes.io/name: prometheus-operator-grafana + app.kubernetes.io/part-of: metalk8s + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 + grafana_dashboard: '1' + heritage: metalk8s + metalk8s.scality.com/monitor: '' + release: prometheus-operator + name: prometheus-operator-k8s-resources-namespace + namespace: metalk8s-monitoring +--- +apiVersion: v1 +data: + k8s-resources-node.json: |- + { + "annotations": { + "list": [ + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "CPU Usage", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests %", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "CPU Limits", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Limits %", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "pod", + "thresholds": [ + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", node=~\"$node\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"cpu\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -33335,7 +34769,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -33344,7 +34778,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -33353,7 +34787,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", node=~\"$node\"}) by (pod)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -33362,7 +34796,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{node=~\"$node\"}) by (pod)", + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{cluster=\"$cluster\", node=~\"$node\",container!=\"\"}) by (pod) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", node=~\"$node\", resource=\"memory\"}) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -33483,7 +34917,7 @@ data: "options": [ ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -33508,7 +34942,7 @@ data: "options": [ ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, node)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -33563,8 +34997,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -33645,7 +35079,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", cluster=\"$cluster\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -33653,7 +35087,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "requests", @@ -33661,7 +35095,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "limits", @@ -33754,7 +35188,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container)", + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"\", cluster=\"$cluster\"}[5m])) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -33978,7 +35412,7 @@ data: ], "targets": [ { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -33987,7 +35421,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -33996,7 +35430,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -34005,7 +35439,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -34014,7 +35448,7 @@ data: "step": 10 }, { - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"cpu\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -34108,7 +35542,7 @@ data: "dashes": true, "fill": 0, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -34118,7 +35552,7 @@ data: "dashes": true, "fill": 0, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -34129,7 +35563,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", image!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{container}}", @@ -34137,7 +35571,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "requests", @@ -34145,7 +35579,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "limits", @@ -34157,7 +35591,7 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Memory Usage", + "title": "Memory Usage (WSS)", "tooltip": { "shared": false, "sort": 0, @@ -34244,7 +35678,7 @@ data: "type": "hidden" }, { - "alias": "Memory Usage", + "alias": "Memory Usage (WSS)", "colorMode": null, "colors": [ ], @@ -34412,7 +35846,7 @@ data: ], "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\", image!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -34421,7 +35855,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -34430,7 +35864,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -34439,7 +35873,7 @@ data: "step": 10 }, { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", + "expr": "sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -34448,7 +35882,7 @@ data: "step": 10 }, { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"\", image!=\"\"}) by (container) / sum(kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", resource=\"memory\"}) by (container)", "format": "table", "instant": true, "intervalFactor": 2, @@ -34566,12 +36000,12 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -34616,13 +36050,95 @@ data: "show": false } ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 7, + "interval": "1m", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Transmit Bandwidth", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -34637,7 +36153,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 8, "interval": "1m", "legend": { "avg": false, @@ -34660,12 +36176,12 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -34677,7 +36193,89 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Transmit Bandwidth", + "title": "Rate of Received Packets", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 9, + "interval": "1m", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Transmitted Packets", "tooltip": { "shared": false, "sort": 0, @@ -34716,7 +36314,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -34731,7 +36329,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 10, "interval": "1m", "legend": { "avg": false, @@ -34754,12 +36352,12 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -34771,7 +36369,89 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets", + "title": "Rate of Received Packets Dropped", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 11, + "interval": "1m", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of Transmitted Packets Dropped", "tooltip": { "shared": false, "sort": 0, @@ -34810,7 +36490,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" }, { @@ -34824,9 +36504,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "decimals": -1, "fill": 10, - "id": 9, - "interval": "1m", + "id": 12, "legend": { "avg": false, "current": false, @@ -34848,15 +36528,23 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{pod}}", + "legendFormat": "Reads", + "legendLink": null, + "step": 10 + }, + { + "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Writes", "legendLink": null, "step": 10 } @@ -34865,7 +36553,96 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets", + "title": "IOPS", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Reads", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Writes", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut", "tooltip": { "shared": false, "sort": 0, @@ -34904,7 +36681,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO - Distribution(Pod - Read & Writes)", "titleSize": "h6" }, { @@ -34918,9 +36695,9 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", + "decimals": -1, "fill": 10, - "id": 10, - "interval": "1m", + "id": 14, "legend": { "avg": false, "current": false, @@ -34942,15 +36719,15 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "expr": "ceil(sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m])))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{pod}}", + "legendFormat": "{{container}}", "legendLink": null, "step": 10 } @@ -34959,7 +36736,88 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Received Packets Dropped", + "title": "IOPS(Reads+Writes)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "ThroughPut(Read+Write)", "tooltip": { "shared": false, "sort": 0, @@ -34998,7 +36856,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO - Distribution(Containers)", "titleSize": "h6" }, { @@ -35012,9 +36870,8 @@ data: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, - "id": 11, - "interval": "1m", + "fill": 1, + "id": 16, "legend": { "avg": false, "current": false, @@ -35025,7 +36882,7 @@ data: "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], "nullPointMode": "null as zero", @@ -35035,17 +36892,207 @@ data: "renderer": "flot", "seriesOverrides": [ ], + "sort": { + "col": 4, + "desc": true + }, "spaceLength": 10, "span": 12, - "stack": true, + "stack": false, "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "IOPS(Reads)", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Writes)", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + ], + "type": "number", + "unit": "short" + }, + { + "alias": "IOPS(Reads + Writes)", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": -1, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + ], + "type": "number", + "unit": "short" + }, + { + "alias": "Throughput(Read)", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Write)", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Throughput(Read + Write)", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #F", + "thresholds": [ + ], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Container", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTargetBlank": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", + "thresholds": [ + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + ], + "type": "string", + "unit": "short" + } + ], "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", - "format": "time_series", + "expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, "intervalFactor": 2, - "legendFormat": "{{pod}}", - "legendLink": null, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + }, + { + "expr": "sum by(container) (rate(container_fs_reads_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]) + rate(container_fs_writes_bytes_total{container!=\"\", cluster=\"$cluster\",namespace=~\"$namespace\", pod=\"$pod\"}[5m]))", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "F", "step": 10 } ], @@ -35053,13 +37100,14 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Rate of Transmitted Packets Dropped", + "title": "Current Storage IO", "tooltip": { "shared": false, "sort": 0, "value_type": "individual" }, - "type": "graph", + "transform": "table", + "type": "table", "xaxis": { "buckets": null, "mode": "time", @@ -35070,7 +37118,7 @@ data: }, "yaxes": [ { - "format": "Bps", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -35092,7 +37140,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Storage IO - Distribution", "titleSize": "h6" } ], @@ -35133,7 +37181,7 @@ data: "options": [ ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -35158,7 +37206,7 @@ data: "options": [ ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -35238,8 +37286,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -35302,7 +37350,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -35518,7 +37566,7 @@ data: ], "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -35527,7 +37575,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -35536,7 +37584,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -35545,7 +37593,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -35554,7 +37602,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -35874,7 +37922,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -35883,7 +37931,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -35892,7 +37940,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -35901,7 +37949,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\", workload_type=\"$type\"}\n) by (pod)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -36235,7 +38283,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -36272,7 +38320,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -36322,19 +38370,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { }, @@ -36365,7 +38401,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -36421,7 +38457,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -36458,7 +38494,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -36508,19 +38544,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { }, @@ -36551,7 +38575,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -36607,7 +38631,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Average Container Bandwidth by Pod", "titleSize": "h6" }, { @@ -36644,7 +38668,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -36694,19 +38718,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { }, @@ -36737,7 +38749,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -36793,7 +38805,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -36830,7 +38842,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -36880,19 +38892,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { }, @@ -36923,7 +38923,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -36979,7 +38979,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" } ], @@ -37020,7 +39020,7 @@ data: "options": [ ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -37045,7 +39045,7 @@ data: "options": [ ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -37070,7 +39070,7 @@ data: "options": [ ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -37095,7 +39095,7 @@ data: "options": [ ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -37150,8 +39150,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -37212,8 +39212,9 @@ data: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -37222,8 +39223,9 @@ data: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -37234,7 +39236,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}} - {{workload_type}}", @@ -37509,7 +39511,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -37518,7 +39520,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -37527,7 +39529,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -37536,7 +39538,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -37545,7 +39547,7 @@ data: "step": 10 }, { - "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"cpu\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -37638,8 +39640,9 @@ data: "color": "#F2495C", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false }, @@ -37648,8 +39651,9 @@ data: "color": "#FF9830", "dashes": true, "fill": 0, + "hiddenSeries": true, "hideTooltip": true, - "legend": false, + "legend": true, "linewidth": 2, "stack": false } @@ -37944,7 +39948,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -37953,7 +39957,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_requests{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -37962,7 +39966,7 @@ data: "step": 10 }, { - "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -37971,7 +39975,7 @@ data: "step": 10 }, { - "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", + "expr": "sum(\n container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", container!=\"\", image!=\"\"}\n * on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n/sum(\n kube_pod_container_resource_limits{cluster=\"$cluster\", namespace=\"$namespace\", resource=\"memory\"}\n* on(namespace,pod)\n group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=\"$namespace\", workload_type=\"$type\"}\n) by (workload, workload_type)\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -38322,7 +40326,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Current Network Usage", "titleSize": "h6" }, { @@ -38359,7 +40363,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -38409,13 +40413,94 @@ data: "show": false } ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{workload}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Transmit Bandwidth", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Bandwidth", "titleSize": "h6" }, { @@ -38430,7 +40515,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -38452,12 +40537,12 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -38469,7 +40554,88 @@ data: ], "timeFrom": null, "timeShift": null, - "title": "Transmit Bandwidth", + "title": "Average Container Bandwidth by Workload: Received", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{workload}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Average Container Bandwidth by Workload: Transmitted", "tooltip": { "shared": false, "sort": 0, @@ -38508,7 +40674,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Average Container Bandwidth by Workload", "titleSize": "h6" }, { @@ -38523,7 +40689,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 10, "legend": { "avg": false, "current": false, @@ -38545,193 +40711,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{workload}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Average Container Bandwidth by Workload: Received", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 12, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{workload}}", - "legendLink": null, - "step": 10 - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Average Container Bandwidth by Workload: Transmitted", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 10, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [ - ], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -38781,19 +40761,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { }, @@ -38824,7 +40792,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -38880,7 +40848,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets", "titleSize": "h6" }, { @@ -38917,7 +40885,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -38967,19 +40935,7 @@ data: "show": false } ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Network", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ + }, { "aliasColors": { }, @@ -39010,7 +40966,7 @@ data: "seriesOverrides": [ ], "spaceLength": 10, - "span": 12, + "span": 6, "stack": true, "steppedLine": false, "targets": [ @@ -39066,7 +41022,7 @@ data: "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Network", + "title": "Rate of Packets Dropped", "titleSize": "h6" } ], @@ -39094,27 +41050,22 @@ data: }, { "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", "current": { - "text": "deployment", - "value": "deployment" + "text": "", + "value": "" }, "datasource": "$datasource", - "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", - "hide": 0, + "hide": 2, "includeAll": false, "label": null, "multi": false, - "name": "type", + "name": "cluster", "options": [ ], - "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", - "refresh": 1, + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, "regex": "", - "skipUrlSync": false, - "sort": 0, + "sort": 1, "tagValuesQuery": "", "tags": [ ], @@ -39124,22 +41075,27 @@ data: }, { "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", "current": { - "text": "", - "value": "" + "text": "deployment", + "value": "deployment" }, "datasource": "$datasource", - "hide": 2, + "definition": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "hide": 0, "includeAll": false, "label": null, "multi": false, - "name": "cluster", + "name": "type", "options": [ ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", + "refresh": 2, "regex": "", - "sort": 1, + "skipUrlSync": false, + "sort": 0, "tagValuesQuery": "", "tags": [ ], @@ -39162,7 +41118,7 @@ data: "options": [ ], "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 1, "tagValuesQuery": "", @@ -39217,8 +41173,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -39245,2265 +41201,1985 @@ data: "id": null, "links": [ ], - "refresh": "10s", - "rows": [ + "panels": [ { - "collapse": false, - "collapsed": false, - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - }, - "id": 2, - "interval": null, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(up{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } + "mappings": [ ], - "thresholds": "", - "title": "Up", - "tooltip": { - "shared": false + "thresholds": { + "mode": "absolute", + "steps": [ + ] }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "links": [ + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "valueName": "min" - }, + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - }, - "id": 3, - "interval": null, + "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Running Kubelets", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } + "mappings": [ ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false + "thresholds": { + "mode": "absolute", + "steps": [ + ] }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 3, + "links": [ + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "thresholds": "", - "title": "Running Pods", - "tooltip": { - "shared": false + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ + { + "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Running Pods", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + ], + "mappings": [ + ], + "thresholds": { + "mode": "absolute", + "steps": [ + ] }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 4, + "links": [ + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "valueName": "min" - }, + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" + "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Running Container", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true + "mappings": [ + ], + "thresholds": { + "mode": "absolute", + "steps": [ + ] }, - "gridPos": { - }, - "id": 4, - "interval": null, - "links": [ - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Running Container", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 5, + "links": [ + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "valueName": "min" - }, + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - }, - "id": 5, - "interval": null, + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Actual Volume Count", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } + "mappings": [ ], - "thresholds": "", - "title": "Actual Volume Count", - "tooltip": { - "shared": false + "thresholds": { + "mode": "absolute", + "steps": [ + ] }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 6, + "links": [ + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "valueName": "min" - }, + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - }, - "id": 6, - "interval": null, + "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Desired Volume Count", + "transparent": false, + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { "links": [ ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } + "mappings": [ ], - "thresholds": "", - "title": "Desired Volume Count", - "tooltip": { - "shared": false + "thresholds": { + "mode": "absolute", + "steps": [ + ] }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 7, + "links": [ + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" ], - "valueName": "min" - }, + "fields": "", + "values": false + } + }, + "pluginVersion": "7", + "targets": [ { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "#299c46", - "rgba(237, 129, 40, 0.89)", - "#d44a3a" - ], - "datasource": "$datasource", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - }, - "id": 7, - "interval": null, - "links": [ - ], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": "", - "title": "Config Error Count", - "tooltip": { - "shared": false - }, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "min" + "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "title": "Config Error Count", + "transparent": false, + "type": "stat" }, { - "collapse": false, - "collapsed": false, - "panels": [ + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Operation Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, + "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 9, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Operation Error Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Operation duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} pod", + "refId": "A" + }, + { + "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} worker", + "refId": "B" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod Start Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} pod", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} worker", + "refId": "B" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod Start Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Error Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Operation duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Storage Operation Duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 11, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} pod", - "refId": "A" - }, - { - "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} worker", - "refId": "B" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Start Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 12, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} pod", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} worker", - "refId": "B" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Start Duration", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 13, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Cgroup manager operation rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 14, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Error Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{operation_type}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Cgroup manager 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Pod lifecycle event generator", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 49 + }, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 15, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_name}} {{volume_plugin}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Storage Operation Duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 49 + }, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 16, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Cgroup manager operation rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist interval", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 17, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{operation_type}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Cgroup manager 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 56 + }, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "PLEG relist duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 63 + }, + "id": 21, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "description": "Pod lifecycle event generator", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "2xx", + "refId": "A" }, { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 19, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist interval", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "3xx", + "refId": "B" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "4xx", + "refId": "C" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5xx", + "refId": "D" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": false, - "collapsed": false, - "panels": [ + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "RPC Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 20, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "PLEG relist duration", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 70 + }, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 21, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "2xx", - "refId": "A" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "3xx", - "refId": "B" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "4xx", - "refId": "C" - }, - { - "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "5xx", - "refId": "D" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "RPC Rate", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} {{verb}} {{url}}", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Request duration 99th quantile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 77 + }, + "id": 23, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 22, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}} {{verb}} {{url}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Request duration 99th quantile", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, { - "collapse": false, - "collapsed": false, - "panels": [ + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 77 + }, + "id": 24, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 23, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 24, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU usage", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": 0, - "show": true - } - ] + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - }, - "id": 25, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - ], - "timeFrom": null, - "timeShift": null, - "title": "Goroutines", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } + ] + }, + { + "aliasColors": { + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 77 + }, + "id": 25, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6", - "type": "row" + "seriesOverrides": [ + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] } ], + "refresh": "10s", + "rows": [ + ], "schemaVersion": 14, "style": "dark", "tags": [ @@ -41617,8 +43293,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -42849,7 +44525,7 @@ data: "options": [ ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 0, "tagValuesQuery": "", @@ -42878,7 +44554,7 @@ data: "options": [ ], "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -43012,8 +44688,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -44470,7 +46146,7 @@ data: "options": [ ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 0, "tagValuesQuery": "", @@ -44499,7 +46175,7 @@ data: "options": [ ], "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -44529,7 +46205,7 @@ data: "options": [ ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\".+\"}, workload_type)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -44663,8 +46339,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -44727,7 +46403,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n instance:node_cpu_utilisation:rate1m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", + "expr": "(\n instance:node_cpu_utilisation:rate5m{job=\"node-exporter\"}\n*\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n/ scalar(sum(instance:node_num_cpu:sum{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -44741,7 +46417,7 @@ data: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -44822,7 +46498,7 @@ data: "timeShift": null, "title": "CPU Saturation (load1 per CPU)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -44915,7 +46591,7 @@ data: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -44982,7 +46658,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -44996,7 +46672,7 @@ data: "timeShift": null, "title": "Memory Saturation (Major Page Faults)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -45084,7 +46760,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} Receive", @@ -45092,7 +46768,7 @@ data: "step": 10 }, { - "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} Transmit", @@ -45106,7 +46782,7 @@ data: "timeShift": null, "title": "Net Utilisation (Bytes Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -45182,7 +46858,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} Receive", @@ -45190,7 +46866,7 @@ data: "step": 10 }, { - "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\"}", + "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} Transmit", @@ -45204,7 +46880,7 @@ data: "timeShift": null, "title": "Net Saturation (Drops Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -45283,7 +46959,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\"}))\n", + "expr": "instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{device}}", @@ -45297,7 +46973,7 @@ data: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -45364,7 +47040,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\"}))\n", + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\"}\n/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\"}))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{device}}", @@ -45378,7 +47054,7 @@ data: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -45471,7 +47147,7 @@ data: "timeShift": null, "title": "Disk Space Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -45578,8 +47254,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -45642,7 +47318,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_cpu_utilisation:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_cpu_utilisation:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Utilisation", @@ -45656,7 +47332,7 @@ data: "timeShift": null, "title": "CPU Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -45737,7 +47413,7 @@ data: "timeShift": null, "title": "CPU Saturation (Load1 per CPU)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -45830,7 +47506,7 @@ data: "timeShift": null, "title": "Memory Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -45897,7 +47573,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_vmstat_pgmajfault:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_vmstat_pgmajfault:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Major page faults", @@ -45911,7 +47587,7 @@ data: "timeShift": null, "title": "Memory Saturation (Major Page Faults)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -45999,7 +47675,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_bytes_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_receive_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Receive", @@ -46007,7 +47683,7 @@ data: "step": 10 }, { - "expr": "instance:node_network_transmit_bytes_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_transmit_bytes_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Transmit", @@ -46021,7 +47697,7 @@ data: "timeShift": null, "title": "Net Utilisation (Bytes Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -46097,7 +47773,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance:node_network_receive_drop_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_receive_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Receive drops", @@ -46105,7 +47781,7 @@ data: "step": 10 }, { - "expr": "instance:node_network_transmit_drop_excluding_lo:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance:node_network_transmit_drop_excluding_lo:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Transmit drops", @@ -46119,7 +47795,7 @@ data: "timeShift": null, "title": "Net Saturation (Drops Receive/Transmit)", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -46198,7 +47874,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_seconds:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance_device:node_disk_io_time_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -46212,7 +47888,7 @@ data: "timeShift": null, "title": "Disk IO Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -46279,7 +47955,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "instance_device:node_disk_io_time_weighted_seconds:rate1m{job=\"node-exporter\", instance=\"$instance\"}", + "expr": "instance_device:node_disk_io_time_weighted_seconds:rate5m{job=\"node-exporter\", instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -46293,7 +47969,7 @@ data: "timeShift": null, "title": "Disk IO Saturation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -46386,7 +48062,7 @@ data: "timeShift": null, "title": "Disk Space Utilisation", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -46518,8 +48194,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -46594,9 +48270,8 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n", + "expr": "(\n (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__rate_interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n", "format": "time_series", - "interval": "1m", "intervalFactor": 5, "legendFormat": "{{cpu}}", "refId": "A" @@ -47011,25 +48686,22 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{device}} read", "refId": "A" }, { - "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{device}} written", "refId": "B" }, { - "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__interval])", + "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{device}} io time", "refId": "C" @@ -47230,9 +48902,8 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{device}}", "refId": "A" @@ -47318,9 +48989,8 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])", + "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__rate_interval])", "format": "time_series", - "interval": "1m", "intervalFactor": 2, "legendFormat": "{{device}}", "refId": "A" @@ -47462,8 +49132,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -47538,14 +49208,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n sum without(instance, node) (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n", + "expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Used Space", "refId": "A" }, { - "expr": "sum without(instance, node) (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n", + "expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Free Space", @@ -47649,7 +49319,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "max without(instance,node) (\n(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n", + "expr": "max without(instance,node) (\n(\n topk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n topk(1, kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n/\ntopk(1, kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -47727,14 +49397,14 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n", + "expr": "sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "Used inodes", "refId": "A" }, { - "expr": "(\n sum without(instance, node) (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n -\n sum without(instance, node) (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n)\n", + "expr": "(\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n -\n sum without(instance, node) (topk(1, (kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})))\n)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": " Free inodes", @@ -47838,7 +49508,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "max without(instance,node) (\nkubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n", + "expr": "max without(instance,node) (\ntopk(1, kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n/\ntopk(1, kubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"})\n* 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -48007,8 +49677,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -48988,7 +50658,7 @@ data: "options": [ ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 0, "tagValuesQuery": "", @@ -49017,7 +50687,7 @@ data: "options": [ ], "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -49047,7 +50717,7 @@ data: "options": [ ], "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\",namespace=~\"$namespace\"}, pod)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -49181,8 +50851,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -49375,7 +51045,7 @@ data: "timeShift": null, "title": "Prometheus Stats", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -49469,7 +51139,7 @@ data: "timeShift": null, "title": "Target Sync", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -49550,7 +51220,7 @@ data: "timeShift": null, "title": "Targets", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -49643,7 +51313,7 @@ data: "timeShift": null, "title": "Average Scrape Interval Duration", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -49748,7 +51418,7 @@ data: "timeShift": null, "title": "Scrape failures", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -49829,7 +51499,7 @@ data: "timeShift": null, "title": "Appended Samples", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -49922,7 +51592,7 @@ data: "timeShift": null, "title": "Head Series", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -50003,7 +51673,7 @@ data: "timeShift": null, "title": "Head Chunks", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -50096,7 +51766,7 @@ data: "timeShift": null, "title": "Query Rate", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -50177,7 +51847,7 @@ data: "timeShift": null, "title": "Stage Duration", "tooltip": { - "shared": true, + "shared": false, "sort": 0, "value_type": "individual" }, @@ -50241,7 +51911,7 @@ data: "type": "datasource" }, { - "allValue": null, + "allValue": ".+", "current": { "selected": true, "text": "All", @@ -50267,7 +51937,7 @@ data: "useTags": false }, { - "allValue": null, + "allValue": ".+", "current": { "selected": true, "text": "All", @@ -50337,8 +52007,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -50430,7 +52100,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"kube-proxy\"})", + "expr": "sum(up{cluster=\"$cluster\", job=\"kube-proxy\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -50495,7 +52165,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", + "expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "rate", @@ -50582,7 +52252,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", + "expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -50682,7 +52352,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", + "expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "rate", @@ -50769,7 +52439,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\"}[5m])) by (instance, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -50869,28 +52539,28 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"2..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "2xx", "refId": "A" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"3..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "3xx", "refId": "B" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"4..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "4xx", "refId": "C" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-proxy\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\",code=~\"5..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "5xx", @@ -50977,7 +52647,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -51077,7 +52747,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -51177,7 +52847,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kube-proxy\",instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -51264,7 +52934,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kube-proxy\",instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -51351,7 +53021,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kube-proxy\",instance=~\"$instance\"}", + "expr": "go_goroutines{cluster=\"$cluster\", job=\"kube-proxy\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -51428,6 +53098,29 @@ data: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { @@ -51440,7 +53133,7 @@ data: "name": "instance", "options": [ ], - "query": "label_values(kubeproxy_network_programming_duration_seconds_bucket{job=\"kube-proxy\"}, instance)", + "query": "label_values(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\"}, instance)", "refresh": 2, "regex": "", "sort": 1, @@ -51496,8 +53189,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -51589,7 +53282,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(up{job=\"kube-scheduler\"})", + "expr": "sum(up{cluster=\"$cluster\", job=\"kube-scheduler\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -51654,31 +53347,31 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} e2e", + "legendFormat": "{{cluster}} {{instance}} e2e", "refId": "A" }, { - "expr": "sum(rate(scheduler_binding_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} binding", + "legendFormat": "{{cluster}} {{instance}} binding", "refId": "B" }, { - "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} scheduling algorithm", + "legendFormat": "{{cluster}} {{instance}} scheduling algorithm", "refId": "C" }, { - "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)", + "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} volume", + "legendFormat": "{{cluster}} {{instance}} volume", "refId": "D" } ], @@ -51762,31 +53455,31 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} e2e", + "legendFormat": "{{cluster}} {{instance}} e2e", "refId": "A" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} binding", + "legendFormat": "{{cluster}} {{instance}} binding", "refId": "B" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} scheduling algorithm", + "legendFormat": "{{cluster}} {{instance}} scheduling algorithm", "refId": "C" }, { - "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} volume", + "legendFormat": "{{cluster}} {{instance}} volume", "refId": "D" } ], @@ -51883,28 +53576,28 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"2..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "2xx", "refId": "A" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"3..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "3xx", "refId": "B" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"4..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "4xx", "refId": "C" }, { - "expr": "sum(rate(rest_client_requests_total{job=\"kube-scheduler\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\",code=~\"5..\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "5xx", @@ -51991,7 +53684,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -52091,7 +53784,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{verb}} {{url}}", @@ -52191,7 +53884,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "process_resident_memory_bytes{job=\"kube-scheduler\", instance=~\"$instance\"}", + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -52278,7 +53971,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=\"kube-scheduler\", instance=~\"$instance\"}[5m])", + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -52365,7 +54058,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "go_goroutines{job=\"kube-scheduler\",instance=~\"$instance\"}", + "expr": "go_goroutines{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -52442,6 +54135,29 @@ data: "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "allValue": null, "current": { @@ -52454,7 +54170,7 @@ data: "name": "instance", "options": [ ], - "query": "label_values(process_cpu_seconds_total{job=\"kube-scheduler\"}, instance)", + "query": "label_values(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\"}, instance)", "refresh": 2, "regex": "", "sort": 1, @@ -52510,8 +54226,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -53402,8 +55118,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -54551,7 +56267,7 @@ data: "options": [ ], "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, + "refresh": 2, "regex": "", "sort": 0, "tagValuesQuery": "", @@ -54580,7 +56296,7 @@ data: "options": [ ], "query": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -54610,7 +56326,7 @@ data: "options": [ ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\"}, workload)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -54640,7 +56356,7 @@ data: "options": [ ], "query": "label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\"}, workload_type)", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, @@ -54774,8 +56490,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 grafana_dashboard: '1' heritage: metalk8s metalk8s.scality.com/monitor: '' @@ -54791,8 +56507,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 7.5.5 - helm.sh/chart: grafana-6.8.3 + app.kubernetes.io/version: 8.0.1 + helm.sh/chart: grafana-6.13.0 heritage: metalk8s name: prometheus-operator-grafana-clusterrole namespace: metalk8s-monitoring @@ -54815,7 +56531,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/part-of: metalk8s - helm.sh/chart: kube-state-metrics-2.13.2 + helm.sh/chart: kube-state-metrics-3.1.1 heritage: metalk8s name: psp-prometheus-operator-kube-state-metrics namespace: metalk8s-monitoring @@ -54837,7 +56553,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/part-of: metalk8s - helm.sh/chart: kube-state-metrics-2.13.2 + helm.sh/chart: kube-state-metrics-3.1.1 heritage: metalk8s name: prometheus-operator-kube-state-metrics namespace: metalk8s-monitoring @@ -55044,7 +56760,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-node-exporter app.kubernetes.io/part-of: metalk8s - chart: prometheus-node-exporter-1.17.0 + chart: prometheus-node-exporter-1.18.1 heritage: metalk8s jobLabel: node-exporter release: prometheus-operator @@ -55069,8 +56785,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55157,8 +56873,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55183,8 +56899,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-prometheus app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55226,8 +56942,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-prometheus app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55251,8 +56967,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 7.5.5 - helm.sh/chart: grafana-6.8.3 + app.kubernetes.io/version: 8.0.1 + helm.sh/chart: grafana-6.13.0 heritage: metalk8s name: prometheus-operator-grafana-clusterrolebinding namespace: metalk8s-monitoring @@ -55273,7 +56989,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/part-of: metalk8s - helm.sh/chart: kube-state-metrics-2.13.2 + helm.sh/chart: kube-state-metrics-3.1.1 heritage: metalk8s name: prometheus-operator-kube-state-metrics namespace: metalk8s-monitoring @@ -55294,7 +57010,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/part-of: metalk8s - helm.sh/chart: kube-state-metrics-2.13.2 + helm.sh/chart: kube-state-metrics-3.1.1 heritage: metalk8s name: psp-prometheus-operator-kube-state-metrics namespace: metalk8s-monitoring @@ -55315,7 +57031,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-node-exporter app.kubernetes.io/part-of: metalk8s - chart: prometheus-node-exporter-1.17.0 + chart: prometheus-node-exporter-1.18.1 heritage: metalk8s jobLabel: node-exporter release: prometheus-operator @@ -55339,8 +57055,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55364,8 +57080,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55389,8 +57105,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-prometheus app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55414,8 +57130,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-prometheus app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55438,8 +57154,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 7.5.5 - helm.sh/chart: grafana-6.8.3 + app.kubernetes.io/version: 8.0.1 + helm.sh/chart: grafana-6.13.0 heritage: metalk8s name: prometheus-operator-grafana namespace: metalk8s-monitoring @@ -55462,8 +57178,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-alertmanager app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55487,8 +57203,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 7.5.5 - helm.sh/chart: grafana-6.8.3 + app.kubernetes.io/version: 8.0.1 + helm.sh/chart: grafana-6.13.0 heritage: metalk8s name: prometheus-operator-grafana namespace: metalk8s-monitoring @@ -55510,8 +57226,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-alertmanager app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55534,8 +57250,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 7.5.5 - helm.sh/chart: grafana-6.8.3 + app.kubernetes.io/version: 8.0.1 + helm.sh/chart: grafana-6.13.0 heritage: metalk8s name: prometheus-operator-grafana namespace: metalk8s-monitoring @@ -55560,7 +57276,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/part-of: metalk8s - helm.sh/chart: kube-state-metrics-2.13.2 + helm.sh/chart: kube-state-metrics-3.1.1 heritage: metalk8s name: prometheus-operator-kube-state-metrics namespace: metalk8s-monitoring @@ -55585,7 +57301,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-node-exporter app.kubernetes.io/part-of: metalk8s - chart: prometheus-node-exporter-1.17.0 + chart: prometheus-node-exporter-1.18.1 heritage: metalk8s jobLabel: node-exporter release: prometheus-operator @@ -55611,8 +57327,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-alertmanager app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55639,8 +57355,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-coredns app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s jobLabel: coredns metalk8s.scality.com/monitor: '' @@ -55666,8 +57382,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-kube-controller-manager app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s jobLabel: kube-controller-manager metalk8s.scality.com/monitor: '' @@ -55694,8 +57410,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-kube-etcd app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s jobLabel: kube-etcd metalk8s.scality.com/monitor: '' @@ -55722,8 +57438,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-kube-proxy app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s jobLabel: kube-proxy metalk8s.scality.com/monitor: '' @@ -55750,8 +57466,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-kube-scheduler app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s jobLabel: kube-scheduler metalk8s.scality.com/monitor: '' @@ -55778,8 +57494,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55804,8 +57520,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-prometheus app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -55818,7 +57534,7 @@ spec: port: 9090 targetPort: 9090 selector: - app: prometheus + app.kubernetes.io/name: prometheus prometheus: prometheus-operator-prometheus type: ClusterIP --- @@ -55830,7 +57546,7 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-node-exporter app.kubernetes.io/part-of: metalk8s - chart: prometheus-node-exporter-1.17.0 + chart: prometheus-node-exporter-1.18.1 heritage: metalk8s jobLabel: node-exporter release: prometheus-operator @@ -55850,7 +57566,7 @@ spec: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-node-exporter app.kubernetes.io/part-of: metalk8s - chart: prometheus-node-exporter-1.17.0 + chart: prometheus-node-exporter-1.18.1 heritage: metalk8s jobLabel: node-exporter release: prometheus-operator @@ -55928,8 +57644,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 7.5.5 - helm.sh/chart: grafana-6.8.3 + app.kubernetes.io/version: 8.0.1 + helm.sh/chart: grafana-6.13.0 heritage: metalk8s name: prometheus-operator-grafana namespace: metalk8s-monitoring @@ -55949,8 +57665,8 @@ spec: apiVersion="v1", namespace="metalk8s-monitoring", name="prometheus-operator-grafana", path="data:grafana.ini") checksum/dashboards-json-config: 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b - checksum/sc-dashboard-provider-config: 46ac8d1a40db8fbce04a7da8b284bd517053e468534a5f64c0d671777be82574 - checksum/secret: e2bd6e1021d2f448aeb36d04122e4efb343466de2fafbf7be9bdb8507ab4bf5a + checksum/sc-dashboard-provider-config: a173a86f2a73a38694f15b1c29166d2f21d5fe505644c17bc48414e0431618ee + checksum/secret: 3a961265bddf6e6e02f7b3d81656b4762b8a5c8e56e6d8ce0bc7eb6dd2fe83e6 labels: app.kubernetes.io/instance: prometheus-operator app.kubernetes.io/name: grafana @@ -55965,7 +57681,7 @@ spec: value: /tmp/dashboards - name: RESOURCE value: both - image: {% endraw -%}{{ build_image_name("k8s-sidecar", False) }}{%- raw %}:1.10.7 + image: {% endraw -%}{{ build_image_name("k8s-sidecar", False) }}{%- raw %}:1.12.2 imagePullPolicy: IfNotPresent name: grafana-sc-dashboard resources: {} @@ -55983,7 +57699,15 @@ spec: secretKeyRef: key: admin-password name: prometheus-operator-grafana - image: {% endraw -%}{{ build_image_name("grafana", False) }}{%- raw %}:7.5.5 + - name: GF_PATHS_DATA + value: /var/lib/grafana/ + - name: GF_PATHS_LOGS + value: /var/log/grafana + - name: GF_PATHS_PLUGINS + value: /var/lib/grafana/plugins + - name: GF_PATHS_PROVISIONING + value: /etc/grafana/provisioning + image: {% endraw -%}{{ build_image_name("grafana", False) }}{%- raw %}:8.0.1 imagePullPolicy: IfNotPresent livenessProbe: failureThreshold: 10 @@ -56028,7 +57752,7 @@ spec: value: /etc/grafana/provisioning/datasources - name: RESOURCE value: both - image: {% endraw -%}{{ build_image_name("k8s-sidecar", False) }}{%- raw %}:1.10.7 + image: {% endraw -%}{{ build_image_name("k8s-sidecar", False) }}{%- raw %}:1.12.2 imagePullPolicy: IfNotPresent name: grafana-sc-datasources resources: {} @@ -56071,8 +57795,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 1.9.8 - helm.sh/chart: kube-state-metrics-2.13.2 + app.kubernetes.io/version: 2.0.0 + helm.sh/chart: kube-state-metrics-3.1.1 heritage: metalk8s name: prometheus-operator-kube-state-metrics namespace: metalk8s-monitoring @@ -56089,35 +57813,35 @@ spec: spec: containers: - args: - - --collectors=certificatesigningrequests - - --collectors=configmaps - - --collectors=cronjobs - - --collectors=daemonsets - - --collectors=deployments - - --collectors=endpoints - - --collectors=horizontalpodautoscalers - - --collectors=ingresses - - --collectors=jobs - - --collectors=limitranges - - --collectors=mutatingwebhookconfigurations - - --collectors=namespaces - - --collectors=networkpolicies - - --collectors=nodes - - --collectors=persistentvolumeclaims - - --collectors=persistentvolumes - - --collectors=poddisruptionbudgets - - --collectors=pods - - --collectors=replicasets - - --collectors=replicationcontrollers - - --collectors=resourcequotas - - --collectors=secrets - - --collectors=services - - --collectors=statefulsets - - --collectors=storageclasses - - --collectors=validatingwebhookconfigurations - - --collectors=volumeattachments + - --resources=certificatesigningrequests + - --resources=configmaps + - --resources=cronjobs + - --resources=daemonsets + - --resources=deployments + - --resources=endpoints + - --resources=horizontalpodautoscalers + - --resources=ingresses + - --resources=jobs + - --resources=limitranges + - --resources=mutatingwebhookconfigurations + - --resources=namespaces + - --resources=networkpolicies + - --resources=nodes + - --resources=persistentvolumeclaims + - --resources=persistentvolumes + - --resources=poddisruptionbudgets + - --resources=pods + - --resources=replicasets + - --resources=replicationcontrollers + - --resources=resourcequotas + - --resources=secrets + - --resources=services + - --resources=statefulsets + - --resources=storageclasses + - --resources=validatingwebhookconfigurations + - --resources=volumeattachments - --telemetry-port=8081 - image: {% endraw -%}{{ build_image_name("kube-state-metrics", False) }}{%- raw %}:v1.9.8 + image: {% endraw -%}{{ build_image_name("kube-state-metrics", False) }}{%- raw %}:v2.0.0 imagePullPolicy: IfNotPresent livenessProbe: httpGet: @@ -56159,8 +57883,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -56180,8 +57904,8 @@ spec: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -56190,10 +57914,10 @@ spec: - args: - --kubelet-service=kube-system/prometheus-operator-kubelet - --localhost=127.0.0.1 - - --prometheus-config-reloader={% endraw -%}{{ build_image_name("prometheus-config-reloader", False) }}{%- raw %}:v0.47.0 + - --prometheus-config-reloader={% endraw -%}{{ build_image_name("prometheus-config-reloader", False) }}{%- raw %}:v0.48.1 - --config-reloader-cpu=100m - --config-reloader-memory=50Mi - image: {% endraw -%}{{ build_image_name("prometheus-operator", False) }}{%- raw %}:v0.47.0 + image: {% endraw -%}{{ build_image_name("prometheus-operator", False) }}{%- raw %}:v0.48.1 imagePullPolicy: IfNotPresent name: prometheus-operator ports: @@ -56230,8 +57954,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 7.5.5 - helm.sh/chart: grafana-6.8.3 + app.kubernetes.io/version: 8.0.1 + helm.sh/chart: grafana-6.13.0 heritage: metalk8s name: prometheus-operator-grafana namespace: metalk8s-monitoring @@ -56254,8 +57978,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-alertmanager app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -56281,7 +58005,7 @@ spec: alertmanagerConfigNamespaceSelector: {} alertmanagerConfigSelector: {} externalUrl: http://prometheus-operator-alertmanager.metalk8s-monitoring:9093 - image: {% endraw -%}{{ build_image_name("alertmanager", False) }}{%- raw %}:v0.21.0 + image: {% endraw -%}{{ build_image_name("alertmanager", False) }}{%- raw %}:v0.22.2 listenLocal: false logFormat: logfmt logLevel: info @@ -56324,7 +58048,7 @@ spec: - effect: NoSchedule key: node-role.kubernetes.io/infra operator: Exists - version: v0.21.0 + version: v0.22.2 --- apiVersion: monitoring.coreos.com/v1 kind: Prometheus @@ -56335,8 +58059,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-prometheus app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -56349,7 +58073,7 @@ spec: - podAffinityTerm: labelSelector: matchExpressions: - - key: app + - key: app.kubernetes.io/name operator: In values: - prometheus @@ -56368,7 +58092,7 @@ spec: port: web enableAdminAPI: {% endraw -%}{{ prometheus.spec.config.enable_admin_api }}{%- raw %} externalUrl: http://prometheus-operator-prometheus.metalk8s-monitoring:9090 - image: {% endraw -%}{{ build_image_name("prometheus", False) }}{%- raw %}:v2.26.0 + image: {% endraw -%}{{ build_image_name("prometheus", False) }}{%- raw %}:v2.27.1 listenLocal: false logFormat: logfmt logLevel: info @@ -56422,7 +58146,7 @@ spec: - effect: NoSchedule key: node-role.kubernetes.io/infra operator: Exists - version: v2.26.0 + version: v2.27.1 --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -56433,8 +58157,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -56444,35 +58168,143 @@ spec: groups: - name: alertmanager.rules rules: - - alert: AlertmanagerConfigInconsistent + - alert: AlertmanagerFailedReload annotations: - message: |- - The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync. - {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }} - Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}" - {{ end }} - expr: count by(namespace,service) (count_values by(namespace,service) ("config_hash", - alertmanager_config_hash{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"})) - != 1 - for: 5m + description: Configuration has failed to load for {{ $labels.namespace }}/{{ + $labels.pod}}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerfailedreload + summary: Reloading an Alertmanager configuration has failed. + expr: |- + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"}[5m]) == 0 + for: 10m labels: severity: critical - - alert: AlertmanagerFailedReload + - alert: AlertmanagerMembersInconsistent annotations: - message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace - }}/{{ $labels.pod}}. - expr: alertmanager_config_last_reload_successful{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"} - == 0 + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only + found {{ $value }} members of the {{$labels.job}} cluster. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagermembersinconsistent + summary: A member of an Alertmanager cluster has not found all other cluster + members. + expr: |- + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"}[5m])) for: 10m + labels: + severity: critical + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed + to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration + }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerfailedtosendalerts + summary: An Alertmanager instance failed to send notifications. + expr: |- + ( + rate(alertmanager_notifications_failed_total{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"}[5m]) + / + rate(alertmanager_notifications_total{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"}[5m]) + ) + > 0.01 + for: 5m labels: severity: warning - - alert: AlertmanagerMembersInconsistent + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration + }} sent from any instance in the {{$labels.job}} cluster is {{ $value | + humanizePercentage }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications + to a critical integration. + expr: |- + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring", integration=~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring", integration=~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration + }} sent from any instance in the {{$labels.job}} cluster is {{ $value | + humanizePercentage }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications + to a non-critical integration. + expr: |- + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring", integration!~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring", integration!~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{$labels.job}} cluster have + different configurations. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerconfiginconsistent + summary: Alertmanager instances within the same cluster have different configurations. + expr: |- + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"}) + ) + != 1 + for: 20m + labels: + severity: critical + - alert: AlertmanagerClusterDown + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances + within the {{$labels.job}} cluster have been up for less than half of the + last 5m.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerclusterdown + summary: Half or more of the Alertmanager instances within the same cluster + are down. + expr: |- + ( + count by (namespace,service) ( + avg_over_time(up{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterCrashlooping annotations: - message: Alertmanager has not found all other members of the cluster. + description: '{{ $value | humanizePercentage }} of Alertmanager instances + within the {{$labels.job}} cluster have restarted at least 5 times in the + last 10m.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerclustercrashlooping + summary: Half or more of the Alertmanager instances within the same cluster + are crashlooping. expr: |- - alertmanager_cluster_members{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"} - != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"}) + ( + count by (namespace,service) ( + changes(process_start_time_seconds{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"}[10m]) > 4 + ) + / + count by (namespace,service) ( + up{job="prometheus-operator-alertmanager",namespace="metalk8s-monitoring"} + ) + ) + >= 0.5 for: 5m labels: severity: critical @@ -56486,8 +58318,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -56634,8 +58466,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -56647,8 +58479,10 @@ spec: rules: - alert: TargetDown annotations: - message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service + description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-targetdown + summary: One or more targets are unreachable. expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 for: 10m @@ -56656,12 +58490,15 @@ spec: severity: warning - alert: Watchdog annotations: - message: |- + description: |- This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-watchdog + summary: An alert that should always be firing to certify that Alertmanager + is working properly. expr: vector(1) labels: severity: none @@ -56675,8 +58512,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -56688,11 +58525,11 @@ spec: rules: - expr: |- sum by (cluster, namespace, pod, container) ( - rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate - expr: |- container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, @@ -56786,8 +58623,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -56802,56 +58639,56 @@ spec: 1 - ( ( # write too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) ) + ( # read too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) - ( ( - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) or vector(0) ) + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) ) ) + # errors - sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) ) / - sum(code:apiserver_request_total:increase30d) + sum by (cluster) (code:apiserver_request_total:increase30d) labels: verb: all record: apiserver_request:availability30d - expr: |- 1 - ( - sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) - ( # too slow ( - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) or vector(0) ) + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) ) + # errors - sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) ) / - sum(code:apiserver_request_total:increase30d{verb="read"}) + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) labels: verb: read record: apiserver_request:availability30d @@ -56859,75 +58696,75 @@ spec: 1 - ( ( # too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + sum by (cluster) (increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) ) + # errors - sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) ) / - sum(code:apiserver_request_total:increase30d{verb="write"}) + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) labels: verb: write record: apiserver_request:availability30d - expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 record: code_verb:apiserver_request_total:increase30d - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h])) + - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) labels: verb: read record: code:apiserver_request_total:increase30d - - expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) labels: verb: write record: code:apiserver_request_total:increase30d @@ -56941,8 +58778,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -57018,8 +58855,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -57033,26 +58870,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) labels: verb: read record: apiserver_request:burnrate1d @@ -57060,26 +58897,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) labels: verb: read record: apiserver_request:burnrate1h @@ -57087,26 +58924,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) labels: verb: read record: apiserver_request:burnrate2h @@ -57114,26 +58951,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) labels: verb: read record: apiserver_request:burnrate30m @@ -57141,26 +58978,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) labels: verb: read record: apiserver_request:burnrate3d @@ -57168,26 +59005,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read record: apiserver_request:burnrate5m @@ -57195,26 +59032,26 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) - ( ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) or vector(0) ) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) ) ) + # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) labels: verb: read record: apiserver_request:burnrate6h @@ -57222,15 +59059,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) labels: verb: write record: apiserver_request:burnrate1d @@ -57238,15 +59075,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) labels: verb: write record: apiserver_request:burnrate1h @@ -57254,15 +59091,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) labels: verb: write record: apiserver_request:burnrate2h @@ -57270,15 +59107,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) labels: verb: write record: apiserver_request:burnrate30m @@ -57286,15 +59123,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) labels: verb: write record: apiserver_request:burnrate3d @@ -57302,15 +59139,15 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write record: apiserver_request:burnrate5m @@ -57318,33 +59155,33 @@ spec: ( ( # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) + sum by (cluster) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) ) + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) ) / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) labels: verb: write record: apiserver_request:burnrate6h - - expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read record: code_resource:apiserver_request_total:rate5m - - expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write record: code_resource:apiserver_request_total:rate5m - - expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) + - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 labels: quantile: '0.99' verb: read record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) + - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 labels: quantile: '0.99' @@ -57375,8 +59212,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -57400,8 +59237,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -57437,8 +59274,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -57503,8 +59340,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -57544,6 +59381,32 @@ spec: for: 15m labels: severity: critical + - alert: KubeStateMetricsShardingMismatch + annotations: + description: kube-state-metrics pods are running with different --total-shards + configuration, some Kubernetes objects may be exposed multiple times or + not exposed at all. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsshardingmismatch + summary: kube-state-metrics sharding is misconfigured. + expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != + 0 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsShardsMissing + annotations: + description: kube-state-metrics shards are missing, some Kubernetes objects + are not being exposed. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsshardsmissing + summary: kube-state-metrics shards are missing. + expr: |- + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 + - + sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) + != 0 + for: 15m + labels: + severity: critical --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -57554,8 +59417,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -57593,8 +59456,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -57610,8 +59473,8 @@ spec: }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping summary: Pod is crash looping. - expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", - namespace=~".*"}[10m]) * 60 * 5 > 0 + expr: increase(kube_pod_container_status_restarts_total{job="kube-state-metrics", + namespace=~".*"}[10m]) > 0 for: 15m labels: severity: warning @@ -57860,8 +59723,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -57996,8 +59859,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58062,8 +59925,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58149,8 +60012,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58180,8 +60043,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58340,8 +60203,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58370,8 +60233,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58416,8 +60279,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58436,9 +60299,9 @@ spec: record: instance:node_num_cpu:sum - expr: |- 1 - avg without (cpu, mode) ( - rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) + rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m]) ) - record: instance:node_cpu_utilisation:rate1m + record: instance:node_cpu_utilisation:rate5m - expr: |- ( node_load1{job="node-exporter"} @@ -58453,32 +60316,32 @@ spec: node_memory_MemTotal_bytes{job="node-exporter"} ) record: instance:node_memory_utilisation:ratio - - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) - record: instance:node_vmstat_pgmajfault:rate1m - - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_seconds:rate1m - - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_weighted_seconds:rate1m + - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + record: instance:node_vmstat_pgmajfault:rate5m + - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + record: instance_device:node_disk_io_time_seconds:rate5m + - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate5m - expr: |- sum without (device) ( - rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_receive_bytes_excluding_lo:rate1m + record: instance:node_network_receive_bytes_excluding_lo:rate5m - expr: |- sum without (device) ( - rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_transmit_bytes_excluding_lo:rate1m + record: instance:node_network_transmit_bytes_excluding_lo:rate5m - expr: |- sum without (device) ( - rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_receive_drop_excluding_lo:rate1m + record: instance:node_network_receive_drop_excluding_lo:rate5m - expr: |- sum without (device) ( - rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) ) - record: instance:node_network_transmit_drop_excluding_lo:rate1m + record: instance:node_network_transmit_drop_excluding_lo:rate5m --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -58489,8 +60352,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58503,7 +60366,8 @@ spec: - alert: NodeNetworkInterfaceFlapping annotations: message: Network interface "{{ $labels.device }}" changing it's up status - often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" + often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkinterfaceflapping expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 for: 2m labels: @@ -58518,8 +60382,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58563,8 +60427,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58666,8 +60530,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58681,6 +60545,7 @@ spec: annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusbadconfig summary: Failed Prometheus configuration reload. expr: |- # Without max_over_time, failed scrapes could create false negatives, see @@ -58693,6 +60558,7 @@ spec: annotations: description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusnotificationqueuerunningfull summary: Prometheus alert notification queue predicted to run full in less than 30m. expr: |- @@ -58710,6 +60576,7 @@ spec: annotations: description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheuserrorsendingalertstosomealertmanagers summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. expr: |- @@ -58723,26 +60590,11 @@ spec: for: 15m labels: severity: warning - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts - from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: |- - min without(alertmanager) ( - rate(prometheus_notifications_errors_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) - ) - * 100 - > 3 - for: 15m - labels: - severity: critical - alert: PrometheusNotConnectedToAlertmanagers annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusnotconnectedtoalertmanagers summary: Prometheus is not connected to any Alertmanagers. expr: |- # Without max_over_time, failed scrapes could create false negatives, see @@ -58755,6 +60607,7 @@ spec: annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheustsdbreloadsfailing summary: Prometheus has issues reloading blocks from disk. expr: increase(prometheus_tsdb_reloads_failures_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[3h]) > 0 @@ -58765,6 +60618,7 @@ spec: annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheustsdbcompactionsfailing summary: Prometheus has issues compacting blocks. expr: increase(prometheus_tsdb_compactions_failed_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[3h]) > 0 @@ -58775,9 +60629,18 @@ spec: annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusnotingestingsamples summary: Prometheus is not ingesting samples. - expr: rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) - <= 0 + expr: |- + ( + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}) > 0 + ) + ) for: 10m labels: severity: warning @@ -58786,6 +60649,7 @@ spec: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusduplicatetimestamps summary: Prometheus is dropping samples with duplicate timestamps. expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) > 0 @@ -58796,6 +60660,7 @@ spec: annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoutofordertimestamps summary: Prometheus drops samples with out-of-order timestamps. expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) > 0 @@ -58807,15 +60672,16 @@ spec: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusremotestoragefailures summary: Prometheus fails to send samples to remote storage. expr: |- ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m])) / ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m])) + - rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) + (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m])) ) ) * 100 @@ -58828,13 +60694,14 @@ spec: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusremotewritebehind summary: Prometheus remote write is behind. expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) - - on(job, instance) group_right + - ignoring(remote_name, url) group_right max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) ) > 120 @@ -58848,6 +60715,7 @@ spec: $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}` $labels.instance | query | first | value }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusremotewritedesiredshards summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. expr: |- @@ -58865,6 +60733,7 @@ spec: annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusrulefailures summary: Prometheus is failing rule evaluations. expr: increase(prometheus_rule_evaluation_failures_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) > 0 @@ -58875,6 +60744,7 @@ spec: annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusmissingruleevaluations summary: Prometheus is missing rule evaluations due to slow rule group evaluation. expr: increase(prometheus_rule_group_iterations_missed_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) > 0 @@ -58886,6 +60756,7 @@ spec: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheustargetlimithit summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) @@ -58893,6 +60764,36 @@ spec: for: 15m labels: severity: warning + - alert: PrometheusLabelLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped + {{ printf "%.0f" $value }} targets because some samples exceeded the configured + label_limit, label_name_length_limit or label_value_length_limit. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheuslabellimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded + the labels limit. + expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring"}[5m]) + > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts + from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheuserrorsendingalertstoanyalertmanager + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: |- + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring",alertmanager!~``}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-operator-prometheus",namespace="metalk8s-monitoring",alertmanager!~``}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor @@ -58903,8 +60804,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-alertmanager app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58932,8 +60833,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-coredns app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58961,8 +60862,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-apiserver app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -58995,8 +60896,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-kube-controller-manager app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -59028,8 +60929,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-kube-etcd app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -59057,8 +60958,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-kube-proxy app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -59086,8 +60987,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-kube-scheduler app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -59119,8 +61020,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-kube-state-metrics app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -59145,8 +61046,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-kubelet app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -59195,7 +61096,7 @@ spec: - kube-system selector: matchLabels: - app.kubernetes.io/managed-by: prometheus-operator + app.kubernetes.io/name: kubelet k8s-app: kubelet --- apiVersion: monitoring.coreos.com/v1 @@ -59207,8 +61108,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-node-exporter app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -59232,8 +61133,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-grafana app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -59260,8 +61161,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-operator app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator @@ -59288,8 +61189,8 @@ metadata: app.kubernetes.io/managed-by: salt app.kubernetes.io/name: prometheus-operator-prometheus app.kubernetes.io/part-of: metalk8s - app.kubernetes.io/version: 15.4.4 - chart: kube-prometheus-stack-15.4.4 + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 heritage: metalk8s metalk8s.scality.com/monitor: '' release: prometheus-operator diff --git a/salt/metalk8s/addons/prometheus-operator/deployed/prometheus-rules.sls b/salt/metalk8s/addons/prometheus-operator/deployed/prometheus-rules.sls index 6549e83362..66be046e3c 100644 --- a/salt/metalk8s/addons/prometheus-operator/deployed/prometheus-rules.sls +++ b/salt/metalk8s/addons/prometheus-operator/deployed/prometheus-rules.sls @@ -168,7 +168,8 @@ spec: {{ printf "%.0f" $value }} receive errors in the last two minutes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs summary: Network interface is reporting many receive errors. - expr: increase(node_network_receive_errs_total[2m]) > {% endraw %}{{ rules.node_exporter.node_network_receive_errors.warning.errors }}{% raw %} + expr: increase(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) + > {% endraw %}{{ rules.node_exporter.node_network_receive_errors.warning.error_rate }}{% raw %} for: 1h labels: severity: warning @@ -178,7 +179,8 @@ spec: {{ printf "%.0f" $value }} transmit errors in the last two minutes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs summary: Network interface is reporting many transmit errors. - expr: increase(node_network_transmit_errs_total[2m]) > {% endraw %}{{ rules.node_exporter.node_network_transmit_errors.warning.errors }}{% raw %} + expr: increase(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) + > {% endraw %}{{ rules.node_exporter.node_network_transmit_errors.warning.error_rate }}{% raw %} for: 1h labels: severity: warning @@ -217,7 +219,10 @@ spec: is configured on this host. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising summary: Clock not synchronising. - expr: min_over_time(node_timex_sync_status[5m]) == {% endraw %}{{ rules.node_exporter.node_clock_not_synchronising.warning.threshold }}{% raw %} + expr: |- + min_over_time(node_timex_sync_status[5m]) == {% endraw %}{{ rules.node_exporter.node_clock_not_synchronising.warning.threshold }}{% raw %} + and + node_timex_maxerror_seconds >= 16 for: 10m labels: severity: warning @@ -247,7 +252,7 @@ spec: Array '{{ $labels.device }}' needs attention and possibly a disk swap. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure summary: Failed device in RAID array - expr: node_md_disks{state="fail"} >= {% endraw %}{{ rules.node_exporter.node_raid_disk_failure.warning.threshold }}{% raw %} + expr: node_md_disks{state="failed"} >= {% endraw %}{{ rules.node_exporter.node_raid_disk_failure.warning.threshold }}{% raw %} labels: severity: warning {%- endraw %} diff --git a/tools/rule_extractor/alerting_rules.csv b/tools/rule_extractor/alerting_rules.csv index ace327c0b9..a19ef8fbee 100644 --- a/tools/rule_extractor/alerting_rules.csv +++ b/tools/rule_extractor/alerting_rules.csv @@ -1,9 +1,11 @@ -AlertmanagerConfigInconsistent,critical,"The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync. -{{ range printf ""alertmanager_config_hash{namespace=\""%s\"",service=\""%s\""}"" $labels.namespace $labels.service | query }} -Configuration hash for pod {{ .Labels.pod }} is ""{{ printf ""%.f"" .Value }}"" -{{ end }}" -AlertmanagerFailedReload,warning,Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. -AlertmanagerMembersInconsistent,critical,Alertmanager has not found all other members of the cluster. +AlertmanagerFailedReload,critical,Reloading an Alertmanager configuration has failed. +AlertmanagerMembersInconsistent,critical,A member of an Alertmanager cluster has not found all other cluster members. +AlertmanagerFailedToSendAlerts,warning,An Alertmanager instance failed to send notifications. +AlertmanagerClusterFailedToSendAlerts,critical,All Alertmanager instances in a cluster failed to send notifications to a critical integration. +AlertmanagerClusterFailedToSendAlerts,warning,All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. +AlertmanagerConfigInconsistent,critical,Alertmanager instances within the same cluster have different configurations. +AlertmanagerClusterDown,critical,Half or more of the Alertmanager instances within the same cluster are down. +AlertmanagerClusterCrashlooping,critical,Half or more of the Alertmanager instances within the same cluster are crashlooping. etcdInsufficientMembers,critical,"etcd cluster ""{{ $labels.job }}"": insufficient members ({{ $value }})." etcdNoLeader,critical,"etcd cluster ""{{ $labels.job }}"": member {{ $labels.instance }} has no leader." etcdHighNumberOfLeaderChanges,warning,"etcd cluster ""{{ $labels.job }}"": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour." @@ -17,18 +19,16 @@ etcdHighCommitDurations,warning,"etcd cluster ""{{ $labels.job }}"": 99th percen etcdHighNumberOfFailedHTTPRequests,warning,{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }} etcdHighNumberOfFailedHTTPRequests,critical,{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}. etcdHTTPRequestsSlow,warning,etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow. -TargetDown,warning,"{{ printf ""%.4g"" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down." -Watchdog,none,"This is an alert meant to ensure that the entire alerting pipeline is functional. -This alert is always firing, therefore it should always be firing in Alertmanager -and always fire against a receiver. There are integrations with various notification -mechanisms that send a notification when this alert is not firing. For example the -""DeadMansSnitch"" integration in PagerDuty." +TargetDown,warning,One or more targets are unreachable. +Watchdog,none,An alert that should always be firing to certify that Alertmanager is working properly. KubeAPIErrorBudgetBurn,critical,The API server is burning too much error budget. KubeAPIErrorBudgetBurn,critical,The API server is burning too much error budget. KubeAPIErrorBudgetBurn,warning,The API server is burning too much error budget. KubeAPIErrorBudgetBurn,warning,The API server is burning too much error budget. KubeStateMetricsListErrors,critical,kube-state-metrics is experiencing errors in list operations. KubeStateMetricsWatchErrors,critical,kube-state-metrics is experiencing errors in watch operations. +KubeStateMetricsShardingMismatch,critical,kube-state-metrics sharding is misconfigured. +KubeStateMetricsShardsMissing,critical,kube-state-metrics shards are missing. KubePodCrashLooping,warning,Pod is crash looping. KubePodNotReady,warning,Pod has been in a non-ready state for more than 15 minutes. KubeDeploymentGenerationMismatch,warning,Deployment generation mismatch due to possible roll-back @@ -94,7 +94,7 @@ NodeClockNotSynchronising,warning,Clock on {{ $labels.instance }} is not synchro NodeTextFileCollectorScrapeError,warning,Node Exporter text file collector failed to scrape. NodeRAIDDegraded,critical,RAID Array is degraded NodeRAIDDiskFailure,warning,Failed device in RAID array -NodeNetworkInterfaceFlapping,warning,"Network interface ""{{ $labels.device }}"" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}""" +NodeNetworkInterfaceFlapping,warning,"Network interface ""{{ $labels.device }}"" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" PrometheusOperatorListErrors,warning,Errors while performing list operations in controller. PrometheusOperatorWatchErrors,warning,Errors while performing watch operations in controller. PrometheusOperatorSyncFailed,warning,Last controller reconciliation failed @@ -105,7 +105,6 @@ PrometheusOperatorRejectedResources,warning,Resources rejected by Prometheus ope PrometheusBadConfig,critical,Failed Prometheus configuration reload. PrometheusNotificationQueueRunningFull,warning,Prometheus alert notification queue predicted to run full in less than 30m. PrometheusErrorSendingAlertsToSomeAlertmanagers,warning,Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. -PrometheusErrorSendingAlertsToAnyAlertmanager,critical,Prometheus encounters more than 3% errors sending alerts to any Alertmanager. PrometheusNotConnectedToAlertmanagers,warning,Prometheus is not connected to any Alertmanagers. PrometheusTSDBReloadsFailing,warning,Prometheus has issues reloading blocks from disk. PrometheusTSDBCompactionsFailing,warning,Prometheus has issues compacting blocks. @@ -118,3 +117,5 @@ PrometheusRemoteWriteDesiredShards,warning,Prometheus remote write desired shard PrometheusRuleFailures,critical,Prometheus is failing rule evaluations. PrometheusMissingRuleEvaluations,warning,Prometheus is missing rule evaluations due to slow rule group evaluation. PrometheusTargetLimitHit,warning,Prometheus has dropped targets because some scrape configs have exceeded the targets limit. +PrometheusLabelLimitHit,warning,Prometheus has dropped targets because some scrape configs have exceeded the labels limit. +PrometheusErrorSendingAlertsToAnyAlertmanager,critical,Prometheus encounters more than 3% errors sending alerts to any Alertmanager. diff --git a/tools/rule_extractor/alerting_rules.json b/tools/rule_extractor/alerting_rules.json index 46417bd2b8..6d4bc5474c 100644 --- a/tools/rule_extractor/alerting_rules.json +++ b/tools/rule_extractor/alerting_rules.json @@ -1,20 +1,50 @@ [ { - "message": "The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.\n{{ range printf \"alertmanager_config_hash{namespace=\\\"%s\\\",service=\\\"%s\\\"}\" $labels.namespace $labels.service | query }}\nConfiguration hash for pod {{ .Labels.pod }} is \"{{ printf \"%.f\" .Value }}\"\n{{ end }}", + "message": "Reloading an Alertmanager configuration has failed.", + "name": "AlertmanagerFailedReload", + "query": "max_over_time(alertmanager_config_last_reload_successful{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) == 0", + "severity": "critical" + }, + { + "message": "A member of an Alertmanager cluster has not found all other cluster members.", + "name": "AlertmanagerMembersInconsistent", + "query": "max_over_time(alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) < on(namespace, service) group_left() count by(namespace, service) (max_over_time(alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]))", + "severity": "critical" + }, + { + "message": "An Alertmanager instance failed to send notifications.", + "name": "AlertmanagerFailedToSendAlerts", + "query": "(rate(alertmanager_notifications_failed_total{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", + "severity": "warning" + }, + { + "message": "All Alertmanager instances in a cluster failed to send notifications to a critical integration.", + "name": "AlertmanagerClusterFailedToSendAlerts", + "query": "min by(namespace, service, integration) (rate(alertmanager_notifications_failed_total{integration=~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{integration=~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", + "severity": "critical" + }, + { + "message": "All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.", + "name": "AlertmanagerClusterFailedToSendAlerts", + "query": "min by(namespace, service, integration) (rate(alertmanager_notifications_failed_total{integration!~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{integration!~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", + "severity": "warning" + }, + { + "message": "Alertmanager instances within the same cluster have different configurations.", "name": "AlertmanagerConfigInconsistent", "query": "count by(namespace, service) (count_values by(namespace, service) (\"config_hash\", alertmanager_config_hash{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) != 1", "severity": "critical" }, { - "message": "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.", - "name": "AlertmanagerFailedReload", - "query": "alertmanager_config_last_reload_successful{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"} == 0", - "severity": "warning" + "message": "Half or more of the Alertmanager instances within the same cluster are down.", + "name": "AlertmanagerClusterDown", + "query": "(count by(namespace, service) (avg_over_time(up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) < 0.5) / count by(namespace, service) (up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) >= 0.5", + "severity": "critical" }, { - "message": "Alertmanager has not found all other members of the cluster.", - "name": "AlertmanagerMembersInconsistent", - "query": "alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"} != on(service) group_left() count by(service) (alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})", + "message": "Half or more of the Alertmanager instances within the same cluster are crashlooping.", + "name": "AlertmanagerClusterCrashlooping", + "query": "(count by(namespace, service) (changes(process_start_time_seconds{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[10m]) > 4) / count by(namespace, service) (up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) >= 0.5", "severity": "critical" }, { @@ -96,13 +126,13 @@ "severity": "warning" }, { - "message": "{{ printf \"%.4g\" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.", + "message": "One or more targets are unreachable.", "name": "TargetDown", "query": "100 * (count by(job, namespace, service) (up == 0) / count by(job, namespace, service) (up)) > 10", "severity": "warning" }, { - "message": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.", + "message": "An alert that should always be firing to certify that Alertmanager is working properly.", "name": "Watchdog", "query": "vector(1)", "severity": "none" @@ -143,10 +173,22 @@ "query": "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) / sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m]))) > 0.01", "severity": "critical" }, + { + "message": "kube-state-metrics sharding is misconfigured.", + "name": "KubeStateMetricsShardingMismatch", + "query": "stdvar(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) != 0", + "severity": "critical" + }, + { + "message": "kube-state-metrics shards are missing.", + "name": "KubeStateMetricsShardsMissing", + "query": "2 ^ max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1 - sum(2 ^ max by(shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"})) != 0", + "severity": "critical" + }, { "message": "Pod is crash looping.", "name": "KubePodCrashLooping", - "query": "rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) * 60 * 5 > 0", + "query": "increase(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) > 0", "severity": "warning" }, { @@ -534,7 +576,7 @@ "severity": "warning" }, { - "message": "Network interface \"{{ $labels.device }}\" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}\"", + "message": "Network interface \"{{ $labels.device }}\" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}", "name": "NodeNetworkInterfaceFlapping", "query": "changes(node_network_up{device!~\"veth.+\",job=\"node-exporter\"}[2m]) > 2", "severity": "warning" @@ -599,12 +641,6 @@ "query": "(rate(prometheus_notifications_errors_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 1", "severity": "warning" }, - { - "message": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager.", - "name": "PrometheusErrorSendingAlertsToAnyAlertmanager", - "query": "min without(alertmanager) (rate(prometheus_notifications_errors_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 3", - "severity": "critical" - }, { "message": "Prometheus is not connected to any Alertmanagers.", "name": "PrometheusNotConnectedToAlertmanagers", @@ -626,7 +662,7 @@ { "message": "Prometheus is not ingesting samples.", "name": "PrometheusNotIngestingSamples", - "query": "rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) <= 0", + "query": "(rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) <= 0 and (sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}) > 0 or sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}) > 0))", "severity": "warning" }, { @@ -644,13 +680,13 @@ { "message": "Prometheus fails to send samples to remote storage.", "name": "PrometheusRemoteStorageFailures", - "query": "(rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) + rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]))) * 100 > 1", + "query": "((rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) / ((rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) + (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])))) * 100 > 1", "severity": "critical" }, { "message": "Prometheus remote write is behind.", "name": "PrometheusRemoteWriteBehind", - "query": "(max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) - on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) > 120", + "query": "(max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) - ignoring(remote_name, url) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) > 120", "severity": "critical" }, { @@ -676,5 +712,17 @@ "name": "PrometheusTargetLimitHit", "query": "increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", "severity": "warning" + }, + { + "message": "Prometheus has dropped targets because some scrape configs have exceeded the labels limit.", + "name": "PrometheusLabelLimitHit", + "query": "increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", + "severity": "warning" + }, + { + "message": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager.", + "name": "PrometheusErrorSendingAlertsToAnyAlertmanager", + "query": "min without(alertmanager) (rate(prometheus_notifications_errors_total{alertmanager!~\"\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{alertmanager!~\"\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 3", + "severity": "critical" } ] \ No newline at end of file diff --git a/tools/rule_extractor/rules.json b/tools/rule_extractor/rules.json index db311f8200..498881bfa4 100644 --- a/tools/rule_extractor/rules.json +++ b/tools/rule_extractor/rules.json @@ -2,24 +2,121 @@ "data": { "groups": [ { - "evaluationTime": 0.000876476, + "evaluationTime": 0.001673568, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-alertmanager.rules.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:54:00.311437754Z", + "lastEvaluation": "2021-06-18T11:25:00.312324905Z", "name": "alertmanager.rules", "rules": [ { "alerts": [], "annotations": { - "message": "The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.\n{{ range printf \"alertmanager_config_hash{namespace=\\\"%s\\\",service=\\\"%s\\\"}\" $labels.namespace $labels.service | query }}\nConfiguration hash for pod {{ .Labels.pod }} is \"{{ printf \"%.f\" .Value }}\"\n{{ end }}" + "description": "Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerfailedreload", + "summary": "Reloading an Alertmanager configuration has failed." + }, + "duration": 600, + "evaluationTime": 0.000285671, + "health": "ok", + "labels": { + "severity": "critical" + }, + "lastEvaluation": "2021-06-18T11:25:00.312332096Z", + "name": "AlertmanagerFailedReload", + "query": "max_over_time(alertmanager_config_last_reload_successful{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) == 0", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagermembersinconsistent", + "summary": "A member of an Alertmanager cluster has not found all other cluster members." + }, + "duration": 600, + "evaluationTime": 0.000219725, + "health": "ok", + "labels": { + "severity": "critical" + }, + "lastEvaluation": "2021-06-18T11:25:00.312619485Z", + "name": "AlertmanagerMembersInconsistent", + "query": "max_over_time(alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) < on(namespace, service) group_left() count by(namespace, service) (max_over_time(alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]))", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerfailedtosendalerts", + "summary": "An Alertmanager instance failed to send notifications." + }, + "duration": 300, + "evaluationTime": 0.000278557, + "health": "ok", + "labels": { + "severity": "warning" + }, + "lastEvaluation": "2021-06-18T11:25:00.312840913Z", + "name": "AlertmanagerFailedToSendAlerts", + "query": "(rate(alertmanager_notifications_failed_total{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerclusterfailedtosendalerts", + "summary": "All Alertmanager instances in a cluster failed to send notifications to a critical integration." }, "duration": 300, - "evaluationTime": 0.000564221, + "evaluationTime": 0.000291097, + "health": "ok", + "labels": { + "severity": "critical" + }, + "lastEvaluation": "2021-06-18T11:25:00.31312021Z", + "name": "AlertmanagerClusterFailedToSendAlerts", + "query": "min by(namespace, service, integration) (rate(alertmanager_notifications_failed_total{integration=~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{integration=~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerclusterfailedtosendalerts", + "summary": "All Alertmanager instances in a cluster failed to send notifications to a non-critical integration." + }, + "duration": 300, + "evaluationTime": 0.000174729, + "health": "ok", + "labels": { + "severity": "warning" + }, + "lastEvaluation": "2021-06-18T11:25:00.313412234Z", + "name": "AlertmanagerClusterFailedToSendAlerts", + "query": "min by(namespace, service, integration) (rate(alertmanager_notifications_failed_total{integration!~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{integration!~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "Alertmanager instances within the {{$labels.job}} cluster have different configurations.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerconfiginconsistent", + "summary": "Alertmanager instances within the same cluster have different configurations." + }, + "duration": 1200, + "evaluationTime": 8.9444e-05, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:00.311444253Z", + "lastEvaluation": "2021-06-18T11:25:00.313587756Z", "name": "AlertmanagerConfigInconsistent", "query": "count by(namespace, service) (count_values by(namespace, service) (\"config_hash\", alertmanager_config_hash{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) != 1", "state": "inactive", @@ -28,44 +125,48 @@ { "alerts": [], "annotations": { - "message": "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}." + "description": "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerclusterdown", + "summary": "Half or more of the Alertmanager instances within the same cluster are down." }, - "duration": 600, - "evaluationTime": 0.000104842, + "duration": 300, + "evaluationTime": 0.000190184, "health": "ok", "labels": { - "severity": "warning" + "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:00.312010647Z", - "name": "AlertmanagerFailedReload", - "query": "alertmanager_config_last_reload_successful{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"} == 0", + "lastEvaluation": "2021-06-18T11:25:00.31367777Z", + "name": "AlertmanagerClusterDown", + "query": "(count by(namespace, service) (avg_over_time(up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) < 0.5) / count by(namespace, service) (up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) >= 0.5", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "message": "Alertmanager has not found all other members of the cluster." + "description": "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerclustercrashlooping", + "summary": "Half or more of the Alertmanager instances within the same cluster are crashlooping." }, "duration": 300, - "evaluationTime": 0.000194148, + "evaluationTime": 0.000127591, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:00.312116072Z", - "name": "AlertmanagerMembersInconsistent", - "query": "alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"} != on(service) group_left() count by(service) (alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})", + "lastEvaluation": "2021-06-18T11:25:00.31386853Z", + "name": "AlertmanagerClusterCrashlooping", + "query": "(count by(namespace, service) (changes(process_start_time_seconds{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[10m]) > 4) / count by(namespace, service) (up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) >= 0.5", "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0.001972902, + "evaluationTime": 0.025678742, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-etcd.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:54:15.281840261Z", + "lastEvaluation": "2021-06-18T11:24:45.2818526Z", "name": "etcd", "rules": [ { @@ -74,12 +175,12 @@ "message": "etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }})." }, "duration": 180, - "evaluationTime": 0.000536485, + "evaluationTime": 0.000482653, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:15.281848377Z", + "lastEvaluation": "2021-06-18T11:24:45.281862175Z", "name": "etcdInsufficientMembers", "query": "sum by(job) (up{job=~\".*etcd.*\"} == bool 1) < ((count by(job) (up{job=~\".*etcd.*\"}) + 1) / 2)", "state": "inactive", @@ -91,12 +192,12 @@ "message": "etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader." }, "duration": 60, - "evaluationTime": 0.000103058, + "evaluationTime": 0.000184015, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:15.282386715Z", + "lastEvaluation": "2021-06-18T11:24:45.282346476Z", "name": "etcdNoLeader", "query": "etcd_server_has_leader{job=~\".*etcd.*\"} == 0", "state": "inactive", @@ -108,12 +209,12 @@ "message": "etcd cluster \"{{ $labels.job }}\": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour." }, "duration": 900, - "evaluationTime": 0.000116403, + "evaluationTime": 0.000197612, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:15.282490679Z", + "lastEvaluation": "2021-06-18T11:24:45.282531934Z", "name": "etcdHighNumberOfLeaderChanges", "query": "rate(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}[15m]) > 3", "state": "inactive", @@ -125,12 +226,12 @@ "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}." }, "duration": 600, - "evaluationTime": 0.000177722, + "evaluationTime": 0.012266921, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:15.282607849Z", + "lastEvaluation": "2021-06-18T11:24:45.282730491Z", "name": "etcdHighNumberOfFailedGRPCRequests", "query": "100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!=\"OK\",job=~\".*etcd.*\"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) > 1", "state": "inactive", @@ -142,12 +243,12 @@ "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}." }, "duration": 300, - "evaluationTime": 0.000139364, + "evaluationTime": 0.011098986, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:15.282786239Z", + "lastEvaluation": "2021-06-18T11:24:45.294998899Z", "name": "etcdHighNumberOfFailedGRPCRequests", "query": "100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!=\"OK\",job=~\".*etcd.*\"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) > 5", "state": "inactive", @@ -159,12 +260,12 @@ "message": "etcd cluster \"{{ $labels.job }}\": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}." }, "duration": 600, - "evaluationTime": 0.000181967, + "evaluationTime": 0.000154908, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:15.282926199Z", + "lastEvaluation": "2021-06-18T11:24:45.306099187Z", "name": "etcdGRPCRequestsSlow", "query": "histogram_quantile(0.99, sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\",job=~\".*etcd.*\"}[5m]))) > 0.15", "state": "inactive", @@ -176,12 +277,12 @@ "message": "etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}." }, "duration": 600, - "evaluationTime": 9.2268e-05, + "evaluationTime": 0.000285608, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:15.283109007Z", + "lastEvaluation": "2021-06-18T11:24:45.306257235Z", "name": "etcdMemberCommunicationSlow", "query": "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.15", "state": "inactive", @@ -193,12 +294,12 @@ "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}." }, "duration": 900, - "evaluationTime": 6.4979e-05, + "evaluationTime": 0.000107662, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:15.283201861Z", + "lastEvaluation": "2021-06-18T11:24:45.306543658Z", "name": "etcdHighNumberOfFailedProposals", "query": "rate(etcd_server_proposals_failed_total{job=~\".*etcd.*\"}[15m]) > 5", "state": "inactive", @@ -210,12 +311,12 @@ "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}." }, "duration": 600, - "evaluationTime": 7.6261e-05, + "evaluationTime": 0.000257879, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:15.283267387Z", + "lastEvaluation": "2021-06-18T11:24:45.306651946Z", "name": "etcdHighFsyncDurations", "query": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.5", "state": "inactive", @@ -227,12 +328,12 @@ "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}." }, "duration": 600, - "evaluationTime": 7.5741e-05, + "evaluationTime": 0.000260581, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:15.2833442Z", + "lastEvaluation": "2021-06-18T11:24:45.306910527Z", "name": "etcdHighCommitDurations", "query": "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.25", "state": "inactive", @@ -244,12 +345,12 @@ "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}" }, "duration": 600, - "evaluationTime": 0.000149562, + "evaluationTime": 0.00015661, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:15.283420515Z", + "lastEvaluation": "2021-06-18T11:24:45.307171959Z", "name": "etcdHighNumberOfFailedHTTPRequests", "query": "sum by(method) (rate(etcd_http_failed_total{code!=\"404\",job=~\".*etcd.*\"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m])) > 0.01", "state": "inactive", @@ -261,12 +362,12 @@ "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}." }, "duration": 600, - "evaluationTime": 0.000168126, + "evaluationTime": 0.000142069, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:15.283570718Z", + "lastEvaluation": "2021-06-18T11:24:45.307329327Z", "name": "etcdHighNumberOfFailedHTTPRequests", "query": "sum by(method) (rate(etcd_http_failed_total{code!=\"404\",job=~\".*etcd.*\"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m])) > 0.05", "state": "inactive", @@ -278,12 +379,12 @@ "message": "etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow." }, "duration": 600, - "evaluationTime": 7.0066e-05, + "evaluationTime": 5.7149e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:15.283740182Z", + "lastEvaluation": "2021-06-18T11:24:45.307472096Z", "name": "etcdHTTPRequestsSlow", "query": "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15", "state": "inactive", @@ -292,35 +393,57 @@ ] }, { - "evaluationTime": 0.000848027, + "evaluationTime": 0.002006797, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-general.rules.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:53:55.521741543Z", + "lastEvaluation": "2021-06-18T11:24:55.522706453Z", "name": "general.rules", "rules": [ { - "alerts": [], + "alerts": [ + { + "activeAt": "2021-06-18T11:23:55.520605063Z", + "annotations": { + "description": "100% of the fluent-bit-headless/fluent-bit-headless targets in metalk8s-logging namespace are down.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-targetdown", + "summary": "One or more targets are unreachable." + }, + "labels": { + "alertname": "TargetDown", + "job": "fluent-bit-headless", + "namespace": "metalk8s-logging", + "service": "fluent-bit-headless", + "severity": "warning" + }, + "state": "pending", + "value": "1e+02" + } + ], "annotations": { - "message": "{{ printf \"%.4g\" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down." + "description": "{{ printf \"%.4g\" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-targetdown", + "summary": "One or more targets are unreachable." }, "duration": 600, - "evaluationTime": 0.000583798, + "evaluationTime": 0.001751202, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:55.521748116Z", + "lastEvaluation": "2021-06-18T11:24:55.522736446Z", "name": "TargetDown", "query": "100 * (count by(job, namespace, service) (up == 0) / count by(job, namespace, service) (up)) > 10", - "state": "inactive", + "state": "pending", "type": "alerting" }, { "alerts": [ { - "activeAt": "2021-05-07T11:53:55.520605063Z", + "activeAt": "2021-06-18T11:23:55.520605063Z", "annotations": { - "message": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty." + "description": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-watchdog", + "summary": "An alert that should always be firing to certify that Alertmanager is working properly." }, "labels": { "alertname": "Watchdog", @@ -331,15 +454,17 @@ } ], "annotations": { - "message": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty." + "description": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-watchdog", + "summary": "An alert that should always be firing to certify that Alertmanager is working properly." }, "duration": 0, - "evaluationTime": 0.000249851, + "evaluationTime": 0.000218943, "health": "ok", "labels": { "severity": "none" }, - "lastEvaluation": "2021-05-07T11:53:55.522334015Z", + "lastEvaluation": "2021-06-18T11:24:55.524490821Z", "name": "Watchdog", "query": "vector(1)", "state": "firing", @@ -348,97 +473,97 @@ ] }, { - "evaluationTime": 0.002106024, + "evaluationTime": 0.014231397, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-k8s.rules.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:54:13.317215717Z", + "lastEvaluation": "2021-06-18T11:24:43.316875265Z", "name": "k8s.rules", "rules": [ { - "evaluationTime": 0.000447507, + "evaluationTime": 0.002241871, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.317229535Z", - "name": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate", - "query": "sum by(cluster, namespace, pod, container) (rate(container_cpu_usage_seconds_total{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"}[5m])) * on(cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\"}))", + "lastEvaluation": "2021-06-18T11:24:43.316881495Z", + "name": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate", + "query": "sum by(cluster, namespace, pod, container) (irate(container_cpu_usage_seconds_total{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"}[5m])) * on(cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\"}))", "type": "recording" }, { - "evaluationTime": 0.000111462, + "evaluationTime": 0.002409345, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.317678925Z", + "lastEvaluation": "2021-06-18T11:24:43.319127593Z", "name": "node_namespace_pod_container:container_memory_working_set_bytes", "query": "container_memory_working_set_bytes{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))", "type": "recording" }, { - "evaluationTime": 9.2276e-05, + "evaluationTime": 0.002291498, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.317791083Z", + "lastEvaluation": "2021-06-18T11:24:43.321539874Z", "name": "node_namespace_pod_container:container_memory_rss", "query": "container_memory_rss{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))", "type": "recording" }, { - "evaluationTime": 0.000323204, + "evaluationTime": 0.002365833, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.317884193Z", + "lastEvaluation": "2021-06-18T11:24:43.323834589Z", "name": "node_namespace_pod_container:container_memory_cache", "query": "container_memory_cache{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))", "type": "recording" }, { - "evaluationTime": 0.000120924, + "evaluationTime": 0.00238141, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.318208802Z", + "lastEvaluation": "2021-06-18T11:24:43.326204136Z", "name": "node_namespace_pod_container:container_memory_swap", "query": "container_memory_swap{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))", "type": "recording" }, { - "evaluationTime": 0.000418846, + "evaluationTime": 0.000721923, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.318330599Z", + "lastEvaluation": "2021-06-18T11:24:43.328588875Z", "name": "namespace_memory:kube_pod_container_resource_requests:sum", "query": "sum by(namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"memory\"}) * on(namespace, pod, cluster) group_left() max by(namespace, pod) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))", "type": "recording" }, { - "evaluationTime": 0.000197581, + "evaluationTime": 0.000760596, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.318751237Z", + "lastEvaluation": "2021-06-18T11:24:43.329312055Z", "name": "namespace_cpu:kube_pod_container_resource_requests:sum", "query": "sum by(namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\"}) * on(namespace, pod, cluster) group_left() max by(namespace, pod) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))", "type": "recording" }, { - "evaluationTime": 0.000210718, + "evaluationTime": 0.00061948, "health": "ok", "labels": { "workload_type": "deployment" }, - "lastEvaluation": "2021-05-07T11:54:13.318949502Z", + "lastEvaluation": "2021-06-18T11:24:43.330074043Z", "name": "namespace_workload_pod:kube_pod_owner:relabel", "query": "max by(cluster, namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job=\"kube-state-metrics\",owner_kind=\"ReplicaSet\"}, \"replicaset\", \"$1\", \"owner_name\", \"(.*)\") * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (1, max by(replicaset, namespace, owner_name) (kube_replicaset_owner{job=\"kube-state-metrics\"})), \"workload\", \"$1\", \"owner_name\", \"(.*)\"))", "type": "recording" }, { - "evaluationTime": 7.3376e-05, + "evaluationTime": 0.00027986, "health": "ok", "labels": { "workload_type": "daemonset" }, - "lastEvaluation": "2021-05-07T11:54:13.319160895Z", + "lastEvaluation": "2021-06-18T11:24:43.330695452Z", "name": "namespace_workload_pod:kube_pod_owner:relabel", "query": "max by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\",owner_kind=\"DaemonSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))", "type": "recording" }, { - "evaluationTime": 8.3187e-05, + "evaluationTime": 0.000127807, "health": "ok", "labels": { "workload_type": "statefulset" }, - "lastEvaluation": "2021-05-07T11:54:13.319235159Z", + "lastEvaluation": "2021-06-18T11:24:43.330976339Z", "name": "namespace_workload_pod:kube_pod_owner:relabel", "query": "max by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\",owner_kind=\"StatefulSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))", "type": "recording" @@ -446,274 +571,274 @@ ] }, { - "evaluationTime": 0, + "evaluationTime": 0.018239183, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-apiserver-availability.rules.yaml", "interval": 180, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:48.620584875Z", "name": "kube-apiserver-availability.rules", "rules": [ { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.006842853, + "health": "ok", "labels": { "verb": "all" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:48.620589101Z", "name": "apiserver_request:availability30d", - "query": "1 - ((sum(increase(apiserver_request_duration_seconds_count{verb=~\"POST|PUT|PATCH|DELETE\"}[30d])) - sum(increase(apiserver_request_duration_seconds_bucket{le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[30d]))) + (sum(increase(apiserver_request_duration_seconds_count{verb=~\"LIST|GET\"}[30d])) - ((sum(increase(apiserver_request_duration_seconds_bucket{le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[30d])) or vector(0)) + sum(increase(apiserver_request_duration_seconds_bucket{le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[30d])) + sum(increase(apiserver_request_duration_seconds_bucket{le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[30d])))) + sum(code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))) / sum(code:apiserver_request_total:increase30d)", + "query": "1 - ((sum by(cluster) (increase(apiserver_request_duration_seconds_count{verb=~\"POST|PUT|PATCH|DELETE\"}[30d])) - sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[30d]))) + (sum by(cluster) (increase(apiserver_request_duration_seconds_count{verb=~\"LIST|GET\"}[30d])) - ((sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[30d])) or vector(0)) + sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[30d])) + sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[30d])))) + sum by(cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))) / sum by(cluster) (code:apiserver_request_total:increase30d)", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.004175904, + "health": "ok", "labels": { "verb": "read" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:48.627434046Z", "name": "apiserver_request:availability30d", - "query": "1 - (sum(increase(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30d])) - ((sum(increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[30d])) or vector(0)) + sum(increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[30d])) + sum(increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[30d]))) + sum(code:apiserver_request_total:increase30d{code=~\"5..\",verb=\"read\"} or vector(0))) / sum(code:apiserver_request_total:increase30d{verb=\"read\"})", + "query": "1 - (sum by(cluster) (increase(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30d])) - ((sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[30d])) or vector(0)) + sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[30d])) + sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[30d]))) + sum by(cluster) (code:apiserver_request_total:increase30d{code=~\"5..\",verb=\"read\"} or vector(0))) / sum by(cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.00154929, + "health": "ok", "labels": { "verb": "write" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:48.631611281Z", "name": "apiserver_request:availability30d", - "query": "1 - ((sum(increase(apiserver_request_duration_seconds_count{verb=~\"POST|PUT|PATCH|DELETE\"}[30d])) - sum(increase(apiserver_request_duration_seconds_bucket{le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[30d]))) + sum(code:apiserver_request_total:increase30d{code=~\"5..\",verb=\"write\"} or vector(0))) / sum(code:apiserver_request_total:increase30d{verb=\"write\"})", + "query": "1 - ((sum by(cluster) (increase(apiserver_request_duration_seconds_count{verb=~\"POST|PUT|PATCH|DELETE\"}[30d])) - sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[30d]))) + sum by(cluster) (code:apiserver_request_total:increase30d{code=~\"5..\",verb=\"write\"} or vector(0))) / sum by(cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 9.2052e-05, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.633162254Z", "name": "code_verb:apiserver_request_total:increase30d", "query": "avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.001236631, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.63325501Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000482869, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.6344942Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"GET\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"GET\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000405999, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.634978298Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"POST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"POST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000356356, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.635385369Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000151023, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.635742707Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"PATCH\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"PATCH\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000118601, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.635894722Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 8.2379e-05, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.636014288Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 5.2686e-05, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.63609727Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"GET\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"GET\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 5.666e-05, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.636150513Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"POST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"POST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 5.6811e-05, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.636207672Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 6.7332e-05, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.636265059Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"PATCH\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"PATCH\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 5.306e-05, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.63633291Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000314749, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.636386459Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.00041558, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.636702536Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"GET\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"GET\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000173257, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.637119706Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"POST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"POST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000191637, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.637294005Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000135997, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.637486586Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"PATCH\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"PATCH\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 9.6701e-05, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.637623434Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000250027, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.637720827Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000194963, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.637972113Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"GET\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"GET\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000171608, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.638168221Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"POST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"POST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000163967, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.638340885Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000138541, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.638506147Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"PATCH\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"PATCH\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 6.5054e-05, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.638645621Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 5.7712e-05, + "health": "ok", "labels": { "verb": "read" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:48.638711268Z", "name": "code:apiserver_request_total:increase30d", - "query": "sum by(code) (code_verb:apiserver_request_total:increase30d{verb=~\"LIST|GET\"})", + "query": "sum by(cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~\"LIST|GET\"})", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 5.2741e-05, + "health": "ok", "labels": { "verb": "write" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:48.638769494Z", "name": "code:apiserver_request_total:increase30d", - "query": "sum by(code) (code_verb:apiserver_request_total:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})", + "query": "sum by(cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})", "type": "recording" } ] }, { - "evaluationTime": 0.000995342, + "evaluationTime": 0.001112405, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-apiserver-slos.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:54:02.746732767Z", + "lastEvaluation": "2021-06-18T11:25:02.746337278Z", "name": "kube-apiserver-slos", "rules": [ { @@ -724,14 +849,14 @@ "summary": "The API server is burning too much error budget." }, "duration": 120, - "evaluationTime": 0.000505161, + "evaluationTime": 0.000378711, "health": "ok", "labels": { "long": "1h", "severity": "critical", "short": "5m" }, - "lastEvaluation": "2021-05-07T11:54:02.746739383Z", + "lastEvaluation": "2021-06-18T11:25:02.746344741Z", "name": "KubeAPIErrorBudgetBurn", "query": "sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01)", "state": "inactive", @@ -745,14 +870,14 @@ "summary": "The API server is burning too much error budget." }, "duration": 900, - "evaluationTime": 0.000154635, + "evaluationTime": 0.000124355, "health": "ok", "labels": { "long": "6h", "severity": "critical", "short": "30m" }, - "lastEvaluation": "2021-05-07T11:54:02.74724678Z", + "lastEvaluation": "2021-06-18T11:25:02.746725217Z", "name": "KubeAPIErrorBudgetBurn", "query": "sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01)", "state": "inactive", @@ -766,14 +891,14 @@ "summary": "The API server is burning too much error budget." }, "duration": 3600, - "evaluationTime": 0.000183328, + "evaluationTime": 0.000345195, "health": "ok", "labels": { "long": "1d", "severity": "warning", "short": "2h" }, - "lastEvaluation": "2021-05-07T11:54:02.74740203Z", + "lastEvaluation": "2021-06-18T11:25:02.746850471Z", "name": "KubeAPIErrorBudgetBurn", "query": "sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01)", "state": "inactive", @@ -787,14 +912,14 @@ "summary": "The API server is burning too much error budget." }, "duration": 10800, - "evaluationTime": 0.000138795, + "evaluationTime": 0.000246896, "health": "ok", "labels": { "long": "3d", "severity": "warning", "short": "6h" }, - "lastEvaluation": "2021-05-07T11:54:02.747586416Z", + "lastEvaluation": "2021-06-18T11:25:02.747198646Z", "name": "KubeAPIErrorBudgetBurn", "query": "sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01)", "state": "inactive", @@ -803,241 +928,241 @@ ] }, { - "evaluationTime": 0.013675876, + "evaluationTime": 0.521168867, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-apiserver.rules.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:54:01.976869389Z", + "lastEvaluation": "2021-06-18T11:25:01.977484002Z", "name": "kube-apiserver.rules", "rules": [ { - "evaluationTime": 0.001144918, + "evaluationTime": 0.007416587, "health": "ok", "labels": { "verb": "read" }, - "lastEvaluation": "2021-05-07T11:54:01.976876501Z", + "lastEvaluation": "2021-06-18T11:25:01.977495589Z", "name": "apiserver_request:burnrate1d", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1d])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[1d])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[1d])) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[1d])))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1d])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[1d])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[1d])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[1d])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))", "type": "recording" }, { - "evaluationTime": 0.000813245, + "evaluationTime": 0.006599565, "health": "ok", "labels": { "verb": "read" }, - "lastEvaluation": "2021-05-07T11:54:01.978025644Z", + "lastEvaluation": "2021-06-18T11:25:01.984915357Z", "name": "apiserver_request:burnrate1h", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1h])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[1h])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[1h])) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[1h])))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1h])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[1h])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[1h])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[1h])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0.00055396, + "evaluationTime": 0.005224842, "health": "ok", "labels": { "verb": "read" }, - "lastEvaluation": "2021-05-07T11:54:01.978841277Z", + "lastEvaluation": "2021-06-18T11:25:01.99151752Z", "name": "apiserver_request:burnrate2h", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[2h])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[2h])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[2h])) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[2h])))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[2h])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[2h])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[2h])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[2h])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))", "type": "recording" }, { - "evaluationTime": 0.000803037, + "evaluationTime": 0.005207308, "health": "ok", "labels": { "verb": "read" }, - "lastEvaluation": "2021-05-07T11:54:01.979396255Z", + "lastEvaluation": "2021-06-18T11:25:01.996745481Z", "name": "apiserver_request:burnrate30m", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30m])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[30m])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[30m])) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[30m])))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30m])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[30m])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[30m])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[30m])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))", "type": "recording" }, { - "evaluationTime": 0.000484351, + "evaluationTime": 0.005351042, "health": "ok", "labels": { "verb": "read" }, - "lastEvaluation": "2021-05-07T11:54:01.980200375Z", + "lastEvaluation": "2021-06-18T11:25:02.001954971Z", "name": "apiserver_request:burnrate3d", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[3d])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[3d])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[3d])) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[3d])))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[3d])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[3d])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[3d])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[3d])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))", "type": "recording" }, { - "evaluationTime": 0.006218982, + "evaluationTime": 0.005641993, "health": "ok", "labels": { "verb": "read" }, - "lastEvaluation": "2021-05-07T11:54:01.980685701Z", + "lastEvaluation": "2021-06-18T11:25:02.007308042Z", "name": "apiserver_request:burnrate5m", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[5m])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[5m])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[5m])) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[5m])))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[5m])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[5m])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[5m])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[5m])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0.000753218, + "evaluationTime": 0.005416728, "health": "ok", "labels": { "verb": "read" }, - "lastEvaluation": "2021-05-07T11:54:01.986908259Z", + "lastEvaluation": "2021-06-18T11:25:02.012952379Z", "name": "apiserver_request:burnrate6h", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[6h])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[6h])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[6h])) + sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[6h])))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[6h])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[6h])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[6h])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[6h])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))", "type": "recording" }, { - "evaluationTime": 0.00044547, + "evaluationTime": 0.002826878, "health": "ok", "labels": { "verb": "write" }, - "lastEvaluation": "2021-05-07T11:54:01.987663968Z", + "lastEvaluation": "2021-06-18T11:25:02.018371198Z", "name": "apiserver_request:burnrate1d", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d])) - sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))", "type": "recording" }, { - "evaluationTime": 0.000322431, + "evaluationTime": 0.002228424, "health": "ok", "labels": { "verb": "write" }, - "lastEvaluation": "2021-05-07T11:54:01.988111393Z", + "lastEvaluation": "2021-06-18T11:25:02.021200075Z", "name": "apiserver_request:burnrate1h", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h])) - sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0.000255712, + "evaluationTime": 0.002263326, "health": "ok", "labels": { "verb": "write" }, - "lastEvaluation": "2021-05-07T11:54:01.988435564Z", + "lastEvaluation": "2021-06-18T11:25:02.02343019Z", "name": "apiserver_request:burnrate2h", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h])) - sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))", "type": "recording" }, { - "evaluationTime": 0.000248511, + "evaluationTime": 0.002124364, "health": "ok", "labels": { "verb": "write" }, - "lastEvaluation": "2021-05-07T11:54:01.988691854Z", + "lastEvaluation": "2021-06-18T11:25:02.025695667Z", "name": "apiserver_request:burnrate30m", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m])) - sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))", "type": "recording" }, { - "evaluationTime": 0.00028714, + "evaluationTime": 0.002445016, "health": "ok", "labels": { "verb": "write" }, - "lastEvaluation": "2021-05-07T11:54:01.98894108Z", + "lastEvaluation": "2021-06-18T11:25:02.027822107Z", "name": "apiserver_request:burnrate3d", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d])) - sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))", "type": "recording" }, { - "evaluationTime": 0.000236354, + "evaluationTime": 0.002518516, "health": "ok", "labels": { "verb": "write" }, - "lastEvaluation": "2021-05-07T11:54:01.989228993Z", + "lastEvaluation": "2021-06-18T11:25:02.030269109Z", "name": "apiserver_request:burnrate5m", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m])) - sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0.000244994, + "evaluationTime": 0.002352141, "health": "ok", "labels": { "verb": "write" }, - "lastEvaluation": "2021-05-07T11:54:01.98946598Z", + "lastEvaluation": "2021-06-18T11:25:02.032789778Z", "name": "apiserver_request:burnrate6h", - "query": "((sum(rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h])) - sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))) + sum(rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))) / sum(rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))", "type": "recording" }, { - "evaluationTime": 6.5445e-05, + "evaluationTime": 0.002905741, "health": "ok", "labels": { "verb": "read" }, - "lastEvaluation": "2021-05-07T11:54:01.989711525Z", + "lastEvaluation": "2021-06-18T11:25:02.035143613Z", "name": "code_resource:apiserver_request_total:rate5m", - "query": "sum by(code, resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))", + "query": "sum by(cluster, code, resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))", "type": "recording" }, { - "evaluationTime": 6.9099e-05, + "evaluationTime": 0.001366403, "health": "ok", "labels": { "verb": "write" }, - "lastEvaluation": "2021-05-07T11:54:01.989777512Z", + "lastEvaluation": "2021-06-18T11:25:02.038052379Z", "name": "code_resource:apiserver_request_total:rate5m", - "query": "sum by(code, resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))", + "query": "sum by(cluster, code, resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0.000107092, + "evaluationTime": 0.176679726, "health": "ok", "labels": { "quantile": "0.99", "verb": "read" }, - "lastEvaluation": "2021-05-07T11:54:01.989847218Z", + "lastEvaluation": "2021-06-18T11:25:02.039421177Z", "name": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.99, sum by(le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))) > 0", + "query": "histogram_quantile(0.99, sum by(cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))) > 0", "type": "recording" }, { - "evaluationTime": 0.000118494, + "evaluationTime": 0.085210535, "health": "ok", "labels": { "quantile": "0.99", "verb": "write" }, - "lastEvaluation": "2021-05-07T11:54:01.989954975Z", + "lastEvaluation": "2021-06-18T11:25:02.216104086Z", "name": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.99, sum by(le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) > 0", + "query": "histogram_quantile(0.99, sum by(cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) > 0", "type": "recording" }, { - "evaluationTime": 0.000127885, + "evaluationTime": 0.072845781, "health": "ok", "labels": { "quantile": "0.99" }, - "lastEvaluation": "2021-05-07T11:54:01.990074534Z", + "lastEvaluation": "2021-06-18T11:25:02.301317879Z", "name": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.99, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])))", "type": "recording" }, { - "evaluationTime": 9.9516e-05, + "evaluationTime": 0.062390997, "health": "ok", "labels": { "quantile": "0.9" }, - "lastEvaluation": "2021-05-07T11:54:01.99020298Z", + "lastEvaluation": "2021-06-18T11:25:02.374170819Z", "name": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.9, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])))", "type": "recording" }, { - "evaluationTime": 0.000239072, + "evaluationTime": 0.062074272, "health": "ok", "labels": { "quantile": "0.5" }, - "lastEvaluation": "2021-05-07T11:54:01.990303074Z", + "lastEvaluation": "2021-06-18T11:25:02.43656955Z", "name": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.5, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])))", "type": "recording" @@ -1045,24 +1170,24 @@ ] }, { - "evaluationTime": 0.000378776, + "evaluationTime": 0.000813556, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-prometheus-general.rules.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:53:57.649635585Z", + "lastEvaluation": "2021-06-18T11:24:57.650113232Z", "name": "kube-prometheus-general.rules", "rules": [ { - "evaluationTime": 0.000311649, + "evaluationTime": 0.000574371, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.649642394Z", + "lastEvaluation": "2021-06-18T11:24:57.650119725Z", "name": "count:up1", "query": "count without(instance, pod, node) (up == 1)", "type": "recording" }, { - "evaluationTime": 5.5395e-05, + "evaluationTime": 0.000227173, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.64995591Z", + "lastEvaluation": "2021-06-18T11:24:57.650696442Z", "name": "count:up0", "query": "count without(instance, pod, node) (up == 0)", "type": "recording" @@ -1070,56 +1195,56 @@ ] }, { - "evaluationTime": 0.001361737, + "evaluationTime": 0.001937398, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-prometheus-node-recording.rules.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:54:13.690356682Z", + "lastEvaluation": "2021-06-18T11:24:43.691014152Z", "name": "kube-prometheus-node-recording.rules", "rules": [ { - "evaluationTime": 0.000343424, + "evaluationTime": 0.000518612, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.690362971Z", + "lastEvaluation": "2021-06-18T11:24:43.691021293Z", "name": "instance:node_cpu:rate:sum", "query": "sum by(instance) (rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[3m]))", "type": "recording" }, { - "evaluationTime": 9.9183e-05, + "evaluationTime": 0.000283108, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.690708388Z", + "lastEvaluation": "2021-06-18T11:24:43.69154152Z", "name": "instance:node_network_receive_bytes:rate:sum", "query": "sum by(instance) (rate(node_network_receive_bytes_total[3m]))", "type": "recording" }, { - "evaluationTime": 5.8018e-05, + "evaluationTime": 0.000208558, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.690808274Z", + "lastEvaluation": "2021-06-18T11:24:43.691826031Z", "name": "instance:node_network_transmit_bytes:rate:sum", "query": "sum by(instance) (rate(node_network_transmit_bytes_total[3m]))", "type": "recording" }, { - "evaluationTime": 0.000138124, + "evaluationTime": 0.000460296, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.690866888Z", + "lastEvaluation": "2021-06-18T11:24:43.692035613Z", "name": "instance:node_cpu:ratio", "query": "sum without(cpu, mode) (rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m])) / on(instance) group_left() count by(instance) (sum by(instance, cpu) (node_cpu_seconds_total))", "type": "recording" }, { - "evaluationTime": 7.8834e-05, + "evaluationTime": 0.000220965, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.691005933Z", + "lastEvaluation": "2021-06-18T11:24:43.692497481Z", "name": "cluster:node_cpu:sum_rate5m", "query": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0.000626706, + "evaluationTime": 0.000228472, "health": "ok", - "lastEvaluation": "2021-05-07T11:54:13.69108539Z", + "lastEvaluation": "2021-06-18T11:24:43.692719695Z", "name": "cluster:node_cpu:ratio", "query": "cluster:node_cpu_seconds_total:rate5m / count(sum by(instance, cpu) (node_cpu_seconds_total))", "type": "recording" @@ -1127,107 +1252,107 @@ ] }, { - "evaluationTime": 0.001545733, + "evaluationTime": 0.002752249, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-scheduler.rules.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:54:00.190569462Z", + "lastEvaluation": "2021-06-18T11:25:00.191359472Z", "name": "kube-scheduler.rules", "rules": [ { - "evaluationTime": 0.000508416, + "evaluationTime": 0.000622717, "health": "ok", "labels": { "quantile": "0.99" }, - "lastEvaluation": "2021-05-07T11:54:00.190576976Z", + "lastEvaluation": "2021-06-18T11:25:00.191369201Z", "name": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 0.000150656, + "evaluationTime": 0.000391144, "health": "ok", "labels": { "quantile": "0.99" }, - "lastEvaluation": "2021-05-07T11:54:00.191087529Z", + "lastEvaluation": "2021-06-18T11:25:00.191994478Z", "name": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 0.000179002, + "evaluationTime": 0.000330333, "health": "ok", "labels": { "quantile": "0.99" }, - "lastEvaluation": "2021-05-07T11:54:00.191239788Z", + "lastEvaluation": "2021-06-18T11:25:00.192386773Z", "name": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 0.000142047, + "evaluationTime": 0.000408995, "health": "ok", "labels": { "quantile": "0.9" }, - "lastEvaluation": "2021-05-07T11:54:00.191420721Z", + "lastEvaluation": "2021-06-18T11:25:00.192718309Z", "name": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 0.000113893, + "evaluationTime": 0.000262053, "health": "ok", "labels": { "quantile": "0.9" }, - "lastEvaluation": "2021-05-07T11:54:00.191563795Z", + "lastEvaluation": "2021-06-18T11:25:00.193128918Z", "name": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 0.000109705, + "evaluationTime": 8.5896e-05, "health": "ok", "labels": { "quantile": "0.9" }, - "lastEvaluation": "2021-05-07T11:54:00.191678724Z", + "lastEvaluation": "2021-06-18T11:25:00.193392054Z", "name": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 0.000109098, + "evaluationTime": 0.000315065, "health": "ok", "labels": { "quantile": "0.5" }, - "lastEvaluation": "2021-05-07T11:54:00.191789534Z", + "lastEvaluation": "2021-06-18T11:25:00.193478564Z", "name": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 9.3432e-05, + "evaluationTime": 0.000244859, "health": "ok", "labels": { "quantile": "0.5" }, - "lastEvaluation": "2021-05-07T11:54:00.191899842Z", + "lastEvaluation": "2021-06-18T11:25:00.193794776Z", "name": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 0.000117234, + "evaluationTime": 6.8814e-05, "health": "ok", "labels": { "quantile": "0.5" }, - "lastEvaluation": "2021-05-07T11:54:00.191994418Z", + "lastEvaluation": "2021-06-18T11:25:00.19404041Z", "name": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" @@ -1235,10 +1360,10 @@ ] }, { - "evaluationTime": 0, + "evaluationTime": 0.00090805, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-state-metrics.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.306384915Z", "name": "kube-state-metrics", "rules": [ { @@ -1249,12 +1374,12 @@ "summary": "kube-state-metrics is experiencing errors in list operations." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.00036131, + "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.306391891Z", "name": "KubeStateMetricsListErrors", "query": "(sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) / sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m]))) > 0.01", "state": "inactive", @@ -1268,55 +1393,93 @@ "summary": "kube-state-metrics is experiencing errors in watch operations." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000166227, + "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.306754786Z", "name": "KubeStateMetricsWatchErrors", "query": "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) / sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m]))) > 0.01", "state": "inactive", "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsshardingmismatch", + "summary": "kube-state-metrics sharding is misconfigured." + }, + "duration": 900, + "evaluationTime": 0.000124954, + "health": "ok", + "labels": { + "severity": "critical" + }, + "lastEvaluation": "2021-06-18T11:24:46.306921752Z", + "name": "KubeStateMetricsShardingMismatch", + "query": "stdvar(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) != 0", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsshardsmissing", + "summary": "kube-state-metrics shards are missing." + }, + "duration": 900, + "evaluationTime": 0.000241615, + "health": "ok", + "labels": { + "severity": "critical" + }, + "lastEvaluation": "2021-06-18T11:24:46.307048387Z", + "name": "KubeStateMetricsShardsMissing", + "query": "2 ^ max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1 - sum(2 ^ max by(shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"})) != 0", + "state": "inactive", + "type": "alerting" } ] }, { - "evaluationTime": 0.000667492, + "evaluationTime": 0.001706121, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubelet.rules.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:53:56.101954582Z", + "lastEvaluation": "2021-06-18T11:24:56.102127681Z", "name": "kubelet.rules", "rules": [ { - "evaluationTime": 0.000476581, + "evaluationTime": 0.00079737, "health": "ok", "labels": { "quantile": "0.99" }, - "lastEvaluation": "2021-05-07T11:53:56.10196087Z", + "lastEvaluation": "2021-06-18T11:24:56.102137485Z", "name": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"})", "type": "recording" }, { - "evaluationTime": 0.000106191, + "evaluationTime": 0.000432262, "health": "ok", "labels": { "quantile": "0.9" }, - "lastEvaluation": "2021-05-07T11:53:56.102439652Z", + "lastEvaluation": "2021-06-18T11:24:56.102937447Z", "name": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.9, sum by(instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"})", "type": "recording" }, { - "evaluationTime": 7.3055e-05, + "evaluationTime": 0.000458707, "health": "ok", "labels": { "quantile": "0.5" }, - "lastEvaluation": "2021-05-07T11:53:56.102546416Z", + "lastEvaluation": "2021-06-18T11:24:56.103370738Z", "name": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.5, sum by(instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"})", "type": "recording" @@ -1324,10 +1487,10 @@ ] }, { - "evaluationTime": 0.004738015, + "evaluationTime": 0.005286722, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-apps.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:54:10.003687644Z", + "lastEvaluation": "2021-06-18T11:24:40.005568382Z", "name": "kubernetes-apps", "rules": [ { @@ -1338,14 +1501,14 @@ "summary": "Pod is crash looping." }, "duration": 900, - "evaluationTime": 0.000544943, + "evaluationTime": 0.000703572, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.003694374Z", + "lastEvaluation": "2021-06-18T11:24:40.005588318Z", "name": "KubePodCrashLooping", - "query": "rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) * 60 * 5 > 0", + "query": "increase(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) > 0", "state": "inactive", "type": "alerting" }, @@ -1357,12 +1520,12 @@ "summary": "Pod has been in a non-ready state for more than 15 minutes." }, "duration": 900, - "evaluationTime": 0.000459652, + "evaluationTime": 0.001196149, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.004242086Z", + "lastEvaluation": "2021-06-18T11:24:40.006293358Z", "name": "KubePodNotReady", "query": "sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\",namespace=~\".*\",phase=~\"Pending|Unknown\"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"}))) > 0", "state": "inactive", @@ -1376,12 +1539,12 @@ "summary": "Deployment generation mismatch due to possible roll-back" }, "duration": 900, - "evaluationTime": 0.000170955, + "evaluationTime": 0.000321936, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.004704179Z", + "lastEvaluation": "2021-06-18T11:24:40.007492015Z", "name": "KubeDeploymentGenerationMismatch", "query": "kube_deployment_status_observed_generation{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_deployment_metadata_generation{job=\"kube-state-metrics\",namespace=~\".*\"}", "state": "inactive", @@ -1395,34 +1558,57 @@ "summary": "Deployment has not matched the expected number of replicas." }, "duration": 900, - "evaluationTime": 0.000288162, + "evaluationTime": 0.000381873, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.004875947Z", + "lastEvaluation": "2021-06-18T11:24:40.007814993Z", "name": "KubeDeploymentReplicasMismatch", "query": "(kube_deployment_spec_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_deployment_status_replicas_available{job=\"kube-state-metrics\",namespace=~\".*\"}) and (changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) == 0)", "state": "inactive", "type": "alerting" }, { - "alerts": [], + "alerts": [ + { + "activeAt": "2021-06-18T11:24:10.003560608Z", + "annotations": { + "description": "StatefulSet metalk8s-logging/loki has not matched the expected number of replicas for longer than 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch", + "summary": "Deployment has not matched the expected number of replicas." + }, + "labels": { + "alertname": "KubeStatefulSetReplicasMismatch", + "container": "kube-state-metrics", + "endpoint": "http", + "instance": "10.233.132.67:8080", + "job": "kube-state-metrics", + "namespace": "metalk8s-logging", + "pod": "prometheus-operator-kube-state-metrics-56466d4b9c-b4kgd", + "service": "prometheus-operator-kube-state-metrics", + "severity": "warning", + "statefulset": "loki" + }, + "state": "pending", + "value": "0e+00" + } + ], "annotations": { "description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch", "summary": "Deployment has not matched the expected number of replicas." }, "duration": 900, - "evaluationTime": 0.000233724, + "evaluationTime": 0.000547482, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.005165324Z", + "lastEvaluation": "2021-06-18T11:24:40.0081978Z", "name": "KubeStatefulSetReplicasMismatch", "query": "(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) == 0)", - "state": "inactive", + "state": "pending", "type": "alerting" }, { @@ -1433,12 +1619,12 @@ "summary": "StatefulSet generation mismatch due to possible roll-back" }, "duration": 900, - "evaluationTime": 0.000179212, + "evaluationTime": 0.000162902, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.005400219Z", + "lastEvaluation": "2021-06-18T11:24:40.008746798Z", "name": "KubeStatefulSetGenerationMismatch", "query": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_metadata_generation{job=\"kube-state-metrics\",namespace=~\".*\"}", "state": "inactive", @@ -1452,12 +1638,12 @@ "summary": "StatefulSet update has not been rolled out." }, "duration": 900, - "evaluationTime": 0.000307299, + "evaluationTime": 0.000315396, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.005580909Z", + "lastEvaluation": "2021-06-18T11:24:40.008910407Z", "name": "KubeStatefulSetUpdateNotRolledOut", "query": "(max without(revision) (kube_statefulset_status_current_revision{job=\"kube-state-metrics\",namespace=~\".*\"} unless kube_statefulset_status_update_revision{job=\"kube-state-metrics\",namespace=~\".*\"}) * (kube_statefulset_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"})) and (changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[5m]) == 0)", "state": "inactive", @@ -1471,12 +1657,12 @@ "summary": "DaemonSet rollout is stuck." }, "duration": 900, - "evaluationTime": 0.000504864, + "evaluationTime": 0.000562858, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.005889373Z", + "lastEvaluation": "2021-06-18T11:24:40.009226538Z", "name": "KubeDaemonSetRolloutStuck", "query": "((kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}) or (kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != 0) or (kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}) or (kube_daemonset_status_number_available{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"})) and (changes(kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}[5m]) == 0)", "state": "inactive", @@ -1490,12 +1676,12 @@ "summary": "Pod container waiting longer than 1 hour" }, "duration": 3600, - "evaluationTime": 0.000550359, + "evaluationTime": 9.2189e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.006395616Z", + "lastEvaluation": "2021-06-18T11:24:40.009790714Z", "name": "KubeContainerWaiting", "query": "sum by(namespace, pod, container) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\",namespace=~\".*\"}) > 0", "state": "inactive", @@ -1509,12 +1695,12 @@ "summary": "DaemonSet pods are not scheduled." }, "duration": 600, - "evaluationTime": 0.000504993, + "evaluationTime": 0.000194653, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.00694784Z", + "lastEvaluation": "2021-06-18T11:24:40.0098836Z", "name": "KubeDaemonSetNotScheduled", "query": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} - kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", "state": "inactive", @@ -1528,12 +1714,12 @@ "summary": "DaemonSet pods are misscheduled." }, "duration": 900, - "evaluationTime": 0.000154867, + "evaluationTime": 7.8301e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.007455286Z", + "lastEvaluation": "2021-06-18T11:24:40.010079003Z", "name": "KubeDaemonSetMisScheduled", "query": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", "state": "inactive", @@ -1547,12 +1733,12 @@ "summary": "Job did not complete in time" }, "duration": 43200, - "evaluationTime": 0.000185855, + "evaluationTime": 8.7612e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.007611872Z", + "lastEvaluation": "2021-06-18T11:24:40.010157847Z", "name": "KubeJobCompletion", "query": "kube_job_spec_completions{job=\"kube-state-metrics\",namespace=~\".*\"} - kube_job_status_succeeded{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", "state": "inactive", @@ -1566,12 +1752,12 @@ "summary": "Job failed to complete." }, "duration": 900, - "evaluationTime": 7.9369e-05, + "evaluationTime": 5.6082e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.007799142Z", + "lastEvaluation": "2021-06-18T11:24:40.010245917Z", "name": "KubeJobFailed", "query": "kube_job_failed{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", "state": "inactive", @@ -1585,12 +1771,12 @@ "summary": "HPA has not matched descired number of replicas." }, "duration": 900, - "evaluationTime": 0.00043403, + "evaluationTime": 0.000421375, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.007879187Z", + "lastEvaluation": "2021-06-18T11:24:40.01030259Z", "name": "KubeHpaReplicasMismatch", "query": "(kube_hpa_status_desired_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} > kube_hpa_spec_min_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} < kube_hpa_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and changes(kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}[15m]) == 0", "state": "inactive", @@ -1604,12 +1790,12 @@ "summary": "HPA is running at max replicas" }, "duration": 900, - "evaluationTime": 0.00010834, + "evaluationTime": 0.000127034, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:10.008314226Z", + "lastEvaluation": "2021-06-18T11:24:40.010725712Z", "name": "KubeHpaMaxedOut", "query": "kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} == kube_hpa_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}", "state": "inactive", @@ -1618,10 +1804,10 @@ ] }, { - "evaluationTime": 0.001756099, + "evaluationTime": 0.002152213, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-resources.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:54:02.941671699Z", + "lastEvaluation": "2021-06-18T11:25:02.941702475Z", "name": "kubernetes-resources", "rules": [ { @@ -1632,12 +1818,12 @@ "summary": "Cluster has overcommitted CPU resource requests." }, "duration": 300, - "evaluationTime": 0.00084701, + "evaluationTime": 0.000445326, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:02.941679568Z", + "lastEvaluation": "2021-06-18T11:25:02.941709832Z", "name": "KubeCPUOvercommit", "query": "sum(namespace_cpu:kube_pod_container_resource_requests:sum) / sum(kube_node_status_allocatable{resource=\"cpu\"}) > ((count(kube_node_status_allocatable{resource=\"cpu\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"cpu\"})", "state": "inactive", @@ -1651,12 +1837,12 @@ "summary": "Cluster has overcommitted memory resource requests." }, "duration": 300, - "evaluationTime": 0.000235045, + "evaluationTime": 0.000195045, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:02.942529041Z", + "lastEvaluation": "2021-06-18T11:25:02.942157136Z", "name": "KubeMemoryOvercommit", "query": "sum(namespace_memory:kube_pod_container_resource_requests:sum) / sum(kube_node_status_allocatable{resource=\"memory\"}) > ((count(kube_node_status_allocatable{resource=\"memory\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"memory\"})", "state": "inactive", @@ -1670,12 +1856,12 @@ "summary": "Cluster has overcommitted CPU resource requests." }, "duration": 300, - "evaluationTime": 9.8925e-05, + "evaluationTime": 0.00012626, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:02.942765062Z", + "lastEvaluation": "2021-06-18T11:25:02.942353167Z", "name": "KubeCPUQuotaOvercommit", "query": "sum(kube_resourcequota{job=\"kube-state-metrics\",resource=\"cpu\",type=\"hard\"}) / sum(kube_node_status_allocatable{resource=\"cpu\"}) > 1.5", "state": "inactive", @@ -1689,12 +1875,12 @@ "summary": "Cluster has overcommitted memory resource requests." }, "duration": 300, - "evaluationTime": 8.9432e-05, + "evaluationTime": 0.000461438, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:02.942865729Z", + "lastEvaluation": "2021-06-18T11:25:02.942480055Z", "name": "KubeMemoryQuotaOvercommit", "query": "sum(kube_resourcequota{job=\"kube-state-metrics\",resource=\"memory\",type=\"hard\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"memory\"}) > 1.5", "state": "inactive", @@ -1708,12 +1894,12 @@ "summary": "Namespace quota is going to be full." }, "duration": 900, - "evaluationTime": 0.000122212, + "evaluationTime": 0.000372597, "health": "ok", "labels": { "severity": "info" }, - "lastEvaluation": "2021-05-07T11:54:02.942956237Z", + "lastEvaluation": "2021-06-18T11:25:02.942945936Z", "name": "KubeQuotaAlmostFull", "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) > 0.9 < 1", "state": "inactive", @@ -1727,12 +1913,12 @@ "summary": "Namespace quota is fully used." }, "duration": 900, - "evaluationTime": 0.000121128, + "evaluationTime": 0.000125757, "health": "ok", "labels": { "severity": "info" }, - "lastEvaluation": "2021-05-07T11:54:02.943079222Z", + "lastEvaluation": "2021-06-18T11:25:02.943319294Z", "name": "KubeQuotaFullyUsed", "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) == 1", "state": "inactive", @@ -1746,12 +1932,12 @@ "summary": "Namespace quota has exceeded the limits." }, "duration": 900, - "evaluationTime": 8.2982e-05, + "evaluationTime": 0.000117394, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:02.943201342Z", + "lastEvaluation": "2021-06-18T11:25:02.943445843Z", "name": "KubeQuotaExceeded", "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) > 1", "state": "inactive", @@ -1765,12 +1951,12 @@ "summary": "Processes experience elevated CPU throttling." }, "duration": 900, - "evaluationTime": 0.000139781, + "evaluationTime": 0.000287323, "health": "ok", "labels": { "severity": "info" }, - "lastEvaluation": "2021-05-07T11:54:02.943284909Z", + "lastEvaluation": "2021-06-18T11:25:02.943564325Z", "name": "CPUThrottlingHigh", "query": "sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=\"\"}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > (25 / 100)", "state": "inactive", @@ -1779,10 +1965,10 @@ ] }, { - "evaluationTime": 0.002370923, + "evaluationTime": 0.0010178, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-storage.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:53:56.33140494Z", + "lastEvaluation": "2021-06-18T11:24:56.330748474Z", "name": "kubernetes-storage", "rules": [ { @@ -1793,12 +1979,12 @@ "summary": "PersistentVolume is filling up." }, "duration": 60, - "evaluationTime": 0.001160203, + "evaluationTime": 0.000440356, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:53:56.331411661Z", + "lastEvaluation": "2021-06-18T11:24:56.330757817Z", "name": "KubePersistentVolumeFillingUp", "query": "kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} < 0.03", "state": "inactive", @@ -1812,12 +1998,12 @@ "summary": "PersistentVolume is filling up." }, "duration": 3600, - "evaluationTime": 0.001077033, + "evaluationTime": 0.00043012, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:56.332574067Z", + "lastEvaluation": "2021-06-18T11:24:56.331200026Z", "name": "KubePersistentVolumeFillingUp", "query": "(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"}) < 0.15 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"}[6h], 4 * 24 * 3600) < 0", "state": "inactive", @@ -1831,12 +2017,12 @@ "summary": "PersistentVolume is having issues with provisioning." }, "duration": 300, - "evaluationTime": 0.000119521, + "evaluationTime": 0.000132208, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:53:56.333653347Z", + "lastEvaluation": "2021-06-18T11:24:56.33163112Z", "name": "KubePersistentVolumeErrors", "query": "kube_persistentvolume_status_phase{job=\"kube-state-metrics\",phase=~\"Failed|Pending\"} > 0", "state": "inactive", @@ -1845,10 +2031,10 @@ ] }, { - "evaluationTime": 0.001766946, + "evaluationTime": 0.004525335, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-system-apiserver.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:53:52.146615657Z", + "lastEvaluation": "2021-06-18T11:24:52.147596585Z", "name": "kubernetes-system-apiserver", "rules": [ { @@ -1859,12 +2045,12 @@ "summary": "Client certificate is about to expire." }, "duration": 0, - "evaluationTime": 0.000618219, + "evaluationTime": 0.00055716, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:52.146621803Z", + "lastEvaluation": "2021-06-18T11:24:52.147604484Z", "name": "KubeClientCertificateExpiration", "query": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800", "state": "inactive", @@ -1878,12 +2064,12 @@ "summary": "Client certificate is about to expire." }, "duration": 0, - "evaluationTime": 0.000177085, + "evaluationTime": 0.000300639, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:53:52.147242087Z", + "lastEvaluation": "2021-06-18T11:24:52.148163534Z", "name": "KubeClientCertificateExpiration", "query": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400", "state": "inactive", @@ -1897,12 +2083,12 @@ "summary": "An aggregated API has reported errors." }, "duration": 0, - "evaluationTime": 0.000189102, + "evaluationTime": 0.00018474, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:52.147420607Z", + "lastEvaluation": "2021-06-18T11:24:52.148465217Z", "name": "AggregatedAPIErrors", "query": "sum by(name, namespace) (increase(aggregator_unavailable_apiservice_total[10m])) > 4", "state": "inactive", @@ -1916,49 +2102,34 @@ "summary": "An aggregated API is down." }, "duration": 300, - "evaluationTime": 0.000219021, + "evaluationTime": 0.000642564, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:52.147610863Z", + "lastEvaluation": "2021-06-18T11:24:52.14865154Z", "name": "AggregatedAPIDown", "query": "(1 - max by(name, namespace) (avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85", "state": "inactive", "type": "alerting" }, { - "alerts": [ - { - "activeAt": "2021-05-07T11:53:52.14654001Z", - "annotations": { - "description": "KubeAPI has disappeared from Prometheus target discovery.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown", - "summary": "Target disappeared from Prometheus target discovery." - }, - "labels": { - "alertname": "KubeAPIDown", - "severity": "critical" - }, - "state": "pending", - "value": "1e+00" - } - ], + "alerts": [], "annotations": { "description": "KubeAPI has disappeared from Prometheus target discovery.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown", "summary": "Target disappeared from Prometheus target discovery." }, "duration": 900, - "evaluationTime": 0.000314115, + "evaluationTime": 0.000113863, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:53:52.147830778Z", + "lastEvaluation": "2021-06-18T11:24:52.149295614Z", "name": "KubeAPIDown", "query": "absent(up{job=\"apiserver\"} == 1)", - "state": "pending", + "state": "inactive", "type": "alerting" }, { @@ -1969,12 +2140,12 @@ "summary": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests." }, "duration": 300, - "evaluationTime": 0.000232154, + "evaluationTime": 0.00270748, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:52.148146995Z", + "lastEvaluation": "2021-06-18T11:24:52.149410809Z", "name": "KubeAPITerminatedRequests", "query": "sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) / (sum(rate(apiserver_request_total{job=\"apiserver\"}[10m])) + sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m]))) > 0.2", "state": "inactive", @@ -1983,53 +2154,38 @@ ] }, { - "evaluationTime": 0.000511166, + "evaluationTime": 0.000312673, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-system-controller-manager.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:54:05.676135045Z", + "lastEvaluation": "2021-06-18T11:25:05.676870049Z", "name": "kubernetes-system-controller-manager", "rules": [ { - "alerts": [ - { - "activeAt": "2021-05-07T11:54:05.675120003Z", - "annotations": { - "description": "KubeControllerManager has disappeared from Prometheus target discovery.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown", - "summary": "Target disappeared from Prometheus target discovery." - }, - "labels": { - "alertname": "KubeControllerManagerDown", - "severity": "critical" - }, - "state": "pending", - "value": "1e+00" - } - ], + "alerts": [], "annotations": { "description": "KubeControllerManager has disappeared from Prometheus target discovery.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown", "summary": "Target disappeared from Prometheus target discovery." }, "duration": 900, - "evaluationTime": 0.000500134, + "evaluationTime": 0.000300689, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:05.676141912Z", + "lastEvaluation": "2021-06-18T11:25:05.676877615Z", "name": "KubeControllerManagerDown", "query": "absent(up{job=\"kube-controller-manager\"} == 1)", - "state": "pending", + "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0.003391065, + "evaluationTime": 0.00248392, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-system-kubelet.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:54:08.336943264Z", + "lastEvaluation": "2021-06-18T11:24:38.337580679Z", "name": "kubernetes-system-kubelet", "rules": [ { @@ -2040,12 +2196,12 @@ "summary": "Node is not ready." }, "duration": 900, - "evaluationTime": 0.000666004, + "evaluationTime": 0.000232751, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:08.336950567Z", + "lastEvaluation": "2021-06-18T11:24:38.337586367Z", "name": "KubeNodeNotReady", "query": "kube_node_status_condition{condition=\"Ready\",job=\"kube-state-metrics\",status=\"true\"} == 0", "state": "inactive", @@ -2059,12 +2215,12 @@ "summary": "Node is unreachable." }, "duration": 900, - "evaluationTime": 0.000466827, + "evaluationTime": 0.000201179, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:08.337619993Z", + "lastEvaluation": "2021-06-18T11:24:38.337820771Z", "name": "KubeNodeUnreachable", "query": "(kube_node_spec_taint{effect=\"NoSchedule\",job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\"} unless ignoring(key, value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1", "state": "inactive", @@ -2078,12 +2234,12 @@ "summary": "Kubelet is running at capacity." }, "duration": 900, - "evaluationTime": 0.000499166, + "evaluationTime": 0.001104386, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:08.338089393Z", + "lastEvaluation": "2021-06-18T11:24:38.33802266Z", "name": "KubeletTooManyPods", "query": "count by(node) ((kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance, pod, namespace, cluster) group_left(node) topk by(instance, pod, namespace, cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})) / max by(node) (kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1) > 0.95", "state": "inactive", @@ -2097,12 +2253,12 @@ "summary": "Node readiness status is flapping." }, "duration": 900, - "evaluationTime": 0.000212117, + "evaluationTime": 0.000143268, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:08.338591702Z", + "lastEvaluation": "2021-06-18T11:24:38.339128675Z", "name": "KubeNodeReadinessFlapping", "query": "sum by(node) (changes(kube_node_status_condition{condition=\"Ready\",status=\"true\"}[15m])) > 2", "state": "inactive", @@ -2116,12 +2272,12 @@ "summary": "Kubelet Pod Lifecycle Event Generator is taking too long to relist." }, "duration": 300, - "evaluationTime": 0.000107239, + "evaluationTime": 6.2581e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:08.33880514Z", + "lastEvaluation": "2021-06-18T11:24:38.339272803Z", "name": "KubeletPlegDurationHigh", "query": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} >= 10", "state": "inactive", @@ -2135,12 +2291,12 @@ "summary": "Kubelet Pod startup latency is too high." }, "duration": 900, - "evaluationTime": 0.000290021, + "evaluationTime": 0.000392748, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:08.338913489Z", + "lastEvaluation": "2021-06-18T11:24:38.339335826Z", "name": "KubeletPodStartUpLatencyHigh", "query": "histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\",metrics_path=\"/metrics\"}[5m]))) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"} > 60", "state": "inactive", @@ -2154,12 +2310,12 @@ "summary": "Kubelet client certificate is about to expire." }, "duration": 0, - "evaluationTime": 0.00012213, + "evaluationTime": 5.205e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:08.339205138Z", + "lastEvaluation": "2021-06-18T11:24:38.339729449Z", "name": "KubeletClientCertificateExpiration", "query": "kubelet_certificate_manager_client_ttl_seconds < 604800", "state": "inactive", @@ -2173,12 +2329,12 @@ "summary": "Kubelet client certificate is about to expire." }, "duration": 0, - "evaluationTime": 8.409e-05, + "evaluationTime": 3.8136e-05, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:08.339328419Z", + "lastEvaluation": "2021-06-18T11:24:38.339782124Z", "name": "KubeletClientCertificateExpiration", "query": "kubelet_certificate_manager_client_ttl_seconds < 86400", "state": "inactive", @@ -2192,12 +2348,12 @@ "summary": "Kubelet server certificate is about to expire." }, "duration": 0, - "evaluationTime": 9.123e-05, + "evaluationTime": 2.8335e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:08.339413514Z", + "lastEvaluation": "2021-06-18T11:24:38.339820895Z", "name": "KubeletServerCertificateExpiration", "query": "kubelet_certificate_manager_server_ttl_seconds < 604800", "state": "inactive", @@ -2211,12 +2367,12 @@ "summary": "Kubelet server certificate is about to expire." }, "duration": 0, - "evaluationTime": 7.7262e-05, + "evaluationTime": 3.6317e-05, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:08.339505659Z", + "lastEvaluation": "2021-06-18T11:24:38.33984967Z", "name": "KubeletServerCertificateExpiration", "query": "kubelet_certificate_manager_server_ttl_seconds < 86400", "state": "inactive", @@ -2230,12 +2386,12 @@ "summary": "Kubelet has failed to renew its client certificate." }, "duration": 900, - "evaluationTime": 0.000133196, + "evaluationTime": 4.592e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:08.339584784Z", + "lastEvaluation": "2021-06-18T11:24:38.33988643Z", "name": "KubeletClientCertificateRenewalErrors", "query": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0", "state": "inactive", @@ -2249,101 +2405,71 @@ "summary": "Kubelet has failed to renew its server certificate." }, "duration": 900, - "evaluationTime": 9.8099e-05, + "evaluationTime": 4.6076e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:08.339720603Z", + "lastEvaluation": "2021-06-18T11:24:38.339933254Z", "name": "KubeletServerCertificateRenewalErrors", "query": "increase(kubelet_server_expiration_renew_errors[5m]) > 0", "state": "inactive", "type": "alerting" }, { - "alerts": [ - { - "activeAt": "2021-05-07T11:54:08.336137741Z", - "annotations": { - "description": "Kubelet has disappeared from Prometheus target discovery.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown", - "summary": "Target disappeared from Prometheus target discovery." - }, - "labels": { - "alertname": "KubeletDown", - "severity": "critical" - }, - "state": "pending", - "value": "1e+00" - } - ], + "alerts": [], "annotations": { "description": "Kubelet has disappeared from Prometheus target discovery.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown", "summary": "Target disappeared from Prometheus target discovery." }, "duration": 900, - "evaluationTime": 0.000509168, + "evaluationTime": 8.212e-05, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:08.339819811Z", + "lastEvaluation": "2021-06-18T11:24:38.339980533Z", "name": "KubeletDown", "query": "absent(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1)", - "state": "pending", + "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0.001782009, + "evaluationTime": 0.000485672, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-system-scheduler.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:53:49.593406866Z", + "lastEvaluation": "2021-06-18T11:24:49.588614557Z", "name": "kubernetes-system-scheduler", "rules": [ { - "alerts": [ - { - "activeAt": "2021-05-07T11:53:49.587750705Z", - "annotations": { - "description": "KubeScheduler has disappeared from Prometheus target discovery.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown", - "summary": "Target disappeared from Prometheus target discovery." - }, - "labels": { - "alertname": "KubeSchedulerDown", - "severity": "critical" - }, - "state": "pending", - "value": "1e+00" - } - ], + "alerts": [], "annotations": { "description": "KubeScheduler has disappeared from Prometheus target discovery.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown", "summary": "Target disappeared from Prometheus target discovery." }, "duration": 900, - "evaluationTime": 0.00175813, + "evaluationTime": 0.000471376, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:53:49.593418185Z", + "lastEvaluation": "2021-06-18T11:24:49.588624907Z", "name": "KubeSchedulerDown", "query": "absent(up{job=\"kube-scheduler\"} == 1)", - "state": "pending", + "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.001098469, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-system.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:47.92774153Z", "name": "kubernetes-system", "rules": [ { @@ -2354,12 +2480,12 @@ "summary": "Different semantic versions of Kubernetes components running." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000533389, + "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:47.927749512Z", "name": "KubeVersionMismatch", "query": "count(count by(git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"}, \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", "state": "inactive", @@ -2373,12 +2499,12 @@ "summary": "Kubernetes API server client is experiencing errors." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000551151, + "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:47.928285555Z", "name": "KubeClientErrors", "query": "(sum by(instance, job) (rate(rest_client_requests_total{code=~\"5..\"}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) > 0.01", "state": "inactive", @@ -2387,107 +2513,107 @@ ] }, { - "evaluationTime": 0.001198775, + "evaluationTime": 0.002881344, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-node-exporter.rules.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:53:57.325144618Z", + "lastEvaluation": "2021-06-18T11:24:57.325078213Z", "name": "node-exporter.rules", "rules": [ { - "evaluationTime": 0.000396275, + "evaluationTime": 0.000592422, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.325153996Z", + "lastEvaluation": "2021-06-18T11:24:57.325091023Z", "name": "instance:node_num_cpu:sum", "query": "count without(cpu) (count without(mode) (node_cpu_seconds_total{job=\"node-exporter\"}))", "type": "recording" }, { - "evaluationTime": 0.000133177, + "evaluationTime": 0.000431526, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.325552462Z", - "name": "instance:node_cpu_utilisation:rate1m", - "query": "1 - avg without(cpu, mode) (rate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[1m]))", + "lastEvaluation": "2021-06-18T11:24:57.325685736Z", + "name": "instance:node_cpu_utilisation:rate5m", + "query": "1 - avg without(cpu, mode) (rate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[5m]))", "type": "recording" }, { - "evaluationTime": 5.2619e-05, + "evaluationTime": 0.000224943, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.32568638Z", + "lastEvaluation": "2021-06-18T11:24:57.32612068Z", "name": "instance:node_load1_per_cpu:ratio", "query": "(node_load1{job=\"node-exporter\"} / instance:node_num_cpu:sum{job=\"node-exporter\"})", "type": "recording" }, { - "evaluationTime": 5.7179e-05, + "evaluationTime": 0.000172692, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.325739539Z", + "lastEvaluation": "2021-06-18T11:24:57.326346653Z", "name": "instance:node_memory_utilisation:ratio", "query": "1 - (node_memory_MemAvailable_bytes{job=\"node-exporter\"} / node_memory_MemTotal_bytes{job=\"node-exporter\"})", "type": "recording" }, { - "evaluationTime": 3.4477e-05, + "evaluationTime": 0.000112864, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.325797642Z", - "name": "instance:node_vmstat_pgmajfault:rate1m", - "query": "rate(node_vmstat_pgmajfault{job=\"node-exporter\"}[1m])", + "lastEvaluation": "2021-06-18T11:24:57.326520256Z", + "name": "instance:node_vmstat_pgmajfault:rate5m", + "query": "rate(node_vmstat_pgmajfault{job=\"node-exporter\"}[5m])", "type": "recording" }, { - "evaluationTime": 0.000123701, + "evaluationTime": 0.000208176, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.325832737Z", - "name": "instance_device:node_disk_io_time_seconds:rate1m", - "query": "rate(node_disk_io_time_seconds_total{device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\",job=\"node-exporter\"}[1m])", + "lastEvaluation": "2021-06-18T11:24:57.326634133Z", + "name": "instance_device:node_disk_io_time_seconds:rate5m", + "query": "rate(node_disk_io_time_seconds_total{device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\",job=\"node-exporter\"}[5m])", "type": "recording" }, { - "evaluationTime": 0.000124634, + "evaluationTime": 0.000148599, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.325956945Z", - "name": "instance_device:node_disk_io_time_weighted_seconds:rate1m", - "query": "rate(node_disk_io_time_weighted_seconds_total{device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\",job=\"node-exporter\"}[1m])", + "lastEvaluation": "2021-06-18T11:24:57.32684311Z", + "name": "instance_device:node_disk_io_time_weighted_seconds:rate5m", + "query": "rate(node_disk_io_time_weighted_seconds_total{device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\",job=\"node-exporter\"}[5m])", "type": "recording" }, { - "evaluationTime": 8.2229e-05, + "evaluationTime": 0.000235378, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.3260827Z", - "name": "instance:node_network_receive_bytes_excluding_lo:rate1m", - "query": "sum without(device) (rate(node_network_receive_bytes_total{device!=\"lo\",job=\"node-exporter\"}[1m]))", + "lastEvaluation": "2021-06-18T11:24:57.326992444Z", + "name": "instance:node_network_receive_bytes_excluding_lo:rate5m", + "query": "sum without(device) (rate(node_network_receive_bytes_total{device!=\"lo\",job=\"node-exporter\"}[5m]))", "type": "recording" }, { - "evaluationTime": 6.0207e-05, + "evaluationTime": 0.000230802, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.326165982Z", - "name": "instance:node_network_transmit_bytes_excluding_lo:rate1m", - "query": "sum without(device) (rate(node_network_transmit_bytes_total{device!=\"lo\",job=\"node-exporter\"}[1m]))", + "lastEvaluation": "2021-06-18T11:24:57.327228613Z", + "name": "instance:node_network_transmit_bytes_excluding_lo:rate5m", + "query": "sum without(device) (rate(node_network_transmit_bytes_total{device!=\"lo\",job=\"node-exporter\"}[5m]))", "type": "recording" }, { - "evaluationTime": 5.6091e-05, + "evaluationTime": 0.000261766, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.326227179Z", - "name": "instance:node_network_receive_drop_excluding_lo:rate1m", - "query": "sum without(device) (rate(node_network_receive_drop_total{device!=\"lo\",job=\"node-exporter\"}[1m]))", + "lastEvaluation": "2021-06-18T11:24:57.327460361Z", + "name": "instance:node_network_receive_drop_excluding_lo:rate5m", + "query": "sum without(device) (rate(node_network_receive_drop_total{device!=\"lo\",job=\"node-exporter\"}[5m]))", "type": "recording" }, { - "evaluationTime": 5.6112e-05, + "evaluationTime": 0.0002338, "health": "ok", - "lastEvaluation": "2021-05-07T11:53:57.326284177Z", - "name": "instance:node_network_transmit_drop_excluding_lo:rate1m", - "query": "sum without(device) (rate(node_network_transmit_drop_total{device!=\"lo\",job=\"node-exporter\"}[1m]))", + "lastEvaluation": "2021-06-18T11:24:57.32772303Z", + "name": "instance:node_network_transmit_drop_excluding_lo:rate5m", + "query": "sum without(device) (rate(node_network_transmit_drop_total{device!=\"lo\",job=\"node-exporter\"}[5m]))", "type": "recording" } ] }, { - "evaluationTime": 0.003122097, + "evaluationTime": 0.012385452, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-node-exporter.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:54:05.358712703Z", + "lastEvaluation": "2021-06-18T11:25:05.359317999Z", "name": "node-exporter", "rules": [ { @@ -2498,12 +2624,12 @@ "summary": "Filesystem is predicted to run out of space within the next 24 hours." }, "duration": 3600, - "evaluationTime": 0.000826976, + "evaluationTime": 0.001722095, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:05.358719119Z", + "lastEvaluation": "2021-06-18T11:25:05.359325132Z", "name": "NodeFilesystemSpaceFillingUp", "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", @@ -2517,12 +2643,12 @@ "summary": "Filesystem is predicted to run out of space within the next 4 hours." }, "duration": 3600, - "evaluationTime": 0.00030962, + "evaluationTime": 0.00227843, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:05.359548707Z", + "lastEvaluation": "2021-06-18T11:25:05.361049089Z", "name": "NodeFilesystemSpaceFillingUp", "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", @@ -2536,12 +2662,12 @@ "summary": "Filesystem has less than 5% space left." }, "duration": 3600, - "evaluationTime": 0.000155787, + "evaluationTime": 0.001013755, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:05.359860182Z", + "lastEvaluation": "2021-06-18T11:25:05.363328594Z", "name": "NodeFilesystemAlmostOutOfSpace", "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 5 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", @@ -2555,12 +2681,12 @@ "summary": "Filesystem has less than 3% space left." }, "duration": 3600, - "evaluationTime": 0.000195325, + "evaluationTime": 0.001043328, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:05.360016733Z", + "lastEvaluation": "2021-06-18T11:25:05.364343423Z", "name": "NodeFilesystemAlmostOutOfSpace", "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 3 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", @@ -2574,12 +2700,12 @@ "summary": "Filesystem is predicted to run out of inodes within the next 24 hours." }, "duration": 3600, - "evaluationTime": 0.000358031, + "evaluationTime": 0.001303358, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:05.360213426Z", + "lastEvaluation": "2021-06-18T11:25:05.365387874Z", "name": "NodeFilesystemFilesFillingUp", "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", @@ -2593,12 +2719,12 @@ "summary": "Filesystem is predicted to run out of inodes within the next 4 hours." }, "duration": 3600, - "evaluationTime": 0.000232544, + "evaluationTime": 0.001463305, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:05.360572931Z", + "lastEvaluation": "2021-06-18T11:25:05.366692155Z", "name": "NodeFilesystemFilesFillingUp", "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", @@ -2612,12 +2738,12 @@ "summary": "Filesystem has less than 5% inodes left." }, "duration": 3600, - "evaluationTime": 0.000164823, + "evaluationTime": 0.000957354, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:05.360806458Z", + "lastEvaluation": "2021-06-18T11:25:05.368156639Z", "name": "NodeFilesystemAlmostOutOfFiles", "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 5 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", @@ -2631,12 +2757,12 @@ "summary": "Filesystem has less than 3% inodes left." }, "duration": 3600, - "evaluationTime": 0.000188264, + "evaluationTime": 0.000987361, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:05.360972429Z", + "lastEvaluation": "2021-06-18T11:25:05.369115415Z", "name": "NodeFilesystemAlmostOutOfFiles", "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 3 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", @@ -2650,12 +2776,12 @@ "summary": "Network interface is reporting many receive errors." }, "duration": 3600, - "evaluationTime": 8.2948e-05, + "evaluationTime": 0.000229894, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:05.361161643Z", + "lastEvaluation": "2021-06-18T11:25:05.370103922Z", "name": "NodeNetworkReceiveErrs", "query": "increase(node_network_receive_errs_total[2m]) > 10", "state": "inactive", @@ -2669,12 +2795,12 @@ "summary": "Network interface is reporting many transmit errors." }, "duration": 3600, - "evaluationTime": 7.9241e-05, + "evaluationTime": 0.000300009, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:05.361245117Z", + "lastEvaluation": "2021-06-18T11:25:05.370334579Z", "name": "NodeNetworkTransmitErrs", "query": "increase(node_network_transmit_errs_total[2m]) > 10", "state": "inactive", @@ -2688,12 +2814,12 @@ "summary": "Number of conntrack are getting close to the limit" }, "duration": 0, - "evaluationTime": 8.8827e-05, + "evaluationTime": 0.00016906, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:05.361325481Z", + "lastEvaluation": "2021-06-18T11:25:05.370635955Z", "name": "NodeHighNumberConntrackEntriesUsed", "query": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75", "state": "inactive", @@ -2707,34 +2833,77 @@ "summary": "Clock skew detected." }, "duration": 600, - "evaluationTime": 0.000186416, + "evaluationTime": 0.000234783, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:05.361414923Z", + "lastEvaluation": "2021-06-18T11:25:05.370805765Z", "name": "NodeClockSkewDetected", "query": "(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)", "state": "inactive", "type": "alerting" }, { - "alerts": [], + "alerts": [ + { + "activeAt": "2021-06-18T11:24:05.358542542Z", + "annotations": { + "message": "Clock on 192.168.1.101:9100 is not synchronising. Ensure NTP is configured on this host.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising", + "summary": "Clock not synchronising." + }, + "labels": { + "alertname": "NodeClockNotSynchronising", + "container": "node-exporter", + "endpoint": "metrics", + "instance": "192.168.1.101:9100", + "job": "node-exporter", + "namespace": "metalk8s-monitoring", + "pod": "prometheus-operator-prometheus-node-exporter-ch6h7", + "service": "prometheus-operator-prometheus-node-exporter", + "severity": "warning" + }, + "state": "pending", + "value": "0e+00" + }, + { + "activeAt": "2021-06-18T11:24:05.358542542Z", + "annotations": { + "message": "Clock on 192.168.1.100:9100 is not synchronising. Ensure NTP is configured on this host.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising", + "summary": "Clock not synchronising." + }, + "labels": { + "alertname": "NodeClockNotSynchronising", + "container": "node-exporter", + "endpoint": "metrics", + "instance": "192.168.1.100:9100", + "job": "node-exporter", + "namespace": "metalk8s-monitoring", + "pod": "prometheus-operator-prometheus-node-exporter-bfknc", + "service": "prometheus-operator-prometheus-node-exporter", + "severity": "warning" + }, + "state": "pending", + "value": "0e+00" + } + ], "annotations": { "message": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising", "summary": "Clock not synchronising." }, "duration": 600, - "evaluationTime": 5.4439e-05, + "evaluationTime": 0.000459361, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:05.361602037Z", + "lastEvaluation": "2021-06-18T11:25:05.371041571Z", "name": "NodeClockNotSynchronising", "query": "min_over_time(node_timex_sync_status[5m]) == 0", - "state": "inactive", + "state": "pending", "type": "alerting" }, { @@ -2745,12 +2914,12 @@ "summary": "Node Exporter text file collector failed to scrape." }, "duration": 0, - "evaluationTime": 5.6123e-05, + "evaluationTime": 8.1699e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:05.361656971Z", + "lastEvaluation": "2021-06-18T11:25:05.371502307Z", "name": "NodeTextFileCollectorScrapeError", "query": "node_textfile_scrape_error{job=\"node-exporter\"} == 1", "state": "inactive", @@ -2764,12 +2933,12 @@ "summary": "RAID Array is degraded" }, "duration": 900, - "evaluationTime": 7.2225e-05, + "evaluationTime": 6.7002e-05, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:54:05.361713665Z", + "lastEvaluation": "2021-06-18T11:25:05.371584592Z", "name": "NodeRAIDDegraded", "query": "node_md_disks_required - ignoring(state) (node_md_disks{state=\"active\"}) >= 1", "state": "inactive", @@ -2783,12 +2952,12 @@ "summary": "Failed device in RAID array" }, "duration": 0, - "evaluationTime": 4.5285e-05, + "evaluationTime": 4.8292e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:54:05.361786423Z", + "lastEvaluation": "2021-06-18T11:25:05.371652265Z", "name": "NodeRAIDDiskFailure", "query": "node_md_disks{state=\"fail\"} >= 1", "state": "inactive", @@ -2797,24 +2966,25 @@ ] }, { - "evaluationTime": 0, + "evaluationTime": 0.00094999, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-node-network.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.988294952Z", "name": "node-network", "rules": [ { "alerts": [], "annotations": { - "message": "Network interface \"{{ $labels.device }}\" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}\"" + "message": "Network interface \"{{ $labels.device }}\" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkinterfaceflapping" }, "duration": 120, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000938181, + "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.988302942Z", "name": "NodeNetworkInterfaceFlapping", "query": "changes(node_network_up{device!~\"veth.+\",job=\"node-exporter\"}[2m]) > 2", "state": "inactive", @@ -2823,32 +2993,32 @@ ] }, { - "evaluationTime": 0, + "evaluationTime": 0.002062397, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-node.rules.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:48.107472278Z", "name": "node.rules", "rules": [ { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.001004127, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.107479934Z", "name": "node_namespace_pod:kube_pod_info:", "query": "topk by(namespace, pod) (1, max by(node, namespace, pod) (label_replace(kube_pod_info{job=\"kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000705795, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.108487271Z", "name": "node:node_num_cpu:sum", "query": "count by(cluster, node) (sum by(node, cpu) (node_cpu_seconds_total{job=\"node-exporter\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000336015, + "health": "ok", + "lastEvaluation": "2021-06-18T11:24:48.109195056Z", "name": ":node_memory_MemAvailable_bytes:sum", "query": "sum by(cluster) (node_memory_MemAvailable_bytes{job=\"node-exporter\"} or (node_memory_Buffers_bytes{job=\"node-exporter\"} + node_memory_Cached_bytes{job=\"node-exporter\"} + node_memory_MemFree_bytes{job=\"node-exporter\"} + node_memory_Slab_bytes{job=\"node-exporter\"}))", "type": "recording" @@ -2856,10 +3026,10 @@ ] }, { - "evaluationTime": 0, + "evaluationTime": 0.001369177, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-prometheus-operator.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.674470047Z", "name": "prometheus-operator", "rules": [ { @@ -2870,12 +3040,12 @@ "summary": "Errors while performing list operations in controller." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000456169, + "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.674477397Z", "name": "PrometheusOperatorListErrors", "query": "(sum by(controller, namespace) (rate(prometheus_operator_list_operations_failed_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_list_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m]))) > 0.4", "state": "inactive", @@ -2889,12 +3059,12 @@ "summary": "Errors while performing watch operations in controller." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000190625, + "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.674935146Z", "name": "PrometheusOperatorWatchErrors", "query": "(sum by(controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_watch_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m]))) > 0.4", "state": "inactive", @@ -2908,12 +3078,12 @@ "summary": "Last controller reconciliation failed" }, "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 9.0145e-05, + "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.675126789Z", "name": "PrometheusOperatorSyncFailed", "query": "min_over_time(prometheus_operator_syncs{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\",status=\"failed\"}[5m]) > 0", "state": "inactive", @@ -2927,12 +3097,12 @@ "summary": "Errors while reconciling controller." }, "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000168298, + "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.675217734Z", "name": "PrometheusOperatorReconcileErrors", "query": "(sum by(controller, namespace) (rate(prometheus_operator_reconcile_errors_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]))) / (sum by(controller, namespace) (rate(prometheus_operator_reconcile_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]))) > 0.1", "state": "inactive", @@ -2946,12 +3116,12 @@ "summary": "Errors while reconciling Prometheus." }, "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000105146, + "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.675386876Z", "name": "PrometheusOperatorNodeLookupErrors", "query": "rate(prometheus_operator_node_address_lookup_errors_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]) > 0.1", "state": "inactive", @@ -2965,12 +3135,12 @@ "summary": "Prometheus operator not ready" }, "duration": 300, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000177994, + "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.675492644Z", "name": "PrometheusOperatorNotReady", "query": "min by(namespace, controller) (max_over_time(prometheus_operator_ready{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]) == 0)", "state": "inactive", @@ -2984,12 +3154,12 @@ "summary": "Resources rejected by Prometheus operator" }, "duration": 300, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000164409, + "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-06-18T11:24:46.675672222Z", "name": "PrometheusOperatorRejectedResources", "query": "min_over_time(prometheus_operator_managed_resources{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\",state=\"rejected\"}[5m]) > 0", "state": "inactive", @@ -2998,25 +3168,26 @@ ] }, { - "evaluationTime": 0.002194673, + "evaluationTime": 0.002851569, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-prometheus.yaml", "interval": 30, - "lastEvaluation": "2021-05-07T11:53:54.860108721Z", + "lastEvaluation": "2021-06-18T11:24:54.86217086Z", "name": "prometheus", "rules": [ { "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusbadconfig", "summary": "Failed Prometheus configuration reload." }, "duration": 600, - "evaluationTime": 0.000436673, + "evaluationTime": 0.000332297, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:53:54.860115332Z", + "lastEvaluation": "2021-06-18T11:24:54.862176892Z", "name": "PrometheusBadConfig", "query": "max_over_time(prometheus_config_last_reload_successful{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) == 0", "state": "inactive", @@ -3026,15 +3197,16 @@ "alerts": [], "annotations": { "description": "Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusnotificationqueuerunningfull", "summary": "Prometheus alert notification queue predicted to run full in less than 30m." }, "duration": 900, - "evaluationTime": 0.00020589, + "evaluationTime": 0.000345237, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:54.860553699Z", + "lastEvaluation": "2021-06-18T11:24:54.862511094Z", "name": "PrometheusNotificationQueueRunningFull", "query": "(predict_linear(prometheus_notifications_queue_length{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]))", "state": "inactive", @@ -3044,51 +3216,35 @@ "alerts": [], "annotations": { "description": "{{ printf \"%.1f\" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheuserrorsendingalertstosomealertmanagers", "summary": "Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager." }, "duration": 900, - "evaluationTime": 0.000119124, + "evaluationTime": 0.000210797, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:54.860760576Z", + "lastEvaluation": "2021-06-18T11:24:54.862859344Z", "name": "PrometheusErrorSendingAlertsToSomeAlertmanagers", "query": "(rate(prometheus_notifications_errors_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 1", "state": "inactive", "type": "alerting" }, - { - "alerts": [], - "annotations": { - "description": "{{ printf \"%.1f\" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.", - "summary": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager." - }, - "duration": 900, - "evaluationTime": 0.00018408, - "health": "ok", - "labels": { - "severity": "critical" - }, - "lastEvaluation": "2021-05-07T11:53:54.860880285Z", - "name": "PrometheusErrorSendingAlertsToAnyAlertmanager", - "query": "min without(alertmanager) (rate(prometheus_notifications_errors_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 3", - "state": "inactive", - "type": "alerting" - }, { "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusnotconnectedtoalertmanagers", "summary": "Prometheus is not connected to any Alertmanagers." }, "duration": 600, - "evaluationTime": 7.5748e-05, + "evaluationTime": 7.8948e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:54.861065064Z", + "lastEvaluation": "2021-06-18T11:24:54.863070706Z", "name": "PrometheusNotConnectedToAlertmanagers", "query": "max_over_time(prometheus_notifications_alertmanagers_discovered{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) < 1", "state": "inactive", @@ -3098,15 +3254,16 @@ "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheustsdbreloadsfailing", "summary": "Prometheus has issues reloading blocks from disk." }, "duration": 14400, - "evaluationTime": 0.000127935, + "evaluationTime": 6.5162e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:54.861141751Z", + "lastEvaluation": "2021-06-18T11:24:54.863150178Z", "name": "PrometheusTSDBReloadsFailing", "query": "increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[3h]) > 0", "state": "inactive", @@ -3116,15 +3273,16 @@ "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheustsdbcompactionsfailing", "summary": "Prometheus has issues compacting blocks." }, "duration": 14400, - "evaluationTime": 7.3651e-05, + "evaluationTime": 6.8084e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:54.861270837Z", + "lastEvaluation": "2021-06-18T11:24:54.863215904Z", "name": "PrometheusTSDBCompactionsFailing", "query": "increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[3h]) > 0", "state": "inactive", @@ -3134,17 +3292,18 @@ "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusnotingestingsamples", "summary": "Prometheus is not ingesting samples." }, "duration": 600, - "evaluationTime": 5.7867e-05, + "evaluationTime": 0.000414331, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:54.861345021Z", + "lastEvaluation": "2021-06-18T11:24:54.863284539Z", "name": "PrometheusNotIngestingSamples", - "query": "rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) <= 0", + "query": "(rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) <= 0 and (sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}) > 0 or sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}) > 0))", "state": "inactive", "type": "alerting" }, @@ -3152,15 +3311,16 @@ "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf \"%.4g\" $value }} samples/s with different values but duplicated timestamp.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusduplicatetimestamps", "summary": "Prometheus is dropping samples with duplicate timestamps." }, "duration": 600, - "evaluationTime": 8.3437e-05, + "evaluationTime": 9.006e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:54.861403487Z", + "lastEvaluation": "2021-06-18T11:24:54.863700017Z", "name": "PrometheusDuplicateTimestamps", "query": "rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", "state": "inactive", @@ -3170,15 +3330,16 @@ "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf \"%.4g\" $value }} samples/s with timestamps arriving out of order.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoutofordertimestamps", "summary": "Prometheus drops samples with out-of-order timestamps." }, "duration": 600, - "evaluationTime": 0.00011754, + "evaluationTime": 6.7852e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:54.861487734Z", + "lastEvaluation": "2021-06-18T11:24:54.863790694Z", "name": "PrometheusOutOfOrderTimestamps", "query": "rate(prometheus_target_scrapes_sample_out_of_order_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", "state": "inactive", @@ -3188,17 +3349,18 @@ "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf \"%.1f\" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusremotestoragefailures", "summary": "Prometheus fails to send samples to remote storage." }, "duration": 900, - "evaluationTime": 0.000268627, + "evaluationTime": 0.000265043, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:53:54.861606421Z", + "lastEvaluation": "2021-06-18T11:24:54.863859068Z", "name": "PrometheusRemoteStorageFailures", - "query": "(rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / (rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) + rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]))) * 100 > 1", + "query": "((rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) / ((rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) + (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])))) * 100 > 1", "state": "inactive", "type": "alerting" }, @@ -3206,17 +3368,18 @@ "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf \"%.1f\" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusremotewritebehind", "summary": "Prometheus remote write is behind." }, "duration": 900, - "evaluationTime": 0.000130138, + "evaluationTime": 0.000108802, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:53:54.861875657Z", + "lastEvaluation": "2021-06-18T11:24:54.864124718Z", "name": "PrometheusRemoteWriteBehind", - "query": "(max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) - on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) > 120", + "query": "(max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) - ignoring(remote_name, url) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) > 120", "state": "inactive", "type": "alerting" }, @@ -3224,15 +3387,16 @@ "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance=\"%s\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}` $labels.instance | query | first | value }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusremotewritedesiredshards", "summary": "Prometheus remote write desired shards calculation wants to run more than configured max shards." }, "duration": 900, - "evaluationTime": 0.000111533, + "evaluationTime": 8.0293e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:54.862006458Z", + "lastEvaluation": "2021-06-18T11:24:54.864234017Z", "name": "PrometheusRemoteWriteDesiredShards", "query": "(max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]))", "state": "inactive", @@ -3242,15 +3406,16 @@ "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf \"%.0f\" $value }} rules in the last 5m.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusrulefailures", "summary": "Prometheus is failing rule evaluations." }, "duration": 900, - "evaluationTime": 7.6758e-05, + "evaluationTime": 0.000181708, "health": "ok", "labels": { "severity": "critical" }, - "lastEvaluation": "2021-05-07T11:53:54.862118555Z", + "lastEvaluation": "2021-06-18T11:24:54.864314802Z", "name": "PrometheusRuleFailures", "query": "increase(prometheus_rule_evaluation_failures_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", "state": "inactive", @@ -3260,15 +3425,16 @@ "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf \"%.0f\" $value }} rule group evaluations in the last 5m.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusmissingruleevaluations", "summary": "Prometheus is missing rule evaluations due to slow rule group evaluation." }, "duration": 900, - "evaluationTime": 5.5317e-05, + "evaluationTime": 0.000183435, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:54.862195875Z", + "lastEvaluation": "2021-06-18T11:24:54.864497174Z", "name": "PrometheusMissingRuleEvaluations", "query": "increase(prometheus_rule_group_iterations_missed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", "state": "inactive", @@ -3278,19 +3444,58 @@ "alerts": [], "annotations": { "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf \"%.0f\" $value }} targets because the number of targets exceeded the configured target_limit.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheustargetlimithit", "summary": "Prometheus has dropped targets because some scrape configs have exceeded the targets limit." }, "duration": 900, - "evaluationTime": 4.8855e-05, + "evaluationTime": 6.8522e-05, "health": "ok", "labels": { "severity": "warning" }, - "lastEvaluation": "2021-05-07T11:53:54.862251661Z", + "lastEvaluation": "2021-06-18T11:24:54.864681216Z", "name": "PrometheusTargetLimitHit", "query": "increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", "state": "inactive", "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf \"%.0f\" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheuslabellimithit", + "summary": "Prometheus has dropped targets because some scrape configs have exceeded the labels limit." + }, + "duration": 900, + "evaluationTime": 6.2719e-05, + "health": "ok", + "labels": { + "severity": "warning" + }, + "lastEvaluation": "2021-06-18T11:24:54.864750348Z", + "name": "PrometheusLabelLimitHit", + "query": "increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "{{ printf \"%.1f\" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheuserrorsendingalertstoanyalertmanager", + "summary": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager." + }, + "duration": 900, + "evaluationTime": 0.000205877, + "health": "ok", + "labels": { + "severity": "critical" + }, + "lastEvaluation": "2021-06-18T11:24:54.864813627Z", + "name": "PrometheusErrorSendingAlertsToAnyAlertmanager", + "query": "min without(alertmanager) (rate(prometheus_notifications_errors_total{alertmanager!~\"\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{alertmanager!~\"\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 3", + "state": "inactive", + "type": "alerting" } ] }