From 889fee79da69893dcb791279f774bbe571f9c70f Mon Sep 17 00:00:00 2001 From: Teddy Andrieux Date: Tue, 19 Oct 2021 10:49:11 +0200 Subject: [PATCH] charts,salt,tools: Add Thanos querier in front of Prometheus As part of kube-prometheus-stack chart, deploy Thanos sidecar as part of Prometheus Pod and the Thanos sidecar service used by Thanos querier to discover the Thanos sidecars IPs. Re-render the chart using: ``` ./charts/render.py prometheus-operator \ charts/kube-prometheus-stack.yaml \ charts/kube-prometheus-stack/ \ --namespace metalk8s-monitoring \ --service-config grafana \ metalk8s-grafana-config \ metalk8s/addons/prometheus-operator/config/grafana.yaml \ metalk8s-monitoring \ --service-config prometheus \ metalk8s-prometheus-config \ metalk8s/addons/prometheus-operator/config/prometheus.yaml \ metalk8s-monitoring \ --service-config alertmanager \ metalk8s-alertmanager-config \ metalk8s/addons/prometheus-operator/config/alertmanager.yaml \ metalk8s-monitoring \ --service-config dex \ metalk8s-dex-config \ metalk8s/addons/dex/config/dex.yaml.j2 metalk8s-auth \ --drop-prometheus-rules charts/drop-prometheus-rules.yaml \ > salt/metalk8s/addons/prometheus-operator/deployed/chart.sls ``` Import Thanos helm chart from banzaicloud: ``` helm repo add banzaicloud-stable https://kubernetes-charts.banzaicloud.com helm repo update helm fetch -d charts --untar banzaicloud-stable/thanos ``` Note that we only deploy the Thanos querier and not all other components. We also bump Thanos image since the one set in the helm chart is a bit old and do not support all Prometheus endpoint we need. Render the Thanos helm chart using ``` ./charts/render.py thanos \ charts/thanos.yaml charts/thanos/ \ --namespace metalk8s-monitoring \ > salt/metalk8s/addons/prometheus-operator/deployed/thanos-chart.sls ``` Then we replace the Prometheus datasource for Grafana in order to use this Thanos querier. We also replace the proxy ingress used by MetalK8s UI to use this Thanos querier as well. Since we now reach Thanos to retrieve Prometheus information, the order for the Prometheus rules changed a bit, so they get extracted again using ``` ./tools/rule_extractor/rule_extractor.py \ -i -p 8443 -t rules ``` NOTE: Test not get updated a lot, just added sanity check to ensure Thanos querier Pod is running, since we already have some tests about Prometheus and for all those tests we now reach Thanos so we implicitly test Thanos querier --- CHANGELOG.md | 4 + buildchain/buildchain/constants.py | 1 + buildchain/buildchain/image.py | 3 + buildchain/buildchain/salt_tree.py | 1 + buildchain/buildchain/versions.py | 5 + charts/kube-prometheus-stack.yaml | 10 + charts/thanos.yaml | 46 + charts/thanos/.helmignore | 21 + charts/thanos/Chart.yaml | 18 + charts/thanos/README.md | 361 ++ charts/thanos/requirements.yaml | 0 charts/thanos/templates/NOTES.txt | 0 charts/thanos/templates/_helpers.tpl | 62 + .../thanos/templates/bucket-deployment.yaml | 98 + charts/thanos/templates/bucket-ingress.yaml | 50 + .../templates/bucket-poddisruptionbudget.yaml | 25 + charts/thanos/templates/bucket-service.yaml | 28 + .../thanos/templates/compact-deployment.yaml | 108 + .../compact-persistentvolumeclaim.yaml | 20 + charts/thanos/templates/compact-service.yaml | 28 + .../templates/compact-servicemonitor.yaml | 30 + charts/thanos/templates/query-deployment.yaml | 159 + .../templates/query-frontend-deployment.yaml | 186 + ...uery-frontend-horizontalpodautoscaler.yaml | 35 + .../templates/query-frontend-ingress.yml | 107 + .../query-frontend-poddisruptionbudget.yaml | 25 + .../thanos/templates/query-frontend-psp.yaml | 27 + .../thanos/templates/query-frontend-rbac.yaml | 55 + .../templates/query-frontend-service.yaml | 63 + .../query-frontend-servicemonitor.yaml | 30 + .../query-horizontalpodautoscaler.yaml | 35 + charts/thanos/templates/query-ingress.yml | 108 + .../templates/query-poddisruptionbudget.yaml | 26 + charts/thanos/templates/query-psp.yaml | 27 + charts/thanos/templates/query-rbac.yaml | 55 + charts/thanos/templates/query-service.yaml | 63 + .../templates/query-servicemonitor.yaml | 30 + charts/thanos/templates/rule-configmap.yaml | 19 + charts/thanos/templates/rule-ingress.yml | 107 + charts/thanos/templates/rule-service.yaml | 63 + .../thanos/templates/rule-servicemonitor.yaml | 30 + charts/thanos/templates/rule-statefulset.yaml | 183 + charts/thanos/templates/secret.yaml | 20 + charts/thanos/templates/sidecar-ingress.yaml | 107 + charts/thanos/templates/sidecar-service.yaml | 63 + .../templates/sidecar-servicemonitor.yaml | 30 + charts/thanos/templates/store-deployment.yaml | 193 + charts/thanos/templates/store-ingress.yaml | 103 + .../store-persistentvolumeclaim.yaml | 31 + charts/thanos/templates/store-service.yaml | 63 + .../templates/store-servicemonitor.yaml | 30 + charts/thanos/values.yaml | 1222 ++++++ .../prometheus-operator/deployed/chart.sls | 31 +- .../prometheus-operator/deployed/init.sls | 1 + .../deployed/thanos-chart.sls | 131 + .../addons/ui/deployed/dependencies.sls | 6 +- salt/metalk8s/addons/ui/deployed/ingress.sls | 4 +- tests/post/features/sanity.feature | 1 + tools/rule_extractor/alerting_rules.csv | 155 +- tools/rule_extractor/alerting_rules.json | 1778 ++++---- tools/rule_extractor/rules.json | 3700 +++++++++++------ 61 files changed, 7731 insertions(+), 2290 deletions(-) create mode 100644 charts/thanos.yaml create mode 100644 charts/thanos/.helmignore create mode 100644 charts/thanos/Chart.yaml create mode 100644 charts/thanos/README.md create mode 100644 charts/thanos/requirements.yaml create mode 100644 charts/thanos/templates/NOTES.txt create mode 100644 charts/thanos/templates/_helpers.tpl create mode 100644 charts/thanos/templates/bucket-deployment.yaml create mode 100644 charts/thanos/templates/bucket-ingress.yaml create mode 100644 charts/thanos/templates/bucket-poddisruptionbudget.yaml create mode 100644 charts/thanos/templates/bucket-service.yaml create mode 100644 charts/thanos/templates/compact-deployment.yaml create mode 100644 charts/thanos/templates/compact-persistentvolumeclaim.yaml create mode 100644 charts/thanos/templates/compact-service.yaml create mode 100644 charts/thanos/templates/compact-servicemonitor.yaml create mode 100644 charts/thanos/templates/query-deployment.yaml create mode 100644 charts/thanos/templates/query-frontend-deployment.yaml create mode 100644 charts/thanos/templates/query-frontend-horizontalpodautoscaler.yaml create mode 100644 charts/thanos/templates/query-frontend-ingress.yml create mode 100644 charts/thanos/templates/query-frontend-poddisruptionbudget.yaml create mode 100644 charts/thanos/templates/query-frontend-psp.yaml create mode 100644 charts/thanos/templates/query-frontend-rbac.yaml create mode 100644 charts/thanos/templates/query-frontend-service.yaml create mode 100644 charts/thanos/templates/query-frontend-servicemonitor.yaml create mode 100644 charts/thanos/templates/query-horizontalpodautoscaler.yaml create mode 100644 charts/thanos/templates/query-ingress.yml create mode 100644 charts/thanos/templates/query-poddisruptionbudget.yaml create mode 100644 charts/thanos/templates/query-psp.yaml create mode 100644 charts/thanos/templates/query-rbac.yaml create mode 100644 charts/thanos/templates/query-service.yaml create mode 100644 charts/thanos/templates/query-servicemonitor.yaml create mode 100644 charts/thanos/templates/rule-configmap.yaml create mode 100644 charts/thanos/templates/rule-ingress.yml create mode 100644 charts/thanos/templates/rule-service.yaml create mode 100644 charts/thanos/templates/rule-servicemonitor.yaml create mode 100644 charts/thanos/templates/rule-statefulset.yaml create mode 100644 charts/thanos/templates/secret.yaml create mode 100644 charts/thanos/templates/sidecar-ingress.yaml create mode 100644 charts/thanos/templates/sidecar-service.yaml create mode 100644 charts/thanos/templates/sidecar-servicemonitor.yaml create mode 100644 charts/thanos/templates/store-deployment.yaml create mode 100644 charts/thanos/templates/store-ingress.yaml create mode 100644 charts/thanos/templates/store-persistentvolumeclaim.yaml create mode 100644 charts/thanos/templates/store-service.yaml create mode 100644 charts/thanos/templates/store-servicemonitor.yaml create mode 100644 charts/thanos/values.yaml create mode 100644 salt/metalk8s/addons/prometheus-operator/deployed/thanos-chart.sls diff --git a/CHANGELOG.md b/CHANGELOG.md index 44b832c0d6..8c63a92262 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,6 +48,10 @@ PR[#3420](https://github.com/scality/metalk8s/pull/3420), PR[#3501](https://github.com/scality/metalk8s/pull/3501)) +- Deploy Thanos querier in front of Prometheus in order to make metrics + highly-available when we have multiple Prometheus instances + (PR[#3573](https://github.com/scality/metalk8s/pull/3573)) + ## Release 2.10.5 (in development) ## Release 2.10.4 diff --git a/buildchain/buildchain/constants.py b/buildchain/buildchain/constants.py index 1f7189de1d..66dc0557a7 100644 --- a/buildchain/buildchain/constants.py +++ b/buildchain/buildchain/constants.py @@ -30,6 +30,7 @@ PROMETHEUS_ADAPTER_REPOSITORY: str = "docker.io/directxman12" PROMETHEUS_OPERATOR_REPOSITORY: str = "quay.io/prometheus-operator" PROMETHEUS_REPOSITORY: str = "quay.io/prometheus" +THANOS_REPOSITORY: str = "quay.io/thanos" # Paths {{{ diff --git a/buildchain/buildchain/image.py b/buildchain/buildchain/image.py index 936ed545c6..c358978714 100644 --- a/buildchain/buildchain/image.py +++ b/buildchain/buildchain/image.py @@ -222,6 +222,9 @@ def _operator_image(name: str, **kwargs: Any) -> targets.OperatorImage: "node-exporter", "prometheus", ], + constants.THANOS_REPOSITORY: [ + "thanos", + ], } REMOTE_NAMES: Dict[str, str] = { diff --git a/buildchain/buildchain/salt_tree.py b/buildchain/buildchain/salt_tree.py index fa546d9279..a525038c6a 100644 --- a/buildchain/buildchain/salt_tree.py +++ b/buildchain/buildchain/salt_tree.py @@ -357,6 +357,7 @@ def task(self) -> types.TaskDict: Path("salt/metalk8s/addons/prometheus-operator/deployed/namespace.sls"), Path("salt/metalk8s/addons/prometheus-operator/deployed/prometheus-rules.sls"), Path("salt/metalk8s/addons/prometheus-operator/deployed/service-configuration.sls"), + Path("salt/metalk8s/addons/prometheus-operator/deployed/thanos-chart.sls"), Path("salt/metalk8s/addons/ui/deployed/dependencies.sls"), Path("salt/metalk8s/addons/ui/deployed/ingress.sls"), Path("salt/metalk8s/addons/ui/deployed/init.sls"), diff --git a/buildchain/buildchain/versions.py b/buildchain/buildchain/versions.py index 57e7cc0ee3..eb367f9521 100644 --- a/buildchain/buildchain/versions.py +++ b/buildchain/buildchain/versions.py @@ -210,6 +210,11 @@ def _version_prefix(version: str, prefix: str = "v") -> str: version="v0.48.1", digest="sha256:2e7b61c86ee8b0aef4f5da8b6a4e51ecef249c9ccf4a329c5aa0c81e3fd074c1", ), + Image( + name="thanos", + version="v0.23.1", + digest="sha256:2f7d1ddc7877b076efbc3fa626b5003f7f197efbd777cff0eec2b20c2cd68d20", + ), # Local images Image( name="metalk8s-alert-logger", diff --git a/charts/kube-prometheus-stack.yaml b/charts/kube-prometheus-stack.yaml index 7e93b5ece7..9dc1bc905d 100644 --- a/charts/kube-prometheus-stack.yaml +++ b/charts/kube-prometheus-stack.yaml @@ -87,7 +87,13 @@ prometheusOperator: prometheus: + thanosService: + enabled: true + prometheusSpec: + thanos: + image: '__full_image__(thanos)' + image: repository: '__image__(prometheus)' @@ -149,6 +155,10 @@ grafana: image: repository: '__image__(k8s-sidecar)' + datasources: + # Service deployed by Thanos + url: http://thanos-query-http:10902/ + nodeSelector: node-role.kubernetes.io/infra: '' diff --git a/charts/thanos.yaml b/charts/thanos.yaml new file mode 100644 index 0000000000..bfae47459e --- /dev/null +++ b/charts/thanos.yaml @@ -0,0 +1,46 @@ +store: + enabled: false + +queryFrontend: + enabled: false + +compact: + enabled: false + +bucket: + enabled: false + +rule: + enabled: false + +# This one is deployed by Prometheus operator +sidecar: + enabled: false + +image: + repository: '__image__(thanos)' + tag: v0.23.1 + +query: + enabled: true + + replicaLabels: + - prometheus_replica + + storeDNSDiscovery: false + sidecarDNSDiscovery: false + + stores: + # Service deployed by Prometheus operator to expose Thanos sidecars + - dnssrv+_grpc._tcp.prometheus-operator-thanos-discovery + + tolerations: + - key: 'node-role.kubernetes.io/bootstrap' + operator: 'Exists' + effect: 'NoSchedule' + - key: 'node-role.kubernetes.io/infra' + operator: 'Exists' + effect: 'NoSchedule' + + nodeSelector: + node-role.kubernetes.io/infra: '' diff --git a/charts/thanos/.helmignore b/charts/thanos/.helmignore new file mode 100644 index 0000000000..f0c1319444 --- /dev/null +++ b/charts/thanos/.helmignore @@ -0,0 +1,21 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj diff --git a/charts/thanos/Chart.yaml b/charts/thanos/Chart.yaml new file mode 100644 index 0000000000..acbef6af0f --- /dev/null +++ b/charts/thanos/Chart.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +appVersion: 0.17.1 +description: Thanos is a set of components that can be composed into a highly available + metric system with unlimited storage capacity, which can be added seamlessly on + top of existing Prometheus deployments. +icon: https://raw.githubusercontent.com/thanos-io/thanos/master/docs/img/Thanos-logo_fullmedium.png +keywords: +- thanos +- prometheus +- metrics +maintainers: +- email: info@banzaicloud.com + name: Banzai Cloud +name: thanos +sources: +- https://github.com/thanos-io/thanos +- https://github.com/banzaicloud/banzai-charts/tree/master/thanos +version: 0.4.6 diff --git a/charts/thanos/README.md b/charts/thanos/README.md new file mode 100644 index 0000000000..da33a0fe92 --- /dev/null +++ b/charts/thanos/README.md @@ -0,0 +1,361 @@ +# Thanos Helm chart + +This is a Helm Chart for Thanos. It does not include the required Prometheus and sidecar installation. + +## Thanos + +Thanos is a set of components that can be composed into a highly available metric system with unlimited storage capacity, which can be added seamlessly on top of existing Prometheus deployments. + +Thanos leverages the Prometheus 2.0 storage format to cost-efficiently store historical metric data in any object storage while retaining fast query latencies. Additionally, it provides a global query view across all Prometheus installations and can merge data from Prometheus HA pairs on the fly. + +Concretely the aims of the project are: + +- Global query view of metrics. +- Unlimited retention of metrics. +- High availability of components, including Prometheus. + +## Helm Chart + +This chart is in **Beta** state to provide easy installation via Helm chart. +Things that we are improving in near future: +- [ ] Automatic TLS generation for communicating between in-cluster components +- [ ] Support for tracing configuration +- [ ] Grafana dashboards +- [ ] Informative NOTES.txt + +## Architecture + +This Chart will install a complete [Thanos](https://github.com/improbable-eng/thanos) solution. To understand how Thanos works please read it's official [Architecture design](https://github.com/improbable-eng/thanos/blob/master/docs/design.md). + +## Installing the Chart + +Add Banzai Cloud repository: + +```bash +$ helm repo add banzaicloud-stable https://kubernetes-charts.banzaicloud.com +``` + +## Storage examples + +### Example GCS configuration for `object-store.yaml` +``` +type: GCS +config: + bucket: "thanos" + service_account: |- + { + "type": "service_account", + "project_id": "project", + "private_key_id": "abcdefghijklmnopqrstuvwxyz12345678906666", + "private_key": "-----BEGIN PRIVATE KEY-----\...\n-----END PRIVATE KEY-----\n", + "client_email": "project@thanos.iam.gserviceaccount.com", + "client_id": "123456789012345678901", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/thanos%40gitpods.iam.gserviceaccount.com" + } +``` + +### Example S3 configuration for `object-store.yaml` +This is an example configuration using thanos with S3. Check endpoints here: https://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region + +``` +type: S3 +config: + bucket: "" + endpoint: "" + region: "" + access_key: "" + insecure: false + signature_version2: false + secret_key: "" + put_user_metadata: {} + http_config: + idle_conn_timeout: 0s + response_header_timeout: 0s + insecure_skip_verify: false + trace: + enable: false + part_size: 0 +``` + +### Example Azure configuration for `object-store.yaml` + +``` +type: AZURE +config: + storage_account: "" + storage_account_key: "" + container: "" + endpoint: "" + max_retries: 0 +``` +Create the Service Account and Bucket at Google cloud. + +#### Install the chart: +```bash +helm install banzaicloud-stable/thanos --name thanos -f my-values.yaml --set-file objstoreFile=object-store.yaml + +``` + +Visit the Bucket browser +```bash +kubectl port-forward svc/thanos-bucket 8080 & +open http://localhost:8080 +``` + +## Install prometheus-operator + +Extra configuration for prometheus operator. + +> Note: Prometheus-operator and Thanos **MUST** be in the same namespace. + +```yaml +prometheus: + prometheusSpec: + thanos: + image: quay.io/thanos/thanos:v0.9.0 + version: v0.9.0 + objectStorageConfig: + name: thanos + key: object-store.yaml +``` + +Install prometheus-operator + +```bash +helm install stable/prometheus-operator -f thanos-sidecar.yaml +``` + +# Configuration + +This section describes the values available + +## General +|Name|Description| Default Value| +|----|-----------|--------------| +| image.repository | Thanos image repository and name | 'quay.io/thanos/thanos' **For Thanos version 0.6.0 or older change this to 'improbable/thanos'** | +| image.tag | Thanos image tag | v0.9.0 | +| image.pullPolicy | Image Kubernetes pull policy | IfNotPresent | +| objstore | Configuration for the backend object storage in yaml format. Mutually exclusive with other objstore options. | {} | +| objstoreFile | Configuration for the backend object storage in string format. Mutually exclusive with other objstore options. | "" | +| objstoreSecretOverride | Configuration for the backend object storage in an existing secret. Mutually exclusive with other objstore options. | "" | + +## Common settings for all components + +These setting applicable to nearly all components. + +|Name|Description| Default Value| +|----|-----------|--------------| +| $component.labels | Additional labels to the Pod | {} | +| $component.annotations | Additional annotations to the Pod | {} | +| $component.deploymentLabels | Additional labels to the deployment | {} | +| $component.deploymentAnnotations | Additional annotations to the deployment | {} | +| $component.extraEnv | Add extra environment variables | [] | +| $component.strategy | Kubernetes [deployment update strategy](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy) object | {} | +| $component.updateStrategy | Kubernetes [statefulset update strategy](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#update-strategies) object | {} | +| $component.metrics.annotations.enabled | Prometheus annotation for component | false | +| $component.metrics.serviceMonitor.enabled | Prometheus ServiceMonitor definition for component | false | +| $component.securityContext | SecurityContext for Pod | {} | +| $component.resources | Resource definition for container | {} | +| $component.tolerations | [Node tolerations for server scheduling to nodes with taints](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/) | {} | +| $component.nodeSelector | [Node labels for compact pod assignment](https://kubernetes.io/docs/user-guide/node-selection/) | {} | +| $component.affinity | [Pod affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#node-affinity) | {} | +| $component.grpc.port | grpc listen port number | 10901 | +| $component.grpc.service.annotations | Service definition for grpc service | {} | +| $component.grpc.service.matchLabels | Pod label selector to match grpc service on. | `{}` | +| $component.grpc.ingress.enabled | Set up ingress for the grpc service | false | +| $component.grpc.ingress.defaultBackend | Set up default backend for ingress | false | +| $component.grpc.ingress.annotations | Add annotations to ingress | {} | +| $component.grpc.ingress.labels | Add labels to ingress | {} | +| $component.grpc.ingress.path | Ingress path | "/" | +| $component.grpc.ingress.hosts | Ingress hosts | [] | +| $component.grpc.ingress.tls | Ingress TLS configuration | [] | +| $component.http.port | http listen port number | 10902 | +| $component.http.service.annotations | Service definition for http service | {} | +| $component.http.service.matchLabels | Pod label selector to match http service on. | `{}` | +| $component.http.ingress.enabled | Set up ingress for the http service | false | +| $component.http.ingress.apiVersion | Set API version for ingress | extensions/v1beta1 | +| $component.http.ingress.defaultBackend | Set up default backend for ingress | false | +| $component.http.ingress.annotations | Add annotations to ingress | {} | +| $component.http.ingress.labels | Add labels to ingress | {} | +| $component.http.ingress.path | Ingress path | "/" | +| $component.http.ingress.hosts | Ingress hosts | [] | +| $component.http.ingress.tls | Ingress TLS configuration | [] | + +## Store + +These values are just samples, for more fine-tuning please check the values.yaml. + +|Name|Description| Default Value| +|----|-----------|--------------| +| store.enabled | Enable component | true | +| store.replicaCount | Pod replica count | 1 | +| store.logLevel | Log level | info | +| store.logFormat | Log format to use. Possible options: logfmt or json. | logfmt | +| store.indexCacheSize | Maximum size of items held in the index cache. | 250MB | +| store.chunkPoolSize | Maximum size of concurrently allocatable bytes for chunks. | 2GB | +| store.grpcSeriesSampleLimit | Maximum amount of samples returned via a single series call. 0 means no limit. NOTE: for efficiency we take 120 as the number of samples in chunk (it cannot be bigger than that), so the actual number of samples might be lower, even though the maximum could be hit. | 0 | +| store.grpcSeriesMaxConcurrency | Maximum number of concurrent Series calls. | 20 | +| store.syncBlockDuration |Repeat interval for syncing the blocks between local and remote view. | 3m | +| store.blockSyncConcurrency | Number of goroutines to use when syncing blocks from object storage. | 20 | +| store.extraEnv | Add extra environment variables | [] | +| store.extraArgs | Add extra arguments | [] | +| store.serviceAccount | Name of the Kubernetes service account to use | "" | +| store.livenessProbe | Set up liveness probe for store available for Thanos v0.8.0+) | {} | +| store.readinessProbe | Set up readinessProbe for store (available for Thanos v0.8.0+) | {} | +| timePartioning | list of min/max time for store partitions. See more details below. Setting this will create mutlipale thanos store deployments based on the number of items in the list | [{min: "", max: ""}] | +| hashPartioning.shards | The number of shared used to partition the blocks based on the hashmod of the blocks. Can not be used with time partitioning | "" | +| initContainers | InitContainers allows injecting specialized containers that run before app containers. This is meant to pre-configure and tune mounted volume permissions. | [] | + + +### Store time partions +Thanos store supports partition based on time. +Setting time partitions will create n number of store deployment based on the number of items in the list. Each item must contain min and max time for querying in the supported format (see details here See details at https://thanos.io/components/store.md/#time-based-partioning ). +Leaving this empty list ([]) will create a single store for all data. +Example - This will create 3 stores: +```yaml +timePartioning: + # One store for data older than 6 weeks + - min: "" + max: -6w + # One store for data newer than 6 weeks and older than 2 weeks + - min: -6w + max: -2w + # One store for data newer than 2 weeks + - min: -2w + max: "" +``` + + +## Query + +|Name|Description| Default Value| +|----|-----------|--------------| +| query.enabled | Enable component | true | +| query.replicaCount | Pod replica count | 1 | +| query.logLevel | Log level | info | +| query.logFormat | Log format to use. Possible options: logfmt or json. | logfmt | +| query.replicaLabels | Labels to treat as a replica indicator along which data is deduplicated. Still you will be able to query without deduplication using 'dedup=false' parameter. | [] | +| query.autoDownsampling | Enable --query.auto-downsampling option for query. | true | +| query.webRoutePrefix |Prefix for API and UI endpoints. This allows thanos UI to be served on a sub-path. This option is analogous to --web.route-prefix of Promethus. | "" | +| query.webExternalPrefix |Static prefix for all HTML links and redirect URLs in the UI query web interface. Actual endpoints are still served on / or the web.route-prefix. This allows thanos UI to be served behind a reverse proxy that strips a URL sub-path | "" | +| query.webPrefixHeader | Name of HTTP request header used for dynamic prefixing of UI links and redirects. This option is ignored if web.external-prefix argument is set. Security risk: enable this option only if a reverse proxy in front of thanos is resetting the header. The --web.prefix-header=X-Forwarded-Prefix option can be useful, for example, if Thanos UI is served via Traefik reverse proxy with PathPrefixStrip option enabled, which sends the stripped prefix value in X-Forwarded-Prefix header. This allows thanos UI to be served on a sub-path | "" | +| query.storeDNSResolver | Custome DNS resolver because of [issue](https://github.com/improbable-eng/thanos/issues/1015) | miekgdns | +| query.storeDNSDiscovery | Enable DNS discovery for stores | true | +| query.sidecarDNSDiscovery | Enable DNS discovery for sidecars (this is for the chart built-in sidecar service) | true | +| query.stores | Addresses of statically configured store API servers (repeatable). The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect store API servers through respective DNS lookups. | [] | +| query.serviceDiscoveryFiles | Path to files that contains addresses of store API servers. The path can be a glob pattern (repeatable). | [] | +| query.serviceDiscoveryFileConfigMaps | Names of configmaps that contain addresses of store API servers, used for file service discovery. | [] | +| query.serviceDiscoveryInterval | Refresh interval to re-read file SD files. It is used as a resync fallback. | 5m | +| query.extraEnv | Add extra environment variables | [] | +| query.extraArgs | Add extra arguments | [] | +| query.podDisruptionBudget.enabled | Enabled and config podDisruptionBudget resource for this component | false | +| query.podDisruptionBudget.minAvailable | Minimum number of available query pods for PodDisruptionBudget | 1 | +| query.podDisruptionBudget.maxUnavailable | Maximum number of unavailable query pods for PodDisruptionBudget | [] | +| query.autoscaling.enabled | Enabled and config horizontalPodAutoscaling resource for this component | false | +| query.autoscaling.minReplicas | If autoscaling enabled, this field sets minimum replica count | 2 | +| query.autoscaling.maxReplicas | If autoscaling enabled, this field sets maximum replica count | 3 | +| query.autoscaling.targetCPUUtilizationPercentage | Target CPU utilization percentage to scale | 50 | +| query.autoscaling.targetMemoryUtilizationPercentage | Target memory utilization percentage to scale 50 | +| query.serviceAccount | Name of the Kubernetes service account to use | "" | +| query.serviceAccountAnnotations | Optional annotations to be added to the ServiceAccount | {} | +| query.psp.enabled | Enable pod security policy, it also requires the `query.rbac.enabled` to be set to `true`. | false | +| query.rbac.enabled | Enable RBAC to use the PSP | false | +| query.livenessProbe | Set up liveness probe for query | {} | +| query.readinessProbe | Set up readinessProbe for query | {} | + +## Rule +|Name|Description| Default Value| +|----|-----------|--------------| +| rule.enabled | Enable component | false | +| rule.logLevel | Log level | info | +| rule.logFormat | Log format to use. Possible options: logfmt or json. | logfmt | +| rule.ruleLabels | Labels to be applied to all generated metrics (repeated). Similar to external labels for Prometheus, used to identify ruler and its blocks as unique source. | {} | +| rule.resendDelay | Minimum amount of time to wait before resending an alert to Alertmanager. | "" | +| rule.evalInterval | The default evaluation interval to use. | "" | +| rule.tsdbBlockDuration | Block duration for TSDB block. | "" | +| rule.tsdbRetention | Block retention time on local disk. | "" | +| rule.webRoutePrefix |Prefix for API and UI endpoints. This allows thanos UI to be served on a sub-path. This option is analogous to --web.route-prefix of Promethus. | "" | +| rule.webExternalPrefix |Static prefix for all HTML links and redirect URLs in the UI query web interface. Actual endpoints are still served on / or the web.route-prefix. This allows thanos UI to be served behind a reverse proxy that strips a URL sub-path | "" | +| rule.webPrefixHeader | Name of HTTP request header used for dynamic prefixing of UI links and redirects. This option is ignored if web.external-prefix argument is set. Security risk: enable this option only if a reverse proxy in front of thanos is resetting the header. The --web.prefix-header=X-Forwarded-Prefix option can be useful, for example, if Thanos UI is served via Traefik reverse proxy with PathPrefixStrip option enabled, which sends the stripped prefix value in X-Forwarded-Prefix header. This allows thanos UI to be served on a sub-path | "" | +| rule.queryDNSDiscovery | Enable DNS discovery for query insances | true | +| rule.alertmanagers | # Alertmanager replica URLs to push firing alerts. Ruler claims success if push to at least one alertmanager from discovered succeeds. The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect Alertmanager IPs through respective DNS lookups. The port defaults to 9093 or the SRV record's value. The URL path is used as a prefix for the regular Alertmanager API path. | []] | +| rule.alertmanagersSendTimeout | Timeout for sending alerts to alertmanagert | "" | +| rule.alertQueryUrl |The external Thanos Query URL that would be set in all alerts 'Source' field | "" | +| rule.alertLabelDrop | Labels by name to drop before sending to alertmanager. This allows alert to be deduplicated on replica label (repeated). Similar Prometheus alert relabelling | [] | +| rule.ruleOverrideName | Override rules file with custom configmap | "" | +| rule.ruleFiles | See example in values.yaml | {}" | +| rule.persistentVolumeClaim | Create the specified persistentVolumeClaim in case persistentVolumeClaim is used for the dataVolume.backend above and needs to be created. | {} | + +## Compact + +|Name|Description| Default Value| +|----|-----------|--------------| +| compact.enabled | Enable component | true | +| compact.replicaCount | Pod replica count | 1 | +| compact.logLevel | Log level | info | +| compact.logFormat | Log format to use. Possible options: logfmt or json. | logfmt | +| compact.serviceAccount | Name of the Kubernetes service account to use | "" | +| compact.consistencyDelay | Minimum age of fresh (non-compacted) blocks before they are being processed. Malformed blocks older than the maximum of consistency-delay and 30m0s will be removed.| 30m | +| compact.retentionResolutionRaw | How long to retain raw samples in bucket. 0d - disables this retention | 30d | +| compact.retentionResolution5m | How long to retain samples of resolution 1 (5 minutes) in bucket. 0d - disables this retention | 120d | +| compact.retentionResolution1h | How long to retain samples of resolution 2 (1 hour) in bucket. 0d - disables this retention | 1y | +| compact.blockSyncConcurrency | Number of goroutines to use when syncing block metadata from object storage. | 20 | +| compact.compactConcurrency | Number of goroutines to use when compacting groups. | 1 | +| compact.dataVolume.backend | Data volume for the compactor to store temporary data defaults to emptyDir. | {} | +| compact.persistentVolumeClaim | Create the specified persistentVolumeClaim in case persistentVolumeClaim is used for the dataVolume.backend above and needs to be created. | {} | + +## Bucket + +|Name|Description| Default Value| +|----|-----------|--------------| +| bucket.enabled | Enable component | true | +| bucket.replicaCount | Pod replica count | 1 | +| bucket.logLevel | Log level | info | +| bucket.logFormat | Log format to use. Possible options: logfmt or json. | logfmt | +| bucket.refresh | Refresh interval to download metadata from remote storage | 30m | +| bucket.timeout | Timeout to download metadata from remote storage | 5m | +| bucket.label | Prometheus label to use as timeline title | "" | +| bucket.http.port | Listening port for bucket web | 8080 | +| bucket.serviceAccount | Name of the Kubernetes service account to use | "" | +| bucket.podDisruptionBudget.enabled | Enabled and config podDisruptionBudget resource for this component | false | +| bucket.podDisruptionBudget.minAvailable | Minimum number of available query pods for PodDisruptionBudget | 1 | +| bucket.podDisruptionBudget.maxUnavailable | Maximum number of unavailable query pods for PodDisruptionBudget | [] | + +## Sidecar + +|Name|Description| Default Value| +|----|-----------|--------------| +| sidecar.enabled | NOTE: This is only the service references for the sidecar. | true | +| sidecar.selector | Pod label selector to match sidecar services on. | `{"app": "prometheus"}` | + +## Query Frontend + +|Name|Description| Default Value| +|----|-----------|--------------| +| queryFrontend.enabled | Enable component | false | +| queryFrontend.replicaCount | Pod replica count | 1 | +| queryFrontend.logLevel | Log level | info | +| queryFrontend.logFormat | Log format to use. Possible options: logfmt or json. | logfmt | +| queryFrontend.downstreamUrl | URL of downstream Prometheus Query compatible API. | | +| queryFrontend.compressResponses | Compress HTTP responses. | `true` | +| queryFrontend.logQueriesLongerThan | Log queries that are slower than the specified duration. | `0` (disabled) | +| queryFrontend.cacheCompressionType | Use compression in results cache. Supported values are: `snappy` and `` (disable compression). | `` | +| queryFrontend.queryRange.alignRangeWithStep | See https://thanos.io/tip/components/query-frontend.md/#flags | `false` | +| queryFrontend.queryRange.splitInterval | See https://thanos.io/tip/components/query-frontend.md/#flags | `24h` | +| queryFrontend.queryRange.maxRetriesPerRequest | See https://thanos.io/tip/components/query-frontend.md/#flags | `5` | +| queryFrontend.queryRange.maxQueryLength | See https://thanos.io/tip/components/query-frontend.md/#flags | `0` | +| queryFrontend.queryRange.maxQueryParallelism | See https://thanos.io/tip/components/query-frontend.md/#flags | `14` | +| queryFrontend.queryRange.responseCacheMaxFreshness | See https://thanos.io/tip/components/query-frontend.md/#flag | `1m` | +| queryFrontend.queryRange.noPartialResponse | See https://thanos.io/tip/components/query-frontend.md/#flags | `false` | +| queryFrontend.cache.inMemory | Use inMemory cache? | `false` | +| queryFrontend.cache.maxSize | Maximum Size of the cache. Use either this or `maxSizeItems`. | `` | +| queryFrontend.cache.maxSizeItems | Maximum number of items in the cache. Use either this or `maxSize`. | `` | +| queryFrontend.cache.validity | | `` | +| queryFrontend.log.request.decision | Request Logging for logging the start and end of requests | `LogFinishCall` | +| queryFrontend.serviceAccountAnnotations | Optional annotations to be added to the ServiceAccount | {} | + +## Contributing +Contributions are very welcome! diff --git a/charts/thanos/requirements.yaml b/charts/thanos/requirements.yaml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/charts/thanos/templates/NOTES.txt b/charts/thanos/templates/NOTES.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/charts/thanos/templates/_helpers.tpl b/charts/thanos/templates/_helpers.tpl new file mode 100644 index 0000000000..054178b411 --- /dev/null +++ b/charts/thanos/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "thanos.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "thanos.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "thanos.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + + +{{/* +Create a default fully qualified component name from the full app name and a component name. +We truncate the full name at 63 - 1 (last dash) - len(component name) chars because some Kubernetes name fields are limited to this (by the DNS naming spec) +and we want to make sure that the component is included in the name. +*/}} +{{- define "thanos.componentname" -}} +{{- $global := index . 0 -}} +{{- $component := index . 1 | trimPrefix "-" -}} +{{- printf "%s-%s" (include "thanos.fullname" $global | trunc (sub 62 (len $component) | int) | trimSuffix "-" ) $component | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default name for service account if not provided. +*/}} +{{- define "thanos.query.serviceaccount" -}} +{{- if .Values.query.rbac.enabled -}} +{{- default (include "thanos.componentname" (list $ "query")) .Values.query.serviceAccount -}} +{{- else -}} +{{- printf "%s" .Values.query.serviceAccount -}} +{{- end -}} +{{- end -}} +{{- define "thanos.queryFrontend.serviceaccount" -}} +{{- if .Values.queryFrontend.rbac.enabled -}} +{{- default (include "thanos.componentname" (list $ "query-frontend")) .Values.queryFrontend.serviceAccount -}} +{{- else -}} +{{- printf "%s" .Values.queryFrontend.serviceAccount -}} +{{- end -}} +{{- end -}} \ No newline at end of file diff --git a/charts/thanos/templates/bucket-deployment.yaml b/charts/thanos/templates/bucket-deployment.yaml new file mode 100644 index 0000000000..a9f1b4a378 --- /dev/null +++ b/charts/thanos/templates/bucket-deployment.yaml @@ -0,0 +1,98 @@ +{{ if .Values.bucket.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "thanos.componentname" (list $ "bucket") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: bucket +{{ with .Values.bucket.deploymentLabels }}{{ toYaml . | indent 4 }}{{ end -}} + {{- with .Values.bucket.deploymentAnnotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + replicas: {{ .Values.bucket.replicaCount | default 1 }} + {{- with .Values.bucket.strategy }} + strategy: {{ toYaml . | nindent 4 }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: bucket +{{ with .Values.bucket.deploymentMatchLabels }}{{ toYaml . | indent 6 }}{{ end }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: bucket +{{ with .Values.bucket.labels }}{{ toYaml . | indent 8 }}{{ end }} + {{- with .Values.bucket.annotations }} + annotations: {{ toYaml . | nindent 8 }} + {{- end }} + spec: + containers: + - name: thanos-bucket + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- with .Values.bucket.extraEnv }} + env: {{ toYaml . | nindent 8 }} + {{- end }} + args: + - "tools" + - "bucket" + - "web" + - "--log.level={{ .Values.bucket.logLevel }}" + - "--log.format={{ .Values.bucket.logFormat }}" + - "--http-address=0.0.0.0:{{ .Values.bucket.http.port }}" + - "--objstore.config-file=/etc/config/object-store.yaml" + {{- if .Values.bucket.refresh }} + - "--refresh={{ .Values.bucket.refresh }}" + {{- end }} + {{- if .Values.bucket.timeout }} + - "--timeout={{ .Values.bucket.timeout }}" + {{- end }} + {{- if .Values.bucket.label }} + - "--label={{ .Values.bucket.label }}" + {{- end }} + {{ with .Values.bucket.extraArgs }}{{ toYaml . | nindent 10 }}{{- end }} + ports: + - name: http + containerPort: {{ .Values.bucket.http.port }} + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + resources: {{ toYaml .Values.bucket.resources | nindent 10 }} + volumes: + - name: config-volume + secret: + {{- if .Values.objstoreSecretOverride }} + secretName: "{{ .Values.objstoreSecretOverride }}" + {{- else }} + secretName: {{ include "thanos.fullname" . }} + {{- end }} + {{- with .Values.bucket.securityContext }} + securityContext: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.bucket.nodeSelector }} + nodeSelector: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.bucket.affinity }} + affinity: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.bucket.tolerations }} + tolerations: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.bucket.serviceAccount }} + serviceAccountName: "{{ . }}" + {{- end }} + {{- if .Values.priorityClassName }} + priorityClassName: {{ .Values.priorityClassName }} + {{- end }} +{{ end }} diff --git a/charts/thanos/templates/bucket-ingress.yaml b/charts/thanos/templates/bucket-ingress.yaml new file mode 100644 index 0000000000..5bf8d7f6ee --- /dev/null +++ b/charts/thanos/templates/bucket-ingress.yaml @@ -0,0 +1,50 @@ +{{ if and .Values.bucket.enabled .Values.bucket.http.ingress.enabled }} +apiVersion: {{ .Values.bucket.http.ingress.apiVersion }} +kind: Ingress +metadata: + name: {{ include "thanos.componentname" (list $ "bucket") }} + {{- with .Values.bucket.http.ingress.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: bucket + {{- if .Values.bucket.http.ingress.labels }} +{{ toYaml .Values.bucket.http.ingress.labels | indent 4 }} + {{- end }} +spec: + {{- if .Values.bucket.http.ingress.defaultBackend }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "bucket") }} + port: + number: {{ $.Values.bucket.http.port }} + {{- end }} + {{- if .Values.bucket.http.ingress.tls }} + tls: + {{- range .Values.bucket.http.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.bucket.http.ingress.hosts }} + - host: {{ . }} + http: + paths: + - path: {{ $.Values.bucket.http.ingress.path }} + pathType: {{ $.Values.bucket.http.ingress.pathType }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "bucket") }} + port: + number: {{ $.Values.bucket.http.port }} + {{- end }} +{{ end }} diff --git a/charts/thanos/templates/bucket-poddisruptionbudget.yaml b/charts/thanos/templates/bucket-poddisruptionbudget.yaml new file mode 100644 index 0000000000..56e1bf106e --- /dev/null +++ b/charts/thanos/templates/bucket-poddisruptionbudget.yaml @@ -0,0 +1,25 @@ +{{- if and .Values.bucket.enabled .Values.bucket.podDisruptionBudget.enabled }} +apiVersion: policy/v1beta1 +kind: PodDisruptionBudget +metadata: + name: {{ include "thanos.componentname" (list $ "bucket") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: bucket +{{ with .Values.bucket.deploymentLabels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + {{- if .Values.bucket.podDisruptionBudget.minAvailable }} + minAvailable: {{ .Values.bucket.podDisruptionBudget.minAvailable }} + {{- end }} + {{- if .Values.bucket.podDisruptionBudget.maxUnavailable }} + maxUnavailable: {{ .Values.bucket.podDisruptionBudget.maxUnavailable }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/component: bucket +{{- end }} \ No newline at end of file diff --git a/charts/thanos/templates/bucket-service.yaml b/charts/thanos/templates/bucket-service.yaml new file mode 100644 index 0000000000..542fade40a --- /dev/null +++ b/charts/thanos/templates/bucket-service.yaml @@ -0,0 +1,28 @@ +{{ if .Values.bucket.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "thanos.componentname" (list $ "bucket") }} + {{- with .Values.bucket.http.service.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: bucket +{{ with .Values.bucket.http.service.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + ports: + - port: {{ .Values.bucket.http.port }} + protocol: TCP + targetPort: http + name: http + selector: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: bucket +{{ with .Values.bucket.http.service.matchLabels }}{{ toYaml . | indent 4 }}{{ end }} +{{ end }} diff --git a/charts/thanos/templates/compact-deployment.yaml b/charts/thanos/templates/compact-deployment.yaml new file mode 100644 index 0000000000..ce86cbf554 --- /dev/null +++ b/charts/thanos/templates/compact-deployment.yaml @@ -0,0 +1,108 @@ +{{ if .Values.compact.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "thanos.componentname" (list $ "compact") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: compact +{{ with .Values.compact.deploymentLabels }}{{ toYaml . | indent 4 }}{{ end -}} + {{- with .Values.compact.deploymentAnnotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + replicas: {{ .Values.compact.replicaCount | default 1 }} + {{- with .Values.compact.strategy }} + strategy: {{ toYaml . | nindent 4 }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: compact +{{ with .Values.compact.deploymentMatchLabels }}{{ toYaml . | indent 6 }}{{ end }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: compact +{{ with .Values.compact.labels }}{{ toYaml . | indent 8 }}{{ end }} + {{- if or .Values.compact.annotations .Values.compact.metrics.annotations.enabled }} + annotations: + {{- end }} + {{- with .Values.compact.annotations }}{{ toYaml . | nindent 8 }}{{- end }} + {{- if .Values.compact.metrics.annotations.enabled }} + prometheus.io/scrape: "true" + prometheus.io/port: "{{ .Values.compact.http.port }}" + {{- end }} + spec: + containers: + - name: thanos-compact + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- with .Values.compact.extraEnv }} + env: {{ toYaml . | nindent 8 }} + {{- end }} + args: + - "compact" + - "--log.level={{ .Values.compact.logLevel }}" + - "--log.format={{ .Values.compact.logFormat }}" + - "--http-address=0.0.0.0:{{ .Values.compact.http.port }}" + - "--objstore.config-file=/etc/config/object-store.yaml" + - "--data-dir=/var/thanos/compact" + - "--consistency-delay={{ .Values.compact.consistencyDelay }}" + - "--retention.resolution-raw={{ .Values.compact.retentionResolutionRaw }}" + - "--retention.resolution-5m={{ .Values.compact.retentionResolution5m }}" + - "--retention.resolution-1h={{ .Values.compact.retentionResolution1h }}" + - "--block-sync-concurrency={{ .Values.compact.blockSyncConcurrency }}" + - "--compact.concurrency={{ .Values.compact.compactConcurrency }}" + - "--wait" +{{ with .Values.compact.extraArgs }}{{ toYaml . | indent 8 }}{{- end }} + ports: + - name: http + containerPort: {{ .Values.compact.http.port }} + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + - name: data-volume + mountPath: /var/thanos/compact + resources: {{ toYaml .Values.compact.resources | nindent 10 }} + volumes: + - name: data-volume + {{- if .Values.compact.dataVolume.backend }} + {{ toYaml .Values.compact.dataVolume.backend | nindent 8 }} + {{- else }} + emptyDir: {} + {{- end }} + - name: config-volume + secret: + {{- if .Values.objstoreSecretOverride }} + secretName: "{{ .Values.objstoreSecretOverride }}" + {{- else }} + secretName: {{ include "thanos.fullname" . }} + {{- end }} + {{- with .Values.compact.securityContext }} + securityContext: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.compact.nodeSelector }} + nodeSelector: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.compact.affinity }} + affinity: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.compact.tolerations }} + tolerations: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.compact.serviceAccount }} + serviceAccountName: "{{ . }}" + {{- end }} + {{- if .Values.priorityClassName }} + priorityClassName: {{ .Values.priorityClassName }} + {{- end }} +{{- end }} diff --git a/charts/thanos/templates/compact-persistentvolumeclaim.yaml b/charts/thanos/templates/compact-persistentvolumeclaim.yaml new file mode 100644 index 0000000000..a0fad02364 --- /dev/null +++ b/charts/thanos/templates/compact-persistentvolumeclaim.yaml @@ -0,0 +1,20 @@ +{{- if and .Values.compact.enabled .Values.compact.persistentVolumeClaim }} +{{- $pvc := .Values.compact.persistentVolumeClaim -}} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ $pvc.name }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: compact +{{ with .Values.compact.deploymentLabels }}{{ toYaml . | indent 4 }}{{ end -}} + {{- with .Values.compact.deploymentAnnotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- toYaml $pvc.spec | nindent 2 }} +{{- end }} diff --git a/charts/thanos/templates/compact-service.yaml b/charts/thanos/templates/compact-service.yaml new file mode 100644 index 0000000000..bb3d5b8c86 --- /dev/null +++ b/charts/thanos/templates/compact-service.yaml @@ -0,0 +1,28 @@ +{{ if .Values.compact.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "thanos.componentname" (list $ "compact") }} + {{- with .Values.compact.http.service.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: compact +{{ with .Values.compact.http.service.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + ports: + - port: {{ .Values.compact.http.port }} + protocol: TCP + targetPort: http + name: http + selector: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: compact +{{ with .Values.compact.http.service.matchLabels }}{{ toYaml . | indent 4 }}{{ end }} +{{ end}} diff --git a/charts/thanos/templates/compact-servicemonitor.yaml b/charts/thanos/templates/compact-servicemonitor.yaml new file mode 100644 index 0000000000..82248ceb5b --- /dev/null +++ b/charts/thanos/templates/compact-servicemonitor.yaml @@ -0,0 +1,30 @@ +{{- if and .Values.compact.enabled .Values.compact.metrics.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "thanos.componentname" (list $ "compact") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: compact +{{ with .Values.compact.metrics.serviceMonitor.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + jobLabel: thanos-compact + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: compact + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: http + interval: {{ .Values.compact.metrics.serviceMonitor.interval | default "15s" }} + {{- with .Values.compact.metrics.serviceMonitor.relabellings }} + metricRelabelings: {{ toYaml . | nindent 8 }} + {{- end }} +{{- end -}} diff --git a/charts/thanos/templates/query-deployment.yaml b/charts/thanos/templates/query-deployment.yaml new file mode 100644 index 0000000000..2ce69dbb1a --- /dev/null +++ b/charts/thanos/templates/query-deployment.yaml @@ -0,0 +1,159 @@ +{{ if .Values.query.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "thanos.componentname" (list $ "query") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query +{{ with .Values.query.deploymentLabels }}{{ toYaml . | indent 4 }}{{ end }} + {{- with .Values.query.deploymentAnnotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if not .Values.query.autoscaling.enabled }} + replicas: {{ .Values.query.replicaCount | default 1 }} + {{- end }} + {{- with .Values.query.strategy }} + strategy: {{ toYaml . | nindent 4 }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: query +{{ with .Values.query.deploymentMatchLabels }}{{ toYaml . | indent 6 }}{{ end }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: query +{{ with .Values.query.labels }}{{ toYaml . | indent 8 }}{{ end }} + {{- if or .Values.query.annotations .Values.query.metrics.annotations.enabled }} + annotations: + {{- end }} + {{- with .Values.query.annotations }}{{ toYaml . | nindent 8 }}{{- end }} + {{- if .Values.query.metrics.annotations.enabled }} + prometheus.io/scrape: "true" + prometheus.io/port: "{{ .Values.query.http.port }}" + {{- end }} + spec: + containers: + - name: thanos-query + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- with .Values.query.extraEnv }} + env: {{ toYaml . | nindent 8 }} + {{- end }} + args: + - "query" + - "--log.level={{ .Values.query.logLevel }}" + - "--log.format={{ .Values.query.logFormat }}" + - "--grpc-address=0.0.0.0:{{ .Values.query.grpc.port }}" + - "--http-address=0.0.0.0:{{ .Values.query.http.port }}" + {{- range .Values.query.replicaLabels }} + - "--query.replica-label={{ . }}" + {{- end }} + {{- if .Values.query.autoDownsampling }} + - "--query.auto-downsampling" + {{- end }} + {{- if .Values.query.webRoutePrefix }} + - "--web.route-prefix={{ .Values.query.webRoutePrefix }}" + {{- end }} + {{- if .Values.query.webExternalPrefix }} + - "--web.external-prefix={{ .Values.query.webExternalPrefix }}" + {{- end }} + {{- if .Values.query.webPrefixHeader }} + - "--web.prefix-header={{ .Values.query.webPrefixHeader }}" + {{- end }} + {{- if .Values.query.storeDNSResolver }} + - "--store.sd-dns-resolver={{ .Values.query.storeDNSResolver }}" + {{- end }} + {{- if .Values.query.storeDNSDiscovery }} + - "--store=dnssrv+_grpc._tcp.{{ include "thanos.componentname" (list $ "store") }}-grpc.{{ .Release.Namespace }}.svc.cluster.local" + {{- end }} + {{- if .Values.query.sidecarDNSDiscovery }} + - "--store=dnssrv+_grpc._tcp.{{ include "thanos.componentname" (list $ "sidecar") }}-grpc.{{ .Release.Namespace }}.svc.cluster.local" + {{- end }} + {{- if .Values.query.ruleDNSDiscovery }} + - "--store=dnssrv+_grpc._tcp.{{ include "thanos.componentname" (list $ "rule") }}-grpc.{{ .Release.Namespace }}.svc.cluster.local" + {{- end }} + {{- range .Values.query.stores }} + - "--store={{ . }}" + {{- end }} + {{- range .Values.query.serviceDiscoveryFiles }} + - "--store.sd-files={{ . }}" + {{- end }} + {{- range .Values.query.serviceDiscoveryFileConfigMaps }} + - "--store.sd-files=/etc/query/{{ . }}/*.yaml" + - "--store.sd-files=/etc/query/{{ . }}/*.yml" + - "--store.sd-files=/etc/query/{{ . }}/*.json" + {{- end }} + {{- if .Values.query.serviceDiscoveryInterval }} + - "--store.sd-interval={{ .Values.query.serviceDiscoveryInterval }}" + {{- end }} + {{- if .Values.query.extraArgs }} + {{ toYaml .Values.query.extraArgs | nindent 8 }} + {{- end }} + ports: + - name: http + containerPort: {{ .Values.query.http.port}} + - name: grpc + containerPort: {{ .Values.query.grpc.port }} + resources: + {{ toYaml .Values.query.resources | nindent 10 }} + volumeMounts: + {{- range .Values.query.serviceDiscoveryFileConfigMaps }} + - mountPath: /etc/query/{{ . }} + name: {{ . }} + {{- end }} + {{- if .Values.query.certSecretName }} + - mountPath: /etc/certs + name: {{ .Values.query.certSecretName }} + readOnly: true + {{- end }} + {{- if .Values.query.livenessProbe }} + livenessProbe: + {{ toYaml .Values.query.livenessProbe | nindent 10 }} + {{- end }} + {{- if .Values.query.readinessProbe }} + readinessProbe: + {{ toYaml .Values.query.readinessProbe | nindent 10 }} + {{- end }} + volumes: + {{- range .Values.query.serviceDiscoveryFileConfigMaps }} + - name: {{ . }} + configMap: + defaultMode: 420 + name: {{ . }} + {{- end }} + {{- if .Values.query.certSecretName }} + - name: {{ .Values.query.certSecretName }} + secret: + defaultMode: 420 + secretName: {{ .Values.query.certSecretName }} + {{- end }} + {{- with .Values.query.securityContext }} + securityContext: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.query.nodeSelector }} + nodeSelector: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.query.affinity }} + affinity: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.query.tolerations }} + tolerations: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with (include "thanos.query.serviceaccount" . ) }} + serviceAccountName: "{{ . }}" + {{- end }} + {{- if .Values.priorityClassName }} + priorityClassName: {{ .Values.priorityClassName }} + {{- end }} +{{ end }} diff --git a/charts/thanos/templates/query-frontend-deployment.yaml b/charts/thanos/templates/query-frontend-deployment.yaml new file mode 100644 index 0000000000..ea311a9592 --- /dev/null +++ b/charts/thanos/templates/query-frontend-deployment.yaml @@ -0,0 +1,186 @@ +{{ if .Values.queryFrontend.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "thanos.componentname" (list $ "query-frontend") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query-frontend +{{ with .Values.queryFrontend.deploymentLabels }}{{ toYaml . | indent 4 }}{{ end }} + {{- with .Values.queryFrontend.deploymentAnnotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if not .Values.queryFrontend.autoscaling.enabled }} + replicas: {{ .Values.queryFrontend.replicaCount | default 1 }} + {{- end }} + {{- with .Values.queryFrontend.strategy }} + strategy: {{ toYaml . | nindent 4 }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: query-frontend +{{ with .Values.queryFrontend.deploymentMatchLabels }}{{ toYaml . | indent 6 }}{{ end }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: query-frontend +{{ with .Values.queryFrontend.labels }}{{ toYaml . | indent 8 }}{{ end }} + {{- if or .Values.queryFrontend.annotations .Values.queryFrontend.metrics.annotations.enabled }} + annotations: + {{- end }} + {{- with .Values.queryFrontend.annotations }}{{ toYaml . | nindent 8 }}{{- end }} + {{- if .Values.queryFrontend.metrics.annotations.enabled }} + prometheus.io/scrape: "true" + prometheus.io/port: "{{ .Values.queryFrontend.http.port }}" + {{- end }} + spec: + containers: + - name: thanos-query-frontend + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- with .Values.queryFrontend.extraEnv }} + env: {{ toYaml . | nindent 8 }} + {{- end }} + args: + - "query-frontend" + - "--log.level={{ .Values.queryFrontend.log.level }}" + - "--log.format={{ .Values.queryFrontend.log.format }}" + {{- if .Values.queryFrontend.log.request.decision }} + - "--log.request.decision={{ .Values.queryFrontend.log.request.decision }}" + {{- end }} + - "--http-address=0.0.0.0:{{ .Values.queryFrontend.http.port }}" + {{- if .Values.queryFrontend.queryRange.splitInterval }} + - "--query-range.split-interval={{ .Values.queryFrontend.queryRange.splitInterval }}" + {{- end }} + {{- if .Values.queryFrontend.queryRange.maxRetriesPerRequest }} + - "--query-range.max-retries-per-request={{ .Values.queryFrontend.queryRange.maxRetriesPerRequest }}" + {{- end }} + {{- if .Values.queryFrontend.queryRange.maxQueryLength }} + - "--query-range.max-query-length={{ .Values.queryFrontend.queryRange.maxQueryLength }}" + {{- end }} + {{- if .Values.queryFrontend.queryRange.maxQueryParallelism }} + - "--query-range.max-query-parallelism={{ .Values.queryFrontend.queryRange.maxQueryParallelism }}" + {{- end }} + {{- if .Values.queryFrontend.queryRange.responseCacheMaxFreshness }} + - "--query-range.response-cache-max-freshness={{ .Values.queryFrontend.queryRange.responseCacheMaxFreshness }}" + {{- end }} + {{- if .Values.queryFrontend.queryRange.noPartialResponse }} + - "--no-query-range.partial-response" + {{- end }} + {{- if .Values.queryFrontend.downstreamUrl }} + - "--query-frontend.downstream-url={{ .Values.queryFrontend.downstreamUrl }}" + {{- else }} + - "--query-frontend.downstream-url=http://{{ include "thanos.componentname" (list $ "query") }}-http:{{ .Values.query.http.port }}" + {{- end }} + {{- if .Values.queryFrontend.compressResponses }} + - "--query-frontend.compress-responses" + {{- end }} + {{- if .Values.queryFrontend.logQueriesLongerThan }} + - "--query-frontend.log-queries-longer-than={{ .Values.queryFrontend.logQueriesLongerThan }}" + {{- end }} + {{- if .Values.queryFrontend.queryRange.cache.inMemory }} + - |- + --query-range.response-cache-config="type": "IN-MEMORY" + "config": + {{- if .Values.queryFrontend.queryRange.cache.maxSize }} + "max_size": "{{ .Values.queryFrontend.queryRange.cache.maxSize }}" + {{- end }} + {{- if .Values.queryFrontend.queryRange.cache.maxSizeItems }} + "max_size_items": {{ .Values.queryFrontend.queryRange.cache.maxSizeItems }} + {{- end }} + {{- if .Values.queryFrontend.queryRange.cache.validity }} + "validity": "{{ .Values.queryFrontend.queryRange.cache.validity }}" + {{- end }} + {{- end }} + {{- if .Values.queryFrontend.qflabels.splitInterval }} + - "--labels.split-interval={{ .Values.queryFrontend.qflabels.splitInterval }}" + {{- end }} + {{- if .Values.queryFrontend.qflabels.maxRetriesPerRequest }} + - "--labels.max-retries-per-request={{ .Values.queryFrontend.qflabels.maxRetriesPerRequest }}" + {{- end }} + {{- if .Values.queryFrontend.qflabels.maxQueryParallelism }} + - "--labels.max-query-parallelism={{ .Values.queryFrontend.qflabels.maxQueryParallelism }}" + {{- end }} + {{- if .Values.queryFrontend.qflabels.responseCacheMaxFreshness }} + - "--labels.response-cache-max-freshness={{ .Values.queryFrontend.qflabels.responseCacheMaxFreshness }}" + {{- end }} + {{- if .Values.queryFrontend.qflabels.noPartialResponse }} + - "--no-labels.partial-response" + {{- end }} + {{- if .Values.queryFrontend.qflabels.cache.inMemory }} + - |- + --labels.response-cache-config="type": "IN-MEMORY" + "config": + {{- if .Values.queryFrontend.qflabels.cache.maxSize }} + "max_size": "{{ .Values.queryFrontend.qflabels.cache.maxSize }}" + {{- end }} + {{- if .Values.queryFrontend.qflabels.cache.maxSizeItems }} + "max_size_items": {{ .Values.queryFrontend.qflabels.cache.maxSizeItems }} + {{- end }} + {{- if .Values.queryFrontend.qflabels.cache.validity }} + "validity": "{{ .Values.queryFrontend.qflabels.cache.validity }}" + {{- end }} + {{- end }} + {{- if .Values.queryFrontend.extraArgs }} + {{ toYaml .Values.queryFrontend.extraArgs | nindent 8 }} + {{- end }} + ports: + - name: http + containerPort: {{ .Values.queryFrontend.http.port}} + - name: grpc + containerPort: {{ .Values.queryFrontend.grpc.port }} + resources: + {{ toYaml .Values.queryFrontend.resources | nindent 10 }} + volumeMounts: + {{- range .Values.queryFrontend.serviceDiscoveryFileConfigMaps }} + - mountPath: /etc/query-frontend/{{ . }} + name: {{ . }} + {{- end }} + {{- if .Values.queryFrontend.certSecretName }} + - mountPath: /etc/certs + name: {{ .Values.queryFrontend.certSecretName }} + readOnly: true + {{- end }} + livenessProbe: + httpGet: + path: /-/healthy + port: http + readinessProbe: + httpGet: + path: /-/ready + port: http + volumes: + {{- range .Values.queryFrontend.serviceDiscoveryFileConfigMaps }} + - name: {{ . }} + configMap: + defaultMode: 420 + name: {{ . }} + {{- end }} + {{- if .Values.queryFrontend.certSecretName }} + - name: {{ .Values.queryFrontend.certSecretName }} + secret: + defaultMode: 420 + secretName: {{ .Values.queryFrontend.certSecretName }} + {{- end }} + {{- with .Values.queryFrontend.securityContext }} + securityContext: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.queryFrontend.nodeSelector }} + nodeSelector: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.queryFrontend.affinity }} + affinity: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.queryFrontend.tolerations }} + tolerations: {{ toYaml . | nindent 8 }} + {{- end }} +{{ end }} diff --git a/charts/thanos/templates/query-frontend-horizontalpodautoscaler.yaml b/charts/thanos/templates/query-frontend-horizontalpodautoscaler.yaml new file mode 100644 index 0000000000..82d1ba650e --- /dev/null +++ b/charts/thanos/templates/query-frontend-horizontalpodautoscaler.yaml @@ -0,0 +1,35 @@ +{{- if .Values.queryFrontend.enabled }} +{{- if .Values.queryFrontend.autoscaling.enabled }} +apiVersion: autoscaling/v2beta1 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "thanos.componentname" (list $ "query-frontend") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query-frontend +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "thanos.componentname" (list $ "query-frontend") }} + minReplicas: {{ .Values.queryFrontend.autoscaling.minReplicas }} + maxReplicas: {{ .Values.queryFrontend.autoscaling.maxReplicas }} + metrics: +{{- with .Values.queryFrontend.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + targetAverageUtilization: {{ . }} +{{- end }} +{{- with .Values.queryFrontend.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + targetAverageUtilization: {{ . }} +{{- end }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/thanos/templates/query-frontend-ingress.yml b/charts/thanos/templates/query-frontend-ingress.yml new file mode 100644 index 0000000000..0fb286ce91 --- /dev/null +++ b/charts/thanos/templates/query-frontend-ingress.yml @@ -0,0 +1,107 @@ +--- +{{- if and .Values.queryFrontend.enabled .Values.queryFrontend.http.ingress.enabled }} +apiVersion: {{ .Values.queryFrontend.http.ingress.apiVersion }} +kind: Ingress +metadata: + name: {{ include "thanos.componentname" (list $ "query-frontend") }}-http + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query-frontend +{{- if .Values.queryFrontend.http.ingress.labels }} +{{ toYaml .Values.queryFrontend.http.ingress.labels | indent 4 }} +{{- end }} + {{- with .Values.queryFrontend.http.ingress.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.queryFrontend.http.ingress.defaultBackend }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "query-frontend") }}-http + port: + number: {{ $.Values.queryFrontend.http.port }} + {{- end }} + {{- if .Values.queryFrontend.http.ingress.tls }} + tls: + {{- range .Values.queryFrontend.http.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + {{- if .secretName }} + secretName: {{ .secretName }} + {{- end}} + {{- end }} + {{- end }} + rules: + {{- range .Values.queryFrontend.http.ingress.hosts }} + - host: {{ . }} + http: + paths: + - path: {{ $.Values.queryFrontend.http.ingress.path }} + pathType: {{ $.Values.queryFrontend.http.ingress.pathType }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "query-frontend") }}-http + port: + number: {{ $.Values.queryFrontend.http.port }} + {{- end }} +{{- end }} + +{{- if and .Values.queryFrontend.enabled .Values.queryFrontend.grpc.ingress.enabled }} +--- +apiVersion: {{ .Values.queryFrontend.grpc.ingress.apiVersion }} +kind: Ingress +metadata: + name: {{ include "thanos.componentname" (list $ "query-frontend") }}-grpc + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query-frontend +{{- if .Values.queryFrontend.grpc.ingress.labels }} +{{ toYaml .Values.queryFrontend.grpc.ingress.labels | indent 4 }} +{{- end }} + {{- with .Values.queryFrontend.grpc.ingress.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.queryFrontend.grpc.ingress.defaultBackend }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "query-frontend") }}-grpc + port: + number: {{ $.Values.queryFrontend.grpc.port }} + {{- end }} + {{- if .Values.queryFrontend.grpc.ingress.tls }} + tls: + {{- range .Values.queryFrontend.grpc.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + {{- if .secretName }} + secretName: {{ .secretName }} + {{- end}} + {{- end }} + {{- end }} + rules: + {{- range .Values.queryFrontend.grpc.ingress.hosts }} + - host: {{ . }} + http: + paths: + - path: {{ $.Values.queryFrontend.grpc.ingress.path }} + pathType: {{ $.Values.queryFrontend.grpc.ingress.pathType }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "query-frontend") }}-grpc + port: + number: {{ $.Values.queryFrontend.grpc.port }} + {{- end }} +{{- end }} diff --git a/charts/thanos/templates/query-frontend-poddisruptionbudget.yaml b/charts/thanos/templates/query-frontend-poddisruptionbudget.yaml new file mode 100644 index 0000000000..ed0c1cd9e2 --- /dev/null +++ b/charts/thanos/templates/query-frontend-poddisruptionbudget.yaml @@ -0,0 +1,25 @@ +{{- if and .Values.queryFrontend.enabled .Values.queryFrontend.podDisruptionBudget.enabled }} +apiVersion: policy/v1beta1 +kind: PodDisruptionBudget +metadata: + name: {{ include "thanos.componentname" (list $ "query-frontend") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query-frontend +{{ with .Values.queryFrontend.deploymentLabels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + {{- if .Values.queryFrontend.podDisruptionBudget.minAvailable }} + minAvailable: {{ .Values.queryFrontend.podDisruptionBudget.minAvailable }} + {{- end }} + {{- if .Values.queryFrontend.podDisruptionBudget.maxUnavailable }} + maxUnavailable: {{ .Values.queryFrontend.podDisruptionBudget.maxUnavailable }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/component: query-frontend +{{- end }} \ No newline at end of file diff --git a/charts/thanos/templates/query-frontend-psp.yaml b/charts/thanos/templates/query-frontend-psp.yaml new file mode 100644 index 0000000000..e2c6cca03a --- /dev/null +++ b/charts/thanos/templates/query-frontend-psp.yaml @@ -0,0 +1,27 @@ +--- +{{- if and .Values.queryFrontend.enabled .Values.queryFrontend.psp.enabled }} +apiVersion: policy/v1beta1 +kind: PodSecurityPolicy +metadata: + name: {{ include "thanos.componentname" (list $ "query-frontend") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query-frontend +spec: + privileged: false + allowPrivilegeEscalation: false + fsGroup: + rule: RunAsAny + runAsUser: + rule: RunAsAny + seLinux: + rule: RunAsAny + supplementalGroups: + rule: RunAsAny + volumes: + - secret +{{- end }} diff --git a/charts/thanos/templates/query-frontend-rbac.yaml b/charts/thanos/templates/query-frontend-rbac.yaml new file mode 100644 index 0000000000..e3a71c017b --- /dev/null +++ b/charts/thanos/templates/query-frontend-rbac.yaml @@ -0,0 +1,55 @@ +{{- if and .Values.queryFrontend.enabled .Values.queryFrontend.rbac.enabled }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + {{- with .Values.queryFrontend.serviceAccountAnnotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} + name: {{ include "thanos.componentname" (list $ "query-frontend") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query-frontend +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "thanos.componentname" (list $ "query-frontend") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query-frontend +rules: +- apiGroups: ['policy'] + resources: ['podsecuritypolicies'] + verbs: ['use'] + resourceNames: + - {{ include "thanos.componentname" (list $ "query-frontend") }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "thanos.componentname" (list $ "query-frontend") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query-frontend +roleRef: + kind: ClusterRole + name: {{ include "thanos.componentname" (list $ "query-frontend") }} + apiGroup: rbac.authorization.k8s.io +subjects: +- kind: ServiceAccount + name: {{ include "thanos.queryFrontend.serviceaccount" . }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/charts/thanos/templates/query-frontend-service.yaml b/charts/thanos/templates/query-frontend-service.yaml new file mode 100644 index 0000000000..aa1992597c --- /dev/null +++ b/charts/thanos/templates/query-frontend-service.yaml @@ -0,0 +1,63 @@ +{{- if .Values.queryFrontend.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "thanos.componentname" (list $ "query-frontend") }}-grpc + {{- with .Values.queryFrontend.grpc.service.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query-frontend +{{ with .Values.queryFrontend.grpc.service.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + type: ClusterIP + clusterIP: None + ports: + - port: {{ .Values.queryFrontend.grpc.port }} + targetPort: grpc + protocol: TCP + name: grpc + selector: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: query-frontend +{{ with .Values.queryFrontend.grpc.service.matchLabels }}{{ toYaml . | indent 4 }}{{ end }} + +--- + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "thanos.componentname" (list $ "query-frontend") }}-http + {{- with .Values.queryFrontend.http.service.annotations }} + annotations: {{ toYaml .| nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query-frontend +{{ with .Values.queryFrontend.http.service.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + type: {{ .Values.queryFrontend.http.service.type }} + {{- if .Values.queryFrontend.http.service.externalTrafficPolicy }} + externalTrafficPolicy: {{ .Values.queryFrontend.http.externalTrafficPolicy }} + {{- end }} + ports: + - port: {{ .Values.queryFrontend.http.port }} + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: query-frontend +{{ with .Values.queryFrontend.http.service.matchLabels }}{{ toYaml . | indent 4 }}{{ end }} +{{- end -}} diff --git a/charts/thanos/templates/query-frontend-servicemonitor.yaml b/charts/thanos/templates/query-frontend-servicemonitor.yaml new file mode 100644 index 0000000000..d8accfba45 --- /dev/null +++ b/charts/thanos/templates/query-frontend-servicemonitor.yaml @@ -0,0 +1,30 @@ +{{- if and .Values.queryFrontend.enabled .Values.queryFrontend.metrics.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "thanos.componentname" (list $ "query-frontend") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query-frontend +{{ with .Values.queryFrontend.metrics.serviceMonitor.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + jobLabel: thanos-query-frontend + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: query-frontend + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: http + interval: {{ .Values.queryFrontend.metrics.serviceMonitor.interval | default "15s" }} + {{- with .Values.queryFrontend.metrics.serviceMonitor.relabellings }} + metricRelabelings: {{ toYaml . | nindent 8 }} + {{- end }} +{{- end -}} diff --git a/charts/thanos/templates/query-horizontalpodautoscaler.yaml b/charts/thanos/templates/query-horizontalpodautoscaler.yaml new file mode 100644 index 0000000000..aa25b6e089 --- /dev/null +++ b/charts/thanos/templates/query-horizontalpodautoscaler.yaml @@ -0,0 +1,35 @@ +{{- if .Values.query.enabled }} +{{- if .Values.query.autoscaling.enabled }} +apiVersion: autoscaling/v2beta1 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "thanos.componentname" (list $ "query") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "thanos.componentname" (list $ "query") }} + minReplicas: {{ .Values.query.autoscaling.minReplicas }} + maxReplicas: {{ .Values.query.autoscaling.maxReplicas }} + metrics: +{{- with .Values.query.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + targetAverageUtilization: {{ . }} +{{- end }} +{{- with .Values.query.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + targetAverageUtilization: {{ . }} +{{- end }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/thanos/templates/query-ingress.yml b/charts/thanos/templates/query-ingress.yml new file mode 100644 index 0000000000..b82d59149d --- /dev/null +++ b/charts/thanos/templates/query-ingress.yml @@ -0,0 +1,108 @@ +--- +{{- if and .Values.query.enabled .Values.query.http.ingress.enabled }} +apiVersion: {{ .Values.query.http.ingress.apiVersion }} +kind: Ingress +metadata: + name: {{ include "thanos.componentname" (list $ "query") }}-http + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query + {{- if .Values.query.http.ingress.labels }} +{{ toYaml .Values.query.http.ingress.labels | indent 4 }} + {{- end }} + {{- with .Values.query.http.ingress.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.query.http.ingress.defaultBackend }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "query") }}-http + port: + number: {{ $.Values.query.http.port }} + {{- end }} + {{- if .Values.query.http.ingress.tls }} + tls: + {{- range .Values.query.http.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + {{- if .secretName }} + secretName: {{ .secretName }} + {{- end}} + {{- end }} + {{- end }} + rules: + {{- range .Values.query.http.ingress.hosts }} + - host: {{ . }} + http: + paths: + - path: {{ $.Values.query.http.ingress.path }} + pathType: {{ $.Values.query.http.ingress.pathType }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "query") }}-http + port: + number: {{ $.Values.query.http.port }} + {{- end }} +{{- end }} + +{{- if and .Values.query.enabled .Values.query.grpc.ingress.enabled }} +--- +apiVersion: {{ .Values.query.grpc.ingress.apiVersion }} +kind: Ingress +metadata: + name: {{ include "thanos.componentname" (list $ "query") }}-grpc + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query + {{- if .Values.query.grpc.ingress.labels }} +{{ toYaml .Values.query.grpc.ingress.labels | indent 4 }} + {{- end }} + {{- with .Values.query.grpc.ingress.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.query.grpc.ingress.defaultBackend }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "query") }}-grpc + port: + number: {{ $.Values.query.grpc.port }} + {{- end }} + {{- if .Values.query.grpc.ingress.tls }} + tls: + {{- range .Values.query.grpc.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + {{- if .secretName }} + secretName: {{ .secretName }} + {{- end}} + {{- end }} + {{- end }} + rules: + {{- range .Values.query.grpc.ingress.hosts }} + - host: {{ . }} + http: + paths: + - path: {{ $.Values.query.grpc.ingress.path }} + pathType: {{ $.Values.query.grpc.ingress.pathType }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "query") }}-grpc + port: + number: {{ $.Values.query.grpc.port }} + {{- end }} +{{- end }} + diff --git a/charts/thanos/templates/query-poddisruptionbudget.yaml b/charts/thanos/templates/query-poddisruptionbudget.yaml new file mode 100644 index 0000000000..46382864d9 --- /dev/null +++ b/charts/thanos/templates/query-poddisruptionbudget.yaml @@ -0,0 +1,26 @@ +{{- if and .Values.query.enabled .Values.query.podDisruptionBudget.enabled }} +apiVersion: policy/v1beta1 +kind: PodDisruptionBudget +metadata: + name: {{ include "thanos.componentname" (list $ "query") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query +{{ with .Values.query.deploymentLabels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + {{- if .Values.query.podDisruptionBudget.minAvailable }} + minAvailable: {{ .Values.query.podDisruptionBudget.minAvailable }} + {{- end }} + {{- if .Values.query.podDisruptionBudget.maxUnavailable }} + maxUnavailable: {{ .Values.query.podDisruptionBudget.maxUnavailable }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: query +{{- end }} diff --git a/charts/thanos/templates/query-psp.yaml b/charts/thanos/templates/query-psp.yaml new file mode 100644 index 0000000000..2ae9a85ed9 --- /dev/null +++ b/charts/thanos/templates/query-psp.yaml @@ -0,0 +1,27 @@ +--- +{{- if and .Values.query.enabled .Values.query.psp.enabled }} +apiVersion: policy/v1beta1 +kind: PodSecurityPolicy +metadata: + name: {{ include "thanos.componentname" (list $ "query") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query +spec: + privileged: false + allowPrivilegeEscalation: false + fsGroup: + rule: RunAsAny + runAsUser: + rule: RunAsAny + seLinux: + rule: RunAsAny + supplementalGroups: + rule: RunAsAny + volumes: + - secret +{{- end }} diff --git a/charts/thanos/templates/query-rbac.yaml b/charts/thanos/templates/query-rbac.yaml new file mode 100644 index 0000000000..3f0fe76b14 --- /dev/null +++ b/charts/thanos/templates/query-rbac.yaml @@ -0,0 +1,55 @@ +{{- if and .Values.query.enabled .Values.query.rbac.enabled }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + {{- with .Values.query.serviceAccountAnnotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} + name: {{ include "thanos.componentname" (list $ "query") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "thanos.componentname" (list $ "query") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query +rules: +- apiGroups: ['policy'] + resources: ['podsecuritypolicies'] + verbs: ['use'] + resourceNames: + - {{ include "thanos.componentname" (list $ "query") }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "thanos.componentname" (list $ "query") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query +roleRef: + kind: ClusterRole + name: {{ include "thanos.componentname" (list $ "query") }} + apiGroup: rbac.authorization.k8s.io +subjects: +- kind: ServiceAccount + name: {{ include "thanos.query.serviceaccount" . }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/charts/thanos/templates/query-service.yaml b/charts/thanos/templates/query-service.yaml new file mode 100644 index 0000000000..27ade19998 --- /dev/null +++ b/charts/thanos/templates/query-service.yaml @@ -0,0 +1,63 @@ +{{- if .Values.query.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "thanos.componentname" (list $ "query") }}-grpc + {{- with .Values.query.grpc.service.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query +{{ with .Values.query.grpc.service.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + type: ClusterIP + clusterIP: None + ports: + - port: {{ .Values.query.grpc.port }} + targetPort: grpc + protocol: TCP + name: grpc + selector: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: query +{{ with .Values.query.grpc.service.matchLabels }}{{ toYaml . | indent 4 }}{{ end }} + +--- + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "thanos.componentname" (list $ "query") }}-http + {{- with .Values.query.http.service.annotations }} + annotations: {{ toYaml .| nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query +{{ with .Values.query.http.service.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + type: {{ .Values.query.http.service.type }} + {{- if .Values.query.http.service.externalTrafficPolicy }} + externalTrafficPolicy: {{ .Values.query.http.externalTrafficPolicy }} + {{- end }} + ports: + - port: {{ .Values.query.http.port }} + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: query +{{ with .Values.query.http.service.matchLabels }}{{ toYaml . | indent 4 }}{{ end }} +{{- end -}} diff --git a/charts/thanos/templates/query-servicemonitor.yaml b/charts/thanos/templates/query-servicemonitor.yaml new file mode 100644 index 0000000000..8a54d6f0f8 --- /dev/null +++ b/charts/thanos/templates/query-servicemonitor.yaml @@ -0,0 +1,30 @@ +{{- if and .Values.query.enabled .Values.query.metrics.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "thanos.componentname" (list $ "query") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: query +{{ with .Values.query.metrics.serviceMonitor.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + jobLabel: thanos-query + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: query + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: http + interval: {{ .Values.query.metrics.serviceMonitor.interval | default "15s" }} + {{- with .Values.query.metrics.serviceMonitor.relabellings }} + metricRelabelings: {{ toYaml . | nindent 8 }} + {{- end }} +{{- end -}} diff --git a/charts/thanos/templates/rule-configmap.yaml b/charts/thanos/templates/rule-configmap.yaml new file mode 100644 index 0000000000..0e76aa8d7c --- /dev/null +++ b/charts/thanos/templates/rule-configmap.yaml @@ -0,0 +1,19 @@ +{{- if and .Values.rule.enabled (empty .Values.rule.ruleOverrideName) -}} +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: rule + name: {{ include "thanos.fullname" . }}-rules +data: + {{- $root := . -}} + {{- range $key, $value := .Values.rule.ruleFiles }} + {{ $key }}: | +{{ toYaml $value | default "{}" | indent 4 }} + {{- end -}} +{{- end -}} diff --git a/charts/thanos/templates/rule-ingress.yml b/charts/thanos/templates/rule-ingress.yml new file mode 100644 index 0000000000..1a1b510ec7 --- /dev/null +++ b/charts/thanos/templates/rule-ingress.yml @@ -0,0 +1,107 @@ +--- +{{- if and .Values.rule.enabled .Values.rule.http.ingress.enabled }} +apiVersion: {{ .Values.rule.http.ingress.apiVersion }} +kind: Ingress +metadata: + name: {{ include "thanos.componentname" (list $ "rule") }}-http + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: rule +{{- if .Values.rule.http.ingress.labels }} +{{ toYaml .Values.rule.http.ingress.labels | indent 4 }} +{{- end }} + {{- with .Values.rule.http.ingress.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.rule.http.ingress.defaultBackend }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "rule") }}-http + port: + number: {{ $.Values.rule.http.port }} + {{- end }} + {{- if .Values.rule.http.ingress.tls }} + tls: + {{- range .Values.rule.http.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + {{- if .secretName }} + secretName: {{ .secretName }} + {{- end}} + {{- end }} + {{- end }} + rules: + {{- range .Values.rule.http.ingress.hosts }} + - host: {{ . }} + http: + paths: + - path: {{ $.Values.rule.http.ingress.path }} + pathType: {{ $.Values.rule.http.ingress.pathType }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "rule") }}-http + port: + number: {{ $.Values.rule.http.port }} + {{- end }} +{{- end }} + +{{- if and .Values.rule.enabled .Values.rule.grpc.ingress.enabled }} +--- +apiVersion: {{ .Values.rule.grpc.ingress.apiVersion }} +kind: Ingress +metadata: + name: {{ include "thanos.componentname" (list $ "rule") }}-grpc + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: rule +{{- if .Values.rule.grpc.ingress.labels }} +{{ toYaml .Values.rule.grpc.ingress.labels | indent 4 }} +{{- end }} + {{- with .Values.rule.grpc.ingress.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.rule.grpc.ingress.defaultBackend }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "rule") }}-grpc + port: + number: {{ $.Values.rule.grpc.port }} + {{- end }} + {{- if .Values.rule.grpc.ingress.tls }} + tls: + {{- range .Values.rule.grpc.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + {{- if .secretName }} + secretName: {{ .secretName }} + {{- end}} + {{- end }} + {{- end }} + rules: + {{- range .Values.rule.grpc.ingress.hosts }} + - host: {{ . }} + http: + paths: + - path: {{ $.Values.rule.grpc.ingress.path }} + pathType: {{ $.Values.rule.grpc.ingress.pathType }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "rule") }}-grpc + port: + number: {{ $.Values.rule.grpc.port }} + {{- end }} +{{- end }} diff --git a/charts/thanos/templates/rule-service.yaml b/charts/thanos/templates/rule-service.yaml new file mode 100644 index 0000000000..280466d7db --- /dev/null +++ b/charts/thanos/templates/rule-service.yaml @@ -0,0 +1,63 @@ +{{- if .Values.rule.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "thanos.componentname" (list $ "rule") }}-grpc + {{- with .Values.rule.grpc.service.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: rule + {{ with .Values.rule.grpc.service.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + type: ClusterIP + clusterIP: None + ports: + - port: {{ .Values.rule.grpc.port }} + protocol: TCP + targetPort: grpc + name: grpc + selector: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: rule +{{ with .Values.rule.grpc.service.matchLabels }}{{ toYaml . | indent 4 }}{{ end }} + +--- + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "thanos.componentname" (list $ "rule") }}-http + {{- with .Values.rule.http.service.annotations }} + annotations: {{ toYaml .| nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: rule + {{ with .Values.rule.http.service.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + type: {{ .Values.rule.http.service.type }} + {{- if .Values.rule.http.service.externalTrafficPolicy }} + externalTrafficPolicy: {{ .Values.rule.http.externalTrafficPolicy }} + {{- end }} + ports: + - port: {{ .Values.rule.http.port }} + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: rule +{{ with .Values.rule.http.service.matchLabels }}{{ toYaml . | indent 4 }}{{ end }} +{{- end }} diff --git a/charts/thanos/templates/rule-servicemonitor.yaml b/charts/thanos/templates/rule-servicemonitor.yaml new file mode 100644 index 0000000000..2a145df497 --- /dev/null +++ b/charts/thanos/templates/rule-servicemonitor.yaml @@ -0,0 +1,30 @@ +{{- if and .Values.rule.enabled .Values.rule.metrics.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "thanos.componentname" (list $ "rule") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: rule +{{ with .Values.rule.metrics.serviceMonitor.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + jobLabel: thanos-rule + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: rule + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: http + interval: {{ .Values.rule.metrics.serviceMonitor.interval | default "15s" }} + {{- with .Values.rule.metrics.serviceMonitor.relabellings }} + metricRelabelings: {{ toYaml . | nindent 8 }} + {{- end }} +{{- end -}} diff --git a/charts/thanos/templates/rule-statefulset.yaml b/charts/thanos/templates/rule-statefulset.yaml new file mode 100644 index 0000000000..df43ce76ab --- /dev/null +++ b/charts/thanos/templates/rule-statefulset.yaml @@ -0,0 +1,183 @@ +{{ if .Values.rule.enabled }} +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{ include "thanos.componentname" (list $ "rule") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: rule +{{ with .Values.rule.statefulsetLabels }}{{ toYaml . | indent 4 }}{{ end -}} + {{- with .Values.rule.statefulsetAnnotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if not .Values.rule.autoscaling.enabled }} + replicas: {{ .Values.rule.replicaCount | default 1 }} + {{- end }} + {{- with .Values.rule.updateStrategy }} + updateStrategy: {{ toYaml . | nindent 4 }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: rule +{{ with .Values.rule.statefulsetMatchLabels }}{{ toYaml . | indent 6 }}{{ end }} + serviceName: {{ include "thanos.name" . }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: rule +{{ with .Values.rule.labels }}{{ toYaml . | indent 8 }}{{ end }} + {{- with .Values.rule.annotations }} + annotations: {{ toYaml . | nindent 8 }} + {{- end }} + spec: + containers: + - name: thanos-rule + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + resources: {{ toYaml .Values.rule.resources | nindent 10 }} + {{- with .Values.rule.extraEnv }} + env: {{ toYaml . | nindent 8 }} + {{- end }} + args: + - "rule" + - "--data-dir=/var/thanos/store" + - "--log.level={{ .Values.rule.logLevel }}" + - "--log.format={{ .Values.rule.logFormat }}" + - "--http-address=0.0.0.0:{{ .Values.rule.http.port }}" + - "--grpc-address=0.0.0.0:{{ .Values.rule.grpc.port }}" + - "--objstore.config-file=/etc/config/object-store.yaml" + - "--rule-file=/etc/rules/*.yaml" + {{- range $key, $val := .Values.rule.ruleLabels }} + - '--label={{ $key }}={{ $val | quote }}' + {{- end }} + {{- if .Values.rule.resendDelay }} + - "--resend-delay={{ .Values.rule.resendDelay }}" + {{- end }} + {{- if .Values.rule.evalInterval }} + - "--eval-interval={{ .Values.rule.evalInterval }}" + {{- end }} + {{- if .Values.rule.tsdbBlockDuration }} + - "--tsdb.block-duration={{ .Values.rule.tsdbBlockDuration }}" + {{- end }} + {{- if .Values.rule.tsdbRetention }} + - "--tsdb.retention={{ .Values.rule.tsdbRetention }}" + {{- end }} + {{- if .Values.rule.webRoutePrefix }} + - "--web.route-prefix={{ .Values.rule.webRoutePrefix }}" + {{- end }} + {{- if .Values.rule.webExternalPrefix }} + - "--web.external-prefix={{ .Values.rule.webExternalPrefix }}" + {{- end }} + {{- if .Values.rule.webPrefixHeader }} + - "--web.prefix-header={{ .Values.rule.webPrefixHeader }}" + {{- end }} + {{- if .Values.rule.queryDNSDiscovery }} + - "--query=dnssrv+_http._tcp.{{ include "thanos.componentname" (list $ "query") }}-http.{{ .Release.Namespace }}.svc.cluster.local" + {{- end }} + {{- range .Values.rule.alertmanagers }} + - "--alertmanagers.url={{ . }}" + {{- end }} + {{- if .Values.rule.alertmanagersSendTimeout }} + - "--alertmanagers.send-timeout={{ .Values.rule.alertmanagersSendTimeout }}" + {{- end }} + {{- if .Values.rule.alertQueryUrl }} + - "--alert.query-url={{ .Values.rule.alertQueryUrl }}" + {{- end }} + {{- range .Values.rule.alertLabelDrop }} + - "--alert.label-drop={{ . }}" + {{- end }} + {{- if .Values.rule.extraArgs }} + {{- toYaml .Values.rule.extraArgs | nindent 8 }} + {{- end }} + ports: + - name: http + containerPort: {{ .Values.rule.http.port }} + - name: grpc + containerPort: {{ .Values.rule.grpc.port }} + volumeMounts: + - name: rule-volume + mountPath: /etc/rules + readOnly: true + - name: config-volume + mountPath: /etc/config + readOnly: true + - name: data + mountPath: /var/thanos/store + {{- if .Values.rule.certSecretName }} + - mountPath: /etc/certs + name: {{ .Values.rule.certSecretName }} + readOnly: true + {{- end }} + volumes: + {{- if not .Values.rule.persistentVolumeClaim }} + - name: data + emptyDir: {} + {{- end }} + - name: config-volume + secret: + {{- if .Values.objstoreSecretOverride }} + secretName: "{{ .Values.objstoreSecretOverride }}" + {{- else }} + secretName: {{ include "thanos.fullname" . }} + {{- end }} + - name: rule-volume + configMap: + {{- if empty .Values.rule.ruleOverrideName }} + name: {{ include "thanos.fullname" . }}-rules + {{- else }} + name: {{ .Values.rule.ruleOverrideName }} + {{- end }} + {{- if .Values.rule.certSecretName }} + - name: {{ .Values.rule.certSecretName }} + secret: + defaultMode: 420 + secretName: {{ .Values.rule.certSecretName }} + {{- end }} + {{- if .Values.rule.livenessProbe }} + livenessProbe: {{ toYaml .Values.rule.livenessProbe | nindent 8 }} + {{- end }} + {{- if .Values.rule.readinessProbe }} + readinessProbe: {{ toYaml .Values.rule.readinessProbe | nindent 8 }} + {{- end }} + {{- with .Values.rule.securityContext }} + securityContext: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.rule.nodeSelector }} + nodeSelector: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.rule.affinity }} + affinity: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.rule.tolerations }} + tolerations: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.rule.serviceAccount }} + serviceAccountName: "{{ . }}" + {{- end }} + {{- if .Values.priorityClassName }} + priorityClassName: {{ .Values.priorityClassName }} + {{- end }} + {{- if .Values.rule.persistentVolumeClaim }} + volumeClaimTemplates: + - metadata: + name: {{ .Values.rule.persistentVolumeClaim.name }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: rule + spec: + {{- toYaml .Values.rule.persistentVolumeClaim.spec | nindent 6 }} + {{- end }} +{{- end }} diff --git a/charts/thanos/templates/secret.yaml b/charts/thanos/templates/secret.yaml new file mode 100644 index 0000000000..e65846ab97 --- /dev/null +++ b/charts/thanos/templates/secret.yaml @@ -0,0 +1,20 @@ +{{- if or .Values.bucket.enabled .Values.store.enabled .Values.compact.enabled .Values.sidecar.enabled }} +{{- if eq .Values.objstoreSecretOverride "" }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "thanos.fullname" . }} + labels: + app: {{ include "thanos.name" . }} + chart: {{ include "thanos.chart" . }} + release: "{{ .Release.Name }}" + heritage: "{{ .Release.Service }}" +type: Opaque +data: + {{- if .Values.objstore }} + object-store.yaml: {{ toYaml .Values.objstore | b64enc }} + {{- else }} + object-store.yaml: {{ .Values.objstoreFile | b64enc }} + {{- end }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/thanos/templates/sidecar-ingress.yaml b/charts/thanos/templates/sidecar-ingress.yaml new file mode 100644 index 0000000000..1ec0f2be53 --- /dev/null +++ b/charts/thanos/templates/sidecar-ingress.yaml @@ -0,0 +1,107 @@ +--- +{{- if and .Values.sidecar.enabled .Values.sidecar.http.ingress.enabled }} +apiVersion: {{ .Values.sidecar.http.ingress.apiVersion }} +kind: Ingress +metadata: + name: {{ include "thanos.componentname" (list $ "sidecar") }}-http + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: sidecar +{{- if .Values.sidecar.http.ingress.labels }} +{{ toYaml .Values.sidecar.ingress.http.labels | indent 4 }} +{{- end }} + {{- with .Values.sidecar.http.ingress.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.sidecar.http.ingress.defaultBackend }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "sidecar") }}-http + port: + number: {{ $.Values.sidecar.http.port }} + {{- end }} + {{- if .Values.sidecar.http.ingress.tls }} + tls: + {{- range .Values.sidecar.http.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + {{- if .secretName }} + secretName: {{ .secretName }} + {{- end}} + {{- end }} + {{- end }} + rules: + {{- range .Values.sidecar.http.ingress.hosts }} + - host: {{ . }} + http: + paths: + - path: {{ $.Values.sidecar.http.ingress.path }} + pathType: {{ $.Values.sidecar.http.ingress.pathType }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "sidecar") }}-http + port: + number: {{ $.Values.sidecar.http.port }} + {{- end }} +{{- end }} + +{{- if and .Values.sidecar.enabled .Values.sidecar.grpc.ingress.enabled }} +--- +apiVersion: {{ .Values.sidecar.grpc.ingress.apiVersion }} +kind: Ingress +metadata: + name: {{ include "thanos.componentname" (list $ "sidecar") }}-grpc + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: sidecar +{{- if .Values.sidecar.grpc.ingress.labels }} +{{ toYaml .Values.sidecar.grpc.ingress.labels | indent 4 }} +{{- end }} + {{- with .Values.sidecar.grpc.ingress.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.sidecar.grpc.ingress.defaultBackend }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "sidecar") }}-grpc + port: + number: {{ $.Values.sidecar.grpc.port }} + {{- end }} + {{- if .Values.sidecar.grpc.ingress.tls }} + tls: + {{- range .Values.sidecar.grpc.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + {{- if .secretName }} + secretName: {{ .secretName }} + {{- end}} + {{- end }} + {{- end }} + rules: + {{- range .Values.sidecar.grpc.ingress.hosts }} + - host: {{ . }} + http: + paths: + - path: {{ $.Values.sidecar.grpc.ingress.path }} + pathType: {{ $.Values.sidecar.grpc.ingress.pathType }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "sidecar") }}-grpc + port: + number: {{ $.Values.sidecar.grpc.port }} + {{- end }} +{{- end }} diff --git a/charts/thanos/templates/sidecar-service.yaml b/charts/thanos/templates/sidecar-service.yaml new file mode 100644 index 0000000000..89106e0c6b --- /dev/null +++ b/charts/thanos/templates/sidecar-service.yaml @@ -0,0 +1,63 @@ +{{- if .Values.sidecar.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "thanos.componentname" (list $ "sidecar") }}-grpc + {{- with .Values.sidecar.grpc.service.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: sidecar +{{ with .Values.sidecar.grpc.service.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + type: {{ .Values.sidecar.grpc.service.type }} + externalIPs: {{- toYaml .Values.sidecar.grpc.service.externalIPs | nindent 4 }} + {{- if eq .Values.sidecar.grpc.service.type "ClusterIP" }} + clusterIP: None + {{- end }} + ports: + - port: {{ .Values.sidecar.grpc.port }} + {{- if eq .Values.sidecar.grpc.service.type "NodePort" }} + nodePort: {{ .Values.sidecar.grpc.service.nodePort }} + {{- end }} + protocol: TCP + targetPort: grpc + name: grpc + selector: + {{ toYaml .Values.sidecar.selector | nindent 4 }} + +--- + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "thanos.componentname" (list $ "sidecar") }}-http + {{- with .Values.sidecar.http.service.annotations }} + annotations: {{ toYaml .| nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: sidecar +{{ with .Values.store.http.service.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + type: {{ .Values.sidecar.http.service.type }} + {{- if .Values.sidecar.http.service.externalTrafficPolicy }} + externalTrafficPolicy: {{ .Values.sidecar.http.externalTrafficPolicy }} + {{- end }} + ports: + - port: {{ .Values.sidecar.http.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{ toYaml .Values.sidecar.selector | nindent 4 }} +{{- end }} diff --git a/charts/thanos/templates/sidecar-servicemonitor.yaml b/charts/thanos/templates/sidecar-servicemonitor.yaml new file mode 100644 index 0000000000..8883fc6957 --- /dev/null +++ b/charts/thanos/templates/sidecar-servicemonitor.yaml @@ -0,0 +1,30 @@ +{{- if and .Values.sidecar.enabled .Values.sidecar.metrics.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "thanos.componentname" (list $ "sidecar") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: sidecar +{{ with .Values.sidecar.metrics.serviceMonitor.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + jobLabel: thanos-sidecar + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: sidecar + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: http + interval: {{ .Values.sidecar.metrics.serviceMonitor.interval | default "15s" }} + {{- with .Values.sidecar.metrics.serviceMonitor.relabellings }} + metricRelabelings: {{ toYaml . | nindent 8 }} + {{- end }} +{{- end -}} diff --git a/charts/thanos/templates/store-deployment.yaml b/charts/thanos/templates/store-deployment.yaml new file mode 100644 index 0000000000..0622522c87 --- /dev/null +++ b/charts/thanos/templates/store-deployment.yaml @@ -0,0 +1,193 @@ +{{- $root := . }} +{{ if .Values.store.enabled }} + +{{- $shards := int 0 }} +{{- $timePartioning := false }} +{{- $hashPartioning := false }} + +{{- if .Values.store.hashPartioning }} + {{- $shards = int .Values.store.hashPartioning.shards }} + {{- $hashPartioning = true }} +{{- else }} + {{- $shards = len .Values.store.timePartioning }} + {{- $timePartioning = true }} +{{- end }} + +{{- range $index, $_ := until $shards }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "thanos.componentname" (list $ "store") }}-{{ $index }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" $root }} + helm.sh/chart: {{ include "thanos.chart" $root }} + app.kubernetes.io/instance: {{ $.Release.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: store + app.kubernetes.io/partition: "{{ $index }}" +{{ with $root.Values.store.deploymentLabels }}{{ toYaml . | indent 4 }}{{ end }} + {{- with $root.Values.store.deploymentAnnotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + replicas: {{ $root.Values.store.replicaCount | default 1 }} + {{- with $root.Values.store.strategy }} + strategy: {{ toYaml . | nindent 4 }} + {{- end }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" $root }} + app.kubernetes.io/instance: {{ $.Release.Name }} + app.kubernetes.io/component: store + app.kubernetes.io/partition: "{{ $index }}" +{{ with $root.Values.store.deploymentMatchLabels }}{{ toYaml . | indent 6 }}{{ end }} + template: + metadata: + labels: +{{ with $root.Values.store.labels }}{{ toYaml . | indent 8 }}{{ end }} + app.kubernetes.io/name: {{ include "thanos.name" $root }} + app.kubernetes.io/instance: {{ $.Release.Name }} + app.kubernetes.io/component: store + app.kubernetes.io/partition: "{{ $index }}" + {{- if or $root.Values.store.annotations $root.Values.store.metrics.annotations.enabled }} + annotations: + {{- end }} + {{- with $root.Values.store.annotations }}{{ toYaml . | nindent 8 }}{{- end }} + {{- if $root.Values.store.metrics.annotations.enabled }} + prometheus.io/scrape: "true" + prometheus.io/port: "{{ $root.Values.store.http.port }}" + {{- end }} + spec: + {{- if $root.Values.store.initContainers }} + initContainers: +{{ toYaml $root.Values.store.initContainers | indent 6 }} + {{- end }} + containers: + - name: thanos-store + image: "{{ $root.Values.image.repository }}:{{ $root.Values.image.tag }}" + imagePullPolicy: {{ $root.Values.image.pullPolicy }} + {{- with $root.Values.store.extraEnv }} + env: {{ toYaml . | nindent 8 }} + {{- end }} + args: + - "store" + - "--data-dir=/var/thanos/store" + - "--log.level={{ $root.Values.store.logLevel }}" + - "--log.format={{ $root.Values.store.logFormat }}" + - "--http-address=0.0.0.0:{{ $root.Values.store.http.port }}" + - "--grpc-address=0.0.0.0:{{ $root.Values.store.grpc.port }}" + - "--objstore.config-file=/etc/config/object-store.yaml" + {{- if $root.Values.store.indexCacheSize }} + - "--index-cache-size={{ $root.Values.store.indexCacheSize }}" + {{- end }} + {{- if $root.Values.store.chunkPoolSize }} + - "--chunk-pool-size={{ $root.Values.store.chunkPoolSize }}" + {{- end }} + {{- if $root.Values.store.grpcSeriesSampleLimit }} + - "--store.grpc.series-sample-limit={{ $root.Values.store.grpcSeriesSampleLimit }}" + {{- end }} + {{- if $root.Values.store.grpcSeriesMaxConcurrency }} + - "--store.grpc.series-max-concurrency={{ $root.Values.store.grpcSeriesMaxConcurrency }}" + {{- end }} + {{- if $root.Values.store.syncBlockDuration }} + - "--sync-block-duration={{ $root.Values.store.syncBlockDuration }}" + {{- end }} + {{- if $root.Values.store.blockSyncConcurrency }} + - "--block-sync-concurrency={{ $root.Values.store.blockSyncConcurrency }}" + {{- end }} + {{- if $timePartioning }} + {{- $partion := (slice $root.Values.store.timePartioning $index) | first }} + {{- if $partion.max }} + - "--max-time={{ $partion.max }}" + {{- end}} + {{- if $partion.min }} + - "--min-time={{ $partion.min }}" + {{- end}} + {{- end }} + {{- if $hashPartioning }} + - | + --selector.relabel-config= + - action: hashmod + source_labels: ["__block_id"] + target_label: shard + modulus: {{ $shards }} + - action: keep + source_labels: ["shard"] + regex: {{ $index }} + {{- end }} + {{- if $root.Values.store.extraArgs }} + {{ toYaml $root.Values.store.extraArgs | nindent 8 }} + {{- end }} + ports: + - name: http + containerPort: {{ $root.Values.store.http.port }} + - name: grpc + containerPort: {{ $root.Values.store.grpc.port }} + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + - name: data + mountPath: /var/thanos/store + {{- if $root.Values.store.certSecretName }} + - mountPath: /etc/certs + name: {{ $root.Values.store.certSecretName }} + readOnly: true + {{- end }} + {{- if $root.Values.store.livenessProbe }} + livenessProbe: + {{ toYaml $root.Values.store.livenessProbe | nindent 10 }} + {{- end }} + {{- if $root.Values.store.readinessProbe }} + readinessProbe: + {{ toYaml $root.Values.store.readinessProbe | nindent 10 }} + {{- end }} + resources: + {{ toYaml $root.Values.store.resources | nindent 10 }} + volumes: + - name: data + {{- if $root.Values.store.dataVolume.backend }} + {{- if $root.Values.store.persistentVolumeClaim }} + persistentVolumeClaim: + claimName: {{ $root.Values.store.dataVolume.backend.persistentVolumeClaim.claimName }}-{{ $index }} + {{- else }} + {{ toYaml $root.Values.store.dataVolume.backend | nindent 8 }} + {{- end }} + {{- else }} + emptyDir: {} + {{- end }} + - name: config-volume + secret: + {{- if $root.Values.objstoreSecretOverride }} + secretName: "{{ $root.Values.objstoreSecretOverride }}" + {{- else }} + secretName: {{ include "thanos.fullname" $root }} + {{- end }} + {{- if $root.Values.store.certSecretName }} + - name: {{ $root.Values.store.certSecretName }} + secret: + defaultMode: 420 + secretName: {{ $root.Values.store.certSecretName }} + {{- end }} + {{- with $root.Values.store.securityContext }} + securityContext: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with $root.Values.store.nodeSelector }} + nodeSelector: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with $root.Values.store.affinity }} + affinity: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with $root.Values.store.tolerations }} + tolerations: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with $root.Values.store.serviceAccount }} + serviceAccountName: "{{ . }}" + {{- end }} + {{- if $root.Values.priorityClassName }} + priorityClassName: {{ $root.Values.priorityClassName }} + {{- end }} +--- +{{- end }} +{{- end }} diff --git a/charts/thanos/templates/store-ingress.yaml b/charts/thanos/templates/store-ingress.yaml new file mode 100644 index 0000000000..d8822ae874 --- /dev/null +++ b/charts/thanos/templates/store-ingress.yaml @@ -0,0 +1,103 @@ +{{- if and .Values.store.enabled .Values.store.http.ingress.enabled }} +apiVersion: {{ .Values.sidecar.http.ingress.apiVersion }} +kind: Ingress +metadata: + name: {{ include "thanos.componentname" (list $ "store") }}-http + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: store +{{- if .Values.store.http.ingress.labels }} +{{ toYaml .Values.store.http.ingress.labels | indent 4 }} +{{- end }} + {{- with .Values.store.http.ingress.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.store.http.ingress.defaultBackend }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "store") }}-http + port: + number: {{ $.Values.store.http.port }} + {{- end }} + {{- if .Values.store.http.ingress.tls }} + tls: + {{- range .Values.store.http.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.store.http.ingress.hosts }} + - host: {{ . }} + http: + paths: + - path: {{ $.Values.store.http.ingress.path }} + pathType: {{ $.Values.store.http.ingress.pathType }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "store") }}-http + port: + number: {{ $.Values.store.http.port }} + {{- end }} +{{- end }} + +--- + + {{- if and .Values.store.enabled .Values.store.grpc.ingress.enabled }} +apiVersion: {{ .Values.sidecar.grpc.ingress.apiVersion }} +kind: Ingress +metadata: + name: {{ include "thanos.componentname" (list $ "store") }}-grpc + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: store +{{- if .Values.store.grpc.ingress.labels }} +{{ toYaml .Values.store.grpc.ingress.labels | indent 4 }} +{{- end }} + {{- with .Values.store.grpc.ingress.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.store.grpc.ingress.defaultBackend }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "store") }}-grpc + port: + number: {{ $.Values.store.grpc.port }} + {{- end }} + {{- if .Values.store.grpc.ingress.tls }} + tls: + {{- range .Values.store.grpc.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.store.grpc.ingress.hosts }} + - host: {{ . }} + http: + paths: + - path: {{ $.Values.store.grpc.ingress.path }} + pathType: {{ $.Values.store.grpc.ingress.pathType }} + backend: + service: + name: {{ include "thanos.componentname" (list $ "store") }}-grpc + port: + number: {{ $.Values.store.grpc.port }} + {{- end }} +{{- end }} diff --git a/charts/thanos/templates/store-persistentvolumeclaim.yaml b/charts/thanos/templates/store-persistentvolumeclaim.yaml new file mode 100644 index 0000000000..def6d89df1 --- /dev/null +++ b/charts/thanos/templates/store-persistentvolumeclaim.yaml @@ -0,0 +1,31 @@ +{{- $root := . }} +{{- if and .Values.store.enabled .Values.store.persistentVolumeClaim }} +{{- $pvc := .Values.store.persistentVolumeClaim -}} + +{{- $shards := int 0 }} +{{- if .Values.store.hashPartioning }} + {{- $shards = int .Values.store.hashPartioning.shards }} +{{- else }} + {{- $shards = len .Values.store.timePartioning }} +{{- end }} +{{- range $index, $_ := until $shards }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ $pvc.name }}-{{ $index }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" $root }} + helm.sh/chart: {{ include "thanos.chart" $root }} + app.kubernetes.io/instance: {{ $.Release.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: store +{{ with $root.Values.store.deploymentLabels }}{{ toYaml . | indent 4 }}{{ end -}} + {{- with $root.Values.store.deploymentAnnotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + {{- toYaml $pvc.spec | nindent 2 }} +--- +{{- end }} +{{- end }} diff --git a/charts/thanos/templates/store-service.yaml b/charts/thanos/templates/store-service.yaml new file mode 100644 index 0000000000..7d105b9399 --- /dev/null +++ b/charts/thanos/templates/store-service.yaml @@ -0,0 +1,63 @@ +{{- if .Values.store.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "thanos.componentname" (list $ "store") }}-grpc + {{- with .Values.store.grpc.service.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: store +{{ with .Values.store.grpc.service.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + type: ClusterIP + clusterIP: None + ports: + - port: {{ .Values.store.grpc.port }} + targetPort: grpc + protocol: TCP + name: grpc + selector: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: store +{{ with .Values.store.grpc.service.matchLabels }}{{ toYaml . | indent 4 }}{{ end }} + +--- + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "thanos.componentname" (list $ "store") }}-http + {{- with .Values.store.http.service.annotations }} + annotations: {{ toYaml .| nindent 4 }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ $.Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: store +{{ with .Values.store.http.service.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + type: {{ .Values.store.http.service.type }} + {{- if .Values.store.http.service.externalTrafficPolicy }} + externalTrafficPolicy: {{ .Values.store.http.externalTrafficPolicy }} + {{- end }} + ports: + - port: {{ .Values.store.http.port }} + targetPort: {{ .Values.store.http.targetPort }} + protocol: TCP + name: http + selector: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: store +{{ with .Values.store.http.service.matchLabels }}{{ toYaml . | indent 4 }}{{ end }} +{{- end -}} diff --git a/charts/thanos/templates/store-servicemonitor.yaml b/charts/thanos/templates/store-servicemonitor.yaml new file mode 100644 index 0000000000..e6813fcd72 --- /dev/null +++ b/charts/thanos/templates/store-servicemonitor.yaml @@ -0,0 +1,30 @@ +{{- if and .Values.store.enabled .Values.store.metrics.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "thanos.componentname" (list $ "store") }} + labels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + helm.sh/chart: {{ include "thanos.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + app.kubernetes.io/version: {{ .Chart.AppVersion | replace "+" "_" }} + app.kubernetes.io/component: store +{{ with .Values.store.metrics.serviceMonitor.labels }}{{ toYaml . | indent 4 }}{{ end }} +spec: + jobLabel: thanos-store + selector: + matchLabels: + app.kubernetes.io/name: {{ include "thanos.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/component: store + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: http + interval: {{ .Values.store.metrics.serviceMonitor.interval | default "15s" }} + {{- with .Values.store.metrics.serviceMonitor.relabellings }} + metricRelabelings: {{ toYaml . | nindent 8 }} + {{- end }} +{{- end -}} diff --git a/charts/thanos/values.yaml b/charts/thanos/values.yaml new file mode 100644 index 0000000000..886c5c50d9 --- /dev/null +++ b/charts/thanos/values.yaml @@ -0,0 +1,1222 @@ +image: + repository: quay.io/thanos/thanos + tag: v0.17.1 + pullPolicy: IfNotPresent + +store: + enabled: true + # Maximum size of items held in the index cache. + indexCacheSize: 250MB + # Maximum size of concurrently allocatable bytes for chunks. + chunkPoolSize: 2GB + # Maximum amount of samples returned via a single series call. 0 means no limit. + # NOTE: for efficiency we take 120 as the number of samples in chunk (it cannot be bigger than that), + # so the actual number of samples might be lower, even though the maximum could be hit. + grpcSeriesSampleLimit: 0 + # Maximum number of concurrent Series calls. + grpcSeriesMaxConcurrency: 20 + # Repeat interval for syncing the blocks between local and remote view. + syncBlockDuration: 3m + # Number of goroutines to use when syncing blocks from object storage. + blockSyncConcurrency: 20 + # Log filtering level. + logLevel: info + # Log format to use. Possible options: logfmt or json. + logFormat: logfmt + # Add extra environment variables to store + extraEnv: [] + # - name: ENV + # value: value + # + # Add extra arguments to the store service + extraArgs: [] + # - "--extraargs=extravalue" + # + # Number of replicas running from store component + replicaCount: 1 + # Kubernetes deployment strategy object as documented in https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy + strategy: {} + # Data volume for the thanos-store to store temporary data defaults to emptyDir + dataVolume: + backend: {} + # persistentVolumeClaim: + # claimName: store-data-volume + # Create the specified persistentVolumeClaim in case persistentVolumeClaim is + # used for the dataVolume.backend above and needs to be created. + persistentVolumeClaim: {} + # name: store-data-volume + # spec: + # storageClassName: "" + # accessModes: ["ReadWriteOnce"] + # resources: + # requests: + # storage: 100Gi + # selector: {} + # volumeName: "" + # volumeMode: "" + # Extra labels for store pod template + labels: {} + # cluster: example + # + # Extra annotations for store pod template + annotations: {} + # example.com: default + # + # Add extra labels to store deployment + deploymentLabels: {} + # extraLabel: extraLabelValue + # + # Add extra annotations to store deployment + deploymentAnnotations: {} + # extraAnnotation: extraAnnotationValue + # + # Add extra selector matchLabels to store deployment + deploymentMatchLabels: {} + # Enable metrics collecting for store service + metrics: + # This is the Prometheus annotation type scraping configuration + annotations: + enabled: false + # Enable ServiceMonitor https://github.com/coreos/prometheus-operator + serviceMonitor: + enabled: false + # Labels for prometheus-operator to find servicemonitor + labels: {} + # The grpc endpoint to communicate with other components + grpc: + # grpc listen port number + port: 10901 + # Service definition for query grpc service + service: + # Annotations to query grpc service + annotations: {} + # Labels to query grpc service + labels: {} + # Match labels for service selector + matchLabels: {} + # Set up ingress for the grpc service + ingress: + enabled: false + + # Set API version for ingress + apiVersion: networking.k8s.io/v1 + + # Set default backend for ingress + defaultBackend: false + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "/" + pathType: ImplementationSpecific + hosts: + - "/" + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + + # The http endpoint to communicate with other components + http: + # http listen port number + port: 10902 + # Service definition for query http service + service: + type: ClusterIP + # Annotations to query http service + annotations: {} + # Labels to query http service + labels: {} + # Match labels for service selector + matchLabels: {} + # Set up ingress for the http service + ingress: + enabled: false + + # Set API version for ingress + apiVersion: networking.k8s.io/v1 + + # Set default backend for ingress + defaultBackend: false + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "/" + pathType: ImplementationSpecific + hosts: + - "/" + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + # Optional securityContext + securityContext: {} + resources: {} + # limits: + # cpu: 2000m + # memory: 16Gi + # requests: + # cpu: 1000m + # memory: 4Gi + # + # Node tolerations for server scheduling to nodes with taints + # Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + # + # Node labels for store pod assignment + # Ref: https://kubernetes.io/docs/user-guide/node-selection/ + # + nodeSelector: {} + # + # Pod affinity + # Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#node-affinity + affinity: {} + serviceAccount: "" + # set up store readinessProbe & livenessProbe + # Ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ + livenessProbe: {} + readinessProbe: {} + # Setting time timePartioning will create multipale store deployments based on the number of partions + # See https://thanos.io/components/store.md/#time-based-partioning + timePartioning: + - min: "" + max: "" + # Setting hashPartioning will create multiple store deployments based on the number of shards specified using the hashmod of the blocks + # See https://thanos.io/sharding.md/#relabelling + hashPartioning: {} + # shards: + # InitContainers allows injecting specialized containers that run before app containers. This is meant to pre-configure and tune mounted volume permissions. + initContainers: [] +query: + enabled: true + # Labels to treat as a replica indicator along which data is deduplicated. + # Still you will be able to query without deduplication using 'dedup=false' parameter. + replicaLabels: [] + # - replica + # - prometheus_replica + # + # Enable --query.auto-downsampling option for query. + autoDownsampling: true + # Prefix for API and UI endpoints. This allows thanos UI to be served on a sub-path. + # This option is analogous to --web.route-prefix of Promethus. + webRoutePrefix: "" + # Static prefix for all HTML links and redirect + # URLs in the UI query web interface. Actual + # endpoints are still served on / or the + # web.route-prefix. This allows thanos UI to be + # served behind a reverse proxy that strips a URL + # sub-path. + webExternalPrefix: "" + # Name of HTTP request header used for dynamic prefixing of UI links and redirects. + # This option is ignored if web.external-prefix argument is set. Security risk: enable this + # option only if a reverse proxy in front of thanos is resetting the header. The --web.prefix-header=X-Forwarded-Prefix option + # can be useful, for example, if Thanos UI is served via Traefik reverse proxy with PathPrefixStrip option enabled, which sends the + # stripped prefix value in X-Forwarded-Prefix header. This allows thanos UI to be served on a sub-path + webPrefixHeader: "" + # https://github.com/improbable-eng/thanos/issues/1015 + storeDNSResolver: miekgdns + # Enable DNS discovery for stores + storeDNSDiscovery: true + # Enable DNS discovery for sidecars (this is for the chart built-in sidecar service) + sidecarDNSDiscovery: true + # Enable DNS discovery for queries + ruleDNSDiscovery: false + # Addresses of statically configured store API servers (repeatable). + # The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect store API servers through respective DNS lookups. + stores: [] + # - "dnssrv+_grpc._tcp...svc.cluster.local" + # + # Path to files that contains addresses of store API servers. The path can be a glob pattern (repeatable). + serviceDiscoveryFiles: [] + # Names of configmaps that contain addresses of store API servers, used for file service discovery. + serviceDiscoveryFileConfigMaps: [] + # Refresh interval to re-read file SD files. It is used as a resync fallback. + serviceDiscoveryInterval: 5m + # Log filtering level. + logLevel: info + # Log format to use. Possible options: logfmt or json. + logFormat: logfmt + # Add extra environment variables to query + extraEnv: [] + # - name: ENV + # value: value + # + # Add extra arguments to the query service + extraArgs: [] + # - "--extraargs=extravalue" + # + # Number of replicas running from query component + replicaCount: 1 + # Kubernetes deployment strategy object as documented in https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy + strategy: {} + # Enable HPA for query component + autoscaling: + enabled: false + minReplicas: 2 + maxReplicas: 3 + targetCPUUtilizationPercentage: 50 + targetMemoryUtilizationPercentage: 50 + # Enable podDisruptionBudget for query component + podDisruptionBudget: + enabled: false + # minAvailable and maxUnavailable can't be used simultaneous. Choose one. + minAvailable: 1 + # maxUnavailable: 50% + + # The grpc endpoint to communicate with other components + grpc: + # grpc listen port number + port: 10901 + # Service definition for query grpc service + service: + # Annotations to query grpc service + annotations: {} + # Labels to query grpc service + labels: {} + # Match labels for service selector + matchLabels: {} + # Set up ingress for the grpc service + ingress: + enabled: false + + # Set API version for ingress + apiVersion: networking.k8s.io/v1 + + # Set default backend for ingress + defaultBackend: false + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "/" + pathType: ImplementationSpecific + hosts: + - "/" + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + serviceAccount: "" + # Optional annotations to be added to the ServiceAccount + serviceAccountAnnotations: {} + + psp: + enabled: false + rbac: + enabled: false + + # The http endpoint to communicate with other components + http: + # http listen port number + port: 10902 + # Service definition for query http service + service: + type: ClusterIP + # Annotations to query http service + annotations: {} + # Labels to query http service + labels: {} + # Match labels for service selector + matchLabels: {} + # Set up ingress for the http service + ingress: + enabled: false + + # Set API version for ingress + apiVersion: networking.k8s.io/v1 + + # Set default backend for ingress + defaultBackend: false + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "/" + pathType: ImplementationSpecific + hosts: + - "/" + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + + certSecretName: "" + # Extra labels for query pod template + labels: {} + # cluster: example + # + # Extra annotations for query pod template + annotations: {} + # example.com: default + # + # Add extra labels to query deployment + deploymentLabels: {} + # extraLabel: extraLabelValue + # + # Add extra annotations to query deployment + deploymentAnnotations: {} + # extraAnnotation: extraAnnotationValue + # + # Add extra selector matchLabels to query deployment + deploymentMatchLabels: {} + # + # Enable metrics collecting for query service + metrics: + # This is the Prometheus annotation type scraping configuration + annotations: + enabled: false + # Enable ServiceMonitor https://github.com/coreos/prometheus-operator + serviceMonitor: + enabled: false + # Labels for prometheus-operator to find servicemonitor + labels: {} + + # Optional securityContext + securityContext: {} + resources: {} + # limits: + # cpu: 2000m + # memory: 16Gi + # requests: + # cpu: 1000m + # memory: 4Gi + # + # Node tolerations for server scheduling to nodes with taints + # Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + # + # Node labels for query pod assignment + # Ref: https://kubernetes.io/docs/user-guide/node-selection/ + # + nodeSelector: {} + # + # Pod affinity + # Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#node-affinity + affinity: {} + # set up store readinessProbe & livenessProbe + # Ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ + livenessProbe: + httpGet: + path: /-/healthy + port: http + readinessProbe: + httpGet: + path: /-/ready + port: http + +queryFrontend: + enabled: false + # Add extra environment variables to query-frontend + extraEnv: [] + # - name: ENV + # value: value + # + # Add extra arguments to the query-frontend service + extraArgs: [] + # - "--extraargs=extravalue" + # + # Number of replicas running from query-frontend component + replicaCount: 1 + # Kubernetes deployment strategy object as documented in https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy + strategy: {} + # Enable HPA for query-frontend component + autoscaling: + enabled: false + minReplicas: 2 + maxReplicas: 3 + targetCPUUtilizationPercentage: 50 + targetMemoryUtilizationPercentage: 50 + # Enable podDisruptionBudget for query-frontend component + podDisruptionBudget: + enabled: false + # minAvailable and maxUnavailable can't be used simultaneous. Choose one. + minAvailable: 1 + # maxUnavailable: 50% + + # The grpc endpoint to communicate with other components + grpc: + # grpc listen port number + port: 10901 + # Service definition for query-frontend grpc service + service: + # Annotations to query-frontend grpc service + annotations: {} + # Labels to query-frontend grpc service + labels: {} + # Match labels for service selector + matchLabels: {} + # Set up ingress for the grpc service + ingress: + enabled: false + + # Set API version for ingress + apiVersion: networking.k8s.io/v1 + + # Set default backend for ingress + defaultBackend: false + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "/" + pathType: ImplementationSpecific + hosts: + - "/" + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + + psp: + enabled: false + rbac: + enabled: false + + # The http endpoint to communicate with other components + http: + # http listen port number + port: 10902 + # Service definition for query-frontend http service + service: + type: ClusterIP + # Annotations to query-frontend http service + annotations: {} + # Labels to query-frontend http service + labels: {} + # Match labels for service selector + matchLabels: {} + # Set up ingress for the http service + ingress: + enabled: false + + # Set API version for ingress + apiVersion: networking.k8s.io/v1 + + # Set default backend for ingress + defaultBackend: false + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "/" + pathType: ImplementationSpecific + hosts: + - "/" + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + + # URL of downstream Prometheus Query compatible API. + # downstream-url=":" + downstreamUrl: + # Compress HTTP responses. + compressResponses: true + # Log queries that are slower than the specified duration. + # Set to 0 to disable. + # Set to < 0 to enable on all queries. + logQueriesLongerThan: 0 + # Use compression in results cache. Supported + # values are: 'snappy' and ” (disable compression). + cacheCompressionType: "" + + # query-range parameters, see https://thanos.io/tip/components/query-frontend.md/#flags + queryRange: + alignRangeWithStep: false + splitInterval: 24h + maxRetriesPerRequest: 5 + maxQueryLength: 0 + maxQueryParallelism: 14 + responseCacheMaxFreshness: 1m + noPartialResponse: false + cache: + inMemory: false + maxSize: + maxSizeItems: + validity: + + # labels parameters, see https://thanos.io/tip/components/query-frontend.md/#flags + qflabels: + splitInterval: 24h + maxRetriesPerRequest: 5 + maxQueryParallelism: 14 + responseCacheMaxFreshness: 1m + noPartialResponse: false + defaultTimeRange: 24h + cache: + inMemory: false + maxSize: + maxSizeItems: + validity: + + log: + level: info + format: logfmt + request: + decision: LogFinishCall + + certSecretName: "" + # Extra labels for query-frontend pod template + labels: {} + # cluster: example + # + # Extra annotations for query-frontend pod template + annotations: {} + # example.com: default + # + # Add extra labels to query-frontend deployment + deploymentLabels: {} + # extraLabel: extraLabelValue + # + # Add extra annotations to query-frontend deployment + deploymentAnnotations: {} + # extraAnnotation: extraAnnotationValue + # + # Add extra selector matchLabels to query-frontend deployment + deploymentMatchLabels: {} + # + # Enable metrics collecting for query-frontend service + metrics: + # This is the Prometheus annotation type scraping configuration + annotations: + enabled: false + # Enable ServiceMonitor https://github.com/coreos/prometheus-operator + serviceMonitor: + enabled: false + # Labels for prometheus-operator to find servicemonitor + labels: {} + + # Optional securityContext + securityContext: {} + resources: {} + # limits: + # cpu: 2000m + # memory: 16Gi + # requests: + # cpu: 1000m + # memory: 4Gi + # + # Node tolerations for server scheduling to nodes with taints + # Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + # + # Node labels for query-frontend pod assignment + # Ref: https://kubernetes.io/docs/user-guide/node-selection/ + # + nodeSelector: {} + # + # Pod affinity + # Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#node-affinity + affinity: {} + + # Optional annotations to be added to the ServiceAccount + serviceAccountAnnotations: {} + +compact: + enabled: true + # Minimum age of fresh (non-compacted) blocks before they are being processed. + # Malformed blocks older than the maximum of consistency-delay and 30m0s will be removed. + consistencyDelay: 30m + # How long to retain raw samples in bucket. 0d - disables this retention + retentionResolutionRaw: 30d + # How long to retain samples of resolution 1 (5 minutes) in bucket. 0d - disables this retention + retentionResolution5m: 120d + # How long to retain samples of resolution 2 (1 hour) in bucket. 0d - disables this retention + retentionResolution1h: 1y + # Number of goroutines to use when syncing block metadata from object storage. + blockSyncConcurrency: 20 + # Number of goroutines to use when compacting groups. + compactConcurrency: 1 + # Log filtering level. + logLevel: info + # Log format to use. Possible options: logfmt or json. + logFormat: logfmt + # Compact service listening http port + http: + port: 10902 + service: + labels: {} + # Match labels for service selector + matchLabels: {} + # Add extra environment variables to compact + extraEnv: + # - name: ENV + # value: value + # + # Add extra arguments to the compact service + extraArgs: + # - "--extraargs=extravalue" + # + # Kubernetes deployment strategy object as documented in https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy + strategy: {} + # Data volume for the compactor to store temporary data defaults to emptyDir + dataVolume: + backend: {} + # persistentVolumeClaim: + # claimName: compact-data-volume + # Create the specified persistentVolumeClaim in case persistentVolumeClaim is + # used for the dataVolume.backend above and needs to be created. + persistentVolumeClaim: {} + # name: compact-data-volume + # spec: + # storageClassName: "" + # accessModes: ["ReadWriteOnce"] + # resources: + # requests: + # storage: 100Gi + # selector: {} + # volumeName: "" + # volumeMode: "" + # Extra labels for compact pod template + labels: {} + # cluster: example + # + # Extra annotations for compact pod template + annotations: {} + # example.com: default + # + # Add extra labels to compact deployment + deploymentLabels: {} + # extraLabel: extraLabelValue + # + # Add extra annotations to compact deployment + deploymentAnnotations: {} + # extraAnnotation: extraAnnotationValue + # + # Add extra selector matchLabels to compact deployment + deploymentMatchLabels: {} + # + # Enable metrics collecting for compact service + metrics: + # This is the Prometheus annotation type scraping configuration + annotations: + enabled: false + # Enable ServiceMonitor https://github.com/coreos/prometheus-operator + serviceMonitor: + enabled: false + # Labels for prometheus-operator to find servicemonitor + labels: {} + serviceAccount: "" + + # Optional securityContext + securityContext: {} + resources: {} + # limits: + # cpu: 2000m + # memory: 16Gi + # requests: + # cpu: 1000m + # memory: 4Gi + # + # Node tolerations for server scheduling to nodes with taints + # Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + # + # Node labels for compact pod assignment + # Ref: https://kubernetes.io/docs/user-guide/node-selection/ + # + nodeSelector: {} + # + # Pod affinity + # Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#node-affinity + affinity: {} + +bucket: + enabled: true + # Number of replicas running from bucket component + replicaCount: 1 + # Log filtering level. + logLevel: info + # Log format to use. Possible options: logfmt or json. + logFormat: logfmt + # Refresh interval to download metadata from remote storage + refresh: 30m + # Timeout to download metadata from remote storage + timeout: 5m + # Prometheus label to use as timeline title + label: "" + # The http endpoint to communicate with other components + http: + # http listen port number + port: 8080 + # Service definition for bucket http service + service: + type: ClusterIP + # Annotations to bucket http service + annotations: {} + # Labels to bucket http service + labels: {} + # Match labels for service selector + matchLabels: {} + # Set up ingress for the http service + ingress: + enabled: false + + # Set API version for ingress + apiVersion: networking.k8s.io/v1 + + # Set default backend for ingress + defaultBackend: false + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "/" + pathType: ImplementationSpecific + hosts: + - "/" + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + # Add extra environment variables to bucket + extraEnv: + # - name: ENV + # value: value + # + # Add extra arguments to the bucket service + extraArgs: + # - "--extraargs=extravalue" + # + # Kubernetes deployment strategy object as documented in https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy + strategy: {} + # Extra labels for bucket pod template + labels: {} + # cluster: example + # + # Extra annotations for bucket pod template + annotations: {} + # example.com: default + # + # Add extra labels to bucket deployment + deploymentLabels: {} + # extraLabel: extraLabelValue + # + # Add extra annotations to bucket deployment + deploymentAnnotations: {} + # + # Add extra selector matchLabels to bucket deployment + deploymentMatchLabels: {} + + # Enable podDisruptionBudget for bucket component + podDisruptionBudget: + enabled: false + # minAvailable and maxUnavailable can't be used simultaneous. Choose one. + minAvailable: 1 + # maxUnavailable: 50% + + # Optional securityContext + securityContext: {} + resources: {} + # limits: + # cpu: 2000m + # memory: 16Gi + # requests: + # cpu: 1000m + # memory: 4Gi + # + # Node tolerations for server scheduling to nodes with taints + # Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + # + # Node labels for bucket pod assignment + # Ref: https://kubernetes.io/docs/user-guide/node-selection/ + # + nodeSelector: {} + # + # Pod affinity + # Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#node-affinity + affinity: {} + serviceAccount: "" + +rule: + enabled: false + # Labels to be applied to all generated metrics (repeated). Similar to external labels for + # Prometheus, used to identify ruler and its blocks as unique source. + ruleLabels: {} + # Minimum amount of time to wait before resending an alert to Alertmanager. + resendDelay: "" + # The default evaluation interval to use. + evalInterval: "" + # Block duration for TSDB block. + tsdbBlockDuration: "" + # Block retention time on local disk. + tsdbRetention: "" + # Prefix for API and UI endpoints. This allows thanos UI to be served on a sub-path. + # This option is analogous to --web.route-prefix of Promethus. + webRoutePrefix: "" + # Static prefix for all HTML links and redirect + # URLs in the UI web interface. Actual + # endpoints are still served on / or the + # web.route-prefix. This allows thanos UI to be + # served behind a reverse proxy that strips a URL sub-path. + webExternalPrefix: "" + # Name of HTTP request header used for dynamic prefixing of UI links and redirects. + # This option is ignored if web.external-prefix argument is set. Security risk: enable this + # option only if a reverse proxy in front of thanos is resetting the header. The --web.prefix-header=X-Forwarded-Prefix option + # can be useful, for example, if Thanos UI is served via Traefik reverse proxy with PathPrefixStrip option enabled, which sends the + # stripped prefix value in X-Forwarded-Prefix header. This allows thanos UI to be served on a sub-path + webPrefixHeader: "" + # Enable DNS discovery for stores + queryDNSDiscovery: true + # Alertmanager replica URLs to push firing alerts. Ruler claims success if push to at + # least one alertmanager from discovered succeeds. The scheme may be prefixed with + # 'dns+' or 'dnssrv+' to detect Alertmanager IPs through respective DNS lookups. The port + # defaults to 9093 or the SRV record's value. The URL path is used as a prefix for the regular + # Alertmanager API path. + alertmanagers: [] + # Timeout for sending alerts to alertmanager + alertmanagersSendTimeout: "" + # The external Thanos Query URL that would be set in all alerts 'Source' field + alertQueryUrl: "" + # Labels by name to drop before sending to alertmanager. This allows alert to be + # deduplicated on replica label (repeated). Similar Prometheus alert relabelling + alertLabelDrop: [] + # Override rules file + ruleOverrideName: "" + # Rule files for rule + ruleFiles: + alerting_rules.yaml: {} + # groups: + # - name: Instances + # rules: + # - alert: InstanceDown + # expr: up == 0 + # for: 5m + # labels: + # severity: page + # annotations: + # description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.' + # summary: 'Instance {{ $labels.instance }} down' + # Log filtering level. + logLevel: info + # Log format to use. Possible options: logfmt or json. + logFormat: logfmt + # Add extra environment variables to rule + extraEnv: [] + # - name: ENV + # value: value + # + # Add extra arguments to the rule service + extraArgs: [] + # - "--extraargs=extravalue" + # + # Number of replicas running from rule component + replicaCount: 1 + # Kubernetes update strategy object as documented in https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#update-strategies + updateStrategy: {} + # Create the specified persistentVolumeClaim in case persistentVolumeClaim is + # used for the dataVolume.backend above and needs to be created. + persistentVolumeClaim: {} + # name: rule-data-volume + # spec: + # storageClassName: "" + # accessModes: ["ReadWriteOnce"] + # resources: + # requests: + # storage: 100Gi + # selector: {} + # volumeName: "" + # volumeMode: "" + # Enable HPA for rule component + autoscaling: + enabled: false + minReplicas: 2 + maxReplicas: 3 + targetCPUUtilizationPercentage: 50 + targetMemoryUtilizationPercentage: 50 + # Enable podDisruptionBudget for rule component + podDisruptionBudget: + enabled: false + # minAvailable and maxUnavailable can't be used simultaneous. Choose one. + minAvailable: 1 + # maxUnavailable: 50% + + # The grpc endpoint to communicate with other components + grpc: + # grpc listen port number + port: 10901 + # Service definition for rule grpc service + service: + # Annotations to rule grpc service + annotations: {} + # labels to rule grpc service + labels: {} + # Match labels for service selector + matchLabels: {} + # Set up ingress for the grpc service + ingress: + enabled: false + + # Set API version for ingress + apiVersion: networking.k8s.io/v1 + + # Set default backend for ingress + defaultBackend: false + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "/" + pathType: ImplementationSpecific + hosts: + - "/" + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + serviceAccount: "" + + # The http endpoint to communicate with other components + http: + # http listen port number + port: 10902 + # Service definition for rule http service + service: + type: ClusterIP + # Annotations to rule http service + annotations: {} + # Labels to rule http service + labels: {} + # Match labels for service selector + matchLabels: {} + # Set up ingress for the http service + ingress: + enabled: false + + # Set API version for ingress + apiVersion: networking.k8s.io/v1 + + # Set default backend for ingress + defaultBackend: false + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "/" + pathType: ImplementationSpecific + hosts: + - "/" + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + + certSecretName: "" + # Extra labels for rule pod template + labels: {} + # cluster: example + # + # Extra annotations for rule pod template + annotations: {} + # example.com: default + # + # Add extra labels to rule deployment + statefulsetLabels: {} + # extraLabel: extraLabelValue + # + # Add extra annotations to rule deployment + statefulsetAnnotations: {} + # extraAnnotation: extraAnnotationValue + # + # + # Add extra selector matchLabels to rule deployment + statefulsetMatchLabels: {} + # Enable metrics collecting for rule service + metrics: + # This is the Prometheus annotation type scraping configuration + annotations: + enabled: false + # Enable ServiceMonitor https://github.com/coreos/prometheus-operator + serviceMonitor: + enabled: false + # Labels for prometheus-operator to find servicemonitor + labels: {} + + # Optional securityContext + securityContext: {} + resources: {} + # limits: + # cpu: 2000m + # memory: 16Gi + # requests: + # cpu: 1000m + # memory: 4Gi + # + # Node tolerations for server scheduling to nodes with taints + # Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ + tolerations: [] + # - key: "key" + # operator: "Equal|Exists" + # value: "value" + # effect: "NoSchedule|PreferNoSchedule|NoExecute(1.6 only)" + # + # Node labels for rule pod assignment + # Ref: https://kubernetes.io/docs/user-guide/node-selection/ + # + nodeSelector: {} + # + # Pod affinity + # Ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#node-affinity + affinity: {} + + +sidecar: + # NOTE: This is only the service references for the sidecar + enabled: true + selector: + app: prometheus + # Enable metrics collecting for sidecar service + metrics: + # Enable ServiceMonitor https://github.com/coreos/prometheus-operator + serviceMonitor: + enabled: false + # Labels for prometheus-operator to find servicemonitor + labels: {} + # The grpc endpoint to communicate with other components + grpc: + # grpc listen port number + port: 10901 + # Service definition for sidecar grpc service + service: + type: ClusterIP + # The node port number to use when the service type is set as 'NodePort' + nodePort: 31901 + # External IPs assigned to sidecar grpc service + externalIPs: [] + # Annotations to sidecar grpc service + annotations: {} + # Labels to sidecar grpc service + labels: {} + # Set up ingress for the grpc service + ingress: + enabled: false + + # Set API version for ingress + apiVersion: networking.k8s.io/v1 + + # Set default backend for ingress + defaultBackend: false + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "/" + pathType: ImplementationSpecific + hosts: + - "/" + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + + # The http endpoint to communicate with other components + http: + # http listen port number + port: 10902 + targetPort: 10902 + # Service definition for sidecar http service + service: + type: ClusterIP + # Annotations to sidecar http service + annotations: {} + # Labels to sidecar http service + labels: {} + # Set up ingress for the http service + ingress: + enabled: false + + # Set API version for ingress + apiVersion: networking.k8s.io/v1 + + # Set default backend for ingress + defaultBackend: false + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "/" + pathType: ImplementationSpecific + hosts: + - "/" + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + + +# This is the general backend configuration. Please se the examples below to configurate object store. +# More information can be found at thanos github repositoriy: https://github.com/thanos-io/thanos/blob/master/docs/storage.md +# Existing secret containing the configuration. The key must be `object-store.yaml` +objstoreSecretOverride: "" +# Text representation of the configuration +objstoreFile: "" +# YAML representation of the configuration. It's mutually exclusive with objstoreFile. +# Priority class name to be used for all the pods +# https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/ +priorityClassName: "" +objstore: {} + # type: GCS + # config: + # bucket: "thanos" + # service_account: |- + # { + # "type": "service_account", + # "project_id": "project", + # "private_key_id": "abcdefghijklmnopqrstuvwxyz12345678906666", + # "private_key": "-----BEGIN PRIVATE KEY-----\...\n-----END PRIVATE KEY-----\n", + # "client_email": "project@thanos.iam.gserviceaccount.com", + # "client_id": "123456789012345678901", + # "auth_uri": "https://accounts.google.com/o/oauth2/auth", + # "token_uri": "https://oauth2.googleapis.com/token", + # "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + # "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/thanos%40gitpods.iam.gserviceaccount.com" + # } + # + # type: S3 + # config: + # bucket: "" + # endpoint: "" + # region: "" + # access_key: "" + # insecure: false + # signature_version2: false + # encrypt_sse: false + # secret_key: "" + # put_user_metadata: {} + # http_config: + # idle_conn_timeout: 0s + # response_header_timeout: 0s + # insecure_skip_verify: false + # trace: + # enable: false + # part_size: 0 + # + # type: AZURE + # config: + # storage_account: "" + # storage_account_key: "" + # container: "" + # endpoint: "" + # max_retries: 0 diff --git a/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls b/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls index 898d62bcab..369e7fc77b 100644 --- a/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls +++ b/salt/metalk8s/addons/prometheus-operator/deployed/chart.sls @@ -20866,7 +20866,7 @@ data: datasources: - name: Prometheus type: prometheus - url: http://prometheus-operator-prometheus:9090/ + url: http://thanos-query-http:10902/ access: proxy isDefault: true jsonData: @@ -57326,6 +57326,33 @@ spec: prometheus: prometheus-operator-prometheus type: ClusterIP --- +apiVersion: v1 +kind: Service +metadata: + labels: + app: prometheus-operator-thanos-discovery + app.kubernetes.io/instance: prometheus-operator + app.kubernetes.io/managed-by: salt + app.kubernetes.io/name: prometheus-operator-thanos-discovery + app.kubernetes.io/part-of: metalk8s + app.kubernetes.io/version: 16.9.1 + chart: kube-prometheus-stack-16.9.1 + heritage: metalk8s + metalk8s.scality.com/monitor: '' + release: prometheus-operator + name: prometheus-operator-thanos-discovery + namespace: metalk8s-monitoring +spec: + clusterIP: None + ports: + - name: grpc + port: 10901 + targetPort: grpc + selector: + app.kubernetes.io/name: prometheus + prometheus: prometheus-operator-prometheus + type: ClusterIP +--- apiVersion: apps/v1 kind: DaemonSet metadata: @@ -57930,6 +57957,8 @@ spec: matchLabels: app.kubernetes.io/name: prometheus-operator-prometheus storageClassName: metalk8s + thanos: + image: {% endraw -%}{{ build_image_name("thanos") }}{%- raw %} tolerations: - effect: NoSchedule key: node-role.kubernetes.io/bootstrap diff --git a/salt/metalk8s/addons/prometheus-operator/deployed/init.sls b/salt/metalk8s/addons/prometheus-operator/deployed/init.sls index bee538fef3..c07737e51f 100644 --- a/salt/metalk8s/addons/prometheus-operator/deployed/init.sls +++ b/salt/metalk8s/addons/prometheus-operator/deployed/init.sls @@ -7,3 +7,4 @@ include: - .service-configuration - .chart - .prometheus-rules + - .thanos-chart diff --git a/salt/metalk8s/addons/prometheus-operator/deployed/thanos-chart.sls b/salt/metalk8s/addons/prometheus-operator/deployed/thanos-chart.sls new file mode 100644 index 0000000000..9186a2109f --- /dev/null +++ b/salt/metalk8s/addons/prometheus-operator/deployed/thanos-chart.sls @@ -0,0 +1,131 @@ +#!jinja | metalk8s_kubernetes + +{%- from "metalk8s/repo/macro.sls" import build_image_name with context %} + + + +{% raw %} + +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: query + app.kubernetes.io/instance: thanos + app.kubernetes.io/managed-by: salt + app.kubernetes.io/name: thanos + app.kubernetes.io/part-of: metalk8s + app.kubernetes.io/version: 0.17.1 + helm.sh/chart: thanos-0.4.6 + heritage: metalk8s + name: thanos-query-grpc + namespace: metalk8s-monitoring +spec: + clusterIP: None + ports: + - name: grpc + port: 10901 + protocol: TCP + targetPort: grpc + selector: + app.kubernetes.io/component: query + app.kubernetes.io/instance: thanos + app.kubernetes.io/name: thanos + type: ClusterIP +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: query + app.kubernetes.io/instance: thanos + app.kubernetes.io/managed-by: salt + app.kubernetes.io/name: thanos + app.kubernetes.io/part-of: metalk8s + app.kubernetes.io/version: 0.17.1 + helm.sh/chart: thanos-0.4.6 + heritage: metalk8s + name: thanos-query-http + namespace: metalk8s-monitoring +spec: + ports: + - name: http + port: 10902 + protocol: TCP + targetPort: http + selector: + app.kubernetes.io/component: query + app.kubernetes.io/instance: thanos + app.kubernetes.io/name: thanos + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: query + app.kubernetes.io/instance: thanos + app.kubernetes.io/managed-by: salt + app.kubernetes.io/name: thanos + app.kubernetes.io/part-of: metalk8s + app.kubernetes.io/version: 0.17.1 + helm.sh/chart: thanos-0.4.6 + heritage: metalk8s + name: thanos-query + namespace: metalk8s-monitoring +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: query + app.kubernetes.io/instance: thanos + app.kubernetes.io/name: thanos + template: + metadata: + labels: + app.kubernetes.io/component: query + app.kubernetes.io/instance: thanos + app.kubernetes.io/name: thanos + spec: + containers: + - args: + - query + - --log.level=info + - --log.format=logfmt + - --grpc-address=0.0.0.0:10901 + - --http-address=0.0.0.0:10902 + - --query.replica-label=prometheus_replica + - --query.auto-downsampling + - --store.sd-dns-resolver=miekgdns + - --store=dnssrv+_grpc._tcp.prometheus-operator-thanos-discovery + - --store.sd-interval=5m + image: {% endraw -%}{{ build_image_name("thanos", False) }}{%- raw %}:v0.23.1 + imagePullPolicy: IfNotPresent + livenessProbe: + httpGet: + path: /-/healthy + port: http + name: thanos-query + ports: + - containerPort: 10902 + name: http + - containerPort: 10901 + name: grpc + readinessProbe: + httpGet: + path: /-/ready + port: http + resources: {} + volumeMounts: null + nodeSelector: + node-role.kubernetes.io/infra: '' + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/bootstrap + operator: Exists + - effect: NoSchedule + key: node-role.kubernetes.io/infra + operator: Exists + volumes: null + +{% endraw %} diff --git a/salt/metalk8s/addons/ui/deployed/dependencies.sls b/salt/metalk8s/addons/ui/deployed/dependencies.sls index 20b6012d0c..13dc9647b5 100644 --- a/salt/metalk8s/addons/ui/deployed/dependencies.sls +++ b/salt/metalk8s/addons/ui/deployed/dependencies.sls @@ -39,7 +39,7 @@ spec: kind: Service apiVersion: v1 metadata: - name: prometheus-api + name: thanos-api namespace: metalk8s-ui labels: app: metalk8s-ui @@ -49,10 +49,10 @@ metadata: heritage: metalk8s spec: type: ExternalName - externalName: prometheus-operator-prometheus.metalk8s-monitoring.svc.cluster.local + externalName: thanos-query-http.metalk8s-monitoring.svc.cluster.local ports: - name: http - port: 9090 + port: 10902 --- kind: Service apiVersion: v1 diff --git a/salt/metalk8s/addons/ui/deployed/ingress.sls b/salt/metalk8s/addons/ui/deployed/ingress.sls index bacfefec03..01386603e5 100644 --- a/salt/metalk8s/addons/ui/deployed/ingress.sls +++ b/salt/metalk8s/addons/ui/deployed/ingress.sls @@ -74,9 +74,9 @@ spec: pathType: Prefix backend: service: - name: prometheus-api + name: thanos-api port: - number: 9090 + number: 10902 - path: /api/alertmanager(/|$)(.*) pathType: Prefix backend: diff --git a/tests/post/features/sanity.feature b/tests/post/features/sanity.feature index 277400d6d5..8acc302e77 100644 --- a/tests/post/features/sanity.feature +++ b/tests/post/features/sanity.feature @@ -41,6 +41,7 @@ Feature: Cluster Sanity Checks | metalk8s-monitoring | prometheus-operator-grafana | | metalk8s-monitoring | prometheus-operator-kube-state-metrics | | metalk8s-monitoring | prometheus-operator-operator | + | metalk8s-monitoring | thanos-query | | metalk8s-ui | metalk8s-ui | Scenario Outline: DaemonSet has desired Pods ready diff --git a/tools/rule_extractor/alerting_rules.csv b/tools/rule_extractor/alerting_rules.csv index b4f2058dc7..0774f0885e 100644 --- a/tools/rule_extractor/alerting_rules.csv +++ b/tools/rule_extractor/alerting_rules.csv @@ -1,121 +1,148 @@ -AlertmanagerFailedReload,critical,Reloading an Alertmanager configuration has failed. -AlertmanagerMembersInconsistent,critical,A member of an Alertmanager cluster has not found all other cluster members. -AlertmanagerFailedToSendAlerts,warning,An Alertmanager instance failed to send notifications. +AlertingServiceAtRisk,critical,The alerting service is at risk. +ClusterAtRisk,critical,The cluster is at risk. +CoreServicesAtRisk,critical,The Core services are at risk. +KubernetesControlPlaneAtRisk,critical,The Kubernetes control plane is at risk. +MonitoringServiceAtRisk,critical,The monitoring service is at risk. +NodeAtRisk,critical,The node {{ $labels.instance }} is at risk. +ObservabilityServicesAtRisk,critical,The observability services are at risk. +PlatformServicesAtRisk,critical,The Platform services are at risk. +SystemPartitionAtRisk,critical,The system partition {{ $labels.mountpoint }} on node {{ $labels.instance }} is at risk. +VolumeAtRisk,critical,The volume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} on node {{ $labels.instance }} is at risk. +AccessServicesDegraded,warning,The Access services are degraded. +AlertingServiceDegraded,warning,The alerting service is degraded. +AuthenticationServiceDegraded,warning,The Authentication service for K8S API is degraded. +BootstrapServicesDegraded,warning,The MetalK8s Bootstrap services are degraded. +ClusterDegraded,warning,The cluster is degraded. +CoreServicesDegraded,warning,The Core services are degraded. +DashboardingServiceDegraded,warning,The dashboarding service is degraded. +IngressControllerServicesDegraded,warning,The Ingress Controllers for control plane and workload plane are degraded. +KubernetesControlPlaneDegraded,warning,The Kubernetes control plane is degraded. +LoggingServiceDegraded,warning,The logging service is degraded. +MonitoringServiceDegraded,warning,The monitoring service is degraded. +NetworkDegraded,warning,The network is degraded. +NodeDegraded,warning,The node {{ $labels.instance }} is degraded. +ObservabilityServicesDegraded,warning,The observability services are degraded. +PlatformServicesDegraded,warning,The Platform services are degraded. +SystemPartitionDegraded,warning,The system partition {{ $labels.mountpoint }} on node {{ $labels.instance }} is degraded. +VolumeDegraded,warning,The volume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} on node {{ $labels.instance }} is degraded. +AlertmanagerClusterCrashlooping,critical,Half or more of the Alertmanager instances within the same cluster are crashlooping. +AlertmanagerClusterDown,critical,Half or more of the Alertmanager instances within the same cluster are down. AlertmanagerClusterFailedToSendAlerts,critical,All Alertmanager instances in a cluster failed to send notifications to a critical integration. AlertmanagerClusterFailedToSendAlerts,warning,All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. AlertmanagerConfigInconsistent,critical,Alertmanager instances within the same cluster have different configurations. -AlertmanagerClusterDown,critical,Half or more of the Alertmanager instances within the same cluster are down. -AlertmanagerClusterCrashlooping,critical,Half or more of the Alertmanager instances within the same cluster are crashlooping. -etcdInsufficientMembers,critical,"etcd cluster ""{{ $labels.job }}"": insufficient members ({{ $value }})." -etcdNoLeader,critical,"etcd cluster ""{{ $labels.job }}"": member {{ $labels.instance }} has no leader." -etcdHighNumberOfLeaderChanges,warning,"etcd cluster ""{{ $labels.job }}"": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour." -etcdHighNumberOfFailedGRPCRequests,warning,"etcd cluster ""{{ $labels.job }}"": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}." -etcdHighNumberOfFailedGRPCRequests,critical,"etcd cluster ""{{ $labels.job }}"": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}." +AlertmanagerFailedReload,critical,Reloading an Alertmanager configuration has failed. +AlertmanagerFailedToSendAlerts,warning,An Alertmanager instance failed to send notifications. +AlertmanagerMembersInconsistent,critical,A member of an Alertmanager cluster has not found all other cluster members. etcdGRPCRequestsSlow,critical,"etcd cluster ""{{ $labels.job }}"": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}." -etcdMemberCommunicationSlow,warning,"etcd cluster ""{{ $labels.job }}"": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}." -etcdHighNumberOfFailedProposals,warning,"etcd cluster ""{{ $labels.job }}"": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}." -etcdHighFsyncDurations,warning,"etcd cluster ""{{ $labels.job }}"": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}." +etcdHTTPRequestsSlow,warning,etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow. etcdHighCommitDurations,warning,"etcd cluster ""{{ $labels.job }}"": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}." -etcdHighNumberOfFailedHTTPRequests,warning,{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }} +etcdHighFsyncDurations,warning,"etcd cluster ""{{ $labels.job }}"": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}." +etcdHighNumberOfFailedGRPCRequests,critical,"etcd cluster ""{{ $labels.job }}"": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}." +etcdHighNumberOfFailedGRPCRequests,warning,"etcd cluster ""{{ $labels.job }}"": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}." etcdHighNumberOfFailedHTTPRequests,critical,{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}. -etcdHTTPRequestsSlow,warning,etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow. +etcdHighNumberOfFailedHTTPRequests,warning,{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }} +etcdHighNumberOfFailedProposals,warning,"etcd cluster ""{{ $labels.job }}"": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}." +etcdHighNumberOfLeaderChanges,warning,"etcd cluster ""{{ $labels.job }}"": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour." +etcdInsufficientMembers,critical,"etcd cluster ""{{ $labels.job }}"": insufficient members ({{ $value }})." +etcdMemberCommunicationSlow,warning,"etcd cluster ""{{ $labels.job }}"": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}." +etcdNoLeader,critical,"etcd cluster ""{{ $labels.job }}"": member {{ $labels.instance }} has no leader." TargetDown,warning,One or more targets are unreachable. Watchdog,none,An alert that should always be firing to certify that Alertmanager is working properly. -KubeAPIErrorBudgetBurn,critical,The API server is burning too much error budget. -KubeAPIErrorBudgetBurn,critical,The API server is burning too much error budget. KubeAPIErrorBudgetBurn,warning,The API server is burning too much error budget. +KubeAPIErrorBudgetBurn,critical,The API server is burning too much error budget. KubeAPIErrorBudgetBurn,warning,The API server is burning too much error budget. +KubeAPIErrorBudgetBurn,critical,The API server is burning too much error budget. KubeStateMetricsListErrors,critical,kube-state-metrics is experiencing errors in list operations. -KubeStateMetricsWatchErrors,critical,kube-state-metrics is experiencing errors in watch operations. KubeStateMetricsShardingMismatch,critical,kube-state-metrics sharding is misconfigured. KubeStateMetricsShardsMissing,critical,kube-state-metrics shards are missing. -KubePodCrashLooping,warning,Pod is crash looping. -KubePodNotReady,warning,Pod has been in a non-ready state for more than 15 minutes. -KubeDeploymentGenerationMismatch,warning,Deployment generation mismatch due to possible roll-back -KubeDeploymentReplicasMismatch,warning,Deployment has not matched the expected number of replicas. -KubeStatefulSetReplicasMismatch,warning,Deployment has not matched the expected number of replicas. -KubeStatefulSetGenerationMismatch,warning,StatefulSet generation mismatch due to possible roll-back -KubeStatefulSetUpdateNotRolledOut,warning,StatefulSet update has not been rolled out. -KubeDaemonSetRolloutStuck,warning,DaemonSet rollout is stuck. +KubeStateMetricsWatchErrors,critical,kube-state-metrics is experiencing errors in watch operations. KubeContainerWaiting,warning,Pod container waiting longer than 1 hour -KubeDaemonSetNotScheduled,warning,DaemonSet pods are not scheduled. KubeDaemonSetMisScheduled,warning,DaemonSet pods are misscheduled. +KubeDaemonSetNotScheduled,warning,DaemonSet pods are not scheduled. +KubeDaemonSetRolloutStuck,warning,DaemonSet rollout is stuck. +KubeDeploymentGenerationMismatch,warning,Deployment generation mismatch due to possible roll-back +KubeDeploymentReplicasMismatch,warning,Deployment has not matched the expected number of replicas. +KubeHpaMaxedOut,warning,HPA is running at max replicas +KubeHpaReplicasMismatch,warning,HPA has not matched descired number of replicas. KubeJobCompletion,warning,Job did not complete in time KubeJobFailed,warning,Job failed to complete. -KubeHpaReplicasMismatch,warning,HPA has not matched descired number of replicas. -KubeHpaMaxedOut,warning,HPA is running at max replicas +KubePodCrashLooping,warning,Pod is crash looping. +KubePodNotReady,warning,Pod has been in a non-ready state for more than 15 minutes. +KubeStatefulSetGenerationMismatch,warning,StatefulSet generation mismatch due to possible roll-back +KubeStatefulSetReplicasMismatch,warning,Deployment has not matched the expected number of replicas. +KubeStatefulSetUpdateNotRolledOut,warning,StatefulSet update has not been rolled out. +CPUThrottlingHigh,info,Processes experience elevated CPU throttling. KubeCPUOvercommit,warning,Cluster has overcommitted CPU resource requests. -KubeMemoryOvercommit,warning,Cluster has overcommitted memory resource requests. KubeCPUQuotaOvercommit,warning,Cluster has overcommitted CPU resource requests. +KubeMemoryOvercommit,warning,Cluster has overcommitted memory resource requests. KubeMemoryQuotaOvercommit,warning,Cluster has overcommitted memory resource requests. KubeQuotaAlmostFull,info,Namespace quota is going to be full. -KubeQuotaFullyUsed,info,Namespace quota is fully used. KubeQuotaExceeded,warning,Namespace quota has exceeded the limits. -CPUThrottlingHigh,info,Processes experience elevated CPU throttling. +KubeQuotaFullyUsed,info,Namespace quota is fully used. +KubePersistentVolumeErrors,critical,PersistentVolume is having issues with provisioning. KubePersistentVolumeFillingUp,critical,PersistentVolume is filling up. KubePersistentVolumeFillingUp,warning,PersistentVolume is filling up. -KubePersistentVolumeErrors,critical,PersistentVolume is having issues with provisioning. -KubeClientCertificateExpiration,warning,Client certificate is about to expire. -KubeClientCertificateExpiration,critical,Client certificate is about to expire. -AggregatedAPIErrors,warning,An aggregated API has reported errors. AggregatedAPIDown,warning,An aggregated API is down. +AggregatedAPIErrors,warning,An aggregated API has reported errors. KubeAPIDown,critical,Target disappeared from Prometheus target discovery. KubeAPITerminatedRequests,warning,The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. +KubeClientCertificateExpiration,critical,Client certificate is about to expire. +KubeClientCertificateExpiration,warning,Client certificate is about to expire. KubeControllerManagerDown,critical,Target disappeared from Prometheus target discovery. KubeNodeNotReady,warning,Node is not ready. -KubeNodeUnreachable,warning,Node is unreachable. -KubeletTooManyPods,warning,Kubelet is running at capacity. KubeNodeReadinessFlapping,warning,Node readiness status is flapping. +KubeNodeUnreachable,warning,Node is unreachable. +KubeletClientCertificateExpiration,critical,Kubelet client certificate is about to expire. +KubeletClientCertificateExpiration,warning,Kubelet client certificate is about to expire. +KubeletClientCertificateRenewalErrors,warning,Kubelet has failed to renew its client certificate. +KubeletDown,critical,Target disappeared from Prometheus target discovery. KubeletPlegDurationHigh,warning,Kubelet Pod Lifecycle Event Generator is taking too long to relist. KubeletPodStartUpLatencyHigh,warning,Kubelet Pod startup latency is too high. -KubeletClientCertificateExpiration,warning,Kubelet client certificate is about to expire. -KubeletClientCertificateExpiration,critical,Kubelet client certificate is about to expire. -KubeletServerCertificateExpiration,warning,Kubelet server certificate is about to expire. KubeletServerCertificateExpiration,critical,Kubelet server certificate is about to expire. -KubeletClientCertificateRenewalErrors,warning,Kubelet has failed to renew its client certificate. +KubeletServerCertificateExpiration,warning,Kubelet server certificate is about to expire. KubeletServerCertificateRenewalErrors,warning,Kubelet has failed to renew its server certificate. -KubeletDown,critical,Target disappeared from Prometheus target discovery. +KubeletTooManyPods,warning,Kubelet is running at capacity. KubeSchedulerDown,critical,Target disappeared from Prometheus target discovery. -KubeVersionMismatch,warning,Different semantic versions of Kubernetes components running. KubeClientErrors,warning,Kubernetes API server client is experiencing errors. -NodeFilesystemSpaceFillingUp,warning,Filesystem is predicted to run out of space within the next 24 hours. -NodeFilesystemSpaceFillingUp,critical,Filesystem is predicted to run out of space within the next 4 hours. -NodeFilesystemAlmostOutOfSpace,warning,Filesystem has less than 20% space left. +KubeVersionMismatch,warning,Different semantic versions of Kubernetes components running. +NodeClockNotSynchronising,warning,Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. +NodeClockSkewDetected,warning,Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. +NodeFilesystemAlmostOutOfFiles,critical,Filesystem has less than 8% inodes left. +NodeFilesystemAlmostOutOfFiles,warning,Filesystem has less than 15% inodes left. NodeFilesystemAlmostOutOfSpace,critical,Filesystem has less than 12% space left. -NodeFilesystemFilesFillingUp,warning,Filesystem is predicted to run out of inodes within the next 24 hours. +NodeFilesystemAlmostOutOfSpace,warning,Filesystem has less than 20% space left. NodeFilesystemFilesFillingUp,critical,Filesystem is predicted to run out of inodes within the next 4 hours. -NodeFilesystemAlmostOutOfFiles,warning,Filesystem has less than 15% inodes left. -NodeFilesystemAlmostOutOfFiles,critical,Filesystem has less than 8% inodes left. +NodeFilesystemFilesFillingUp,warning,Filesystem is predicted to run out of inodes within the next 24 hours. +NodeFilesystemSpaceFillingUp,critical,Filesystem is predicted to run out of space within the next 4 hours. +NodeFilesystemSpaceFillingUp,warning,Filesystem is predicted to run out of space within the next 24 hours. +NodeHighNumberConntrackEntriesUsed,warning,Number of conntrack are getting close to the limit NodeNetworkReceiveErrs,warning,Network interface is reporting many receive errors. NodeNetworkTransmitErrs,warning,Network interface is reporting many transmit errors. -NodeHighNumberConntrackEntriesUsed,warning,Number of conntrack are getting close to the limit -NodeClockSkewDetected,warning,Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. -NodeClockNotSynchronising,warning,Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. -NodeTextFileCollectorScrapeError,warning,Node Exporter text file collector failed to scrape. NodeRAIDDegraded,critical,RAID Array is degraded NodeRAIDDiskFailure,warning,Failed device in RAID array +NodeTextFileCollectorScrapeError,warning,Node Exporter text file collector failed to scrape. NodeNetworkInterfaceFlapping,warning,"Network interface ""{{ $labels.device }}"" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" PrometheusOperatorListErrors,warning,Errors while performing list operations in controller. -PrometheusOperatorWatchErrors,warning,Errors while performing watch operations in controller. -PrometheusOperatorSyncFailed,warning,Last controller reconciliation failed -PrometheusOperatorReconcileErrors,warning,Errors while reconciling controller. PrometheusOperatorNodeLookupErrors,warning,Errors while reconciling Prometheus. PrometheusOperatorNotReady,warning,Prometheus operator not ready +PrometheusOperatorReconcileErrors,warning,Errors while reconciling controller. PrometheusOperatorRejectedResources,warning,Resources rejected by Prometheus operator +PrometheusOperatorSyncFailed,warning,Last controller reconciliation failed +PrometheusOperatorWatchErrors,warning,Errors while performing watch operations in controller. PrometheusBadConfig,critical,Failed Prometheus configuration reload. -PrometheusNotificationQueueRunningFull,warning,Prometheus alert notification queue predicted to run full in less than 30m. +PrometheusDuplicateTimestamps,warning,Prometheus is dropping samples with duplicate timestamps. +PrometheusErrorSendingAlertsToAnyAlertmanager,critical,Prometheus encounters more than 3% errors sending alerts to any Alertmanager. PrometheusErrorSendingAlertsToSomeAlertmanagers,warning,Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. +PrometheusLabelLimitHit,warning,Prometheus has dropped targets because some scrape configs have exceeded the labels limit. +PrometheusMissingRuleEvaluations,warning,Prometheus is missing rule evaluations due to slow rule group evaluation. PrometheusNotConnectedToAlertmanagers,warning,Prometheus is not connected to any Alertmanagers. -PrometheusTSDBReloadsFailing,warning,Prometheus has issues reloading blocks from disk. -PrometheusTSDBCompactionsFailing,warning,Prometheus has issues compacting blocks. PrometheusNotIngestingSamples,warning,Prometheus is not ingesting samples. -PrometheusDuplicateTimestamps,warning,Prometheus is dropping samples with duplicate timestamps. +PrometheusNotificationQueueRunningFull,warning,Prometheus alert notification queue predicted to run full in less than 30m. PrometheusOutOfOrderTimestamps,warning,Prometheus drops samples with out-of-order timestamps. PrometheusRemoteStorageFailures,critical,Prometheus fails to send samples to remote storage. PrometheusRemoteWriteBehind,critical,Prometheus remote write is behind. PrometheusRemoteWriteDesiredShards,warning,Prometheus remote write desired shards calculation wants to run more than configured max shards. PrometheusRuleFailures,critical,Prometheus is failing rule evaluations. -PrometheusMissingRuleEvaluations,warning,Prometheus is missing rule evaluations due to slow rule group evaluation. +PrometheusTSDBCompactionsFailing,warning,Prometheus has issues compacting blocks. +PrometheusTSDBReloadsFailing,warning,Prometheus has issues reloading blocks from disk. PrometheusTargetLimitHit,warning,Prometheus has dropped targets because some scrape configs have exceeded the targets limit. -PrometheusLabelLimitHit,warning,Prometheus has dropped targets because some scrape configs have exceeded the labels limit. -PrometheusErrorSendingAlertsToAnyAlertmanager,critical,Prometheus encounters more than 3% errors sending alerts to any Alertmanager. diff --git a/tools/rule_extractor/alerting_rules.json b/tools/rule_extractor/alerting_rules.json index 620b17beaf..2bb7e8d437 100644 --- a/tools/rule_extractor/alerting_rules.json +++ b/tools/rule_extractor/alerting_rules.json @@ -1,890 +1,890 @@ [ - { - "name": "ClusterAtRisk", - "severity": "critical", - "message": "The cluster is at risk.", - "query": "(ALERTS{alertname=\"NodeAtRisk\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PlatformServicesAtRisk\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"VolumeAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1" - }, - { - "name": "NodeAtRisk", - "severity": "critical", - "message": "The node {{ $labels.instance }} is at risk.", - "query": "(ALERTS{alertname=\"KubeletClientCertificateExpiration\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"NodeRAIDDegraded\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"SystemPartitionAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1" - }, - { - "name": "SystemPartitionAtRisk", - "severity": "critical", - "message": "The system partition {{ $labels.mountpoint }} on node {{ $labels.instance }} is at risk.", - "query": "(ALERTS{alertname=\"NodeFilesystemAlmostOutOfSpace\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"NodeFilesystemAlmostOutOfFiles\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"NodeFilesystemFilesFillingUp\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"NodeFilesystemSpaceFillingUp\",alertstate=\"firing\",severity=\"critical\"}) >= 1" - }, - { - "name": "PlatformServicesAtRisk", - "severity": "critical", - "message": "The Platform services are at risk.", - "query": "(ALERTS{alertname=\"CoreServicesAtRisk\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"ObservabilityServicesAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1" - }, - { - "name": "CoreServicesAtRisk", - "severity": "critical", - "message": "The Core services are at risk.", - "query": "(ALERTS{alertname=\"KubernetesControlPlaneAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1" - }, - { - "name": "KubernetesControlPlaneAtRisk", - "severity": "critical", - "message": "The Kubernetes control plane is at risk.", - "query": "(ALERTS{alertname=\"KubeAPIErrorBudgetBurn\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdHighNumberOfFailedGRPCRequests\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdGRPCRequestsSlow\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdHighNumberOfFailedHTTPRequests\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdInsufficientMembers\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdMembersDown\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdNoLeader\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeStateMetricsListErrors\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeStateMetricsWatchErrors\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeAPIDown\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeClientCertificateExpiration\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeControllerManagerDown\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeletDown\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeSchedulerDown\",alertstate=\"firing\",severity=\"critical\"}) >= 1" - }, - { - "name": "ObservabilityServicesAtRisk", - "severity": "critical", - "message": "The observability services are at risk.", - "query": "(ALERTS{alertname=\"MonitoringServiceAtRisk\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"AlertingServiceAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1" - }, - { - "name": "MonitoringServiceAtRisk", - "severity": "critical", - "message": "The monitoring service is at risk.", - "query": "(ALERTS{alertname=\"PrometheusRuleFailures\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PrometheusRemoteWriteBehind\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PrometheusRemoteStorageFailures\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PrometheusErrorSendingAlertsToAnyAlertmanager\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PrometheusBadConfig\",alertstate=\"firing\",severity=\"critical\"}) >= 1" - }, - { - "name": "AlertingServiceAtRisk", - "severity": "critical", - "message": "The alerting service is at risk.", - "query": "(ALERTS{alertname=\"AlertmanagerConfigInconsistent\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"AlertmanagerMembersInconsistent\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"AlertmanagerFailedReload\",alertstate=\"firing\",severity=\"critical\"}) >= 1" - }, - { - "name": "VolumeAtRisk", - "severity": "critical", - "message": "The volume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} on node {{ $labels.instance }} is at risk.", - "query": "sum by(persistentvolumeclaim, namespace, instance) (ALERTS{alertname=\"KubePersistentVolumeFillingUp\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubePersistentVolumeErrors\",alertstate=\"firing\",severity=\"critical\"}) >= 1" - }, - { - "name": "ClusterDegraded", - "severity": "warning", - "message": "The cluster is degraded.", - "query": "(ALERTS{alertname=\"NetworkDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PlatformServicesDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"VolumeDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1" - }, - { - "name": "NetworkDegraded", - "severity": "warning", - "message": "The network is degraded.", - "query": "(ALERTS{alertname=\"NodeNetworkReceiveErrs\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeHighNumberConntrackEntriesUsed\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeNetworkTransmitErrs\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeNetworkInterfaceFlapping\",alertstate=\"firing\",severity=\"warning\"}) >= 1" - }, - { - "name": "NodeDegraded", - "severity": "warning", - "message": "The node {{ $labels.instance }} is degraded.", - "query": "(ALERTS{alertname=\"KubeNodeNotReady\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeNodeReadinessFlapping\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeNodeUnreachable\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletClientCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletClientCertificateRenewalErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletPlegDurationHigh\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletPodStartUpLatencyHigh\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletServerCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletServerCertificateRenewalErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletTooManyPods\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeClockNotSynchronising\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeClockSkewDetected\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeRAIDDiskFailure\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeTextFileCollectorScrapeError\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"SystemPartitionDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1" - }, - { - "name": "SystemPartitionDegraded", - "severity": "warning", - "message": "The system partition {{ $labels.mountpoint }} on node {{ $labels.instance }} is degraded.", - "query": "(ALERTS{alertname=\"NodeFilesystemAlmostOutOfSpace\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeFilesystemAlmostOutOfFiles\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeFilesystemFilesFillingUp\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeFilesystemSpaceFillingUp\",alertstate=\"firing\",severity=\"warning\"}) >= 1" - }, - { - "name": "PlatformServicesDegraded", - "severity": "warning", - "message": "The Platform services are degraded.", - "query": "(ALERTS{alertname=\"AccessServicesDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"CoreServicesDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"ObservabilityServicesDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1" - }, - { - "name": "AccessServicesDegraded", - "severity": "warning", - "message": "The Access services are degraded.", - "query": "(ALERTS{alertname=\"AuthenticationServiceDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"IngressControllerServicesDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1" - }, - { - "name": "AuthenticationServiceDegraded", - "severity": "warning", - "message": "The Authentication service for K8S API is degraded.", - "query": "(ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"dex\",namespace=~\"metalk8s-auth\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"dex\",namespace=~\"metalk8s-auth\",severity=\"warning\"}) >= 1" - }, - { - "name": "IngressControllerServicesDegraded", - "severity": "warning", - "message": "The Ingress Controllers for control plane and workload plane are degraded.", - "query": "(ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"ingress-nginx-defaultbackend\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"ingress-nginx-defaultbackend\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetNotScheduled\",alertstate=\"firing\",daemonset=~\"ingress-nginx-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetMisScheduled\",alertstate=\"firing\",daemonset=~\"ingress-nginx-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetRolloutStuck\",alertstate=\"firing\",daemonset=~\"ingress-nginx-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetNotScheduled\",alertstate=\"firing\",daemonset=~\"ingress-nginx-control-plane-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetMisScheduled\",alertstate=\"firing\",daemonset=~\"ingress-nginx-control-plane-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetRolloutStuck\",alertstate=\"firing\",daemonset=~\"ingress-nginx-control-plane-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"}) >= 1" - }, - { - "name": "CoreServicesDegraded", - "severity": "warning", - "message": "The Core services are degraded.", - "query": "(ALERTS{alertname=\"KubernetesControlPlaneDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"BootstrapServicesDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1" - }, - { - "name": "KubernetesControlPlaneDegraded", - "severity": "warning", - "message": "The Kubernetes control plane is degraded.", - "query": "(ALERTS{alertname=\"KubeAPIErrorBudgetBurn\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedGRPCRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHTTPRequestsSlow\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighCommitDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighFsyncDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedHTTPRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedProposals\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfLeaderChanges\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdMemberCommunicationSlow\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeVersionMismatch\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1" - }, - { - "name": "BootstrapServicesDegraded", - "severity": "warning", - "message": "The MetalK8s Bootstrap services are degraded.", - "query": "(ALERTS{alertname=\"KubePodNotReady\",alertstate=\"firing\",namespace=~\"kube-system\",pod=~\"repositories-.*\",severity=\"warning\"} or ALERTS{alertname=\"KubePodNotReady\",alertstate=\"firing\",namespace=~\"kube-system\",pod=~\"salt-master-.*\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"storage-operator\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"storage-operator\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"metalk8s-ui\",namespace=~\"metalk8s-ui\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"metalk8s-ui\",namespace=~\"metalk8s-ui\",severity=\"warning\"}) >= 1" - }, - { - "name": "ObservabilityServicesDegraded", - "severity": "warning", - "message": "The observability services are degraded.", - "query": "(ALERTS{alertname=\"MonitoringServiceDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"AlertingServiceDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"LoggingServiceDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"DashboardingServiceDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1" - }, - { - "name": "MonitoringServiceDegraded", - "severity": "warning", - "message": "The monitoring service is degraded.", - "query": "(ALERTS{alertname=\"PrometheusTargetLimitHit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusTSDBReloadsFailing\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusTSDBCompactionsFailing\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusRemoteWriteDesiredShards\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOutOfOrderTimestamps\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusNotificationQueueRunningFull\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusNotIngestingSamples\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusNotConnectedToAlertmanagers\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusMissingRuleEvaluations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusErrorSendingAlertsToSomeAlertmanagers\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusDuplicateTimestamps\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorWatchErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorSyncFailed\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorRejectedResources\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorReconcileErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorNotReady\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorNodeLookupErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorListErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeStatefulSetReplicasMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"prometheus-prometheus-operator-prometheus\"} or ALERTS{alertname=\"KubeStatefulSetGenerationMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"prometheus-prometheus-operator-prometheus\"} or ALERTS{alertname=\"KubeStatefulSetUpdateNotRolledOut\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"prometheus-prometheus-operator-prometheus\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-operator\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-operator\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetNotScheduled\",alertstate=\"firing\",daemonset=~\"prometheus-operator-prometheus-node-exporter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetMisScheduled\",alertstate=\"firing\",daemonset=~\"prometheus-operator-prometheus-node-exporter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetRolloutStuck\",alertstate=\"firing\",daemonset=~\"prometheus-operator-prometheus-node-exporter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1" - }, - { - "name": "AlertingServiceDegraded", - "severity": "warning", - "message": "The alerting service is degraded.", - "query": "(ALERTS{alertname=\"AlertmanagerFailedReload\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeStatefulSetReplicasMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"alertmanager-prometheus-operator-alertmanager\"} or ALERTS{alertname=\"KubeStatefulSetGenerationMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"alertmanager-prometheus-operator-alertmanager\"} or ALERTS{alertname=\"KubeStatefulSetUpdateNotRolledOut\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"alertmanager-prometheus-operator-alertmanager\"}) >= 1" - }, - { - "name": "LoggingServiceDegraded", - "severity": "warning", - "message": "The logging service is degraded.", - "query": "(ALERTS{alertname=\"KubeStatefulSetReplicasMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-logging\",severity=\"warning\",statefulset=~\"loki\"} or ALERTS{alertname=\"KubeStatefulSetGenerationMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-logging\",severity=\"warning\",statefulset=~\"loki\"} or ALERTS{alertname=\"KubeStatefulSetUpdateNotRolledOut\",alertstate=\"firing\",namespace=~\"metalk8s-logging\",severity=\"warning\",statefulset=~\"loki\"} or ALERTS{alertname=\"KubeDaemonSetNotScheduled\",alertstate=\"firing\",daemonset=~\"fluentbit\",namespace=~\"metalk8s-logging\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetMisScheduled\",alertstate=\"firing\",daemonset=~\"fluentbit\",namespace=~\"metalk8s-logging\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetRolloutStuck\",alertstate=\"firing\",daemonset=~\"fluentbit\",namespace=~\"metalk8s-logging\",severity=\"warning\"}) >= 1" - }, - { - "name": "DashboardingServiceDegraded", - "severity": "warning", - "message": "The dashboarding service is degraded.", - "query": "(ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-grafana\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-grafana\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1" - }, - { - "name": "VolumeDegraded", - "severity": "warning", - "message": "The volume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} on node {{ $labels.instance }} is degraded.", - "query": "sum by(persistentvolumeclaim, namespace, instance) (ALERTS{alertname=\"KubePersistentVolumeFillingUp\",alertstate=\"firing\",severity=\"warning\"}) >= 1" - }, - { - "name": "AlertmanagerFailedReload", - "severity": "critical", - "message": "Reloading an Alertmanager configuration has failed.", - "query": "max_over_time(alertmanager_config_last_reload_successful{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) == 0" - }, - { - "name": "AlertmanagerMembersInconsistent", - "severity": "critical", - "message": "A member of an Alertmanager cluster has not found all other cluster members.", - "query": "max_over_time(alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) < on(namespace, service) group_left() count by(namespace, service) (max_over_time(alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]))" - }, - { - "name": "AlertmanagerFailedToSendAlerts", - "severity": "warning", - "message": "An Alertmanager instance failed to send notifications.", - "query": "(rate(alertmanager_notifications_failed_total{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01" - }, - { - "name": "AlertmanagerClusterFailedToSendAlerts", - "severity": "critical", - "message": "All Alertmanager instances in a cluster failed to send notifications to a critical integration.", - "query": "min by(namespace, service, integration) (rate(alertmanager_notifications_failed_total{integration=~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{integration=~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01" - }, - { - "name": "AlertmanagerClusterFailedToSendAlerts", - "severity": "warning", - "message": "All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.", - "query": "min by(namespace, service, integration) (rate(alertmanager_notifications_failed_total{integration!~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{integration!~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01" - }, - { - "name": "AlertmanagerConfigInconsistent", - "severity": "critical", - "message": "Alertmanager instances within the same cluster have different configurations.", - "query": "count by(namespace, service) (count_values by(namespace, service) (\"config_hash\", alertmanager_config_hash{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) != 1" - }, - { - "name": "AlertmanagerClusterDown", - "severity": "critical", - "message": "Half or more of the Alertmanager instances within the same cluster are down.", - "query": "(count by(namespace, service) (avg_over_time(up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) < 0.5) / count by(namespace, service) (up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) >= 0.5" - }, - { - "name": "AlertmanagerClusterCrashlooping", - "severity": "critical", - "message": "Half or more of the Alertmanager instances within the same cluster are crashlooping.", - "query": "(count by(namespace, service) (changes(process_start_time_seconds{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[10m]) > 4) / count by(namespace, service) (up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) >= 0.5" - }, - { - "name": "etcdInsufficientMembers", - "severity": "critical", - "message": "etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }}).", - "query": "sum by(job) (up{job=~\".*etcd.*\"} == bool 1) < ((count by(job) (up{job=~\".*etcd.*\"}) + 1) / 2)" - }, - { - "name": "etcdNoLeader", - "severity": "critical", - "message": "etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader.", - "query": "etcd_server_has_leader{job=~\".*etcd.*\"} == 0" - }, - { - "name": "etcdHighNumberOfLeaderChanges", - "severity": "warning", - "message": "etcd cluster \"{{ $labels.job }}\": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.", - "query": "rate(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}[15m]) > 3" - }, - { - "name": "etcdHighNumberOfFailedGRPCRequests", - "severity": "warning", - "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.", - "query": "100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!=\"OK\",job=~\".*etcd.*\"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) > 1" - }, - { - "name": "etcdHighNumberOfFailedGRPCRequests", - "severity": "critical", - "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.", - "query": "100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!=\"OK\",job=~\".*etcd.*\"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) > 5" - }, - { - "name": "etcdGRPCRequestsSlow", - "severity": "critical", - "message": "etcd cluster \"{{ $labels.job }}\": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.", - "query": "histogram_quantile(0.99, sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\",job=~\".*etcd.*\"}[5m]))) > 0.15" - }, - { - "name": "etcdMemberCommunicationSlow", - "severity": "warning", - "message": "etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.", - "query": "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.15" - }, - { - "name": "etcdHighNumberOfFailedProposals", - "severity": "warning", - "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.", - "query": "rate(etcd_server_proposals_failed_total{job=~\".*etcd.*\"}[15m]) > 5" - }, - { - "name": "etcdHighFsyncDurations", - "severity": "warning", - "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.", - "query": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.5" - }, - { - "name": "etcdHighCommitDurations", - "severity": "warning", - "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.", - "query": "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.25" - }, - { - "name": "etcdHighNumberOfFailedHTTPRequests", - "severity": "warning", - "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", - "query": "sum by(method) (rate(etcd_http_failed_total{code!=\"404\",job=~\".*etcd.*\"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m])) > 0.01" - }, - { - "name": "etcdHighNumberOfFailedHTTPRequests", - "severity": "critical", - "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.", - "query": "sum by(method) (rate(etcd_http_failed_total{code!=\"404\",job=~\".*etcd.*\"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m])) > 0.05" - }, - { - "name": "etcdHTTPRequestsSlow", - "severity": "warning", - "message": "etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.", - "query": "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15" - }, - { - "name": "TargetDown", - "severity": "warning", - "message": "One or more targets are unreachable.", - "query": "100 * (count by(job, namespace, service) (up == 0) / count by(job, namespace, service) (up)) > 10" - }, - { - "name": "Watchdog", - "severity": "none", - "message": "An alert that should always be firing to certify that Alertmanager is working properly.", - "query": "vector(1)" - }, - { - "name": "KubeAPIErrorBudgetBurn", - "severity": "critical", - "message": "The API server is burning too much error budget.", - "query": "sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01)" - }, - { - "name": "KubeAPIErrorBudgetBurn", - "severity": "critical", - "message": "The API server is burning too much error budget.", - "query": "sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01)" - }, - { - "name": "KubeAPIErrorBudgetBurn", - "severity": "warning", - "message": "The API server is burning too much error budget.", - "query": "sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01)" - }, - { - "name": "KubeAPIErrorBudgetBurn", - "severity": "warning", - "message": "The API server is burning too much error budget.", - "query": "sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01)" - }, - { - "name": "KubeStateMetricsListErrors", - "severity": "critical", - "message": "kube-state-metrics is experiencing errors in list operations.", - "query": "(sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) / sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m]))) > 0.01" - }, - { - "name": "KubeStateMetricsWatchErrors", - "severity": "critical", - "message": "kube-state-metrics is experiencing errors in watch operations.", - "query": "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) / sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m]))) > 0.01" - }, - { - "name": "KubeStateMetricsShardingMismatch", - "severity": "critical", - "message": "kube-state-metrics sharding is misconfigured.", - "query": "stdvar(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) != 0" - }, - { - "name": "KubeStateMetricsShardsMissing", - "severity": "critical", - "message": "kube-state-metrics shards are missing.", - "query": "2 ^ max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1 - sum(2 ^ max by(shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"})) != 0" - }, - { - "name": "KubePodCrashLooping", - "severity": "warning", - "message": "Pod is crash looping.", - "query": "increase(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) > 0" - }, - { - "name": "KubePodNotReady", - "severity": "warning", - "message": "Pod has been in a non-ready state for more than 15 minutes.", - "query": "sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\",namespace=~\".*\",phase=~\"Pending|Unknown\"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"}))) > 0" - }, - { - "name": "KubeDeploymentGenerationMismatch", - "severity": "warning", - "message": "Deployment generation mismatch due to possible roll-back", - "query": "kube_deployment_status_observed_generation{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_deployment_metadata_generation{job=\"kube-state-metrics\",namespace=~\".*\"}" - }, - { - "name": "KubeDeploymentReplicasMismatch", - "severity": "warning", - "message": "Deployment has not matched the expected number of replicas.", - "query": "(kube_deployment_spec_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_deployment_status_replicas_available{job=\"kube-state-metrics\",namespace=~\".*\"}) and (changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) == 0)" - }, - { - "name": "KubeStatefulSetReplicasMismatch", - "severity": "warning", - "message": "Deployment has not matched the expected number of replicas.", - "query": "(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) == 0)" - }, - { - "name": "KubeStatefulSetGenerationMismatch", - "severity": "warning", - "message": "StatefulSet generation mismatch due to possible roll-back", - "query": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_metadata_generation{job=\"kube-state-metrics\",namespace=~\".*\"}" - }, - { - "name": "KubeStatefulSetUpdateNotRolledOut", - "severity": "warning", - "message": "StatefulSet update has not been rolled out.", - "query": "(max without(revision) (kube_statefulset_status_current_revision{job=\"kube-state-metrics\",namespace=~\".*\"} unless kube_statefulset_status_update_revision{job=\"kube-state-metrics\",namespace=~\".*\"}) * (kube_statefulset_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"})) and (changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[5m]) == 0)" - }, - { - "name": "KubeDaemonSetRolloutStuck", - "severity": "warning", - "message": "DaemonSet rollout is stuck.", - "query": "((kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}) or (kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != 0) or (kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}) or (kube_daemonset_status_number_available{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"})) and (changes(kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}[5m]) == 0)" - }, - { - "name": "KubeContainerWaiting", - "severity": "warning", - "message": "Pod container waiting longer than 1 hour", - "query": "sum by(namespace, pod, container) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\",namespace=~\".*\"}) > 0" - }, - { - "name": "KubeDaemonSetNotScheduled", - "severity": "warning", - "message": "DaemonSet pods are not scheduled.", - "query": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} - kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} > 0" - }, - { - "name": "KubeDaemonSetMisScheduled", - "severity": "warning", - "message": "DaemonSet pods are misscheduled.", - "query": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\",namespace=~\".*\"} > 0" - }, - { - "name": "KubeJobCompletion", - "severity": "warning", - "message": "Job did not complete in time", - "query": "kube_job_spec_completions{job=\"kube-state-metrics\",namespace=~\".*\"} - kube_job_status_succeeded{job=\"kube-state-metrics\",namespace=~\".*\"} > 0" - }, - { - "name": "KubeJobFailed", - "severity": "warning", - "message": "Job failed to complete.", - "query": "kube_job_failed{job=\"kube-state-metrics\",namespace=~\".*\"} > 0" - }, - { - "name": "KubeHpaReplicasMismatch", - "severity": "warning", - "message": "HPA has not matched descired number of replicas.", - "query": "(kube_hpa_status_desired_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} > kube_hpa_spec_min_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} < kube_hpa_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and changes(kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}[15m]) == 0" - }, - { - "name": "KubeHpaMaxedOut", - "severity": "warning", - "message": "HPA is running at max replicas", - "query": "kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} == kube_hpa_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}" - }, - { - "name": "KubeCPUOvercommit", - "severity": "warning", - "message": "Cluster has overcommitted CPU resource requests.", - "query": "sum(namespace_cpu:kube_pod_container_resource_requests:sum) / sum(kube_node_status_allocatable{resource=\"cpu\"}) > ((count(kube_node_status_allocatable{resource=\"cpu\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"cpu\"})" - }, - { - "name": "KubeMemoryOvercommit", - "severity": "warning", - "message": "Cluster has overcommitted memory resource requests.", - "query": "sum(namespace_memory:kube_pod_container_resource_requests:sum) / sum(kube_node_status_allocatable{resource=\"memory\"}) > ((count(kube_node_status_allocatable{resource=\"memory\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"memory\"})" - }, - { - "name": "KubeCPUQuotaOvercommit", - "severity": "warning", - "message": "Cluster has overcommitted CPU resource requests.", - "query": "sum(kube_resourcequota{job=\"kube-state-metrics\",resource=\"cpu\",type=\"hard\"}) / sum(kube_node_status_allocatable{resource=\"cpu\"}) > 1.5" - }, - { - "name": "KubeMemoryQuotaOvercommit", - "severity": "warning", - "message": "Cluster has overcommitted memory resource requests.", - "query": "sum(kube_resourcequota{job=\"kube-state-metrics\",resource=\"memory\",type=\"hard\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"memory\"}) > 1.5" - }, - { - "name": "KubeQuotaAlmostFull", - "severity": "info", - "message": "Namespace quota is going to be full.", - "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) > 0.9 < 1" - }, - { - "name": "KubeQuotaFullyUsed", - "severity": "info", - "message": "Namespace quota is fully used.", - "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) == 1" - }, - { - "name": "KubeQuotaExceeded", - "severity": "warning", - "message": "Namespace quota has exceeded the limits.", - "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) > 1" - }, - { - "name": "CPUThrottlingHigh", - "severity": "info", - "message": "Processes experience elevated CPU throttling.", - "query": "sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=\"\"}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > (25 / 100)" - }, - { - "name": "KubePersistentVolumeFillingUp", - "severity": "critical", - "message": "PersistentVolume is filling up.", - "query": "kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} < 0.03" - }, - { - "name": "KubePersistentVolumeFillingUp", - "severity": "warning", - "message": "PersistentVolume is filling up.", - "query": "(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"}) < 0.15 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"}[6h], 4 * 24 * 3600) < 0" - }, - { - "name": "KubePersistentVolumeErrors", - "severity": "critical", - "message": "PersistentVolume is having issues with provisioning.", - "query": "kube_persistentvolume_status_phase{job=\"kube-state-metrics\",phase=~\"Failed|Pending\"} > 0" - }, - { - "name": "KubeClientCertificateExpiration", - "severity": "warning", - "message": "Client certificate is about to expire.", - "query": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800" - }, - { - "name": "KubeClientCertificateExpiration", - "severity": "critical", - "message": "Client certificate is about to expire.", - "query": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400" - }, - { - "name": "AggregatedAPIErrors", - "severity": "warning", - "message": "An aggregated API has reported errors.", - "query": "sum by(name, namespace) (increase(aggregator_unavailable_apiservice_total[10m])) > 4" - }, - { - "name": "AggregatedAPIDown", - "severity": "warning", - "message": "An aggregated API is down.", - "query": "(1 - max by(name, namespace) (avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85" - }, - { - "name": "KubeAPIDown", - "severity": "critical", - "message": "Target disappeared from Prometheus target discovery.", - "query": "absent(up{job=\"apiserver\"} == 1)" - }, - { - "name": "KubeAPITerminatedRequests", - "severity": "warning", - "message": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.", - "query": "sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) / (sum(rate(apiserver_request_total{job=\"apiserver\"}[10m])) + sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m]))) > 0.2" - }, - { - "name": "KubeControllerManagerDown", - "severity": "critical", - "message": "Target disappeared from Prometheus target discovery.", - "query": "absent(up{job=\"kube-controller-manager\"} == 1)" - }, - { - "name": "KubeNodeNotReady", - "severity": "warning", - "message": "Node is not ready.", - "query": "kube_node_status_condition{condition=\"Ready\",job=\"kube-state-metrics\",status=\"true\"} == 0" - }, - { - "name": "KubeNodeUnreachable", - "severity": "warning", - "message": "Node is unreachable.", - "query": "(kube_node_spec_taint{effect=\"NoSchedule\",job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\"} unless ignoring(key, value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1" - }, - { - "name": "KubeletTooManyPods", - "severity": "warning", - "message": "Kubelet is running at capacity.", - "query": "count by(node) ((kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance, pod, namespace, cluster) group_left(node) topk by(instance, pod, namespace, cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})) / max by(node) (kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1) > 0.95" - }, - { - "name": "KubeNodeReadinessFlapping", - "severity": "warning", - "message": "Node readiness status is flapping.", - "query": "sum by(node) (changes(kube_node_status_condition{condition=\"Ready\",status=\"true\"}[15m])) > 2" - }, - { - "name": "KubeletPlegDurationHigh", - "severity": "warning", - "message": "Kubelet Pod Lifecycle Event Generator is taking too long to relist.", - "query": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} >= 10" - }, - { - "name": "KubeletPodStartUpLatencyHigh", - "severity": "warning", - "message": "Kubelet Pod startup latency is too high.", - "query": "histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\",metrics_path=\"/metrics\"}[5m]))) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"} > 60" - }, - { - "name": "KubeletClientCertificateExpiration", - "severity": "warning", - "message": "Kubelet client certificate is about to expire.", - "query": "kubelet_certificate_manager_client_ttl_seconds < 604800" - }, - { - "name": "KubeletClientCertificateExpiration", - "severity": "critical", - "message": "Kubelet client certificate is about to expire.", - "query": "kubelet_certificate_manager_client_ttl_seconds < 86400" - }, - { - "name": "KubeletServerCertificateExpiration", - "severity": "warning", - "message": "Kubelet server certificate is about to expire.", - "query": "kubelet_certificate_manager_server_ttl_seconds < 604800" - }, - { - "name": "KubeletServerCertificateExpiration", - "severity": "critical", - "message": "Kubelet server certificate is about to expire.", - "query": "kubelet_certificate_manager_server_ttl_seconds < 86400" - }, - { - "name": "KubeletClientCertificateRenewalErrors", - "severity": "warning", - "message": "Kubelet has failed to renew its client certificate.", - "query": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0" - }, - { - "name": "KubeletServerCertificateRenewalErrors", - "severity": "warning", - "message": "Kubelet has failed to renew its server certificate.", - "query": "increase(kubelet_server_expiration_renew_errors[5m]) > 0" - }, - { - "name": "KubeletDown", - "severity": "critical", - "message": "Target disappeared from Prometheus target discovery.", - "query": "absent(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1)" - }, - { - "name": "KubeSchedulerDown", - "severity": "critical", - "message": "Target disappeared from Prometheus target discovery.", - "query": "absent(up{job=\"kube-scheduler\"} == 1)" - }, - { - "name": "KubeVersionMismatch", - "severity": "warning", - "message": "Different semantic versions of Kubernetes components running.", - "query": "count(count by(git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"}, \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1" - }, - { - "name": "KubeClientErrors", - "severity": "warning", - "message": "Kubernetes API server client is experiencing errors.", - "query": "(sum by(instance, job) (rate(rest_client_requests_total{code=~\"5..\"}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) > 0.01" - }, - { - "name": "NodeFilesystemSpaceFillingUp", - "severity": "warning", - "message": "Filesystem is predicted to run out of space within the next 24 hours.", - "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)" - }, - { - "name": "NodeFilesystemSpaceFillingUp", - "severity": "critical", - "message": "Filesystem is predicted to run out of space within the next 4 hours.", - "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)" - }, - { - "name": "NodeFilesystemAlmostOutOfSpace", - "severity": "warning", - "message": "Filesystem has less than 20% space left.", - "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)" - }, - { - "name": "NodeFilesystemAlmostOutOfSpace", - "severity": "critical", - "message": "Filesystem has less than 12% space left.", - "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 12 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)" - }, - { - "name": "NodeFilesystemFilesFillingUp", - "severity": "warning", - "message": "Filesystem is predicted to run out of inodes within the next 24 hours.", - "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)" - }, - { - "name": "NodeFilesystemFilesFillingUp", - "severity": "critical", - "message": "Filesystem is predicted to run out of inodes within the next 4 hours.", - "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)" - }, - { - "name": "NodeFilesystemAlmostOutOfFiles", - "severity": "warning", - "message": "Filesystem has less than 15% inodes left.", - "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 15 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)" - }, - { - "name": "NodeFilesystemAlmostOutOfFiles", - "severity": "critical", - "message": "Filesystem has less than 8% inodes left.", - "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 8 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)" - }, - { - "name": "NodeNetworkReceiveErrs", - "severity": "warning", - "message": "Network interface is reporting many receive errors.", - "query": "increase(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01" - }, - { - "name": "NodeNetworkTransmitErrs", - "severity": "warning", - "message": "Network interface is reporting many transmit errors.", - "query": "increase(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01" - }, - { - "name": "NodeHighNumberConntrackEntriesUsed", - "severity": "warning", - "message": "Number of conntrack are getting close to the limit", - "query": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75" - }, - { - "name": "NodeClockSkewDetected", - "severity": "warning", - "message": "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.", - "query": "(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)" - }, - { - "name": "NodeClockNotSynchronising", - "severity": "warning", - "message": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.", - "query": "min_over_time(node_timex_sync_status[5m]) == 0 and node_timex_maxerror_seconds >= 16" - }, - { - "name": "NodeTextFileCollectorScrapeError", - "severity": "warning", - "message": "Node Exporter text file collector failed to scrape.", - "query": "node_textfile_scrape_error{job=\"node-exporter\"} == 1" - }, - { - "name": "NodeRAIDDegraded", - "severity": "critical", - "message": "RAID Array is degraded", - "query": "node_md_disks_required - ignoring(state) (node_md_disks{state=\"active\"}) >= 1" - }, - { - "name": "NodeRAIDDiskFailure", - "severity": "warning", - "message": "Failed device in RAID array", - "query": "node_md_disks{state=\"failed\"} >= 1" - }, - { - "name": "NodeNetworkInterfaceFlapping", - "severity": "warning", - "message": "Network interface \"{{ $labels.device }}\" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}", - "query": "changes(node_network_up{device!~\"veth.+\",job=\"node-exporter\"}[2m]) > 2" - }, - { - "name": "PrometheusOperatorListErrors", - "severity": "warning", - "message": "Errors while performing list operations in controller.", - "query": "(sum by(controller, namespace) (rate(prometheus_operator_list_operations_failed_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_list_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m]))) > 0.4" - }, - { - "name": "PrometheusOperatorWatchErrors", - "severity": "warning", - "message": "Errors while performing watch operations in controller.", - "query": "(sum by(controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_watch_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m]))) > 0.4" - }, - { - "name": "PrometheusOperatorSyncFailed", - "severity": "warning", - "message": "Last controller reconciliation failed", - "query": "min_over_time(prometheus_operator_syncs{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\",status=\"failed\"}[5m]) > 0" - }, - { - "name": "PrometheusOperatorReconcileErrors", - "severity": "warning", - "message": "Errors while reconciling controller.", - "query": "(sum by(controller, namespace) (rate(prometheus_operator_reconcile_errors_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]))) / (sum by(controller, namespace) (rate(prometheus_operator_reconcile_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]))) > 0.1" - }, - { - "name": "PrometheusOperatorNodeLookupErrors", - "severity": "warning", - "message": "Errors while reconciling Prometheus.", - "query": "rate(prometheus_operator_node_address_lookup_errors_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]) > 0.1" - }, - { - "name": "PrometheusOperatorNotReady", - "severity": "warning", - "message": "Prometheus operator not ready", - "query": "min by(namespace, controller) (max_over_time(prometheus_operator_ready{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]) == 0)" - }, - { - "name": "PrometheusOperatorRejectedResources", - "severity": "warning", - "message": "Resources rejected by Prometheus operator", - "query": "min_over_time(prometheus_operator_managed_resources{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\",state=\"rejected\"}[5m]) > 0" - }, - { - "name": "PrometheusBadConfig", - "severity": "critical", - "message": "Failed Prometheus configuration reload.", - "query": "max_over_time(prometheus_config_last_reload_successful{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) == 0" - }, - { - "name": "PrometheusNotificationQueueRunningFull", - "severity": "warning", - "message": "Prometheus alert notification queue predicted to run full in less than 30m.", - "query": "(predict_linear(prometheus_notifications_queue_length{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]))" - }, - { - "name": "PrometheusErrorSendingAlertsToSomeAlertmanagers", - "severity": "warning", - "message": "Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.", - "query": "(rate(prometheus_notifications_errors_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 1" - }, - { - "name": "PrometheusNotConnectedToAlertmanagers", - "severity": "warning", - "message": "Prometheus is not connected to any Alertmanagers.", - "query": "max_over_time(prometheus_notifications_alertmanagers_discovered{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) < 1" - }, - { - "name": "PrometheusTSDBReloadsFailing", - "severity": "warning", - "message": "Prometheus has issues reloading blocks from disk.", - "query": "increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[3h]) > 0" - }, - { - "name": "PrometheusTSDBCompactionsFailing", - "severity": "warning", - "message": "Prometheus has issues compacting blocks.", - "query": "increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[3h]) > 0" - }, - { - "name": "PrometheusNotIngestingSamples", - "severity": "warning", - "message": "Prometheus is not ingesting samples.", - "query": "(rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) <= 0 and (sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}) > 0 or sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}) > 0))" - }, - { - "name": "PrometheusDuplicateTimestamps", - "severity": "warning", - "message": "Prometheus is dropping samples with duplicate timestamps.", - "query": "rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0" - }, - { - "name": "PrometheusOutOfOrderTimestamps", - "severity": "warning", - "message": "Prometheus drops samples with out-of-order timestamps.", - "query": "rate(prometheus_target_scrapes_sample_out_of_order_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0" - }, - { - "name": "PrometheusRemoteStorageFailures", - "severity": "critical", - "message": "Prometheus fails to send samples to remote storage.", - "query": "((rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) / ((rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) + (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])))) * 100 > 1" - }, - { - "name": "PrometheusRemoteWriteBehind", - "severity": "critical", - "message": "Prometheus remote write is behind.", - "query": "(max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) - ignoring(remote_name, url) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) > 120" - }, - { - "name": "PrometheusRemoteWriteDesiredShards", - "severity": "warning", - "message": "Prometheus remote write desired shards calculation wants to run more than configured max shards.", - "query": "(max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]))" - }, - { - "name": "PrometheusRuleFailures", - "severity": "critical", - "message": "Prometheus is failing rule evaluations.", - "query": "increase(prometheus_rule_evaluation_failures_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0" - }, - { - "name": "PrometheusMissingRuleEvaluations", - "severity": "warning", - "message": "Prometheus is missing rule evaluations due to slow rule group evaluation.", - "query": "increase(prometheus_rule_group_iterations_missed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0" - }, - { - "name": "PrometheusTargetLimitHit", - "severity": "warning", - "message": "Prometheus has dropped targets because some scrape configs have exceeded the targets limit.", - "query": "increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0" - }, - { - "name": "PrometheusLabelLimitHit", - "severity": "warning", - "message": "Prometheus has dropped targets because some scrape configs have exceeded the labels limit.", - "query": "increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0" - }, - { - "name": "PrometheusErrorSendingAlertsToAnyAlertmanager", - "severity": "critical", - "message": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager.", - "query": "min without(alertmanager) (rate(prometheus_notifications_errors_total{alertmanager!~\"\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{alertmanager!~\"\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 3" - } -] + { + "message": "The alerting service is at risk.", + "name": "AlertingServiceAtRisk", + "query": "(ALERTS{alertname=\"AlertmanagerConfigInconsistent\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"AlertmanagerMembersInconsistent\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"AlertmanagerFailedReload\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "severity": "critical" + }, + { + "message": "The cluster is at risk.", + "name": "ClusterAtRisk", + "query": "(ALERTS{alertname=\"NodeAtRisk\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PlatformServicesAtRisk\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"VolumeAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "severity": "critical" + }, + { + "message": "The Core services are at risk.", + "name": "CoreServicesAtRisk", + "query": "(ALERTS{alertname=\"KubernetesControlPlaneAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "severity": "critical" + }, + { + "message": "The Kubernetes control plane is at risk.", + "name": "KubernetesControlPlaneAtRisk", + "query": "(ALERTS{alertname=\"KubeAPIErrorBudgetBurn\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdHighNumberOfFailedGRPCRequests\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdGRPCRequestsSlow\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdHighNumberOfFailedHTTPRequests\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdInsufficientMembers\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdMembersDown\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdNoLeader\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeStateMetricsListErrors\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeStateMetricsWatchErrors\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeAPIDown\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeClientCertificateExpiration\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeControllerManagerDown\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeletDown\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeSchedulerDown\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "severity": "critical" + }, + { + "message": "The monitoring service is at risk.", + "name": "MonitoringServiceAtRisk", + "query": "(ALERTS{alertname=\"PrometheusRuleFailures\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PrometheusRemoteWriteBehind\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PrometheusRemoteStorageFailures\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PrometheusErrorSendingAlertsToAnyAlertmanager\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PrometheusBadConfig\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "severity": "critical" + }, + { + "message": "The node {{ $labels.instance }} is at risk.", + "name": "NodeAtRisk", + "query": "(ALERTS{alertname=\"KubeletClientCertificateExpiration\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"NodeRAIDDegraded\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"SystemPartitionAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "severity": "critical" + }, + { + "message": "The observability services are at risk.", + "name": "ObservabilityServicesAtRisk", + "query": "(ALERTS{alertname=\"MonitoringServiceAtRisk\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"AlertingServiceAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "severity": "critical" + }, + { + "message": "The Platform services are at risk.", + "name": "PlatformServicesAtRisk", + "query": "(ALERTS{alertname=\"CoreServicesAtRisk\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"ObservabilityServicesAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "severity": "critical" + }, + { + "message": "The system partition {{ $labels.mountpoint }} on node {{ $labels.instance }} is at risk.", + "name": "SystemPartitionAtRisk", + "query": "(ALERTS{alertname=\"NodeFilesystemAlmostOutOfSpace\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"NodeFilesystemAlmostOutOfFiles\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"NodeFilesystemFilesFillingUp\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"NodeFilesystemSpaceFillingUp\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "severity": "critical" + }, + { + "message": "The volume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} on node {{ $labels.instance }} is at risk.", + "name": "VolumeAtRisk", + "query": "sum by(persistentvolumeclaim, namespace, instance) (ALERTS{alertname=\"KubePersistentVolumeFillingUp\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubePersistentVolumeErrors\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "severity": "critical" + }, + { + "message": "The Access services are degraded.", + "name": "AccessServicesDegraded", + "query": "(ALERTS{alertname=\"AuthenticationServiceDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"IngressControllerServicesDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The alerting service is degraded.", + "name": "AlertingServiceDegraded", + "query": "(ALERTS{alertname=\"AlertmanagerFailedReload\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeStatefulSetReplicasMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"alertmanager-prometheus-operator-alertmanager\"} or ALERTS{alertname=\"KubeStatefulSetGenerationMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"alertmanager-prometheus-operator-alertmanager\"} or ALERTS{alertname=\"KubeStatefulSetUpdateNotRolledOut\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"alertmanager-prometheus-operator-alertmanager\"}) >= 1", + "severity": "warning" + }, + { + "message": "The Authentication service for K8S API is degraded.", + "name": "AuthenticationServiceDegraded", + "query": "(ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"dex\",namespace=~\"metalk8s-auth\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"dex\",namespace=~\"metalk8s-auth\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The MetalK8s Bootstrap services are degraded.", + "name": "BootstrapServicesDegraded", + "query": "(ALERTS{alertname=\"KubePodNotReady\",alertstate=\"firing\",namespace=~\"kube-system\",pod=~\"repositories-.*\",severity=\"warning\"} or ALERTS{alertname=\"KubePodNotReady\",alertstate=\"firing\",namespace=~\"kube-system\",pod=~\"salt-master-.*\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"storage-operator\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"storage-operator\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"metalk8s-ui\",namespace=~\"metalk8s-ui\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"metalk8s-ui\",namespace=~\"metalk8s-ui\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The cluster is degraded.", + "name": "ClusterDegraded", + "query": "(ALERTS{alertname=\"NetworkDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PlatformServicesDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"VolumeDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The Core services are degraded.", + "name": "CoreServicesDegraded", + "query": "(ALERTS{alertname=\"KubernetesControlPlaneDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"BootstrapServicesDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The dashboarding service is degraded.", + "name": "DashboardingServiceDegraded", + "query": "(ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-grafana\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-grafana\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The Ingress Controllers for control plane and workload plane are degraded.", + "name": "IngressControllerServicesDegraded", + "query": "(ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"ingress-nginx-defaultbackend\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"ingress-nginx-defaultbackend\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetNotScheduled\",alertstate=\"firing\",daemonset=~\"ingress-nginx-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetMisScheduled\",alertstate=\"firing\",daemonset=~\"ingress-nginx-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetRolloutStuck\",alertstate=\"firing\",daemonset=~\"ingress-nginx-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetNotScheduled\",alertstate=\"firing\",daemonset=~\"ingress-nginx-control-plane-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetMisScheduled\",alertstate=\"firing\",daemonset=~\"ingress-nginx-control-plane-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetRolloutStuck\",alertstate=\"firing\",daemonset=~\"ingress-nginx-control-plane-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The Kubernetes control plane is degraded.", + "name": "KubernetesControlPlaneDegraded", + "query": "(ALERTS{alertname=\"KubeAPIErrorBudgetBurn\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedGRPCRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHTTPRequestsSlow\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighCommitDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighFsyncDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedHTTPRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedProposals\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfLeaderChanges\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdMemberCommunicationSlow\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeVersionMismatch\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The logging service is degraded.", + "name": "LoggingServiceDegraded", + "query": "(ALERTS{alertname=\"KubeStatefulSetReplicasMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-logging\",severity=\"warning\",statefulset=~\"loki\"} or ALERTS{alertname=\"KubeStatefulSetGenerationMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-logging\",severity=\"warning\",statefulset=~\"loki\"} or ALERTS{alertname=\"KubeStatefulSetUpdateNotRolledOut\",alertstate=\"firing\",namespace=~\"metalk8s-logging\",severity=\"warning\",statefulset=~\"loki\"} or ALERTS{alertname=\"KubeDaemonSetNotScheduled\",alertstate=\"firing\",daemonset=~\"fluentbit\",namespace=~\"metalk8s-logging\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetMisScheduled\",alertstate=\"firing\",daemonset=~\"fluentbit\",namespace=~\"metalk8s-logging\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetRolloutStuck\",alertstate=\"firing\",daemonset=~\"fluentbit\",namespace=~\"metalk8s-logging\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The monitoring service is degraded.", + "name": "MonitoringServiceDegraded", + "query": "(ALERTS{alertname=\"PrometheusTargetLimitHit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusTSDBReloadsFailing\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusTSDBCompactionsFailing\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusRemoteWriteDesiredShards\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOutOfOrderTimestamps\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusNotificationQueueRunningFull\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusNotIngestingSamples\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusNotConnectedToAlertmanagers\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusMissingRuleEvaluations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusErrorSendingAlertsToSomeAlertmanagers\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusDuplicateTimestamps\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorWatchErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorSyncFailed\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorRejectedResources\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorReconcileErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorNotReady\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorNodeLookupErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorListErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeStatefulSetReplicasMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"prometheus-prometheus-operator-prometheus\"} or ALERTS{alertname=\"KubeStatefulSetGenerationMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"prometheus-prometheus-operator-prometheus\"} or ALERTS{alertname=\"KubeStatefulSetUpdateNotRolledOut\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"prometheus-prometheus-operator-prometheus\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-operator\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-operator\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetNotScheduled\",alertstate=\"firing\",daemonset=~\"prometheus-operator-prometheus-node-exporter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetMisScheduled\",alertstate=\"firing\",daemonset=~\"prometheus-operator-prometheus-node-exporter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetRolloutStuck\",alertstate=\"firing\",daemonset=~\"prometheus-operator-prometheus-node-exporter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The network is degraded.", + "name": "NetworkDegraded", + "query": "(ALERTS{alertname=\"NodeNetworkReceiveErrs\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeHighNumberConntrackEntriesUsed\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeNetworkTransmitErrs\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeNetworkInterfaceFlapping\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The node {{ $labels.instance }} is degraded.", + "name": "NodeDegraded", + "query": "(ALERTS{alertname=\"KubeNodeNotReady\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeNodeReadinessFlapping\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeNodeUnreachable\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletClientCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletClientCertificateRenewalErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletPlegDurationHigh\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletPodStartUpLatencyHigh\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletServerCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletServerCertificateRenewalErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletTooManyPods\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeClockNotSynchronising\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeClockSkewDetected\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeRAIDDiskFailure\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeTextFileCollectorScrapeError\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"SystemPartitionDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The observability services are degraded.", + "name": "ObservabilityServicesDegraded", + "query": "(ALERTS{alertname=\"MonitoringServiceDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"AlertingServiceDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"LoggingServiceDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"DashboardingServiceDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The Platform services are degraded.", + "name": "PlatformServicesDegraded", + "query": "(ALERTS{alertname=\"AccessServicesDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"CoreServicesDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"ObservabilityServicesDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The system partition {{ $labels.mountpoint }} on node {{ $labels.instance }} is degraded.", + "name": "SystemPartitionDegraded", + "query": "(ALERTS{alertname=\"NodeFilesystemAlmostOutOfSpace\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeFilesystemAlmostOutOfFiles\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeFilesystemFilesFillingUp\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeFilesystemSpaceFillingUp\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "The volume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} on node {{ $labels.instance }} is degraded.", + "name": "VolumeDegraded", + "query": "sum by(persistentvolumeclaim, namespace, instance) (ALERTS{alertname=\"KubePersistentVolumeFillingUp\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "severity": "warning" + }, + { + "message": "Half or more of the Alertmanager instances within the same cluster are crashlooping.", + "name": "AlertmanagerClusterCrashlooping", + "query": "(count by(namespace, service) (changes(process_start_time_seconds{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[10m]) > 4) / count by(namespace, service) (up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) >= 0.5", + "severity": "critical" + }, + { + "message": "Half or more of the Alertmanager instances within the same cluster are down.", + "name": "AlertmanagerClusterDown", + "query": "(count by(namespace, service) (avg_over_time(up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) < 0.5) / count by(namespace, service) (up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) >= 0.5", + "severity": "critical" + }, + { + "message": "All Alertmanager instances in a cluster failed to send notifications to a critical integration.", + "name": "AlertmanagerClusterFailedToSendAlerts", + "query": "min by(namespace, service, integration) (rate(alertmanager_notifications_failed_total{integration=~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{integration=~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", + "severity": "critical" + }, + { + "message": "All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.", + "name": "AlertmanagerClusterFailedToSendAlerts", + "query": "min by(namespace, service, integration) (rate(alertmanager_notifications_failed_total{integration!~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{integration!~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", + "severity": "warning" + }, + { + "message": "Alertmanager instances within the same cluster have different configurations.", + "name": "AlertmanagerConfigInconsistent", + "query": "count by(namespace, service) (count_values by(namespace, service) (\"config_hash\", alertmanager_config_hash{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) != 1", + "severity": "critical" + }, + { + "message": "Reloading an Alertmanager configuration has failed.", + "name": "AlertmanagerFailedReload", + "query": "max_over_time(alertmanager_config_last_reload_successful{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) == 0", + "severity": "critical" + }, + { + "message": "An Alertmanager instance failed to send notifications.", + "name": "AlertmanagerFailedToSendAlerts", + "query": "(rate(alertmanager_notifications_failed_total{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", + "severity": "warning" + }, + { + "message": "A member of an Alertmanager cluster has not found all other cluster members.", + "name": "AlertmanagerMembersInconsistent", + "query": "max_over_time(alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) < on(namespace, service) group_left() count by(namespace, service) (max_over_time(alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]))", + "severity": "critical" + }, + { + "message": "etcd cluster \"{{ $labels.job }}\": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.", + "name": "etcdGRPCRequestsSlow", + "query": "histogram_quantile(0.99, sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\",job=~\".*etcd.*\"}[5m]))) > 0.15", + "severity": "critical" + }, + { + "message": "etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.", + "name": "etcdHTTPRequestsSlow", + "query": "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15", + "severity": "warning" + }, + { + "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.", + "name": "etcdHighCommitDurations", + "query": "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.25", + "severity": "warning" + }, + { + "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.", + "name": "etcdHighFsyncDurations", + "query": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.5", + "severity": "warning" + }, + { + "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.", + "name": "etcdHighNumberOfFailedGRPCRequests", + "query": "100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!=\"OK\",job=~\".*etcd.*\"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) > 5", + "severity": "critical" + }, + { + "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.", + "name": "etcdHighNumberOfFailedGRPCRequests", + "query": "100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!=\"OK\",job=~\".*etcd.*\"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) > 1", + "severity": "warning" + }, + { + "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.", + "name": "etcdHighNumberOfFailedHTTPRequests", + "query": "sum by(method) (rate(etcd_http_failed_total{code!=\"404\",job=~\".*etcd.*\"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m])) > 0.05", + "severity": "critical" + }, + { + "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + "name": "etcdHighNumberOfFailedHTTPRequests", + "query": "sum by(method) (rate(etcd_http_failed_total{code!=\"404\",job=~\".*etcd.*\"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m])) > 0.01", + "severity": "warning" + }, + { + "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.", + "name": "etcdHighNumberOfFailedProposals", + "query": "rate(etcd_server_proposals_failed_total{job=~\".*etcd.*\"}[15m]) > 5", + "severity": "warning" + }, + { + "message": "etcd cluster \"{{ $labels.job }}\": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.", + "name": "etcdHighNumberOfLeaderChanges", + "query": "rate(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}[15m]) > 3", + "severity": "warning" + }, + { + "message": "etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }}).", + "name": "etcdInsufficientMembers", + "query": "sum by(job) (up{job=~\".*etcd.*\"} == bool 1) < ((count by(job) (up{job=~\".*etcd.*\"}) + 1) / 2)", + "severity": "critical" + }, + { + "message": "etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.", + "name": "etcdMemberCommunicationSlow", + "query": "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.15", + "severity": "warning" + }, + { + "message": "etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader.", + "name": "etcdNoLeader", + "query": "etcd_server_has_leader{job=~\".*etcd.*\"} == 0", + "severity": "critical" + }, + { + "message": "One or more targets are unreachable.", + "name": "TargetDown", + "query": "100 * (count by(job, namespace, service) (up == 0) / count by(job, namespace, service) (up)) > 10", + "severity": "warning" + }, + { + "message": "An alert that should always be firing to certify that Alertmanager is working properly.", + "name": "Watchdog", + "query": "vector(1)", + "severity": "none" + }, + { + "message": "The API server is burning too much error budget.", + "name": "KubeAPIErrorBudgetBurn", + "query": "sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01)", + "severity": "warning" + }, + { + "message": "The API server is burning too much error budget.", + "name": "KubeAPIErrorBudgetBurn", + "query": "sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01)", + "severity": "critical" + }, + { + "message": "The API server is burning too much error budget.", + "name": "KubeAPIErrorBudgetBurn", + "query": "sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01)", + "severity": "warning" + }, + { + "message": "The API server is burning too much error budget.", + "name": "KubeAPIErrorBudgetBurn", + "query": "sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01)", + "severity": "critical" + }, + { + "message": "kube-state-metrics is experiencing errors in list operations.", + "name": "KubeStateMetricsListErrors", + "query": "(sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) / sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m]))) > 0.01", + "severity": "critical" + }, + { + "message": "kube-state-metrics sharding is misconfigured.", + "name": "KubeStateMetricsShardingMismatch", + "query": "stdvar(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) != 0", + "severity": "critical" + }, + { + "message": "kube-state-metrics shards are missing.", + "name": "KubeStateMetricsShardsMissing", + "query": "2 ^ max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1 - sum(2 ^ max by(shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"})) != 0", + "severity": "critical" + }, + { + "message": "kube-state-metrics is experiencing errors in watch operations.", + "name": "KubeStateMetricsWatchErrors", + "query": "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) / sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m]))) > 0.01", + "severity": "critical" + }, + { + "message": "Pod container waiting longer than 1 hour", + "name": "KubeContainerWaiting", + "query": "sum by(namespace, pod, container) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\",namespace=~\".*\"}) > 0", + "severity": "warning" + }, + { + "message": "DaemonSet pods are misscheduled.", + "name": "KubeDaemonSetMisScheduled", + "query": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", + "severity": "warning" + }, + { + "message": "DaemonSet pods are not scheduled.", + "name": "KubeDaemonSetNotScheduled", + "query": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} - kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", + "severity": "warning" + }, + { + "message": "DaemonSet rollout is stuck.", + "name": "KubeDaemonSetRolloutStuck", + "query": "((kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}) or (kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != 0) or (kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}) or (kube_daemonset_status_number_available{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"})) and (changes(kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}[5m]) == 0)", + "severity": "warning" + }, + { + "message": "Deployment generation mismatch due to possible roll-back", + "name": "KubeDeploymentGenerationMismatch", + "query": "kube_deployment_status_observed_generation{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_deployment_metadata_generation{job=\"kube-state-metrics\",namespace=~\".*\"}", + "severity": "warning" + }, + { + "message": "Deployment has not matched the expected number of replicas.", + "name": "KubeDeploymentReplicasMismatch", + "query": "(kube_deployment_spec_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_deployment_status_replicas_available{job=\"kube-state-metrics\",namespace=~\".*\"}) and (changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) == 0)", + "severity": "warning" + }, + { + "message": "HPA is running at max replicas", + "name": "KubeHpaMaxedOut", + "query": "kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} == kube_hpa_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}", + "severity": "warning" + }, + { + "message": "HPA has not matched descired number of replicas.", + "name": "KubeHpaReplicasMismatch", + "query": "(kube_hpa_status_desired_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} > kube_hpa_spec_min_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} < kube_hpa_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and changes(kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}[15m]) == 0", + "severity": "warning" + }, + { + "message": "Job did not complete in time", + "name": "KubeJobCompletion", + "query": "kube_job_spec_completions{job=\"kube-state-metrics\",namespace=~\".*\"} - kube_job_status_succeeded{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", + "severity": "warning" + }, + { + "message": "Job failed to complete.", + "name": "KubeJobFailed", + "query": "kube_job_failed{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", + "severity": "warning" + }, + { + "message": "Pod is crash looping.", + "name": "KubePodCrashLooping", + "query": "increase(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) > 0", + "severity": "warning" + }, + { + "message": "Pod has been in a non-ready state for more than 15 minutes.", + "name": "KubePodNotReady", + "query": "sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\",namespace=~\".*\",phase=~\"Pending|Unknown\"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"}))) > 0", + "severity": "warning" + }, + { + "message": "StatefulSet generation mismatch due to possible roll-back", + "name": "KubeStatefulSetGenerationMismatch", + "query": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_metadata_generation{job=\"kube-state-metrics\",namespace=~\".*\"}", + "severity": "warning" + }, + { + "message": "Deployment has not matched the expected number of replicas.", + "name": "KubeStatefulSetReplicasMismatch", + "query": "(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) == 0)", + "severity": "warning" + }, + { + "message": "StatefulSet update has not been rolled out.", + "name": "KubeStatefulSetUpdateNotRolledOut", + "query": "(max without(revision) (kube_statefulset_status_current_revision{job=\"kube-state-metrics\",namespace=~\".*\"} unless kube_statefulset_status_update_revision{job=\"kube-state-metrics\",namespace=~\".*\"}) * (kube_statefulset_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"})) and (changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[5m]) == 0)", + "severity": "warning" + }, + { + "message": "Processes experience elevated CPU throttling.", + "name": "CPUThrottlingHigh", + "query": "sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=\"\"}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > (25 / 100)", + "severity": "info" + }, + { + "message": "Cluster has overcommitted CPU resource requests.", + "name": "KubeCPUOvercommit", + "query": "sum(namespace_cpu:kube_pod_container_resource_requests:sum) / sum(kube_node_status_allocatable{resource=\"cpu\"}) > ((count(kube_node_status_allocatable{resource=\"cpu\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"cpu\"})", + "severity": "warning" + }, + { + "message": "Cluster has overcommitted CPU resource requests.", + "name": "KubeCPUQuotaOvercommit", + "query": "sum(kube_resourcequota{job=\"kube-state-metrics\",resource=\"cpu\",type=\"hard\"}) / sum(kube_node_status_allocatable{resource=\"cpu\"}) > 1.5", + "severity": "warning" + }, + { + "message": "Cluster has overcommitted memory resource requests.", + "name": "KubeMemoryOvercommit", + "query": "sum(namespace_memory:kube_pod_container_resource_requests:sum) / sum(kube_node_status_allocatable{resource=\"memory\"}) > ((count(kube_node_status_allocatable{resource=\"memory\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"memory\"})", + "severity": "warning" + }, + { + "message": "Cluster has overcommitted memory resource requests.", + "name": "KubeMemoryQuotaOvercommit", + "query": "sum(kube_resourcequota{job=\"kube-state-metrics\",resource=\"memory\",type=\"hard\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"memory\"}) > 1.5", + "severity": "warning" + }, + { + "message": "Namespace quota is going to be full.", + "name": "KubeQuotaAlmostFull", + "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) > 0.9 < 1", + "severity": "info" + }, + { + "message": "Namespace quota has exceeded the limits.", + "name": "KubeQuotaExceeded", + "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) > 1", + "severity": "warning" + }, + { + "message": "Namespace quota is fully used.", + "name": "KubeQuotaFullyUsed", + "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) == 1", + "severity": "info" + }, + { + "message": "PersistentVolume is having issues with provisioning.", + "name": "KubePersistentVolumeErrors", + "query": "kube_persistentvolume_status_phase{job=\"kube-state-metrics\",phase=~\"Failed|Pending\"} > 0", + "severity": "critical" + }, + { + "message": "PersistentVolume is filling up.", + "name": "KubePersistentVolumeFillingUp", + "query": "kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} < 0.03", + "severity": "critical" + }, + { + "message": "PersistentVolume is filling up.", + "name": "KubePersistentVolumeFillingUp", + "query": "(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"}) < 0.15 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"}[6h], 4 * 24 * 3600) < 0", + "severity": "warning" + }, + { + "message": "An aggregated API is down.", + "name": "AggregatedAPIDown", + "query": "(1 - max by(name, namespace) (avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85", + "severity": "warning" + }, + { + "message": "An aggregated API has reported errors.", + "name": "AggregatedAPIErrors", + "query": "sum by(name, namespace) (increase(aggregator_unavailable_apiservice_total[10m])) > 4", + "severity": "warning" + }, + { + "message": "Target disappeared from Prometheus target discovery.", + "name": "KubeAPIDown", + "query": "absent(up{job=\"apiserver\"} == 1)", + "severity": "critical" + }, + { + "message": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.", + "name": "KubeAPITerminatedRequests", + "query": "sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) / (sum(rate(apiserver_request_total{job=\"apiserver\"}[10m])) + sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m]))) > 0.2", + "severity": "warning" + }, + { + "message": "Client certificate is about to expire.", + "name": "KubeClientCertificateExpiration", + "query": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400", + "severity": "critical" + }, + { + "message": "Client certificate is about to expire.", + "name": "KubeClientCertificateExpiration", + "query": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800", + "severity": "warning" + }, + { + "message": "Target disappeared from Prometheus target discovery.", + "name": "KubeControllerManagerDown", + "query": "absent(up{job=\"kube-controller-manager\"} == 1)", + "severity": "critical" + }, + { + "message": "Node is not ready.", + "name": "KubeNodeNotReady", + "query": "kube_node_status_condition{condition=\"Ready\",job=\"kube-state-metrics\",status=\"true\"} == 0", + "severity": "warning" + }, + { + "message": "Node readiness status is flapping.", + "name": "KubeNodeReadinessFlapping", + "query": "sum by(node) (changes(kube_node_status_condition{condition=\"Ready\",status=\"true\"}[15m])) > 2", + "severity": "warning" + }, + { + "message": "Node is unreachable.", + "name": "KubeNodeUnreachable", + "query": "(kube_node_spec_taint{effect=\"NoSchedule\",job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\"} unless ignoring(key, value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1", + "severity": "warning" + }, + { + "message": "Kubelet client certificate is about to expire.", + "name": "KubeletClientCertificateExpiration", + "query": "kubelet_certificate_manager_client_ttl_seconds < 86400", + "severity": "critical" + }, + { + "message": "Kubelet client certificate is about to expire.", + "name": "KubeletClientCertificateExpiration", + "query": "kubelet_certificate_manager_client_ttl_seconds < 604800", + "severity": "warning" + }, + { + "message": "Kubelet has failed to renew its client certificate.", + "name": "KubeletClientCertificateRenewalErrors", + "query": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0", + "severity": "warning" + }, + { + "message": "Target disappeared from Prometheus target discovery.", + "name": "KubeletDown", + "query": "absent(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1)", + "severity": "critical" + }, + { + "message": "Kubelet Pod Lifecycle Event Generator is taking too long to relist.", + "name": "KubeletPlegDurationHigh", + "query": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} >= 10", + "severity": "warning" + }, + { + "message": "Kubelet Pod startup latency is too high.", + "name": "KubeletPodStartUpLatencyHigh", + "query": "histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\",metrics_path=\"/metrics\"}[5m]))) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"} > 60", + "severity": "warning" + }, + { + "message": "Kubelet server certificate is about to expire.", + "name": "KubeletServerCertificateExpiration", + "query": "kubelet_certificate_manager_server_ttl_seconds < 86400", + "severity": "critical" + }, + { + "message": "Kubelet server certificate is about to expire.", + "name": "KubeletServerCertificateExpiration", + "query": "kubelet_certificate_manager_server_ttl_seconds < 604800", + "severity": "warning" + }, + { + "message": "Kubelet has failed to renew its server certificate.", + "name": "KubeletServerCertificateRenewalErrors", + "query": "increase(kubelet_server_expiration_renew_errors[5m]) > 0", + "severity": "warning" + }, + { + "message": "Kubelet is running at capacity.", + "name": "KubeletTooManyPods", + "query": "count by(node) ((kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance, pod, namespace, cluster) group_left(node) topk by(instance, pod, namespace, cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})) / max by(node) (kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1) > 0.95", + "severity": "warning" + }, + { + "message": "Target disappeared from Prometheus target discovery.", + "name": "KubeSchedulerDown", + "query": "absent(up{job=\"kube-scheduler\"} == 1)", + "severity": "critical" + }, + { + "message": "Kubernetes API server client is experiencing errors.", + "name": "KubeClientErrors", + "query": "(sum by(instance, job) (rate(rest_client_requests_total{code=~\"5..\"}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) > 0.01", + "severity": "warning" + }, + { + "message": "Different semantic versions of Kubernetes components running.", + "name": "KubeVersionMismatch", + "query": "count(count by(git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"}, \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", + "severity": "warning" + }, + { + "message": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.", + "name": "NodeClockNotSynchronising", + "query": "min_over_time(node_timex_sync_status[5m]) == 0 and node_timex_maxerror_seconds >= 16", + "severity": "warning" + }, + { + "message": "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.", + "name": "NodeClockSkewDetected", + "query": "(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)", + "severity": "warning" + }, + { + "message": "Filesystem has less than 8% inodes left.", + "name": "NodeFilesystemAlmostOutOfFiles", + "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 8 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "severity": "critical" + }, + { + "message": "Filesystem has less than 15% inodes left.", + "name": "NodeFilesystemAlmostOutOfFiles", + "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 15 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "severity": "warning" + }, + { + "message": "Filesystem has less than 12% space left.", + "name": "NodeFilesystemAlmostOutOfSpace", + "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 12 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "severity": "critical" + }, + { + "message": "Filesystem has less than 20% space left.", + "name": "NodeFilesystemAlmostOutOfSpace", + "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "severity": "warning" + }, + { + "message": "Filesystem is predicted to run out of inodes within the next 4 hours.", + "name": "NodeFilesystemFilesFillingUp", + "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "severity": "critical" + }, + { + "message": "Filesystem is predicted to run out of inodes within the next 24 hours.", + "name": "NodeFilesystemFilesFillingUp", + "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "severity": "warning" + }, + { + "message": "Filesystem is predicted to run out of space within the next 4 hours.", + "name": "NodeFilesystemSpaceFillingUp", + "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "severity": "critical" + }, + { + "message": "Filesystem is predicted to run out of space within the next 24 hours.", + "name": "NodeFilesystemSpaceFillingUp", + "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "severity": "warning" + }, + { + "message": "Number of conntrack are getting close to the limit", + "name": "NodeHighNumberConntrackEntriesUsed", + "query": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75", + "severity": "warning" + }, + { + "message": "Network interface is reporting many receive errors.", + "name": "NodeNetworkReceiveErrs", + "query": "increase(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01", + "severity": "warning" + }, + { + "message": "Network interface is reporting many transmit errors.", + "name": "NodeNetworkTransmitErrs", + "query": "increase(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01", + "severity": "warning" + }, + { + "message": "RAID Array is degraded", + "name": "NodeRAIDDegraded", + "query": "node_md_disks_required - ignoring(state) (node_md_disks{state=\"active\"}) >= 1", + "severity": "critical" + }, + { + "message": "Failed device in RAID array", + "name": "NodeRAIDDiskFailure", + "query": "node_md_disks{state=\"failed\"} >= 1", + "severity": "warning" + }, + { + "message": "Node Exporter text file collector failed to scrape.", + "name": "NodeTextFileCollectorScrapeError", + "query": "node_textfile_scrape_error{job=\"node-exporter\"} == 1", + "severity": "warning" + }, + { + "message": "Network interface \"{{ $labels.device }}\" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}", + "name": "NodeNetworkInterfaceFlapping", + "query": "changes(node_network_up{device!~\"veth.+\",job=\"node-exporter\"}[2m]) > 2", + "severity": "warning" + }, + { + "message": "Errors while performing list operations in controller.", + "name": "PrometheusOperatorListErrors", + "query": "(sum by(controller, namespace) (rate(prometheus_operator_list_operations_failed_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_list_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m]))) > 0.4", + "severity": "warning" + }, + { + "message": "Errors while reconciling Prometheus.", + "name": "PrometheusOperatorNodeLookupErrors", + "query": "rate(prometheus_operator_node_address_lookup_errors_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]) > 0.1", + "severity": "warning" + }, + { + "message": "Prometheus operator not ready", + "name": "PrometheusOperatorNotReady", + "query": "min by(namespace, controller) (max_over_time(prometheus_operator_ready{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]) == 0)", + "severity": "warning" + }, + { + "message": "Errors while reconciling controller.", + "name": "PrometheusOperatorReconcileErrors", + "query": "(sum by(controller, namespace) (rate(prometheus_operator_reconcile_errors_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]))) / (sum by(controller, namespace) (rate(prometheus_operator_reconcile_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]))) > 0.1", + "severity": "warning" + }, + { + "message": "Resources rejected by Prometheus operator", + "name": "PrometheusOperatorRejectedResources", + "query": "min_over_time(prometheus_operator_managed_resources{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\",state=\"rejected\"}[5m]) > 0", + "severity": "warning" + }, + { + "message": "Last controller reconciliation failed", + "name": "PrometheusOperatorSyncFailed", + "query": "min_over_time(prometheus_operator_syncs{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\",status=\"failed\"}[5m]) > 0", + "severity": "warning" + }, + { + "message": "Errors while performing watch operations in controller.", + "name": "PrometheusOperatorWatchErrors", + "query": "(sum by(controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_watch_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m]))) > 0.4", + "severity": "warning" + }, + { + "message": "Failed Prometheus configuration reload.", + "name": "PrometheusBadConfig", + "query": "max_over_time(prometheus_config_last_reload_successful{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) == 0", + "severity": "critical" + }, + { + "message": "Prometheus is dropping samples with duplicate timestamps.", + "name": "PrometheusDuplicateTimestamps", + "query": "rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", + "severity": "warning" + }, + { + "message": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager.", + "name": "PrometheusErrorSendingAlertsToAnyAlertmanager", + "query": "min without(alertmanager) (rate(prometheus_notifications_errors_total{alertmanager!~\"\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{alertmanager!~\"\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 3", + "severity": "critical" + }, + { + "message": "Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.", + "name": "PrometheusErrorSendingAlertsToSomeAlertmanagers", + "query": "(rate(prometheus_notifications_errors_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 1", + "severity": "warning" + }, + { + "message": "Prometheus has dropped targets because some scrape configs have exceeded the labels limit.", + "name": "PrometheusLabelLimitHit", + "query": "increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", + "severity": "warning" + }, + { + "message": "Prometheus is missing rule evaluations due to slow rule group evaluation.", + "name": "PrometheusMissingRuleEvaluations", + "query": "increase(prometheus_rule_group_iterations_missed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", + "severity": "warning" + }, + { + "message": "Prometheus is not connected to any Alertmanagers.", + "name": "PrometheusNotConnectedToAlertmanagers", + "query": "max_over_time(prometheus_notifications_alertmanagers_discovered{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) < 1", + "severity": "warning" + }, + { + "message": "Prometheus is not ingesting samples.", + "name": "PrometheusNotIngestingSamples", + "query": "(rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) <= 0 and (sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}) > 0 or sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}) > 0))", + "severity": "warning" + }, + { + "message": "Prometheus alert notification queue predicted to run full in less than 30m.", + "name": "PrometheusNotificationQueueRunningFull", + "query": "(predict_linear(prometheus_notifications_queue_length{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]))", + "severity": "warning" + }, + { + "message": "Prometheus drops samples with out-of-order timestamps.", + "name": "PrometheusOutOfOrderTimestamps", + "query": "rate(prometheus_target_scrapes_sample_out_of_order_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", + "severity": "warning" + }, + { + "message": "Prometheus fails to send samples to remote storage.", + "name": "PrometheusRemoteStorageFailures", + "query": "((rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) / ((rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) + (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])))) * 100 > 1", + "severity": "critical" + }, + { + "message": "Prometheus remote write is behind.", + "name": "PrometheusRemoteWriteBehind", + "query": "(max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) - ignoring(remote_name, url) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) > 120", + "severity": "critical" + }, + { + "message": "Prometheus remote write desired shards calculation wants to run more than configured max shards.", + "name": "PrometheusRemoteWriteDesiredShards", + "query": "(max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]))", + "severity": "warning" + }, + { + "message": "Prometheus is failing rule evaluations.", + "name": "PrometheusRuleFailures", + "query": "increase(prometheus_rule_evaluation_failures_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", + "severity": "critical" + }, + { + "message": "Prometheus has issues compacting blocks.", + "name": "PrometheusTSDBCompactionsFailing", + "query": "increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[3h]) > 0", + "severity": "warning" + }, + { + "message": "Prometheus has issues reloading blocks from disk.", + "name": "PrometheusTSDBReloadsFailing", + "query": "increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[3h]) > 0", + "severity": "warning" + }, + { + "message": "Prometheus has dropped targets because some scrape configs have exceeded the targets limit.", + "name": "PrometheusTargetLimitHit", + "query": "increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", + "severity": "warning" + } +] \ No newline at end of file diff --git a/tools/rule_extractor/rules.json b/tools/rule_extractor/rules.json index 52bc2e3a9d..eac6a7b0b1 100644 --- a/tools/rule_extractor/rules.json +++ b/tools/rule_extractor/rules.json @@ -2,66 +2,673 @@ "data": { "groups": [ { - "evaluationTime": 0.001724433, - "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-alertmanager.rules.yaml", + "evaluationTime": 0.001903288, + "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-alert-tree.rules.yaml", "interval": 30, - "lastEvaluation": "2021-08-05T08:29:30.310652056Z", - "name": "alertmanager.rules", + "lastEvaluation": "2021-10-21T08:32:24.386911896Z", + "name": "cluster-at-risk.rules", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], "annotations": { - "description": "Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerfailedreload", - "summary": "Reloading an Alertmanager configuration has failed." + "children": "AlertmanagerConfigInconsistent{severity='critical'}, AlertmanagerMembersInconsistent{severity='critical'}, AlertmanagerFailedReload{severity='critical'}", + "summary": "The alerting service is at risk." }, - "duration": 600, - "evaluationTime": 0.000325479, + "duration": 60, + "evaluationTime": 0.000131079, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:30.310657753Z", - "name": "AlertmanagerFailedReload", - "query": "max_over_time(alertmanager_config_last_reload_successful{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) == 0", + "lastEvaluation": "2021-10-21T08:32:24.388575249Z", + "name": "AlertingServiceAtRisk", + "query": "(ALERTS{alertname=\"AlertmanagerConfigInconsistent\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"AlertmanagerMembersInconsistent\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"AlertmanagerFailedReload\",alertstate=\"firing\",severity=\"critical\"}) >= 1", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagermembersinconsistent", - "summary": "A member of an Alertmanager cluster has not found all other cluster members." + "children": "NodeAtRisk{severity='critical'}, PlatformServicesAtRisk{severity='critical'}, VolumeAtRisk{severity='critical'}", + "summary": "The cluster is at risk." }, - "duration": 600, - "evaluationTime": 0.000241523, + "duration": 60, + "evaluationTime": 0.000289237, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:30.310984822Z", - "name": "AlertmanagerMembersInconsistent", - "query": "max_over_time(alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) < on(namespace, service) group_left() count by(namespace, service) (max_over_time(alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]))", + "lastEvaluation": "2021-10-21T08:32:24.38691813Z", + "name": "ClusterAtRisk", + "query": "(ALERTS{alertname=\"NodeAtRisk\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PlatformServicesAtRisk\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"VolumeAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerfailedtosendalerts", - "summary": "An Alertmanager instance failed to send notifications." + "children": "KubernetesControlPlaneAtRisk{severity='critical'}", + "summary": "The Core services are at risk." }, - "duration": 300, - "evaluationTime": 0.000139383, + "duration": 60, + "evaluationTime": 8.1825e-05, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" + }, + "lastEvaluation": "2021-10-21T08:32:24.387588746Z", + "name": "CoreServicesAtRisk", + "query": "(ALERTS{alertname=\"KubernetesControlPlaneAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "KubeAPIErrorBudgetBurn{severity='critical'}, etcdHighNumberOfFailedGRPCRequests{severity='critical'}, etcdGRPCRequestsSlow{severity='critical'}, etcdHighNumberOfFailedHTTPRequests{severity='critical'}, etcdInsufficientMembers{severity='critical'}, etcdMembersDown{severity='critical'}, etcdNoLeader{severity='critical'}, KubeStateMetricsListErrors{severity='critical'}, KubeStateMetricsWatchErrors{severity='critical'}, KubeAPIDown{severity='critical'}, KubeClientCertificateExpiration{severity='critical'}, KubeControllerManagerDown{severity='critical'}, KubeletDown{severity='critical'}, KubeSchedulerDown{severity='critical'}", + "summary": "The Kubernetes control plane is at risk." + }, + "duration": 60, + "evaluationTime": 0.000612155, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" + }, + "lastEvaluation": "2021-10-21T08:32:24.387671174Z", + "name": "KubernetesControlPlaneAtRisk", + "query": "(ALERTS{alertname=\"KubeAPIErrorBudgetBurn\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdHighNumberOfFailedGRPCRequests\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdGRPCRequestsSlow\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdHighNumberOfFailedHTTPRequests\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdInsufficientMembers\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdMembersDown\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"etcdNoLeader\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeStateMetricsListErrors\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeStateMetricsWatchErrors\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeAPIDown\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeClientCertificateExpiration\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeControllerManagerDown\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeletDown\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubeSchedulerDown\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "PrometheusRuleFailures{severity='critical'}, PrometheusRemoteWriteBehind{severity='critical'}, PrometheusRemoteStorageFailures{severity='critical'}, PrometheusErrorSendingAlertsToAnyAlertmanager{severity='critical'}, PrometheusBadConfig{severity='critical'}", + "summary": "The monitoring service is at risk." + }, + "duration": 60, + "evaluationTime": 0.000173222, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" + }, + "lastEvaluation": "2021-10-21T08:32:24.388401392Z", + "name": "MonitoringServiceAtRisk", + "query": "(ALERTS{alertname=\"PrometheusRuleFailures\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PrometheusRemoteWriteBehind\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PrometheusRemoteStorageFailures\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PrometheusErrorSendingAlertsToAnyAlertmanager\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"PrometheusBadConfig\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "KubeletClientCertificateExpiration{severity='critical'}, NodeRAIDDegraded{severity='critical'}, SystemPartitionAtRisk{severity='critical'}", + "summary": "The node {{ $labels.instance }} is at risk." + }, + "duration": 60, + "evaluationTime": 0.000145598, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" + }, + "lastEvaluation": "2021-10-21T08:32:24.38720863Z", + "name": "NodeAtRisk", + "query": "(ALERTS{alertname=\"KubeletClientCertificateExpiration\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"NodeRAIDDegraded\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"SystemPartitionAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "MonitoringServiceAtRisk{severity='critical'}, AlertingServiceAtRisk{severity='critical'}", + "summary": "The observability services are at risk." + }, + "duration": 60, + "evaluationTime": 0.000115421, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" + }, + "lastEvaluation": "2021-10-21T08:32:24.388285282Z", + "name": "ObservabilityServicesAtRisk", + "query": "(ALERTS{alertname=\"MonitoringServiceAtRisk\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"AlertingServiceAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "CoreServicesAtRisk{severity='critical'}, ObservabilityServicesAtRisk{severity='critical'}", + "summary": "The Platform services are at risk." + }, + "duration": 60, + "evaluationTime": 8.6209e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" + }, + "lastEvaluation": "2021-10-21T08:32:24.38750192Z", + "name": "PlatformServicesAtRisk", + "query": "(ALERTS{alertname=\"CoreServicesAtRisk\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"ObservabilityServicesAtRisk\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "NodeFilesystemAlmostOutOfSpace{severity='critical'}, NodeFilesystemAlmostOutOfFiles{severity='critical'}, NodeFilesystemFilesFillingUp{severity='critical'}, NodeFilesystemSpaceFillingUp{severity='critical'}", + "summary": "The system partition {{ $labels.mountpoint }} on node {{ $labels.instance }} is at risk." + }, + "duration": 60, + "evaluationTime": 0.000146384, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" + }, + "lastEvaluation": "2021-10-21T08:32:24.387354978Z", + "name": "SystemPartitionAtRisk", + "query": "(ALERTS{alertname=\"NodeFilesystemAlmostOutOfSpace\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"NodeFilesystemAlmostOutOfFiles\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"NodeFilesystemFilesFillingUp\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"NodeFilesystemSpaceFillingUp\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "KubePersistentVolumeFillingUp{severity='critical'}, KubePersistentVolumeErrors{severity='critical'}", + "summary": "The volume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} on node {{ $labels.instance }} is at risk." + }, + "duration": 60, + "evaluationTime": 0.000105942, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" + }, + "lastEvaluation": "2021-10-21T08:32:24.388706983Z", + "name": "VolumeAtRisk", + "query": "sum by(persistentvolumeclaim, namespace, instance) (ALERTS{alertname=\"KubePersistentVolumeFillingUp\",alertstate=\"firing\",severity=\"critical\"} or ALERTS{alertname=\"KubePersistentVolumeErrors\",alertstate=\"firing\",severity=\"critical\"}) >= 1", + "state": "inactive", + "type": "alerting" + } + ] + }, + { + "evaluationTime": 0.006524291, + "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-alert-tree.rules.yaml", + "interval": 30, + "lastEvaluation": "2021-10-21T08:32:42.909738729Z", + "name": "cluster-degraded.rules", + "partialResponseStrategy": "ABORT", + "rules": [ + { + "alerts": [], + "annotations": { + "children": "AuthenticationServiceDegraded{severity='warning'}, IngressControllerServicesDegraded{severity='warning'}", + "summary": "The Access services are degraded." + }, + "duration": 60, + "evaluationTime": 8.2256e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:30.31122765Z", - "name": "AlertmanagerFailedToSendAlerts", - "query": "(rate(alertmanager_notifications_failed_total{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", + "lastEvaluation": "2021-10-21T08:32:42.91197082Z", + "name": "AccessServicesDegraded", + "query": "(ALERTS{alertname=\"AuthenticationServiceDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"IngressControllerServicesDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "AlertmanagerFailedReload{severity='warning'}, KubeStatefulSetReplicasMismatch{namespace=~'metalk8s-monitoring', severity='warning', statefulset=~'alertmanager-prometheus-operator-alertmanager'}, KubeStatefulSetGenerationMismatch{namespace=~'metalk8s-monitoring', severity='warning', statefulset=~'alertmanager-prometheus-operator-alertmanager'}, KubeStatefulSetUpdateNotRolledOut{namespace=~'metalk8s-monitoring', severity='warning', statefulset=~'alertmanager-prometheus-operator-alertmanager'}", + "summary": "The alerting service is degraded." + }, + "duration": 60, + "evaluationTime": 0.000310153, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.915386132Z", + "name": "AlertingServiceDegraded", + "query": "(ALERTS{alertname=\"AlertmanagerFailedReload\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeStatefulSetReplicasMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"alertmanager-prometheus-operator-alertmanager\"} or ALERTS{alertname=\"KubeStatefulSetGenerationMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"alertmanager-prometheus-operator-alertmanager\"} or ALERTS{alertname=\"KubeStatefulSetUpdateNotRolledOut\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"alertmanager-prometheus-operator-alertmanager\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "KubeDeploymentReplicasMismatch{deployment=~'dex', namespace=~'metalk8s-auth', severity='warning'}, KubeDeploymentGenerationMismatch{deployment=~'dex', namespace=~'metalk8s-auth', severity='warning'}", + "summary": "The Authentication service for K8S API is degraded." + }, + "duration": 60, + "evaluationTime": 0.000197136, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.912053631Z", + "name": "AuthenticationServiceDegraded", + "query": "(ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"dex\",namespace=~\"metalk8s-auth\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"dex\",namespace=~\"metalk8s-auth\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "KubePodNotReady{namespace=~'kube-system', pod=~'repositories-.*', severity='warning'}, KubePodNotReady{namespace=~'kube-system', pod=~'salt-master-.*', severity='warning'}, KubeDeploymentReplicasMismatch{deployment=~'storage-operator', namespace=~'kube-system', severity='warning'}, KubeDeploymentGenerationMismatch{deployment=~'storage-operator', namespace=~'kube-system', severity='warning'}, KubeDeploymentReplicasMismatch{deployment=~'metalk8s-ui', namespace=~'metalk8s-ui', severity='warning'}, KubeDeploymentGenerationMismatch{deployment=~'metalk8s-ui', namespace=~'metalk8s-ui', severity='warning'}", + "summary": "The MetalK8s Bootstrap services are degraded." + }, + "duration": 60, + "evaluationTime": 0.000372059, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.913723347Z", + "name": "BootstrapServicesDegraded", + "query": "(ALERTS{alertname=\"KubePodNotReady\",alertstate=\"firing\",namespace=~\"kube-system\",pod=~\"repositories-.*\",severity=\"warning\"} or ALERTS{alertname=\"KubePodNotReady\",alertstate=\"firing\",namespace=~\"kube-system\",pod=~\"salt-master-.*\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"storage-operator\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"storage-operator\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"metalk8s-ui\",namespace=~\"metalk8s-ui\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"metalk8s-ui\",namespace=~\"metalk8s-ui\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [ + { + "activeAt": "2021-10-21T08:02:42.908069235Z", + "annotations": { + "children": "NetworkDegraded{severity='warning'}, NodeDegraded{severity='warning'}, PlatformServicesDegraded{severity='warning'}, VolumeDegraded{severity='warning'}", + "summary": "The cluster is degraded." + }, + "labels": { + "alertname": "ClusterDegraded", + "alertstate": "firing", + "container": "node-exporter", + "endpoint": "metrics", + "instance": "192.168.1.100:9100", + "job": "node-exporter", + "namespace": "metalk8s-monitoring", + "pod": "prometheus-operator-prometheus-node-exporter-2krc7", + "service": "prometheus-operator-prometheus-node-exporter", + "severity": "warning" + }, + "partialResponseStrategy": "WARN", + "state": "firing", + "value": "1e+00" + }, + { + "activeAt": "2021-10-21T08:02:42.908069235Z", + "annotations": { + "children": "NetworkDegraded{severity='warning'}, NodeDegraded{severity='warning'}, PlatformServicesDegraded{severity='warning'}, VolumeDegraded{severity='warning'}", + "summary": "The cluster is degraded." + }, + "labels": { + "alertname": "ClusterDegraded", + "alertstate": "firing", + "container": "node-exporter", + "endpoint": "metrics", + "instance": "192.168.1.101:9100", + "job": "node-exporter", + "namespace": "metalk8s-monitoring", + "pod": "prometheus-operator-prometheus-node-exporter-hxvmn", + "service": "prometheus-operator-prometheus-node-exporter", + "severity": "warning" + }, + "partialResponseStrategy": "WARN", + "state": "firing", + "value": "1e+00" + } + ], + "annotations": { + "children": "NetworkDegraded{severity='warning'}, NodeDegraded{severity='warning'}, PlatformServicesDegraded{severity='warning'}, VolumeDegraded{severity='warning'}", + "summary": "The cluster is degraded." + }, + "duration": 60, + "evaluationTime": 0.000697751, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.909744506Z", + "name": "ClusterDegraded", + "query": "(ALERTS{alertname=\"NetworkDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PlatformServicesDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"VolumeDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "state": "firing", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "KubernetesControlPlaneDegraded{severity='warning'}, BootstrapServicesDegraded{severity='warning'}", + "summary": "The Core services are degraded." + }, + "duration": 60, + "evaluationTime": 8.494e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.912810496Z", + "name": "CoreServicesDegraded", + "query": "(ALERTS{alertname=\"KubernetesControlPlaneDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"BootstrapServicesDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "KubeDeploymentReplicasMismatch{deployment=~'prometheus-operator-grafana', namespace=~'metalk8s-monitoring', severity='warning'}, KubeDeploymentGenerationMismatch{deployment=~'prometheus-operator-grafana', namespace=~'metalk8s-monitoring', severity='warning'}", + "summary": "The dashboarding service is degraded." + }, + "duration": 60, + "evaluationTime": 0.000163879, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.916025494Z", + "name": "DashboardingServiceDegraded", + "query": "(ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-grafana\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-grafana\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "KubeDeploymentReplicasMismatch{deployment=~'ingress-nginx-defaultbackend', namespace=~'metalk8s-ingress', severity='warning'}, KubeDeploymentGenerationMismatch{deployment=~'ingress-nginx-defaultbackend', namespace=~'metalk8s-ingress', severity='warning'}, KubeDaemonSetNotScheduled{daemonset=~'ingress-nginx-controller', namespace=~'metalk8s-ingress', severity='warning'}, KubeDaemonSetMisScheduled{daemonset=~'ingress-nginx-controller', namespace=~'metalk8s-ingress', severity='warning'}, KubeDaemonSetRolloutStuck{daemonset=~'ingress-nginx-controller', namespace=~'metalk8s-ingress', severity='warning'}, KubeDaemonSetNotScheduled{daemonset=~'ingress-nginx-control-plane-controller', namespace=~'metalk8s-ingress', severity='warning'}, KubeDaemonSetMisScheduled{daemonset=~'ingress-nginx-control-plane-controller', namespace=~'metalk8s-ingress', severity='warning'}, KubeDaemonSetRolloutStuck{daemonset=~'ingress-nginx-control-plane-controller', namespace=~'metalk8s-ingress', severity='warning'}", + "summary": "The Ingress Controllers for control plane and workload plane are degraded." + }, + "duration": 60, + "evaluationTime": 0.000558162, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.912251407Z", + "name": "IngressControllerServicesDegraded", + "query": "(ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"ingress-nginx-defaultbackend\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"ingress-nginx-defaultbackend\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetNotScheduled\",alertstate=\"firing\",daemonset=~\"ingress-nginx-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetMisScheduled\",alertstate=\"firing\",daemonset=~\"ingress-nginx-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetRolloutStuck\",alertstate=\"firing\",daemonset=~\"ingress-nginx-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetNotScheduled\",alertstate=\"firing\",daemonset=~\"ingress-nginx-control-plane-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetMisScheduled\",alertstate=\"firing\",daemonset=~\"ingress-nginx-control-plane-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetRolloutStuck\",alertstate=\"firing\",daemonset=~\"ingress-nginx-control-plane-controller\",namespace=~\"metalk8s-ingress\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "KubeAPIErrorBudgetBurn{severity='warning'}, etcdHighNumberOfFailedGRPCRequests{severity='warning'}, etcdHTTPRequestsSlow{severity='warning'}, etcdHighCommitDurations{severity='warning'}, etcdHighFsyncDurations{severity='warning'}, etcdHighNumberOfFailedHTTPRequests{severity='warning'}, etcdHighNumberOfFailedProposals{severity='warning'}, etcdHighNumberOfLeaderChanges{severity='warning'}, etcdMemberCommunicationSlow{severity='warning'}, KubeCPUOvercommit{severity='warning'}, KubeCPUQuotaOvercommit{severity='warning'}, KubeMemoryOvercommit{severity='warning'}, KubeMemoryQuotaOvercommit{severity='warning'}, KubeClientCertificateExpiration{severity='warning'}, KubeClientErrors{severity='warning'}, KubeVersionMismatch{severity='warning'}, KubeDeploymentReplicasMismatch{deployment=~'coredns', namespace=~'kube-system', severity='warning'}, KubeDeploymentGenerationMismatch{deployment=~'coredns', namespace=~'kube-system', severity='warning'}, KubeDeploymentReplicasMismatch{deployment=~'prometheus-adapter', namespace=~'metalk8s-monitoring', severity='warning'}, KubeDeploymentGenerationMismatch{deployment=~'prometheus-adapter', namespace=~'metalk8s-monitoring', severity='warning'}, KubeDeploymentReplicasMismatch{deployment=~'prometheus-operator-kube-state-metrics', namespace=~'metalk8s-monitoring', severity='warning'}, KubeDeploymentGenerationMismatch{deployment=~'prometheus-operator-kube-state-metrics', namespace=~'metalk8s-monitoring', severity='warning'}", + "summary": "The Kubernetes control plane is degraded." + }, + "duration": 60, + "evaluationTime": 0.000826208, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.912896118Z", + "name": "KubernetesControlPlaneDegraded", + "query": "(ALERTS{alertname=\"KubeAPIErrorBudgetBurn\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedGRPCRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHTTPRequestsSlow\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighCommitDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighFsyncDurations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedHTTPRequests\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfFailedProposals\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdHighNumberOfLeaderChanges\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"etcdMemberCommunicationSlow\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeCPUQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeMemoryQuotaOvercommit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeClientErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeVersionMismatch\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"coredns\",namespace=~\"kube-system\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-adapter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-kube-state-metrics\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "KubeStatefulSetReplicasMismatch{namespace=~'metalk8s-logging', severity='warning', statefulset=~'loki'}, KubeStatefulSetGenerationMismatch{namespace=~'metalk8s-logging', severity='warning', statefulset=~'loki'}, KubeStatefulSetUpdateNotRolledOut{namespace=~'metalk8s-logging', severity='warning', statefulset=~'loki'}, KubeDaemonSetNotScheduled{daemonset=~'fluentbit', namespace=~'metalk8s-logging', severity='warning'}, KubeDaemonSetMisScheduled{daemonset=~'fluentbit', namespace=~'metalk8s-logging', severity='warning'}, KubeDaemonSetRolloutStuck{daemonset=~'fluentbit', namespace=~'metalk8s-logging', severity='warning'}", + "summary": "The logging service is degraded." + }, + "duration": 60, + "evaluationTime": 0.00032739, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.915696978Z", + "name": "LoggingServiceDegraded", + "query": "(ALERTS{alertname=\"KubeStatefulSetReplicasMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-logging\",severity=\"warning\",statefulset=~\"loki\"} or ALERTS{alertname=\"KubeStatefulSetGenerationMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-logging\",severity=\"warning\",statefulset=~\"loki\"} or ALERTS{alertname=\"KubeStatefulSetUpdateNotRolledOut\",alertstate=\"firing\",namespace=~\"metalk8s-logging\",severity=\"warning\",statefulset=~\"loki\"} or ALERTS{alertname=\"KubeDaemonSetNotScheduled\",alertstate=\"firing\",daemonset=~\"fluentbit\",namespace=~\"metalk8s-logging\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetMisScheduled\",alertstate=\"firing\",daemonset=~\"fluentbit\",namespace=~\"metalk8s-logging\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetRolloutStuck\",alertstate=\"firing\",daemonset=~\"fluentbit\",namespace=~\"metalk8s-logging\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "PrometheusTargetLimitHit{severity='warning'}, PrometheusTSDBReloadsFailing{severity='warning'}, PrometheusTSDBCompactionsFailing{severity='warning'}, PrometheusRemoteWriteDesiredShards{severity='warning'}, PrometheusOutOfOrderTimestamps{severity='warning'}, PrometheusNotificationQueueRunningFull{severity='warning'}, PrometheusNotIngestingSamples{severity='warning'}, PrometheusNotConnectedToAlertmanagers{severity='warning'}, PrometheusMissingRuleEvaluations{severity='warning'}, PrometheusErrorSendingAlertsToSomeAlertmanagers{severity='warning'}, PrometheusDuplicateTimestamps{severity='warning'}, PrometheusOperatorWatchErrors{severity='warning'}, PrometheusOperatorSyncFailed{severity='warning'}, PrometheusOperatorRejectedResources{severity='warning'}, PrometheusOperatorReconcileErrors{severity='warning'}, PrometheusOperatorNotReady{severity='warning'}, PrometheusOperatorNodeLookupErrors{severity='warning'}, PrometheusOperatorListErrors{severity='warning'}, KubeStatefulSetReplicasMismatch{namespace=~'metalk8s-monitoring', severity='warning', statefulset=~'prometheus-prometheus-operator-prometheus'}, KubeStatefulSetGenerationMismatch{namespace=~'metalk8s-monitoring', severity='warning', statefulset=~'prometheus-prometheus-operator-prometheus'}, KubeStatefulSetUpdateNotRolledOut{namespace=~'metalk8s-monitoring', severity='warning', statefulset=~'prometheus-prometheus-operator-prometheus'}, KubeDeploymentReplicasMismatch{deployment=~'prometheus-operator-operator', namespace=~'metalk8s-monitoring', severity='warning'}, KubeDeploymentGenerationMismatch{deployment=~'prometheus-operator-operator', namespace=~'metalk8s-monitoring', severity='warning'}, KubeDaemonSetNotScheduled{daemonset=~'prometheus-operator-prometheus-node-exporter', namespace=~'metalk8s-monitoring', severity='warning'}, KubeDaemonSetMisScheduled{daemonset=~'prometheus-operator-prometheus-node-exporter', namespace=~'metalk8s-monitoring', severity='warning'}, KubeDaemonSetRolloutStuck{daemonset=~'prometheus-operator-prometheus-node-exporter', namespace=~'metalk8s-monitoring', severity='warning'}", + "summary": "The monitoring service is degraded." + }, + "duration": 60, + "evaluationTime": 0.001147514, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.914237636Z", + "name": "MonitoringServiceDegraded", + "query": "(ALERTS{alertname=\"PrometheusTargetLimitHit\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusTSDBReloadsFailing\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusTSDBCompactionsFailing\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusRemoteWriteDesiredShards\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOutOfOrderTimestamps\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusNotificationQueueRunningFull\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusNotIngestingSamples\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusNotConnectedToAlertmanagers\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusMissingRuleEvaluations\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusErrorSendingAlertsToSomeAlertmanagers\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusDuplicateTimestamps\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorWatchErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorSyncFailed\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorRejectedResources\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorReconcileErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorNotReady\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorNodeLookupErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"PrometheusOperatorListErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeStatefulSetReplicasMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"prometheus-prometheus-operator-prometheus\"} or ALERTS{alertname=\"KubeStatefulSetGenerationMismatch\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"prometheus-prometheus-operator-prometheus\"} or ALERTS{alertname=\"KubeStatefulSetUpdateNotRolledOut\",alertstate=\"firing\",namespace=~\"metalk8s-monitoring\",severity=\"warning\",statefulset=~\"prometheus-prometheus-operator-prometheus\"} or ALERTS{alertname=\"KubeDeploymentReplicasMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-operator\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDeploymentGenerationMismatch\",alertstate=\"firing\",deployment=~\"prometheus-operator-operator\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetNotScheduled\",alertstate=\"firing\",daemonset=~\"prometheus-operator-prometheus-node-exporter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetMisScheduled\",alertstate=\"firing\",daemonset=~\"prometheus-operator-prometheus-node-exporter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"} or ALERTS{alertname=\"KubeDaemonSetRolloutStuck\",alertstate=\"firing\",daemonset=~\"prometheus-operator-prometheus-node-exporter\",namespace=~\"metalk8s-monitoring\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "NodeNetworkReceiveErrs{severity='warning'}, NodeHighNumberConntrackEntriesUsed{severity='warning'}, NodeNetworkTransmitErrs{severity='warning'}, NodeNetworkInterfaceFlapping{severity='warning'}", + "summary": "The network is degraded." + }, + "duration": 60, + "evaluationTime": 0.000284721, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.910444137Z", + "name": "NetworkDegraded", + "query": "(ALERTS{alertname=\"NodeNetworkReceiveErrs\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeHighNumberConntrackEntriesUsed\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeNetworkTransmitErrs\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeNetworkInterfaceFlapping\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [ + { + "activeAt": "2021-10-21T08:01:12.908069235Z", + "annotations": { + "children": "KubeNodeNotReady{severity='warning'}, KubeNodeReadinessFlapping{severity='warning'}, KubeNodeUnreachable{severity='warning'}, KubeletClientCertificateExpiration{severity='warning'}, KubeletClientCertificateRenewalErrors{severity='warning'}, KubeletPlegDurationHigh{severity='warning'}, KubeletPodStartUpLatencyHigh{severity='warning'}, KubeletServerCertificateExpiration{severity='warning'}, KubeletServerCertificateRenewalErrors{severity='warning'}, KubeletTooManyPods{severity='warning'}, NodeClockNotSynchronising{severity='warning'}, NodeClockSkewDetected{severity='warning'}, NodeRAIDDiskFailure{severity='warning'}, NodeTextFileCollectorScrapeError{severity='warning'}, SystemPartitionDegraded{severity='warning'}", + "summary": "The node 192.168.1.101:9100 is degraded." + }, + "labels": { + "alertname": "NodeDegraded", + "alertstate": "firing", + "container": "node-exporter", + "endpoint": "metrics", + "instance": "192.168.1.101:9100", + "job": "node-exporter", + "namespace": "metalk8s-monitoring", + "pod": "prometheus-operator-prometheus-node-exporter-hxvmn", + "service": "prometheus-operator-prometheus-node-exporter", + "severity": "warning" + }, + "partialResponseStrategy": "WARN", + "state": "firing", + "value": "1e+00" + }, + { + "activeAt": "2021-10-21T08:01:12.908069235Z", + "annotations": { + "children": "KubeNodeNotReady{severity='warning'}, KubeNodeReadinessFlapping{severity='warning'}, KubeNodeUnreachable{severity='warning'}, KubeletClientCertificateExpiration{severity='warning'}, KubeletClientCertificateRenewalErrors{severity='warning'}, KubeletPlegDurationHigh{severity='warning'}, KubeletPodStartUpLatencyHigh{severity='warning'}, KubeletServerCertificateExpiration{severity='warning'}, KubeletServerCertificateRenewalErrors{severity='warning'}, KubeletTooManyPods{severity='warning'}, NodeClockNotSynchronising{severity='warning'}, NodeClockSkewDetected{severity='warning'}, NodeRAIDDiskFailure{severity='warning'}, NodeTextFileCollectorScrapeError{severity='warning'}, SystemPartitionDegraded{severity='warning'}", + "summary": "The node 192.168.1.100:9100 is degraded." + }, + "labels": { + "alertname": "NodeDegraded", + "alertstate": "firing", + "container": "node-exporter", + "endpoint": "metrics", + "instance": "192.168.1.100:9100", + "job": "node-exporter", + "namespace": "metalk8s-monitoring", + "pod": "prometheus-operator-prometheus-node-exporter-2krc7", + "service": "prometheus-operator-prometheus-node-exporter", + "severity": "warning" + }, + "partialResponseStrategy": "WARN", + "state": "firing", + "value": "1e+00" + } + ], + "annotations": { + "children": "KubeNodeNotReady{severity='warning'}, KubeNodeReadinessFlapping{severity='warning'}, KubeNodeUnreachable{severity='warning'}, KubeletClientCertificateExpiration{severity='warning'}, KubeletClientCertificateRenewalErrors{severity='warning'}, KubeletPlegDurationHigh{severity='warning'}, KubeletPodStartUpLatencyHigh{severity='warning'}, KubeletServerCertificateExpiration{severity='warning'}, KubeletServerCertificateRenewalErrors{severity='warning'}, KubeletTooManyPods{severity='warning'}, NodeClockNotSynchronising{severity='warning'}, NodeClockSkewDetected{severity='warning'}, NodeRAIDDiskFailure{severity='warning'}, NodeTextFileCollectorScrapeError{severity='warning'}, SystemPartitionDegraded{severity='warning'}", + "summary": "The node {{ $labels.instance }} is degraded." + }, + "duration": 60, + "evaluationTime": 0.000945618, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.910730472Z", + "name": "NodeDegraded", + "query": "(ALERTS{alertname=\"KubeNodeNotReady\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeNodeReadinessFlapping\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeNodeUnreachable\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletClientCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletClientCertificateRenewalErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletPlegDurationHigh\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletPodStartUpLatencyHigh\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletServerCertificateExpiration\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletServerCertificateRenewalErrors\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"KubeletTooManyPods\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeClockNotSynchronising\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeClockSkewDetected\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeRAIDDiskFailure\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeTextFileCollectorScrapeError\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"SystemPartitionDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "state": "firing", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "MonitoringServiceDegraded{severity='warning'}, AlertingServiceDegraded{severity='warning'}, LoggingServiceDegraded{severity='warning'}, DashboardingServiceDegraded{severity='warning'}", + "summary": "The observability services are degraded." + }, + "duration": 60, + "evaluationTime": 0.000140741, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.91409622Z", + "name": "ObservabilityServicesDegraded", + "query": "(ALERTS{alertname=\"MonitoringServiceDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"AlertingServiceDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"LoggingServiceDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"DashboardingServiceDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "AccessServicesDegraded{severity='warning'}, CoreServicesDegraded{severity='warning'}, ObservabilityServicesDegraded{severity='warning'}", + "summary": "The Platform services are degraded." + }, + "duration": 60, + "evaluationTime": 0.000118082, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.911851981Z", + "name": "PlatformServicesDegraded", + "query": "(ALERTS{alertname=\"AccessServicesDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"CoreServicesDegraded\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"ObservabilityServicesDegraded\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "NodeFilesystemAlmostOutOfSpace{severity='warning'}, NodeFilesystemAlmostOutOfFiles{severity='warning'}, NodeFilesystemFilesFillingUp{severity='warning'}, NodeFilesystemSpaceFillingUp{severity='warning'}", + "summary": "The system partition {{ $labels.mountpoint }} on node {{ $labels.instance }} is degraded." + }, + "duration": 60, + "evaluationTime": 0.000173602, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.911677625Z", + "name": "SystemPartitionDegraded", + "query": "(ALERTS{alertname=\"NodeFilesystemAlmostOutOfSpace\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeFilesystemAlmostOutOfFiles\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeFilesystemFilesFillingUp\",alertstate=\"firing\",severity=\"warning\"} or ALERTS{alertname=\"NodeFilesystemSpaceFillingUp\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "children": "KubePersistentVolumeFillingUp{severity='warning'}", + "summary": "The volume {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} on node {{ $labels.instance }} is degraded." + }, + "duration": 60, + "evaluationTime": 7.1149e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:42.916190096Z", + "name": "VolumeDegraded", + "query": "sum by(persistentvolumeclaim, namespace, instance) (ALERTS{alertname=\"KubePersistentVolumeFillingUp\",alertstate=\"firing\",severity=\"warning\"}) >= 1", + "state": "inactive", + "type": "alerting" + } + ] + }, + { + "evaluationTime": 0.002660388, + "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-alertmanager.rules.yaml", + "interval": 30, + "lastEvaluation": "2021-10-21T08:32:30.311443906Z", + "name": "alertmanager.rules", + "partialResponseStrategy": "ABORT", + "rules": [ + { + "alerts": [], + "annotations": { + "description": "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerclustercrashlooping", + "summary": "Half or more of the Alertmanager instances within the same cluster are crashlooping." + }, + "duration": 300, + "evaluationTime": 0.000229317, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" + }, + "lastEvaluation": "2021-10-21T08:32:30.313872163Z", + "name": "AlertmanagerClusterCrashlooping", + "query": "(count by(namespace, service) (changes(process_start_time_seconds{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[10m]) > 4) / count by(namespace, service) (up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) >= 0.5", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerclusterdown", + "summary": "Half or more of the Alertmanager instances within the same cluster are down." + }, + "duration": 300, + "evaluationTime": 0.000352742, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" + }, + "lastEvaluation": "2021-10-21T08:32:30.31351824Z", + "name": "AlertmanagerClusterDown", + "query": "(count by(namespace, service) (avg_over_time(up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) < 0.5) / count by(namespace, service) (up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) >= 0.5", "state": "inactive", "type": "alerting" }, @@ -73,12 +680,13 @@ "summary": "All Alertmanager instances in a cluster failed to send notifications to a critical integration." }, "duration": 300, - "evaluationTime": 0.000172464, + "evaluationTime": 0.000566057, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:30.311367875Z", + "lastEvaluation": "2021-10-21T08:32:30.312468037Z", "name": "AlertmanagerClusterFailedToSendAlerts", "query": "min by(namespace, service, integration) (rate(alertmanager_notifications_failed_total{integration=~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{integration=~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", "state": "inactive", @@ -92,12 +700,13 @@ "summary": "All Alertmanager instances in a cluster failed to send notifications to a non-critical integration." }, "duration": 300, - "evaluationTime": 0.000153029, + "evaluationTime": 0.00028614, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:30.311541032Z", + "lastEvaluation": "2021-10-21T08:32:30.31303535Z", "name": "AlertmanagerClusterFailedToSendAlerts", "query": "min by(namespace, service, integration) (rate(alertmanager_notifications_failed_total{integration!~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{integration!~\".*\",job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", "state": "inactive", @@ -111,12 +720,13 @@ "summary": "Alertmanager instances within the same cluster have different configurations." }, "duration": 1200, - "evaluationTime": 9.0759e-05, + "evaluationTime": 0.000194406, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:30.311694791Z", + "lastEvaluation": "2021-10-21T08:32:30.31332269Z", "name": "AlertmanagerConfigInconsistent", "query": "count by(namespace, service) (count_values by(namespace, service) (\"config_hash\", alertmanager_config_hash{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) != 1", "state": "inactive", @@ -125,115 +735,142 @@ { "alerts": [], "annotations": { - "description": "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerclusterdown", - "summary": "Half or more of the Alertmanager instances within the same cluster are down." + "description": "Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerfailedreload", + "summary": "Reloading an Alertmanager configuration has failed." }, - "duration": 300, - "evaluationTime": 0.000288587, + "duration": 600, + "evaluationTime": 0.000365137, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:30.311786172Z", - "name": "AlertmanagerClusterDown", - "query": "(count by(namespace, service) (avg_over_time(up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) < 0.5) / count by(namespace, service) (up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) >= 0.5", + "lastEvaluation": "2021-10-21T08:32:30.311451302Z", + "name": "AlertmanagerFailedReload", + "query": "max_over_time(alertmanager_config_last_reload_successful{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) == 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerclustercrashlooping", - "summary": "Half or more of the Alertmanager instances within the same cluster are crashlooping." + "description": "Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerfailedtosendalerts", + "summary": "An Alertmanager instance failed to send notifications." }, "duration": 300, - "evaluationTime": 0.0002969, + "evaluationTime": 0.000418496, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:30.312048261Z", + "name": "AlertmanagerFailedToSendAlerts", + "query": "(rate(alertmanager_notifications_failed_total{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(alertmanager_notifications_total{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m])) > 0.01", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagermembersinconsistent", + "summary": "A member of an Alertmanager cluster has not found all other cluster members." + }, + "duration": 600, + "evaluationTime": 0.000228896, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:30.312076764Z", - "name": "AlertmanagerClusterCrashlooping", - "query": "(count by(namespace, service) (changes(process_start_time_seconds{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[10m]) > 4) / count by(namespace, service) (up{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"})) >= 0.5", + "lastEvaluation": "2021-10-21T08:32:30.311818244Z", + "name": "AlertmanagerMembersInconsistent", + "query": "max_over_time(alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]) < on(namespace, service) group_left() count by(namespace, service) (max_over_time(alertmanager_cluster_members{job=\"prometheus-operator-alertmanager\",namespace=\"metalk8s-monitoring\"}[5m]))", "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.044033512, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-etcd.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:45.280957376Z", "name": "etcd", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }})." + "message": "etcd cluster \"{{ $labels.job }}\": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}." }, - "duration": 180, - "evaluationTime": 0, - "health": "unknown", + "duration": 600, + "evaluationTime": 0.000254057, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "etcdInsufficientMembers", - "query": "sum by(job) (up{job=~\".*etcd.*\"} == bool 1) < ((count by(job) (up{job=~\".*etcd.*\"}) + 1) / 2)", + "lastEvaluation": "2021-10-21T08:32:45.322994591Z", + "name": "etcdGRPCRequestsSlow", + "query": "histogram_quantile(0.99, sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\",job=~\".*etcd.*\"}[5m]))) > 0.15", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader." + "message": "etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow." }, - "duration": 60, - "evaluationTime": 0, - "health": "unknown", + "duration": 600, + "evaluationTime": 6.8433e-05, + "health": "ok", "labels": { - "severity": "critical" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "etcdNoLeader", - "query": "etcd_server_has_leader{job=~\".*etcd.*\"} == 0", + "lastEvaluation": "2021-10-21T08:32:45.324920681Z", + "name": "etcdHTTPRequestsSlow", + "query": "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour." + "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}." }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "duration": 600, + "evaluationTime": 0.000330653, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "etcdHighNumberOfLeaderChanges", - "query": "rate(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}[15m]) > 3", + "lastEvaluation": "2021-10-21T08:32:45.324246383Z", + "name": "etcdHighCommitDurations", + "query": "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.25", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}." + "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}." }, "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000360373, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "etcdHighNumberOfFailedGRPCRequests", - "query": "100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!=\"OK\",job=~\".*etcd.*\"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) > 1", + "lastEvaluation": "2021-10-21T08:32:45.323885242Z", + "name": "etcdHighFsyncDurations", + "query": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.5", "state": "inactive", "type": "alerting" }, @@ -243,12 +880,13 @@ "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}." }, "duration": 300, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.020640004, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:45.302352601Z", "name": "etcdHighNumberOfFailedGRPCRequests", "query": "100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!=\"OK\",job=~\".*etcd.*\"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) > 5", "state": "inactive", @@ -257,556 +895,709 @@ { "alerts": [], "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}." + "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}." }, "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.020552323, + "health": "ok", "labels": { - "severity": "critical" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "etcdGRPCRequestsSlow", - "query": "histogram_quantile(0.99, sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type=\"unary\",job=~\".*etcd.*\"}[5m]))) > 0.15", + "lastEvaluation": "2021-10-21T08:32:45.281798236Z", + "name": "etcdHighNumberOfFailedGRPCRequests", + "query": "100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!=\"OK\",job=~\".*etcd.*\"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) > 1", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}." + "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}." }, "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000143048, + "health": "ok", "labels": { - "severity": "warning" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "etcdMemberCommunicationSlow", - "query": "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.15", + "lastEvaluation": "2021-10-21T08:32:45.324776932Z", + "name": "etcdHighNumberOfFailedHTTPRequests", + "query": "sum by(method) (rate(etcd_http_failed_total{code!=\"404\",job=~\".*etcd.*\"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m])) > 0.05", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}." + "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}" }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "duration": 600, + "evaluationTime": 0.000198033, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "etcdHighNumberOfFailedProposals", - "query": "rate(etcd_server_proposals_failed_total{job=~\".*etcd.*\"}[15m]) > 5", + "lastEvaluation": "2021-10-21T08:32:45.324578072Z", + "name": "etcdHighNumberOfFailedHTTPRequests", + "query": "sum by(method) (rate(etcd_http_failed_total{code!=\"404\",job=~\".*etcd.*\"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m])) > 0.01", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}." + "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}." }, - "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "duration": 900, + "evaluationTime": 0.000122839, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "etcdHighFsyncDurations", - "query": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.5", + "lastEvaluation": "2021-10-21T08:32:45.323761592Z", + "name": "etcdHighNumberOfFailedProposals", + "query": "rate(etcd_server_proposals_failed_total{job=~\".*etcd.*\"}[15m]) > 5", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}." + "message": "etcd cluster \"{{ $labels.job }}\": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour." }, - "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "duration": 900, + "evaluationTime": 0.000127337, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "etcdHighCommitDurations", - "query": "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.25", + "lastEvaluation": "2021-10-21T08:32:45.281670108Z", + "name": "etcdHighNumberOfLeaderChanges", + "query": "rate(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}[15m]) > 3", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}" + "message": "etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }})." }, - "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "duration": 180, + "evaluationTime": 0.000552319, + "health": "ok", "labels": { - "severity": "warning" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "etcdHighNumberOfFailedHTTPRequests", - "query": "sum by(method) (rate(etcd_http_failed_total{code!=\"404\",job=~\".*etcd.*\"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m])) > 0.01", + "lastEvaluation": "2021-10-21T08:32:45.28096744Z", + "name": "etcdInsufficientMembers", + "query": "sum by(job) (up{job=~\".*etcd.*\"} == bool 1) < ((count by(job) (up{job=~\".*etcd.*\"}) + 1) / 2)", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}." + "message": "etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}." }, "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000508256, + "health": "ok", "labels": { - "severity": "critical" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "etcdHighNumberOfFailedHTTPRequests", - "query": "sum by(method) (rate(etcd_http_failed_total{code!=\"404\",job=~\".*etcd.*\"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m])) > 0.05", + "lastEvaluation": "2021-10-21T08:32:45.323252077Z", + "name": "etcdMemberCommunicationSlow", + "query": "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\"}[5m])) > 0.15", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "message": "etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow." + "message": "etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader." }, - "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "duration": 60, + "evaluationTime": 0.000147683, + "health": "ok", "labels": { - "severity": "warning" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "etcdHTTPRequestsSlow", - "query": "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15", + "lastEvaluation": "2021-10-21T08:32:45.281521418Z", + "name": "etcdNoLeader", + "query": "etcd_server_has_leader{job=~\".*etcd.*\"} == 0", "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.001308961, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-general.rules.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:25.521411129Z", "name": "general.rules", + "partialResponseStrategy": "ABORT", "rules": [ { - "alerts": [], + "alerts": [ + { + "activeAt": "2021-10-21T07:50:55.520605063Z", + "annotations": { + "description": "100% of the fluent-bit-headless/fluent-bit-headless targets in metalk8s-logging namespace are down.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-targetdown", + "summary": "One or more targets are unreachable." + }, + "labels": { + "alertname": "TargetDown", + "job": "fluent-bit-headless", + "namespace": "metalk8s-logging", + "service": "fluent-bit-headless", + "severity": "warning" + }, + "partialResponseStrategy": "WARN", + "state": "firing", + "value": "1e+02" + } + ], "annotations": { "description": "{{ printf \"%.4g\" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-targetdown", "summary": "One or more targets are unreachable." }, "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.001031664, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:25.521417572Z", "name": "TargetDown", "query": "100 * (count by(job, namespace, service) (up == 0) / count by(job, namespace, service) (up)) > 10", - "state": "inactive", + "state": "firing", "type": "alerting" }, { - "alerts": [], + "alerts": [ + { + "activeAt": "2021-10-21T07:50:55.520605063Z", + "annotations": { + "description": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-watchdog", + "summary": "An alert that should always be firing to certify that Alertmanager is working properly." + }, + "labels": { + "alertname": "Watchdog", + "severity": "none" + }, + "partialResponseStrategy": "WARN", + "state": "firing", + "value": "1e+00" + } + ], "annotations": { "description": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-watchdog", "summary": "An alert that should always be firing to certify that Alertmanager is working properly." }, "duration": 0, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000265129, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "none" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:25.522451317Z", "name": "Watchdog", "query": "vector(1)", - "state": "inactive", + "state": "firing", "type": "alerting" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.020044534, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-k8s.rules.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:43.315844659Z", "name": "k8s.rules", + "partialResponseStrategy": "ABORT", "rules": [ { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate", - "query": "sum by(cluster, namespace, pod, container) (irate(container_cpu_usage_seconds_total{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"}[5m])) * on(cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\"}))", + "evaluationTime": 0.001092403, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:43.333401686Z", + "name": "namespace_cpu:kube_pod_container_resource_requests:sum", + "query": "sum by(namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\"}) * on(namespace, pod, cluster) group_left() max by(namespace, pod) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "node_namespace_pod_container:container_memory_working_set_bytes", - "query": "container_memory_working_set_bytes{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))", + "evaluationTime": 0.001120499, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:43.332280018Z", + "name": "namespace_memory:kube_pod_container_resource_requests:sum", + "query": "sum by(namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"memory\"}) * on(namespace, pod, cluster) group_left() max by(namespace, pod) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "node_namespace_pod_container:container_memory_rss", - "query": "container_memory_rss{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))", + "evaluationTime": 0.000322395, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "workload_type": "daemonset" + }, + "lastEvaluation": "2021-10-21T08:32:43.33540091Z", + "name": "namespace_workload_pod:kube_pod_owner:relabel", + "query": "max by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\",owner_kind=\"DaemonSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "node_namespace_pod_container:container_memory_cache", - "query": "container_memory_cache{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))", + "evaluationTime": 0.000904229, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "workload_type": "deployment" + }, + "lastEvaluation": "2021-10-21T08:32:43.334495284Z", + "name": "namespace_workload_pod:kube_pod_owner:relabel", + "query": "max by(cluster, namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job=\"kube-state-metrics\",owner_kind=\"ReplicaSet\"}, \"replicaset\", \"$1\", \"owner_name\", \"(.*)\") * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (1, max by(replicaset, namespace, owner_name) (kube_replicaset_owner{job=\"kube-state-metrics\"})), \"workload\", \"$1\", \"owner_name\", \"(.*)\"))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "node_namespace_pod_container:container_memory_swap", - "query": "container_memory_swap{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))", + "evaluationTime": 0.000162819, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "workload_type": "statefulset" + }, + "lastEvaluation": "2021-10-21T08:32:43.335724361Z", + "name": "namespace_workload_pod:kube_pod_owner:relabel", + "query": "max by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\",owner_kind=\"StatefulSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "namespace_memory:kube_pod_container_resource_requests:sum", - "query": "sum by(namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"memory\"}) * on(namespace, pod, cluster) group_left() max by(namespace, pod) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))", + "evaluationTime": 0.003263931, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:43.315851137Z", + "name": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate", + "query": "sum by(cluster, namespace, pod, container) (irate(container_cpu_usage_seconds_total{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"}[5m])) * on(cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\"}))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "namespace_cpu:kube_pod_container_resource_requests:sum", - "query": "sum by(namespace, cluster) (sum by(namespace, pod, cluster) (max by(namespace, pod, container, cluster) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\"}) * on(namespace, pod, cluster) group_left() max by(namespace, pod) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)))", + "evaluationTime": 0.00327035, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:43.325805722Z", + "name": "node_namespace_pod_container:container_memory_cache", + "query": "container_memory_cache{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.003261854, + "health": "ok", "labels": { - "workload_type": "deployment" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "namespace_workload_pod:kube_pod_owner:relabel", - "query": "max by(cluster, namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job=\"kube-state-metrics\",owner_kind=\"ReplicaSet\"}, \"replicaset\", \"$1\", \"owner_name\", \"(.*)\") * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (1, max by(replicaset, namespace, owner_name) (kube_replicaset_owner{job=\"kube-state-metrics\"})), \"workload\", \"$1\", \"owner_name\", \"(.*)\"))", + "lastEvaluation": "2021-10-21T08:32:43.322541151Z", + "name": "node_namespace_pod_container:container_memory_rss", + "query": "container_memory_rss{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.003199012, + "health": "ok", "labels": { - "workload_type": "daemonset" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "namespace_workload_pod:kube_pod_owner:relabel", - "query": "max by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\",owner_kind=\"DaemonSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))", + "lastEvaluation": "2021-10-21T08:32:43.329078689Z", + "name": "node_namespace_pod_container:container_memory_swap", + "query": "container_memory_swap{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.003419938, + "health": "ok", "labels": { - "workload_type": "statefulset" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "namespace_workload_pod:kube_pod_owner:relabel", - "query": "max by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job=\"kube-state-metrics\",owner_kind=\"StatefulSet\"}, \"workload\", \"$1\", \"owner_name\", \"(.*)\"))", + "lastEvaluation": "2021-10-21T08:32:43.319118252Z", + "name": "node_namespace_pod_container:container_memory_working_set_bytes", + "query": "container_memory_working_set_bytes{image!=\"\",job=\"kubelet\",metrics_path=\"/metrics/cadvisor\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=\"\"}))", "type": "recording" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.034840892, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-apiserver-availability.rules.yaml", "interval": 180, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:30:48.621117124Z", "name": "kube-apiserver-availability.rules", + "partialResponseStrategy": "ABORT", "rules": [ { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.012604993, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "verb": "all" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:30:48.621122834Z", "name": "apiserver_request:availability30d", "query": "1 - ((sum by(cluster) (increase(apiserver_request_duration_seconds_count{verb=~\"POST|PUT|PATCH|DELETE\"}[30d])) - sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[30d]))) + (sum by(cluster) (increase(apiserver_request_duration_seconds_count{verb=~\"LIST|GET\"}[30d])) - ((sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[30d])) or vector(0)) + sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[30d])) + sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[30d])))) + sum by(cluster) (code:apiserver_request_total:increase30d{code=~\"5..\"} or vector(0))) / sum by(cluster) (code:apiserver_request_total:increase30d)", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.008342771, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "verb": "read" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:30:48.633729972Z", "name": "apiserver_request:availability30d", "query": "1 - (sum by(cluster) (increase(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30d])) - ((sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[30d])) or vector(0)) + sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[30d])) + sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[30d]))) + sum by(cluster) (code:apiserver_request_total:increase30d{code=~\"5..\",verb=\"read\"} or vector(0))) / sum by(cluster) (code:apiserver_request_total:increase30d{verb=\"read\"})", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.004151861, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "verb": "write" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:30:48.642074875Z", "name": "apiserver_request:availability30d", "query": "1 - ((sum by(cluster) (increase(apiserver_request_duration_seconds_count{verb=~\"POST|PUT|PATCH|DELETE\"}[30d])) - sum by(cluster) (increase(apiserver_request_duration_seconds_bucket{le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[30d]))) + sum by(cluster) (code:apiserver_request_total:increase30d{code=~\"5..\",verb=\"write\"} or vector(0))) / sum by(cluster) (code:apiserver_request_total:increase30d{verb=\"write\"})", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "code_verb:apiserver_request_total:increase30d", - "query": "avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30", + "evaluationTime": 0.000178681, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "verb": "read" + }, + "lastEvaluation": "2021-10-21T08:30:48.655492031Z", + "name": "code:apiserver_request_total:increase30d", + "query": "sum by(cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~\"LIST|GET\"})", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", + "evaluationTime": 0.000282988, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "verb": "write" + }, + "lastEvaluation": "2021-10-21T08:30:48.655672399Z", + "name": "code:apiserver_request_total:increase30d", + "query": "sum by(cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000321581, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.65292396Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"GET\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.001163888, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.64991836Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"POST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"GET\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.003269497, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.646646718Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000323593, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.652599425Z", "name": "code_verb:apiserver_request_total:increase1h", "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"PATCH\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000681962, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.651083724Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"POST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000831478, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.651766839Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"2..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 9.2043e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.653568713Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"GET\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 6.5792e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.653324897Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"POST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"GET\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 7.7845e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.653246439Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 5.7474e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.653510764Z", "name": "code_verb:apiserver_request_total:increase1h", "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"PATCH\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 5.9923e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.653391271Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"POST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 5.859e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.653451706Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"3..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000150259, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.654788896Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"GET\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000485333, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.653740729Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"POST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"GET\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 7.8839e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.653661338Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000130713, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.654657077Z", "name": "code_verb:apiserver_request_total:increase1h", "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"PATCH\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000176573, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.65422703Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"POST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000251733, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.654404476Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"4..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 6.8971e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.655422528Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"GET\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000110311, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.655035833Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"POST\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"GET\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 9.5166e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.654939945Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"LIST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 8.1924e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.655339994Z", "name": "code_verb:apiserver_request_total:increase1h", "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"PATCH\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 9.6851e-05, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:30:48.655146868Z", "name": "code_verb:apiserver_request_total:increase1h", - "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"DELETE\"}[1h]))", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"POST\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 9.4924e-05, + "health": "ok", "labels": { - "verb": "read" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "code:apiserver_request_total:increase30d", - "query": "sum by(cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~\"LIST|GET\"})", + "lastEvaluation": "2021-10-21T08:30:48.655244441Z", + "name": "code_verb:apiserver_request_total:increase1h", + "query": "sum by(cluster, code, verb) (increase(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=\"PUT\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000416218, + "health": "ok", "labels": { - "verb": "write" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "code:apiserver_request_total:increase30d", - "query": "sum by(cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~\"POST|PUT|PATCH|DELETE\"})", + "lastEvaluation": "2021-10-21T08:30:48.646228839Z", + "name": "code_verb:apiserver_request_total:increase30d", + "query": "avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30", "type": "recording" } ] }, { - "evaluationTime": 0.001224051, + "evaluationTime": 0.001048574, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-apiserver-slos.yaml", "interval": 30, - "lastEvaluation": "2021-08-05T08:29:32.746513551Z", + "lastEvaluation": "2021-10-21T08:32:32.747227278Z", "name": "kube-apiserver-slos", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], @@ -815,17 +1606,18 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn", "summary": "The API server is burning too much error budget." }, - "duration": 120, - "evaluationTime": 0.000481699, + "duration": 3600, + "evaluationTime": 0.000178542, "health": "ok", "labels": { - "long": "1h", - "severity": "critical", - "short": "5m" + "long": "1d", + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning", + "short": "2h" }, - "lastEvaluation": "2021-08-05T08:29:32.746520738Z", + "lastEvaluation": "2021-10-21T08:32:32.747961638Z", "name": "KubeAPIErrorBudgetBurn", - "query": "sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01)", + "query": "sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01)", "state": "inactive", "type": "alerting" }, @@ -836,17 +1628,18 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn", "summary": "The API server is burning too much error budget." }, - "duration": 900, - "evaluationTime": 0.000154742, + "duration": 120, + "evaluationTime": 0.000442897, "health": "ok", "labels": { - "long": "6h", + "long": "1h", + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical", - "short": "30m" + "short": "5m" }, - "lastEvaluation": "2021-08-05T08:29:32.74700463Z", + "lastEvaluation": "2021-10-21T08:32:32.74723522Z", "name": "KubeAPIErrorBudgetBurn", - "query": "sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01)", + "query": "sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01)", "state": "inactive", "type": "alerting" }, @@ -857,17 +1650,18 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn", "summary": "The API server is burning too much error budget." }, - "duration": 3600, - "evaluationTime": 0.000265238, + "duration": 10800, + "evaluationTime": 0.00013299, "health": "ok", "labels": { - "long": "1d", + "long": "3d", + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning", - "short": "2h" + "short": "6h" }, - "lastEvaluation": "2021-08-05T08:29:32.747160445Z", + "lastEvaluation": "2021-10-21T08:32:32.748140928Z", "name": "KubeAPIErrorBudgetBurn", - "query": "sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01)", + "query": "sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01)", "state": "inactive", "type": "alerting" }, @@ -878,496 +1672,538 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn", "summary": "The API server is burning too much error budget." }, - "duration": 10800, - "evaluationTime": 0.000305324, + "duration": 900, + "evaluationTime": 0.000279703, "health": "ok", "labels": { - "long": "3d", - "severity": "warning", - "short": "6h" + "long": "6h", + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical", + "short": "30m" }, - "lastEvaluation": "2021-08-05T08:29:32.747427599Z", + "lastEvaluation": "2021-10-21T08:32:32.747680462Z", "name": "KubeAPIErrorBudgetBurn", - "query": "sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01)", + "query": "sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01)", "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0.005920899, + "evaluationTime": 0.488652899, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-apiserver.rules.yaml", "interval": 30, - "lastEvaluation": "2021-08-05T08:29:31.976279011Z", + "lastEvaluation": "2021-10-21T08:32:31.976537981Z", "name": "kube-apiserver.rules", + "partialResponseStrategy": "ABORT", "rules": [ { - "evaluationTime": 0.001005022, + "evaluationTime": 0.016424681, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "verb": "read" }, - "lastEvaluation": "2021-08-05T08:29:31.976285869Z", + "lastEvaluation": "2021-10-21T08:32:31.976545605Z", "name": "apiserver_request:burnrate1d", "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1d])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[1d])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[1d])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[1d])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1d]))", "type": "recording" }, { - "evaluationTime": 0.00040539, + "evaluationTime": 0.006765203, "health": "ok", "labels": { - "verb": "read" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "verb": "write" }, - "lastEvaluation": "2021-08-05T08:29:31.977293169Z", - "name": "apiserver_request:burnrate1h", - "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1h])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[1h])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[1h])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[1h])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))", + "lastEvaluation": "2021-10-21T08:32:32.058221884Z", + "name": "apiserver_request:burnrate1d", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))", "type": "recording" }, { - "evaluationTime": 0.00049458, + "evaluationTime": 0.012458167, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "verb": "read" }, - "lastEvaluation": "2021-08-05T08:29:31.977699527Z", - "name": "apiserver_request:burnrate2h", - "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[2h])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[2h])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[2h])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[2h])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))", + "lastEvaluation": "2021-10-21T08:32:31.992973219Z", + "name": "apiserver_request:burnrate1h", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[1h])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[1h])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[1h])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[1h])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0.000367437, + "evaluationTime": 0.006202841, "health": "ok", "labels": { - "verb": "read" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "verb": "write" }, - "lastEvaluation": "2021-08-05T08:29:31.978195499Z", - "name": "apiserver_request:burnrate30m", - "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30m])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[30m])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[30m])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[30m])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))", + "lastEvaluation": "2021-10-21T08:32:32.064990451Z", + "name": "apiserver_request:burnrate1h", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))", "type": "recording" }, { - "evaluationTime": 0.000358372, + "evaluationTime": 0.012205701, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "verb": "read" }, - "lastEvaluation": "2021-08-05T08:29:31.978563828Z", - "name": "apiserver_request:burnrate3d", - "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[3d])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[3d])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[3d])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[3d])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))", + "lastEvaluation": "2021-10-21T08:32:32.005434243Z", + "name": "apiserver_request:burnrate2h", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[2h])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[2h])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[2h])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[2h])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[2h]))", "type": "recording" }, { - "evaluationTime": 0.000372175, + "evaluationTime": 0.006182311, "health": "ok", "labels": { - "verb": "read" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "verb": "write" }, - "lastEvaluation": "2021-08-05T08:29:31.978923137Z", - "name": "apiserver_request:burnrate5m", - "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[5m])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[5m])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[5m])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[5m])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))", + "lastEvaluation": "2021-10-21T08:32:32.071198052Z", + "name": "apiserver_request:burnrate2h", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))", "type": "recording" }, { - "evaluationTime": 0.00035355, + "evaluationTime": 0.008865727, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "verb": "read" }, - "lastEvaluation": "2021-08-05T08:29:31.979296207Z", - "name": "apiserver_request:burnrate6h", - "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[6h])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[6h])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[6h])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[6h])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))", + "lastEvaluation": "2021-10-21T08:32:32.017643365Z", + "name": "apiserver_request:burnrate30m", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[30m])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[30m])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[30m])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[30m])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[30m]))", "type": "recording" }, { - "evaluationTime": 0.000269247, + "evaluationTime": 0.004025447, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "verb": "write" }, - "lastEvaluation": "2021-08-05T08:29:31.979650672Z", - "name": "apiserver_request:burnrate1d", - "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1d]))", + "lastEvaluation": "2021-10-21T08:32:32.077383474Z", + "name": "apiserver_request:burnrate30m", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))", "type": "recording" }, { - "evaluationTime": 0.000256352, + "evaluationTime": 0.01203926, "health": "ok", "labels": { - "verb": "write" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "verb": "read" }, - "lastEvaluation": "2021-08-05T08:29:31.979920632Z", - "name": "apiserver_request:burnrate1h", - "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[1h]))", + "lastEvaluation": "2021-10-21T08:32:32.026512223Z", + "name": "apiserver_request:burnrate3d", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[3d])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[3d])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[3d])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[3d])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[3d]))", "type": "recording" }, { - "evaluationTime": 0.000266638, + "evaluationTime": 0.005983571, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "verb": "write" }, - "lastEvaluation": "2021-08-05T08:29:31.980177647Z", - "name": "apiserver_request:burnrate2h", - "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[2h]))", + "lastEvaluation": "2021-10-21T08:32:32.08141082Z", + "name": "apiserver_request:burnrate3d", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))", "type": "recording" }, { - "evaluationTime": 0.000322176, + "evaluationTime": 0.007217213, "health": "ok", "labels": { - "verb": "write" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "verb": "read" }, - "lastEvaluation": "2021-08-05T08:29:31.980445344Z", - "name": "apiserver_request:burnrate30m", - "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[30m]))", + "lastEvaluation": "2021-10-21T08:32:32.038555352Z", + "name": "apiserver_request:burnrate5m", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[5m])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[5m])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[5m])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[5m])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0.0002526, + "evaluationTime": 0.003110297, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "verb": "write" }, - "lastEvaluation": "2021-08-05T08:29:31.980768522Z", - "name": "apiserver_request:burnrate3d", - "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[3d]))", + "lastEvaluation": "2021-10-21T08:32:32.087396731Z", + "name": "apiserver_request:burnrate5m", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0.000264974, + "evaluationTime": 0.012442712, "health": "ok", "labels": { - "verb": "write" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "verb": "read" }, - "lastEvaluation": "2021-08-05T08:29:31.981021778Z", - "name": "apiserver_request:burnrate5m", - "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))", + "lastEvaluation": "2021-10-21T08:32:32.045776223Z", + "name": "apiserver_request:burnrate6h", + "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"LIST|GET\"}[6h])) - ((sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.1\",scope=~\"resource|\",verb=~\"LIST|GET\"}[6h])) or vector(0)) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"0.5\",scope=\"namespace\",verb=~\"LIST|GET\"}[6h])) + sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"5\",scope=\"cluster\",verb=~\"LIST|GET\"}[6h])))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[6h]))", "type": "recording" }, { - "evaluationTime": 0.000234391, + "evaluationTime": 0.005958263, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "verb": "write" }, - "lastEvaluation": "2021-08-05T08:29:31.981287498Z", + "lastEvaluation": "2021-10-21T08:32:32.09050883Z", "name": "apiserver_request:burnrate6h", "query": "((sum by(cluster) (rate(apiserver_request_duration_seconds_count{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h])) - sum by(cluster) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",le=\"1\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))) + sum by(cluster) (rate(apiserver_request_total{code=~\"5..\",job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))) / sum by(cluster) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[6h]))", "type": "recording" }, { - "evaluationTime": 6.3581e-05, + "evaluationTime": 0.080936825, "health": "ok", "labels": { - "verb": "read" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "quantile": "0.5" }, - "lastEvaluation": "2021-08-05T08:29:31.981522665Z", - "name": "code_resource:apiserver_request_total:rate5m", - "query": "sum by(cluster, code, resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))", + "lastEvaluation": "2021-10-21T08:32:32.38424814Z", + "name": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", + "query": "histogram_quantile(0.5, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])))", "type": "recording" }, { - "evaluationTime": 7.4163e-05, + "evaluationTime": 0.080095466, "health": "ok", "labels": { - "verb": "write" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "quantile": "0.9" }, - "lastEvaluation": "2021-08-05T08:29:31.981586778Z", - "name": "code_resource:apiserver_request_total:rate5m", - "query": "sum by(cluster, code, resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))", + "lastEvaluation": "2021-10-21T08:32:32.304146772Z", + "name": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", + "query": "histogram_quantile(0.9, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])))", "type": "recording" }, { - "evaluationTime": 0.000111271, + "evaluationTime": 0.080209771, "health": "ok", "labels": { - "quantile": "0.99", - "verb": "read" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "quantile": "0.99" }, - "lastEvaluation": "2021-08-05T08:29:31.981661952Z", + "lastEvaluation": "2021-10-21T08:32:32.223931696Z", "name": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.99, sum by(cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))) > 0", + "query": "histogram_quantile(0.99, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])))", "type": "recording" }, { - "evaluationTime": 9.6269e-05, + "evaluationTime": 0.084295986, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "quantile": "0.99", - "verb": "write" + "verb": "read" }, - "lastEvaluation": "2021-08-05T08:29:31.981773897Z", + "lastEvaluation": "2021-10-21T08:32:32.100463508Z", "name": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.99, sum by(cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) > 0", + "query": "histogram_quantile(0.99, sum by(cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))) > 0", "type": "recording" }, { - "evaluationTime": 0.000110487, + "evaluationTime": 0.039166517, "health": "ok", "labels": { - "quantile": "0.99" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "quantile": "0.99", + "verb": "write" }, - "lastEvaluation": "2021-08-05T08:29:31.981870832Z", + "lastEvaluation": "2021-10-21T08:32:32.18476244Z", "name": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.99, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])))", + "query": "histogram_quantile(0.99, sum by(cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))) > 0", "type": "recording" }, { - "evaluationTime": 9.5591e-05, + "evaluationTime": 0.002621087, "health": "ok", "labels": { - "quantile": "0.9" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "verb": "read" }, - "lastEvaluation": "2021-08-05T08:29:31.981982001Z", - "name": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.9, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])))", + "lastEvaluation": "2021-10-21T08:32:32.096469246Z", + "name": "code_resource:apiserver_request_total:rate5m", + "query": "sum by(cluster, code, resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"LIST|GET\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0.00011867, + "evaluationTime": 0.001368292, "health": "ok", "labels": { - "quantile": "0.5" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "verb": "write" }, - "lastEvaluation": "2021-08-05T08:29:31.982078273Z", - "name": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.5, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])))", + "lastEvaluation": "2021-10-21T08:32:32.099093061Z", + "name": "code_resource:apiserver_request_total:rate5m", + "query": "sum by(cluster, code, resource) (rate(apiserver_request_total{job=\"apiserver\",verb=~\"POST|PUT|PATCH|DELETE\"}[5m]))", "type": "recording" } ] }, { - "evaluationTime": 0.000347602, + "evaluationTime": 0.001000297, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-prometheus-general.rules.yaml", "interval": 30, - "lastEvaluation": "2021-08-05T08:29:27.650571925Z", + "lastEvaluation": "2021-10-21T08:32:27.649701944Z", "name": "kube-prometheus-general.rules", + "partialResponseStrategy": "ABORT", "rules": [ { - "evaluationTime": 0.00028237, + "evaluationTime": 0.000378395, "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.650577515Z", - "name": "count:up1", - "query": "count without(instance, pod, node) (up == 1)", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.650319686Z", + "name": "count:up0", + "query": "count without(instance, pod, node) (up == 0)", "type": "recording" }, { - "evaluationTime": 5.5044e-05, + "evaluationTime": 0.000609169, "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.65086119Z", - "name": "count:up0", - "query": "count without(instance, pod, node) (up == 0)", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.649708414Z", + "name": "count:up1", + "query": "count without(instance, pod, node) (up == 1)", "type": "recording" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.00323884, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-prometheus-node-recording.rules.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:43.690168182Z", "name": "kube-prometheus-node-recording.rules", + "partialResponseStrategy": "ABORT", "rules": [ { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "instance:node_cpu:rate:sum", - "query": "sum by(instance) (rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[3m]))", + "evaluationTime": 0.000455514, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:43.692947514Z", + "name": "cluster:node_cpu:ratio", + "query": "cluster:node_cpu_seconds_total:rate5m / count(sum by(instance, cpu) (node_cpu_seconds_total))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "instance:node_network_receive_bytes:rate:sum", - "query": "sum by(instance) (rate(node_network_receive_bytes_total[3m]))", + "evaluationTime": 0.000379758, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:43.692566584Z", + "name": "cluster:node_cpu:sum_rate5m", + "query": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "instance:node_network_transmit_bytes:rate:sum", - "query": "sum by(instance) (rate(node_network_transmit_bytes_total[3m]))", + "evaluationTime": 0.000630528, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:43.690174352Z", + "name": "instance:node_cpu:rate:sum", + "query": "sum by(instance) (rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[3m]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.000878671, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:43.691686529Z", "name": "instance:node_cpu:ratio", "query": "sum without(cpu, mode) (rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m])) / on(instance) group_left() count by(instance) (sum by(instance, cpu) (node_cpu_seconds_total))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "cluster:node_cpu:sum_rate5m", - "query": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\",mode!=\"steal\"}[5m]))", + "evaluationTime": 0.000448106, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:43.690806829Z", + "name": "instance:node_network_receive_bytes:rate:sum", + "query": "sum by(instance) (rate(node_network_receive_bytes_total[3m]))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "cluster:node_cpu:ratio", - "query": "cluster:node_cpu_seconds_total:rate5m / count(sum by(instance, cpu) (node_cpu_seconds_total))", + "evaluationTime": 0.000429402, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:43.691256127Z", + "name": "instance:node_network_transmit_bytes:rate:sum", + "query": "sum by(instance) (rate(node_network_transmit_bytes_total[3m]))", "type": "recording" } ] }, { - "evaluationTime": 0.000995042, + "evaluationTime": 0.003021723, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-scheduler.rules.yaml", "interval": 30, - "lastEvaluation": "2021-08-05T08:29:30.190309316Z", + "lastEvaluation": "2021-10-21T08:32:30.191811129Z", "name": "kube-scheduler.rules", + "partialResponseStrategy": "ABORT", "rules": [ { - "evaluationTime": 0.000327417, + "evaluationTime": 8.8745e-05, "health": "ok", "labels": { - "quantile": "0.99" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "quantile": "0.5" }, - "lastEvaluation": "2021-08-05T08:29:30.190315048Z", - "name": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", + "lastEvaluation": "2021-10-21T08:32:30.19474187Z", + "name": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile", + "query": "histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 0.000106285, + "evaluationTime": 8.745e-05, "health": "ok", "labels": { - "quantile": "0.99" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "quantile": "0.9" }, - "lastEvaluation": "2021-08-05T08:29:30.190643886Z", - "name": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", + "lastEvaluation": "2021-10-21T08:32:30.193888021Z", + "name": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile", + "query": "histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 8.8025e-05, + "evaluationTime": 0.000140794, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "quantile": "0.99" }, - "lastEvaluation": "2021-08-05T08:29:30.190751079Z", + "lastEvaluation": "2021-10-21T08:32:30.192885853Z", "name": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 7.8888e-05, + "evaluationTime": 0.000390153, "health": "ok", "labels": { - "quantile": "0.9" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "quantile": "0.5" }, - "lastEvaluation": "2021-08-05T08:29:30.19083978Z", + "lastEvaluation": "2021-10-21T08:32:30.19397613Z", "name": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", + "query": "histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 6.8782e-05, + "evaluationTime": 0.000480169, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "quantile": "0.9" }, - "lastEvaluation": "2021-08-05T08:29:30.190919696Z", - "name": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", + "lastEvaluation": "2021-10-21T08:32:30.193028163Z", + "name": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile", + "query": "histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 7.7415e-05, + "evaluationTime": 0.000647802, "health": "ok", "labels": { - "quantile": "0.9" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "quantile": "0.99" }, - "lastEvaluation": "2021-08-05T08:29:30.190989102Z", - "name": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", + "lastEvaluation": "2021-10-21T08:32:30.191816976Z", + "name": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile", + "query": "histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 9.8382e-05, + "evaluationTime": 0.000341486, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "quantile": "0.5" }, - "lastEvaluation": "2021-08-05T08:29:30.191067364Z", - "name": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", + "lastEvaluation": "2021-10-21T08:32:30.19436733Z", + "name": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile", + "query": "histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 6.5162e-05, + "evaluationTime": 0.000377589, "health": "ok", "labels": { - "quantile": "0.5" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "quantile": "0.9" }, - "lastEvaluation": "2021-08-05T08:29:30.191166534Z", + "lastEvaluation": "2021-10-21T08:32:30.193509442Z", "name": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", + "query": "histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" }, { - "evaluationTime": 6.6836e-05, + "evaluationTime": 0.000417773, "health": "ok", "labels": { - "quantile": "0.5" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "quantile": "0.99" }, - "lastEvaluation": "2021-08-05T08:29:30.191232375Z", - "name": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", + "lastEvaluation": "2021-10-21T08:32:30.192466838Z", + "name": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile", + "query": "histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])))", "type": "recording" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.000717964, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kube-state-metrics.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:46.305681257Z", "name": "kube-state-metrics", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], - "annotations": { - "description": "kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors", - "summary": "kube-state-metrics is experiencing errors in list operations." - }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", - "labels": { - "severity": "critical" - }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeStateMetricsListErrors", - "query": "(sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) / sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m]))) > 0.01", - "state": "inactive", - "type": "alerting" - }, - { - "alerts": [], - "annotations": { - "description": "kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors", - "summary": "kube-state-metrics is experiencing errors in watch operations." + "annotations": { + "description": "kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors", + "summary": "kube-state-metrics is experiencing errors in list operations." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.00033623, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeStateMetricsWatchErrors", - "query": "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) / sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m]))) > 0.01", + "lastEvaluation": "2021-10-21T08:32:46.305687065Z", + "name": "KubeStateMetricsListErrors", + "query": "(sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) / sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m]))) > 0.01", "state": "inactive", "type": "alerting" }, @@ -1379,12 +2215,13 @@ "summary": "kube-state-metrics sharding is misconfigured." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 7.5459e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:46.306164342Z", "name": "KubeStateMetricsShardingMismatch", "query": "stdvar(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) != 0", "state": "inactive", @@ -1398,397 +2235,441 @@ "summary": "kube-state-metrics shards are missing." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000156931, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:46.306240377Z", "name": "KubeStateMetricsShardsMissing", "query": "2 ^ max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1 - sum(2 ^ max by(shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"})) != 0", "state": "inactive", "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors", + "summary": "kube-state-metrics is experiencing errors in watch operations." + }, + "duration": 900, + "evaluationTime": 0.000138389, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" + }, + "lastEvaluation": "2021-10-21T08:32:46.306025287Z", + "name": "KubeStateMetricsWatchErrors", + "query": "(sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) / sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m]))) > 0.01", + "state": "inactive", + "type": "alerting" } ] }, { - "evaluationTime": 0.000828074, + "evaluationTime": 0.001541132, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubelet.rules.yaml", "interval": 30, - "lastEvaluation": "2021-08-05T08:29:26.101935191Z", + "lastEvaluation": "2021-10-21T08:32:26.102129696Z", "name": "kubelet.rules", + "partialResponseStrategy": "ABORT", "rules": [ { - "evaluationTime": 0.000598713, + "evaluationTime": 0.0004104, "health": "ok", "labels": { - "quantile": "0.99" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "quantile": "0.5" }, - "lastEvaluation": "2021-08-05T08:29:26.101952455Z", + "lastEvaluation": "2021-10-21T08:32:26.103257952Z", "name": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"})", + "query": "histogram_quantile(0.5, sum by(instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"})", "type": "recording" }, { - "evaluationTime": 0.000119529, + "evaluationTime": 0.000428797, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "quantile": "0.9" }, - "lastEvaluation": "2021-08-05T08:29:26.102554756Z", + "lastEvaluation": "2021-10-21T08:32:26.102828219Z", "name": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile", "query": "histogram_quantile(0.9, sum by(instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"})", "type": "recording" }, { - "evaluationTime": 8.4771e-05, + "evaluationTime": 0.000690398, "health": "ok", "labels": { - "quantile": "0.5" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "quantile": "0.99" }, - "lastEvaluation": "2021-08-05T08:29:26.102675214Z", + "lastEvaluation": "2021-10-21T08:32:26.102136215Z", "name": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile", - "query": "histogram_quantile(0.5, sum by(instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"})", + "query": "histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"})", "type": "recording" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.006543581, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-apps.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:40.004994856Z", "name": "kubernetes-apps", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], "annotations": { - "description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf \"%.2f\" $value }} times / 10 minutes.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping", - "summary": "Pod is crash looping." + "description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting", + "summary": "Pod container waiting longer than 1 hour" }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "duration": 3600, + "evaluationTime": 0.000118489, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubePodCrashLooping", - "query": "increase(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) > 0", + "lastEvaluation": "2021-10-21T08:32:40.010127677Z", + "name": "KubeContainerWaiting", + "query": "sum by(namespace, pod, container) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\",namespace=~\".*\"}) > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready", - "summary": "Pod has been in a non-ready state for more than 15 minutes." + "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled", + "summary": "DaemonSet pods are misscheduled." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000171467, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubePodNotReady", - "query": "sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\",namespace=~\".*\",phase=~\"Pending|Unknown\"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"}))) > 0", + "lastEvaluation": "2021-10-21T08:32:40.010558906Z", + "name": "KubeDaemonSetMisScheduled", + "query": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch", - "summary": "Deployment generation mismatch due to possible roll-back" + "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled", + "summary": "DaemonSet pods are not scheduled." }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "duration": 600, + "evaluationTime": 0.000310671, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeDeploymentGenerationMismatch", - "query": "kube_deployment_status_observed_generation{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_deployment_metadata_generation{job=\"kube-state-metrics\",namespace=~\".*\"}", + "lastEvaluation": "2021-10-21T08:32:40.010246948Z", + "name": "KubeDaemonSetNotScheduled", + "query": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} - kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch", - "summary": "Deployment has not matched the expected number of replicas." + "description": "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck", + "summary": "DaemonSet rollout is stuck." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000738933, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeDeploymentReplicasMismatch", - "query": "(kube_deployment_spec_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_deployment_status_replicas_available{job=\"kube-state-metrics\",namespace=~\".*\"}) and (changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) == 0)", + "lastEvaluation": "2021-10-21T08:32:40.009387543Z", + "name": "KubeDaemonSetRolloutStuck", + "query": "((kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}) or (kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != 0) or (kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}) or (kube_daemonset_status_number_available{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"})) and (changes(kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}[5m]) == 0)", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch", - "summary": "Deployment has not matched the expected number of replicas." + "description": "Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch", + "summary": "Deployment generation mismatch due to possible roll-back" }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000383826, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeStatefulSetReplicasMismatch", - "query": "(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) == 0)", + "lastEvaluation": "2021-10-21T08:32:40.007426557Z", + "name": "KubeDeploymentGenerationMismatch", + "query": "kube_deployment_status_observed_generation{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_deployment_metadata_generation{job=\"kube-state-metrics\",namespace=~\".*\"}", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch", - "summary": "StatefulSet generation mismatch due to possible roll-back" + "description": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch", + "summary": "Deployment has not matched the expected number of replicas." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000571319, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeStatefulSetGenerationMismatch", - "query": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_metadata_generation{job=\"kube-state-metrics\",namespace=~\".*\"}", + "lastEvaluation": "2021-10-21T08:32:40.007811237Z", + "name": "KubeDeploymentReplicasMismatch", + "query": "(kube_deployment_spec_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_deployment_status_replicas_available{job=\"kube-state-metrics\",namespace=~\".*\"}) and (changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) == 0)", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout", - "summary": "StatefulSet update has not been rolled out." + "description": "HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout", + "summary": "HPA is running at max replicas" }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000135086, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeStatefulSetUpdateNotRolledOut", - "query": "(max without(revision) (kube_statefulset_status_current_revision{job=\"kube-state-metrics\",namespace=~\".*\"} unless kube_statefulset_status_update_revision{job=\"kube-state-metrics\",namespace=~\".*\"}) * (kube_statefulset_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"})) and (changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[5m]) == 0)", + "lastEvaluation": "2021-10-21T08:32:40.011400949Z", + "name": "KubeHpaMaxedOut", + "query": "kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} == kube_hpa_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck", - "summary": "DaemonSet rollout is stuck." + "description": "HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch", + "summary": "HPA has not matched descired number of replicas." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000391709, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeDaemonSetRolloutStuck", - "query": "((kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}) or (kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != 0) or (kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}) or (kube_daemonset_status_number_available{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"})) and (changes(kube_daemonset_updated_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"}[5m]) == 0)", + "lastEvaluation": "2021-10-21T08:32:40.011008172Z", + "name": "KubeHpaReplicasMismatch", + "query": "(kube_hpa_status_desired_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} > kube_hpa_spec_min_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} < kube_hpa_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and changes(kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}[15m]) == 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting", - "summary": "Pod container waiting longer than 1 hour" + "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion", + "summary": "Job did not complete in time" }, - "duration": 3600, - "evaluationTime": 0, - "health": "unknown", + "duration": 43200, + "evaluationTime": 0.000173054, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeContainerWaiting", - "query": "sum by(namespace, pod, container) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\",namespace=~\".*\"}) > 0", + "lastEvaluation": "2021-10-21T08:32:40.010731598Z", + "name": "KubeJobCompletion", + "query": "kube_job_spec_completions{job=\"kube-state-metrics\",namespace=~\".*\"} - kube_job_status_succeeded{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled", - "summary": "DaemonSet pods are not scheduled." + "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed", + "summary": "Job failed to complete." }, - "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "duration": 900, + "evaluationTime": 0.00010127, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeDaemonSetNotScheduled", - "query": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} - kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", + "lastEvaluation": "2021-10-21T08:32:40.010906065Z", + "name": "KubeJobFailed", + "query": "kube_job_failed{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled", - "summary": "DaemonSet pods are misscheduled." + "description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf \"%.2f\" $value }} times / 10 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping", + "summary": "Pod is crash looping." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000769569, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeDaemonSetMisScheduled", - "query": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", + "lastEvaluation": "2021-10-21T08:32:40.005000709Z", + "name": "KubePodCrashLooping", + "query": "increase(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion", - "summary": "Job did not complete in time" + "description": "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready", + "summary": "Pod has been in a non-ready state for more than 15 minutes." }, - "duration": 43200, - "evaluationTime": 0, - "health": "unknown", + "duration": 900, + "evaluationTime": 0.001653835, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeJobCompletion", - "query": "kube_job_spec_completions{job=\"kube-state-metrics\",namespace=~\".*\"} - kube_job_status_succeeded{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", + "lastEvaluation": "2021-10-21T08:32:40.005771549Z", + "name": "KubePodNotReady", + "query": "sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\",namespace=~\".*\",phase=~\"Pending|Unknown\"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"Job\"}))) > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed", - "summary": "Job failed to complete." + "description": "StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch", + "summary": "StatefulSet generation mismatch due to possible roll-back" }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000236357, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeJobFailed", - "query": "kube_job_failed{job=\"kube-state-metrics\",namespace=~\".*\"} > 0", + "lastEvaluation": "2021-10-21T08:32:40.008682227Z", + "name": "KubeStatefulSetGenerationMismatch", + "query": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_metadata_generation{job=\"kube-state-metrics\",namespace=~\".*\"}", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch", - "summary": "HPA has not matched descired number of replicas." + "description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch", + "summary": "Deployment has not matched the expected number of replicas." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000297438, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeHpaReplicasMismatch", - "query": "(kube_hpa_status_desired_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} > kube_hpa_spec_min_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} < kube_hpa_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and changes(kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}[15m]) == 0", + "lastEvaluation": "2021-10-21T08:32:40.008383407Z", + "name": "KubeStatefulSetReplicasMismatch", + "query": "(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and (changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[10m]) == 0)", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout", - "summary": "HPA is running at max replicas" + "description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout", + "summary": "StatefulSet update has not been rolled out." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000466204, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeHpaMaxedOut", - "query": "kube_hpa_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} == kube_hpa_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}", + "lastEvaluation": "2021-10-21T08:32:40.008920214Z", + "name": "KubeStatefulSetUpdateNotRolledOut", + "query": "(max without(revision) (kube_statefulset_status_current_revision{job=\"kube-state-metrics\",namespace=~\".*\"} unless kube_statefulset_status_update_revision{job=\"kube-state-metrics\",namespace=~\".*\"}) * (kube_statefulset_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} != kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"})) and (changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\",namespace=~\".*\"}[5m]) == 0)", "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0.001995499, + "evaluationTime": 0.001606885, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-resources.yaml", "interval": 30, - "lastEvaluation": "2021-08-05T08:29:32.941518579Z", + "lastEvaluation": "2021-10-21T08:32:32.941996979Z", "name": "kubernetes-resources", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], "annotations": { - "description": "Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit", - "summary": "Cluster has overcommitted CPU resource requests." + "description": "{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh", + "summary": "Processes experience elevated CPU throttling." }, - "duration": 300, - "evaluationTime": 0.000713714, + "duration": 900, + "evaluationTime": 0.000236089, "health": "ok", "labels": { - "severity": "warning" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "info" }, - "lastEvaluation": "2021-08-05T08:29:32.941525512Z", - "name": "KubeCPUOvercommit", - "query": "sum(namespace_cpu:kube_pod_container_resource_requests:sum) / sum(kube_node_status_allocatable{resource=\"cpu\"}) > ((count(kube_node_status_allocatable{resource=\"cpu\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"cpu\"})", + "lastEvaluation": "2021-10-21T08:32:32.943365528Z", + "name": "CPUThrottlingHigh", + "query": "sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=\"\"}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > (25 / 100)", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit", - "summary": "Cluster has overcommitted memory resource requests." + "description": "Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit", + "summary": "Cluster has overcommitted CPU resource requests." }, "duration": 300, - "evaluationTime": 0.000239363, + "evaluationTime": 0.000417616, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:32.942241329Z", - "name": "KubeMemoryOvercommit", - "query": "sum(namespace_memory:kube_pod_container_resource_requests:sum) / sum(kube_node_status_allocatable{resource=\"memory\"}) > ((count(kube_node_status_allocatable{resource=\"memory\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"memory\"})", + "lastEvaluation": "2021-10-21T08:32:32.942003159Z", + "name": "KubeCPUOvercommit", + "query": "sum(namespace_cpu:kube_pod_container_resource_requests:sum) / sum(kube_node_status_allocatable{resource=\"cpu\"}) > ((count(kube_node_status_allocatable{resource=\"cpu\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"cpu\"})", "state": "inactive", "type": "alerting" }, @@ -1800,12 +2681,13 @@ "summary": "Cluster has overcommitted CPU resource requests." }, "duration": 300, - "evaluationTime": 0.0001194, + "evaluationTime": 0.000192032, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:32.942481682Z", + "lastEvaluation": "2021-10-21T08:32:32.942701123Z", "name": "KubeCPUQuotaOvercommit", "query": "sum(kube_resourcequota{job=\"kube-state-metrics\",resource=\"cpu\",type=\"hard\"}) / sum(kube_node_status_allocatable{resource=\"cpu\"}) > 1.5", "state": "inactive", @@ -1814,38 +2696,40 @@ { "alerts": [], "annotations": { - "description": "Cluster has overcommitted memory resource requests for Namespaces.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit", + "description": "Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit", "summary": "Cluster has overcommitted memory resource requests." }, "duration": 300, - "evaluationTime": 0.000119575, + "evaluationTime": 0.000277904, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:32.942601996Z", - "name": "KubeMemoryQuotaOvercommit", - "query": "sum(kube_resourcequota{job=\"kube-state-metrics\",resource=\"memory\",type=\"hard\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"memory\"}) > 1.5", + "lastEvaluation": "2021-10-21T08:32:32.942422343Z", + "name": "KubeMemoryOvercommit", + "query": "sum(namespace_memory:kube_pod_container_resource_requests:sum) / sum(kube_node_status_allocatable{resource=\"memory\"}) > ((count(kube_node_status_allocatable{resource=\"memory\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"memory\"})", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull", - "summary": "Namespace quota is going to be full." + "description": "Cluster has overcommitted memory resource requests for Namespaces.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit", + "summary": "Cluster has overcommitted memory resource requests." }, - "duration": 900, - "evaluationTime": 0.000178422, + "duration": 300, + "evaluationTime": 0.000172213, "health": "ok", "labels": { - "severity": "info" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:32.942722358Z", - "name": "KubeQuotaAlmostFull", - "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) > 0.9 < 1", + "lastEvaluation": "2021-10-21T08:32:32.942894936Z", + "name": "KubeMemoryQuotaOvercommit", + "query": "sum(kube_resourcequota{job=\"kube-state-metrics\",resource=\"memory\",type=\"hard\"}) / sum(kube_node_status_allocatable{job=\"kube-state-metrics\",resource=\"memory\"}) > 1.5", "state": "inactive", "type": "alerting" }, @@ -1853,18 +2737,19 @@ "alerts": [], "annotations": { "description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused", - "summary": "Namespace quota is fully used." + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull", + "summary": "Namespace quota is going to be full." }, "duration": 900, - "evaluationTime": 0.000188183, + "evaluationTime": 0.000113732, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "info" }, - "lastEvaluation": "2021-08-05T08:29:32.942901763Z", - "name": "KubeQuotaFullyUsed", - "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) == 1", + "lastEvaluation": "2021-10-21T08:32:32.943067915Z", + "name": "KubeQuotaAlmostFull", + "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) > 0.9 < 1", "state": "inactive", "type": "alerting" }, @@ -1876,12 +2761,13 @@ "summary": "Namespace quota has exceeded the limits." }, "duration": 900, - "evaluationTime": 0.000230747, + "evaluationTime": 9.0557e-05, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:32.943091768Z", + "lastEvaluation": "2021-10-21T08:32:32.943274416Z", "name": "KubeQuotaExceeded", "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) > 1", "state": "inactive", @@ -1890,262 +2776,262 @@ { "alerts": [], "annotations": { - "description": "{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh", - "summary": "Processes experience elevated CPU throttling." + "description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused", + "summary": "Namespace quota is fully used." }, "duration": 900, - "evaluationTime": 0.000187565, + "evaluationTime": 9.155e-05, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "info" }, - "lastEvaluation": "2021-08-05T08:29:32.943323354Z", - "name": "CPUThrottlingHigh", - "query": "sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=\"\"}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > (25 / 100)", + "lastEvaluation": "2021-10-21T08:32:32.943182291Z", + "name": "KubeQuotaFullyUsed", + "query": "kube_resourcequota{job=\"kube-state-metrics\",type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\",type=\"hard\"} > 0) == 1", "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0.001022916, + "evaluationTime": 0.00149525, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-storage.yaml", "interval": 30, - "lastEvaluation": "2021-08-05T08:29:26.331490963Z", + "lastEvaluation": "2021-10-21T08:32:26.331929125Z", "name": "kubernetes-storage", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], "annotations": { - "description": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup", - "summary": "PersistentVolume is filling up." + "description": "The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors", + "summary": "PersistentVolume is having issues with provisioning." }, - "duration": 60, - "evaluationTime": 0.000490018, + "duration": 300, + "evaluationTime": 0.000164238, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:26.331497357Z", - "name": "KubePersistentVolumeFillingUp", - "query": "kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} < 0.03", + "lastEvaluation": "2021-10-21T08:32:26.333257939Z", + "name": "KubePersistentVolumeErrors", + "query": "kube_persistentvolume_status_phase{job=\"kube-state-metrics\",phase=~\"Failed|Pending\"} > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.", + "description": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup", "summary": "PersistentVolume is filling up." }, - "duration": 3600, - "evaluationTime": 0.000388512, + "duration": 60, + "evaluationTime": 0.00076122, "health": "ok", "labels": { - "severity": "warning" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:26.331989558Z", + "lastEvaluation": "2021-10-21T08:32:26.331967731Z", "name": "KubePersistentVolumeFillingUp", - "query": "(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"}) < 0.15 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"}[6h], 4 * 24 * 3600) < 0", + "query": "kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} < 0.03", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors", - "summary": "PersistentVolume is having issues with provisioning." + "description": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup", + "summary": "PersistentVolume is filling up." }, - "duration": 300, - "evaluationTime": 0.000130252, + "duration": 3600, + "evaluationTime": 0.000525899, "health": "ok", "labels": { - "severity": "critical" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:26.332380443Z", - "name": "KubePersistentVolumeErrors", - "query": "kube_persistentvolume_status_phase{job=\"kube-state-metrics\",phase=~\"Failed|Pending\"} > 0", + "lastEvaluation": "2021-10-21T08:32:26.332730703Z", + "name": "KubePersistentVolumeFillingUp", + "query": "(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"}) < 0.15 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\",namespace=~\".*\"}[6h], 4 * 24 * 3600) < 0", "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.00630703, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-system-apiserver.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:22.147506303Z", "name": "kubernetes-system-apiserver", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], "annotations": { - "description": "A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration", - "summary": "Client certificate is about to expire." + "description": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown", + "summary": "An aggregated API is down." }, - "duration": 0, - "evaluationTime": 0, - "health": "unknown", + "duration": 300, + "evaluationTime": 0.00080116, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeClientCertificateExpiration", - "query": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800", + "lastEvaluation": "2021-10-21T08:32:22.148586204Z", + "name": "AggregatedAPIDown", + "query": "(1 - max by(name, namespace) (avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration", - "summary": "Client certificate is about to expire." + "description": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors", + "summary": "An aggregated API has reported errors." }, "duration": 0, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 8.9418e-05, + "health": "ok", "labels": { - "severity": "critical" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeClientCertificateExpiration", - "query": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400", + "lastEvaluation": "2021-10-21T08:32:22.148496072Z", + "name": "AggregatedAPIErrors", + "query": "sum by(name, namespace) (increase(aggregator_unavailable_apiservice_total[10m])) > 4", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors", - "summary": "An aggregated API has reported errors." + "description": "KubeAPI has disappeared from Prometheus target discovery.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown", + "summary": "Target disappeared from Prometheus target discovery." }, - "duration": 0, - "evaluationTime": 0, - "health": "unknown", + "duration": 900, + "evaluationTime": 0.000101287, + "health": "ok", "labels": { - "severity": "warning" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "AggregatedAPIErrors", - "query": "sum by(name, namespace) (increase(aggregator_unavailable_apiservice_total[10m])) > 4", + "lastEvaluation": "2021-10-21T08:32:22.149388252Z", + "name": "KubeAPIDown", + "query": "absent(up{job=\"apiserver\"} == 1)", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown", - "summary": "An aggregated API is down." + "description": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapiterminatedrequests", + "summary": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests." }, "duration": 300, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.004320346, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "AggregatedAPIDown", - "query": "(1 - max by(name, namespace) (avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85", + "lastEvaluation": "2021-10-21T08:32:22.149490253Z", + "name": "KubeAPITerminatedRequests", + "query": "sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) / (sum(rate(apiserver_request_total{job=\"apiserver\"}[10m])) + sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m]))) > 0.2", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "KubeAPI has disappeared from Prometheus target discovery.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown", - "summary": "Target disappeared from Prometheus target discovery." + "description": "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration", + "summary": "Client certificate is about to expire." }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "duration": 0, + "evaluationTime": 0.000372017, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeAPIDown", - "query": "absent(up{job=\"apiserver\"} == 1)", + "lastEvaluation": "2021-10-21T08:32:22.148123206Z", + "name": "KubeClientCertificateExpiration", + "query": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapiterminatedrequests", - "summary": "The apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests." + "description": "A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration", + "summary": "Client certificate is about to expire." }, - "duration": 300, - "evaluationTime": 0, - "health": "unknown", + "duration": 0, + "evaluationTime": 0.000609031, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeAPITerminatedRequests", - "query": "sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m])) / (sum(rate(apiserver_request_total{job=\"apiserver\"}[10m])) + sum(rate(apiserver_request_terminations_total{job=\"apiserver\"}[10m]))) > 0.2", + "lastEvaluation": "2021-10-21T08:32:22.147512831Z", + "name": "KubeClientCertificateExpiration", + "query": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800", "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0.000732412, + "evaluationTime": 0.000275401, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-system-controller-manager.yaml", "interval": 30, - "lastEvaluation": "2021-08-05T08:29:35.675742785Z", + "lastEvaluation": "2021-10-21T08:32:35.675863915Z", "name": "kubernetes-system-controller-manager", + "partialResponseStrategy": "ABORT", "rules": [ { - "alerts": [ - { - "activeAt": "2021-08-05T08:29:35.675120003Z", - "annotations": { - "description": "KubeControllerManager has disappeared from Prometheus target discovery.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown", - "summary": "Target disappeared from Prometheus target discovery." - }, - "labels": { - "alertname": "KubeControllerManagerDown", - "severity": "critical" - }, - "state": "pending", - "value": "1e+00" - } - ], + "alerts": [], "annotations": { "description": "KubeControllerManager has disappeared from Prometheus target discovery.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown", "summary": "Target disappeared from Prometheus target discovery." }, "duration": 900, - "evaluationTime": 0.000720188, + "evaluationTime": 0.00026561, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:35.675749807Z", + "lastEvaluation": "2021-10-21T08:32:35.675870703Z", "name": "KubeControllerManagerDown", "query": "absent(up{job=\"kube-controller-manager\"} == 1)", - "state": "pending", + "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.003535335, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-system-kubelet.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:38.337978538Z", "name": "kubernetes-system-kubelet", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], @@ -2155,12 +3041,13 @@ "summary": "Node is not ready." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000247693, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:38.337984871Z", "name": "KubeNodeNotReady", "query": "kube_node_status_condition{condition=\"Ready\",job=\"kube-state-metrics\",status=\"true\"} == 0", "state": "inactive", @@ -2169,152 +3056,160 @@ { "alerts": [], "annotations": { - "description": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable", - "summary": "Node is unreachable." + "description": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping", + "summary": "Node readiness status is flapping." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000137405, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeNodeUnreachable", - "query": "(kube_node_spec_taint{effect=\"NoSchedule\",job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\"} unless ignoring(key, value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1", + "lastEvaluation": "2021-10-21T08:32:38.339963052Z", + "name": "KubeNodeReadinessFlapping", + "query": "sum by(node) (changes(kube_node_status_condition{condition=\"Ready\",status=\"true\"}[15m])) > 2", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods", - "summary": "Kubelet is running at capacity." + "description": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable", + "summary": "Node is unreachable." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.00020408, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeletTooManyPods", - "query": "count by(node) ((kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance, pod, namespace, cluster) group_left(node) topk by(instance, pod, namespace, cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})) / max by(node) (kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1) > 0.95", + "lastEvaluation": "2021-10-21T08:32:38.338234137Z", + "name": "KubeNodeUnreachable", + "query": "(kube_node_spec_taint{effect=\"NoSchedule\",job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\"} unless ignoring(key, value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping", - "summary": "Node readiness status is flapping." + "description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration", + "summary": "Kubelet client certificate is about to expire." }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "duration": 0, + "evaluationTime": 0.000108043, + "health": "ok", "labels": { - "severity": "warning" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeNodeReadinessFlapping", - "query": "sum by(node) (changes(kube_node_status_condition{condition=\"Ready\",status=\"true\"}[15m])) > 2", + "lastEvaluation": "2021-10-21T08:32:38.341020851Z", + "name": "KubeletClientCertificateExpiration", + "query": "kubelet_certificate_manager_client_ttl_seconds < 86400", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh", - "summary": "Kubelet Pod Lifecycle Event Generator is taking too long to relist." + "description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration", + "summary": "Kubelet client certificate is about to expire." }, - "duration": 300, - "evaluationTime": 0, - "health": "unknown", + "duration": 0, + "evaluationTime": 7.0061e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeletPlegDurationHigh", - "query": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} >= 10", + "lastEvaluation": "2021-10-21T08:32:38.340950154Z", + "name": "KubeletClientCertificateExpiration", + "query": "kubelet_certificate_manager_client_ttl_seconds < 604800", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh", - "summary": "Kubelet Pod startup latency is too high." + "description": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors", + "summary": "Kubelet has failed to renew its client certificate." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 7.4842e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeletPodStartUpLatencyHigh", - "query": "histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\",metrics_path=\"/metrics\"}[5m]))) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"} > 60", + "lastEvaluation": "2021-10-21T08:32:38.341259052Z", + "name": "KubeletClientCertificateRenewalErrors", + "query": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration", - "summary": "Kubelet client certificate is about to expire." + "description": "Kubelet has disappeared from Prometheus target discovery.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown", + "summary": "Target disappeared from Prometheus target discovery." }, - "duration": 0, - "evaluationTime": 0, - "health": "unknown", + "duration": 900, + "evaluationTime": 0.000101933, + "health": "ok", "labels": { - "severity": "warning" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeletClientCertificateExpiration", - "query": "kubelet_certificate_manager_client_ttl_seconds < 604800", + "lastEvaluation": "2021-10-21T08:32:38.341409519Z", + "name": "KubeletDown", + "query": "absent(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1)", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration", - "summary": "Kubelet client certificate is about to expire." + "description": "The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh", + "summary": "Kubelet Pod Lifecycle Event Generator is taking too long to relist." }, - "duration": 0, - "evaluationTime": 0, - "health": "unknown", + "duration": 300, + "evaluationTime": 6.574e-05, + "health": "ok", "labels": { - "severity": "critical" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeletClientCertificateExpiration", - "query": "kubelet_certificate_manager_client_ttl_seconds < 86400", + "lastEvaluation": "2021-10-21T08:32:38.340101144Z", + "name": "KubeletPlegDurationHigh", + "query": "node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile=\"0.99\"} >= 10", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration", - "summary": "Kubelet server certificate is about to expire." + "description": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh", + "summary": "Kubelet Pod startup latency is too high." }, - "duration": 0, - "evaluationTime": 0, - "health": "unknown", + "duration": 900, + "evaluationTime": 0.000781676, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeletServerCertificateExpiration", - "query": "kubelet_certificate_manager_server_ttl_seconds < 604800", + "lastEvaluation": "2021-10-21T08:32:38.34016745Z", + "name": "KubeletPodStartUpLatencyHigh", + "query": "histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\",metrics_path=\"/metrics\"}[5m]))) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\",metrics_path=\"/metrics\"} > 60", "state": "inactive", "type": "alerting" }, @@ -2326,12 +3221,13 @@ "summary": "Kubelet server certificate is about to expire." }, "duration": 0, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 4.3281e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:38.341215164Z", "name": "KubeletServerCertificateExpiration", "query": "kubelet_certificate_manager_server_ttl_seconds < 86400", "state": "inactive", @@ -2340,19 +3236,20 @@ { "alerts": [], "annotations": { - "description": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors", - "summary": "Kubelet has failed to renew its client certificate." + "description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration", + "summary": "Kubelet server certificate is about to expire." }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "duration": 0, + "evaluationTime": 8.3805e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeletClientCertificateRenewalErrors", - "query": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0", + "lastEvaluation": "2021-10-21T08:32:38.341130701Z", + "name": "KubeletServerCertificateExpiration", + "query": "kubelet_certificate_manager_server_ttl_seconds < 604800", "state": "inactive", "type": "alerting" }, @@ -2364,12 +3261,13 @@ "summary": "Kubelet has failed to renew its server certificate." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 7.4541e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:38.341334469Z", "name": "KubeletServerCertificateRenewalErrors", "query": "increase(kubelet_server_expiration_renew_errors[5m]) > 0", "state": "inactive", @@ -2378,30 +3276,32 @@ { "alerts": [], "annotations": { - "description": "Kubelet has disappeared from Prometheus target discovery.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown", - "summary": "Target disappeared from Prometheus target discovery." + "description": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods", + "summary": "Kubelet is running at capacity." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.001523087, + "health": "ok", "labels": { - "severity": "critical" + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeletDown", - "query": "absent(up{job=\"kubelet\",metrics_path=\"/metrics\"} == 1)", + "lastEvaluation": "2021-10-21T08:32:38.33843889Z", + "name": "KubeletTooManyPods", + "query": "count by(node) ((kube_pod_status_phase{job=\"kube-state-metrics\",phase=\"Running\"} == 1) * on(instance, pod, namespace, cluster) group_left(node) topk by(instance, pod, namespace, cluster) (1, kube_pod_info{job=\"kube-state-metrics\"})) / max by(node) (kube_node_status_capacity{job=\"kube-state-metrics\",resource=\"pods\"} != 1) > 0.95", "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.000470343, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-system-scheduler.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:19.588529821Z", "name": "kubernetes-system-scheduler", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], @@ -2411,12 +3311,13 @@ "summary": "Target disappeared from Prometheus target discovery." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000457826, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:19.588539111Z", "name": "KubeSchedulerDown", "query": "absent(up{job=\"kube-scheduler\"} == 1)", "state": "inactive", @@ -2425,210 +3326,316 @@ ] }, { - "evaluationTime": 0, + "evaluationTime": 0.001497536, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-kubernetes-system.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:17.922937061Z", "name": "kubernetes-system", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], "annotations": { - "description": "There are {{ $value }} different semantic versions of Kubernetes components running.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch", - "summary": "Different semantic versions of Kubernetes components running." + "description": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors", + "summary": "Kubernetes API server client is experiencing errors." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000912229, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeVersionMismatch", - "query": "count(count by(git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"}, \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", + "lastEvaluation": "2021-10-21T08:32:17.923519789Z", + "name": "KubeClientErrors", + "query": "(sum by(instance, job) (rate(rest_client_requests_total{code=~\"5..\"}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) > 0.01", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors", - "summary": "Kubernetes API server client is experiencing errors." + "description": "There are {{ $value }} different semantic versions of Kubernetes components running.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch", + "summary": "Different semantic versions of Kubernetes components running." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.00057511, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "KubeClientErrors", - "query": "(sum by(instance, job) (rate(rest_client_requests_total{code=~\"5..\"}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) > 0.01", + "lastEvaluation": "2021-10-21T08:32:17.922943195Z", + "name": "KubeVersionMismatch", + "query": "count(count by(git_version) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"}, \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0.001619265, + "evaluationTime": 0.003452158, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-node-exporter.rules.yaml", "interval": 30, - "lastEvaluation": "2021-08-05T08:29:27.325028126Z", + "lastEvaluation": "2021-10-21T08:32:27.325839644Z", "name": "node-exporter.rules", + "partialResponseStrategy": "ABORT", "rules": [ { - "evaluationTime": 0.000312794, - "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.325040219Z", - "name": "instance:node_num_cpu:sum", - "query": "count without(cpu) (count without(mode) (node_cpu_seconds_total{job=\"node-exporter\"}))", - "type": "recording" - }, - { - "evaluationTime": 0.00015216, + "evaluationTime": 0.000263411, "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.325354973Z", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.326540533Z", "name": "instance:node_cpu_utilisation:rate5m", "query": "1 - avg without(cpu, mode) (rate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0.000143513, + "evaluationTime": 0.000154181, "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.325507949Z", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.326804879Z", "name": "instance:node_load1_per_cpu:ratio", "query": "(node_load1{job=\"node-exporter\"} / instance:node_num_cpu:sum{job=\"node-exporter\"})", "type": "recording" }, { - "evaluationTime": 0.000147387, + "evaluationTime": 0.00014866, "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.32565247Z", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.326959817Z", "name": "instance:node_memory_utilisation:ratio", "query": "1 - (node_memory_MemAvailable_bytes{job=\"node-exporter\"} / node_memory_MemTotal_bytes{job=\"node-exporter\"})", "type": "recording" }, { - "evaluationTime": 9.6468e-05, + "evaluationTime": 0.000458261, "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.325801962Z", - "name": "instance:node_vmstat_pgmajfault:rate5m", - "query": "rate(node_vmstat_pgmajfault{job=\"node-exporter\"}[5m])", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.327654904Z", + "name": "instance:node_network_receive_bytes_excluding_lo:rate5m", + "query": "sum without(device) (rate(node_network_receive_bytes_total{device!=\"lo\",job=\"node-exporter\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0.000128537, + "evaluationTime": 0.000395081, "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.325899322Z", - "name": "instance_device:node_disk_io_time_seconds:rate5m", - "query": "rate(node_disk_io_time_seconds_total{device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\",job=\"node-exporter\"}[5m])", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.328529437Z", + "name": "instance:node_network_receive_drop_excluding_lo:rate5m", + "query": "sum without(device) (rate(node_network_receive_drop_total{device!=\"lo\",job=\"node-exporter\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0.00016345, + "evaluationTime": 0.000414391, "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.326028668Z", - "name": "instance_device:node_disk_io_time_weighted_seconds:rate5m", - "query": "rate(node_disk_io_time_weighted_seconds_total{device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\",job=\"node-exporter\"}[5m])", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.328114142Z", + "name": "instance:node_network_transmit_bytes_excluding_lo:rate5m", + "query": "sum without(device) (rate(node_network_transmit_bytes_total{device!=\"lo\",job=\"node-exporter\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0.000151948, + "evaluationTime": 0.000363995, "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.326193112Z", - "name": "instance:node_network_receive_bytes_excluding_lo:rate5m", - "query": "sum without(device) (rate(node_network_receive_bytes_total{device!=\"lo\",job=\"node-exporter\"}[5m]))", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.328925482Z", + "name": "instance:node_network_transmit_drop_excluding_lo:rate5m", + "query": "sum without(device) (rate(node_network_transmit_drop_total{device!=\"lo\",job=\"node-exporter\"}[5m]))", "type": "recording" }, { - "evaluationTime": 0.000151124, + "evaluationTime": 0.000691436, "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.326346445Z", - "name": "instance:node_network_transmit_bytes_excluding_lo:rate5m", - "query": "sum without(device) (rate(node_network_transmit_bytes_total{device!=\"lo\",job=\"node-exporter\"}[5m]))", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.325847228Z", + "name": "instance:node_num_cpu:sum", + "query": "count without(cpu) (count without(mode) (node_cpu_seconds_total{job=\"node-exporter\"}))", "type": "recording" }, { - "evaluationTime": 6.5027e-05, + "evaluationTime": 8.7963e-05, "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.32649886Z", - "name": "instance:node_network_receive_drop_excluding_lo:rate5m", - "query": "sum without(device) (rate(node_network_receive_drop_total{device!=\"lo\",job=\"node-exporter\"}[5m]))", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.327109188Z", + "name": "instance:node_vmstat_pgmajfault:rate5m", + "query": "rate(node_vmstat_pgmajfault{job=\"node-exporter\"}[5m])", "type": "recording" }, { - "evaluationTime": 7.9169e-05, + "evaluationTime": 0.00022782, "health": "ok", - "lastEvaluation": "2021-08-05T08:29:27.326564489Z", - "name": "instance:node_network_transmit_drop_excluding_lo:rate5m", - "query": "sum without(device) (rate(node_network_transmit_drop_total{device!=\"lo\",job=\"node-exporter\"}[5m]))", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.327197836Z", + "name": "instance_device:node_disk_io_time_seconds:rate5m", + "query": "rate(node_disk_io_time_seconds_total{device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\",job=\"node-exporter\"}[5m])", + "type": "recording" + }, + { + "evaluationTime": 0.000227384, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:27.327426425Z", + "name": "instance_device:node_disk_io_time_weighted_seconds:rate5m", + "query": "rate(node_disk_io_time_weighted_seconds_total{device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\",job=\"node-exporter\"}[5m])", "type": "recording" } ] }, { - "evaluationTime": 0.003346354, + "evaluationTime": 0.043766772, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-node-exporter.yaml", "interval": 30, - "lastEvaluation": "2021-08-05T08:29:35.358810274Z", + "lastEvaluation": "2021-10-21T08:32:35.359876017Z", "name": "node-exporter", + "partialResponseStrategy": "ABORT", "rules": [ + { + "alerts": [ + { + "activeAt": "2021-10-21T07:51:05.358542542Z", + "annotations": { + "message": "Clock on 192.168.1.100:9100 is not synchronising. Ensure NTP is configured on this host.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising", + "summary": "Clock not synchronising." + }, + "labels": { + "alertname": "NodeClockNotSynchronising", + "container": "node-exporter", + "endpoint": "metrics", + "instance": "192.168.1.100:9100", + "job": "node-exporter", + "namespace": "metalk8s-monitoring", + "pod": "prometheus-operator-prometheus-node-exporter-2krc7", + "service": "prometheus-operator-prometheus-node-exporter", + "severity": "warning" + }, + "partialResponseStrategy": "WARN", + "state": "firing", + "value": "0e+00" + }, + { + "activeAt": "2021-10-21T07:51:05.358542542Z", + "annotations": { + "message": "Clock on 192.168.1.101:9100 is not synchronising. Ensure NTP is configured on this host.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising", + "summary": "Clock not synchronising." + }, + "labels": { + "alertname": "NodeClockNotSynchronising", + "container": "node-exporter", + "endpoint": "metrics", + "instance": "192.168.1.101:9100", + "job": "node-exporter", + "namespace": "metalk8s-monitoring", + "pod": "prometheus-operator-prometheus-node-exporter-hxvmn", + "service": "prometheus-operator-prometheus-node-exporter", + "severity": "warning" + }, + "partialResponseStrategy": "WARN", + "state": "firing", + "value": "0e+00" + } + ], + "annotations": { + "message": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising", + "summary": "Clock not synchronising." + }, + "duration": 600, + "evaluationTime": 0.001698564, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:35.400992248Z", + "name": "NodeClockNotSynchronising", + "query": "min_over_time(node_timex_sync_status[5m]) == 0 and node_timex_maxerror_seconds >= 16", + "state": "firing", + "type": "alerting" + }, { "alerts": [], "annotations": { - "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup", - "summary": "Filesystem is predicted to run out of space within the next 24 hours." + "message": "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected", + "summary": "Clock skew detected." }, - "duration": 3600, - "evaluationTime": 0.000881355, + "duration": 600, + "evaluationTime": 0.000786153, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:35.358817782Z", - "name": "NodeFilesystemSpaceFillingUp", - "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "lastEvaluation": "2021-10-21T08:32:35.400205175Z", + "name": "NodeClockSkewDetected", + "query": "(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up fast.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup", - "summary": "Filesystem is predicted to run out of space within the next 4 hours." + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles", + "summary": "Filesystem has less than 8% inodes left." }, "duration": 3600, - "evaluationTime": 0.000264111, + "evaluationTime": 0.007015993, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:35.359701487Z", - "name": "NodeFilesystemSpaceFillingUp", - "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "lastEvaluation": "2021-10-21T08:32:35.387362825Z", + "name": "NodeFilesystemAlmostOutOfFiles", + "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 8 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace", - "summary": "Filesystem has less than 20% space left." + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles", + "summary": "Filesystem has less than 15% inodes left." }, "duration": 3600, - "evaluationTime": 0.000141783, + "evaluationTime": 0.005049341, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:35.359966493Z", - "name": "NodeFilesystemAlmostOutOfSpace", - "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "lastEvaluation": "2021-10-21T08:32:35.382312061Z", + "name": "NodeFilesystemAlmostOutOfFiles", + "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 15 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", "type": "alerting" }, @@ -2640,12 +3647,13 @@ "summary": "Filesystem has less than 12% space left." }, "duration": 3600, - "evaluationTime": 0.000164005, + "evaluationTime": 0.001789966, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:35.360109123Z", + "lastEvaluation": "2021-10-21T08:32:35.368294569Z", "name": "NodeFilesystemAlmostOutOfSpace", "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 12 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", @@ -2654,19 +3662,20 @@ { "alerts": [], "annotations": { - "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup", - "summary": "Filesystem is predicted to run out of inodes within the next 24 hours." + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace", + "summary": "Filesystem has less than 20% space left." }, "duration": 3600, - "evaluationTime": 0.000296165, + "evaluationTime": 0.002011131, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:35.360274144Z", - "name": "NodeFilesystemFilesFillingUp", - "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "lastEvaluation": "2021-10-21T08:32:35.366282142Z", + "name": "NodeFilesystemAlmostOutOfSpace", + "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", "type": "alerting" }, @@ -2678,12 +3687,13 @@ "summary": "Filesystem is predicted to run out of inodes within the next 4 hours." }, "duration": 3600, - "evaluationTime": 0.000384795, + "evaluationTime": 0.009030322, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:35.360571987Z", + "lastEvaluation": "2021-10-21T08:32:35.373280092Z", "name": "NodeFilesystemFilesFillingUp", "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", @@ -2692,76 +3702,60 @@ { "alerts": [], "annotations": { - "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles", - "summary": "Filesystem has less than 15% inodes left." + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup", + "summary": "Filesystem is predicted to run out of inodes within the next 24 hours." }, "duration": 3600, - "evaluationTime": 0.000189107, + "evaluationTime": 0.003193163, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:35.360958094Z", - "name": "NodeFilesystemAlmostOutOfFiles", - "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 15 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", + "lastEvaluation": "2021-10-21T08:32:35.370085535Z", + "name": "NodeFilesystemFilesFillingUp", + "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles", - "summary": "Filesystem has less than 8% inodes left." + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up fast.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup", + "summary": "Filesystem is predicted to run out of space within the next 4 hours." }, "duration": 3600, - "evaluationTime": 0.000162797, + "evaluationTime": 0.003011202, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:35.361148216Z", - "name": "NodeFilesystemAlmostOutOfFiles", - "query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 8 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", - "state": "inactive", - "type": "alerting" - }, - { - "alerts": [], - "annotations": { - "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs", - "summary": "Network interface is reporting many receive errors." - }, - "duration": 3600, - "evaluationTime": 0.000111343, - "health": "ok", - "labels": { - "severity": "warning" - }, - "lastEvaluation": "2021-08-05T08:29:35.361311978Z", - "name": "NodeNetworkReceiveErrs", - "query": "increase(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01", + "lastEvaluation": "2021-10-21T08:32:35.363269381Z", + "name": "NodeFilesystemSpaceFillingUp", + "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs", - "summary": "Network interface is reporting many transmit errors." + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup", + "summary": "Filesystem is predicted to run out of space within the next 24 hours." }, "duration": 3600, - "evaluationTime": 0.000104584, + "evaluationTime": 0.003385063, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:35.36142417Z", - "name": "NodeNetworkTransmitErrs", - "query": "increase(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01", + "lastEvaluation": "2021-10-21T08:32:35.359882168Z", + "name": "NodeFilesystemSpaceFillingUp", + "query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)", "state": "inactive", "type": "alerting" }, @@ -2773,12 +3767,13 @@ "summary": "Number of conntrack are getting close to the limit" }, "duration": 0, - "evaluationTime": 0.000108123, + "evaluationTime": 0.000425587, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:35.361529541Z", + "lastEvaluation": "2021-10-21T08:32:35.399778712Z", "name": "NodeHighNumberConntrackEntriesUsed", "query": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75", "state": "inactive", @@ -2787,57 +3782,40 @@ { "alerts": [], "annotations": { - "message": "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected", - "summary": "Clock skew detected." - }, - "duration": 600, - "evaluationTime": 0.00019926, - "health": "ok", - "labels": { - "severity": "warning" - }, - "lastEvaluation": "2021-08-05T08:29:35.361638553Z", - "name": "NodeClockSkewDetected", - "query": "(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)", - "state": "inactive", - "type": "alerting" - }, - { - "alerts": [], - "annotations": { - "message": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising", - "summary": "Clock not synchronising." + "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs", + "summary": "Network interface is reporting many receive errors." }, - "duration": 600, - "evaluationTime": 0.000105942, + "duration": 3600, + "evaluationTime": 0.002284779, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:35.361839124Z", - "name": "NodeClockNotSynchronising", - "query": "min_over_time(node_timex_sync_status[5m]) == 0 and node_timex_maxerror_seconds >= 16", + "lastEvaluation": "2021-10-21T08:32:35.394380398Z", + "name": "NodeNetworkReceiveErrs", + "query": "increase(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01", "state": "inactive", "type": "alerting" }, { - "alerts": [], - "annotations": { - "description": "Node Exporter text file collector failed to scrape.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror", - "summary": "Node Exporter text file collector failed to scrape." + "alerts": [], + "annotations": { + "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs", + "summary": "Network interface is reporting many transmit errors." }, - "duration": 0, - "evaluationTime": 5.6446e-05, + "duration": 3600, + "evaluationTime": 0.003110883, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:35.361945817Z", - "name": "NodeTextFileCollectorScrapeError", - "query": "node_textfile_scrape_error{job=\"node-exporter\"} == 1", + "lastEvaluation": "2021-10-21T08:32:35.396666591Z", + "name": "NodeNetworkTransmitErrs", + "query": "increase(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01", "state": "inactive", "type": "alerting" }, @@ -2849,12 +3827,13 @@ "summary": "RAID Array is degraded" }, "duration": 900, - "evaluationTime": 7.7363e-05, + "evaluationTime": 0.000137106, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "2021-08-05T08:29:35.362002951Z", + "lastEvaluation": "2021-10-21T08:32:35.403263734Z", "name": "NodeRAIDDegraded", "query": "node_md_disks_required - ignoring(state) (node_md_disks{state=\"active\"}) >= 1", "state": "inactive", @@ -2868,25 +3847,47 @@ "summary": "Failed device in RAID array" }, "duration": 0, - "evaluationTime": 7.3091e-05, + "evaluationTime": 0.000239199, "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "2021-08-05T08:29:35.362081016Z", + "lastEvaluation": "2021-10-21T08:32:35.403401534Z", "name": "NodeRAIDDiskFailure", "query": "node_md_disks{state=\"failed\"} >= 1", "state": "inactive", "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "Node Exporter text file collector failed to scrape.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror", + "summary": "Node Exporter text file collector failed to scrape." + }, + "duration": 0, + "evaluationTime": 0.000569877, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "warning" + }, + "lastEvaluation": "2021-10-21T08:32:35.402692886Z", + "name": "NodeTextFileCollectorScrapeError", + "query": "node_textfile_scrape_error{job=\"node-exporter\"} == 1", + "state": "inactive", + "type": "alerting" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.000691183, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-node-network.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:16.988044806Z", "name": "node-network", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], @@ -2895,12 +3896,13 @@ "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkinterfaceflapping" }, "duration": 120, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000680792, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:16.988051892Z", "name": "NodeNetworkInterfaceFlapping", "query": "changes(node_network_up{device!~\"veth.+\",job=\"node-exporter\"}[2m]) > 2", "state": "inactive", @@ -2909,44 +3911,55 @@ ] }, { - "evaluationTime": 0, + "evaluationTime": 0.002934693, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-node.rules.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:18.107472668Z", "name": "node.rules", + "partialResponseStrategy": "ABORT", "rules": [ { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "node_namespace_pod:kube_pod_info:", - "query": "topk by(namespace, pod) (1, max by(node, namespace, pod) (label_replace(kube_pod_info{job=\"kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")))", + "evaluationTime": 0.000347433, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:18.110055637Z", + "name": ":node_memory_MemAvailable_bytes:sum", + "query": "sum by(cluster) (node_memory_MemAvailable_bytes{job=\"node-exporter\"} or (node_memory_Buffers_bytes{job=\"node-exporter\"} + node_memory_Cached_bytes{job=\"node-exporter\"} + node_memory_MemFree_bytes{job=\"node-exporter\"} + node_memory_Slab_bytes{job=\"node-exporter\"}))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", + "evaluationTime": 0.001277199, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:18.108776602Z", "name": "node:node_num_cpu:sum", "query": "count by(cluster, node) (sum by(node, cpu) (node_cpu_seconds_total{job=\"node-exporter\"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)))", "type": "recording" }, { - "evaluationTime": 0, - "health": "unknown", - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": ":node_memory_MemAvailable_bytes:sum", - "query": "sum by(cluster) (node_memory_MemAvailable_bytes{job=\"node-exporter\"} or (node_memory_Buffers_bytes{job=\"node-exporter\"} + node_memory_Cached_bytes{job=\"node-exporter\"} + node_memory_MemFree_bytes{job=\"node-exporter\"} + node_memory_Slab_bytes{job=\"node-exporter\"}))", + "evaluationTime": 0.001295777, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus" + }, + "lastEvaluation": "2021-10-21T08:32:18.107477935Z", + "name": "node_namespace_pod:kube_pod_info:", + "query": "topk by(namespace, pod) (1, max by(node, namespace, pod) (label_replace(kube_pod_info{job=\"kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")))", "type": "recording" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.001308323, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-prometheus-operator.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:46.673895688Z", "name": "prometheus-operator", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], @@ -2956,12 +3969,13 @@ "summary": "Errors while performing list operations in controller." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000448743, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:46.673902073Z", "name": "PrometheusOperatorListErrors", "query": "(sum by(controller, namespace) (rate(prometheus_operator_list_operations_failed_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_list_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m]))) > 0.4", "state": "inactive", @@ -2970,38 +3984,40 @@ { "alerts": [], "annotations": { - "description": "Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorwatcherrors", - "summary": "Errors while performing watch operations in controller." + "description": "Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornodelookuperrors", + "summary": "Errors while reconciling Prometheus." }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "duration": 600, + "evaluationTime": 7.7556e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusOperatorWatchErrors", - "query": "(sum by(controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_watch_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m]))) > 0.4", + "lastEvaluation": "2021-10-21T08:32:46.674904428Z", + "name": "PrometheusOperatorNodeLookupErrors", + "query": "rate(prometheus_operator_node_address_lookup_errors_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]) > 0.1", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorsyncfailed", - "summary": "Last controller reconciliation failed" + "description": "Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornotready", + "summary": "Prometheus operator not ready" }, - "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "duration": 300, + "evaluationTime": 0.000102503, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusOperatorSyncFailed", - "query": "min_over_time(prometheus_operator_syncs{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\",status=\"failed\"}[5m]) > 0", + "lastEvaluation": "2021-10-21T08:32:46.67498269Z", + "name": "PrometheusOperatorNotReady", + "query": "min by(namespace, controller) (max_over_time(prometheus_operator_ready{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]) == 0)", "state": "inactive", "type": "alerting" }, @@ -3013,12 +4029,13 @@ "summary": "Errors while reconciling controller." }, "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000205759, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:46.674697898Z", "name": "PrometheusOperatorReconcileErrors", "query": "(sum by(controller, namespace) (rate(prometheus_operator_reconcile_errors_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]))) / (sum by(controller, namespace) (rate(prometheus_operator_reconcile_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]))) > 0.1", "state": "inactive", @@ -3027,68 +4044,72 @@ { "alerts": [], "annotations": { - "description": "Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornodelookuperrors", - "summary": "Errors while reconciling Prometheus." + "description": "Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf \"%0.0f\" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorrejectedresources", + "summary": "Resources rejected by Prometheus operator" }, - "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "duration": 300, + "evaluationTime": 0.000116252, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusOperatorNodeLookupErrors", - "query": "rate(prometheus_operator_node_address_lookup_errors_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]) > 0.1", + "lastEvaluation": "2021-10-21T08:32:46.675085761Z", + "name": "PrometheusOperatorRejectedResources", + "query": "min_over_time(prometheus_operator_managed_resources{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\",state=\"rejected\"}[5m]) > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornotready", - "summary": "Prometheus operator not ready" + "description": "Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorsyncfailed", + "summary": "Last controller reconciliation failed" }, - "duration": 300, - "evaluationTime": 0, - "health": "unknown", + "duration": 600, + "evaluationTime": 0.000140594, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusOperatorNotReady", - "query": "min by(namespace, controller) (max_over_time(prometheus_operator_ready{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[5m]) == 0)", + "lastEvaluation": "2021-10-21T08:32:46.674556605Z", + "name": "PrometheusOperatorSyncFailed", + "query": "min_over_time(prometheus_operator_syncs{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\",status=\"failed\"}[5m]) > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf \"%0.0f\" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorrejectedresources", - "summary": "Resources rejected by Prometheus operator" + "description": "Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorwatcherrors", + "summary": "Errors while performing watch operations in controller." }, - "duration": 300, - "evaluationTime": 0, - "health": "unknown", + "duration": 900, + "evaluationTime": 0.000203437, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusOperatorRejectedResources", - "query": "min_over_time(prometheus_operator_managed_resources{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\",state=\"rejected\"}[5m]) > 0", + "lastEvaluation": "2021-10-21T08:32:46.674352418Z", + "name": "PrometheusOperatorWatchErrors", + "query": "(sum by(controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_watch_operations_total{job=\"prometheus-operator-operator\",namespace=\"metalk8s-monitoring\"}[10m]))) > 0.4", "state": "inactive", "type": "alerting" } ] }, { - "evaluationTime": 0, + "evaluationTime": 0.002924499, "file": "/etc/prometheus/rules/prometheus-prometheus-operator-prometheus-rulefiles-0/metalk8s-monitoring-prometheus-operator-prometheus.yaml", "interval": 30, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:24.862577161Z", "name": "prometheus", + "partialResponseStrategy": "ABORT", "rules": [ { "alerts": [], @@ -3098,12 +4119,13 @@ "summary": "Failed Prometheus configuration reload." }, "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000262094, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:24.862584138Z", "name": "PrometheusBadConfig", "query": "max_over_time(prometheus_config_last_reload_successful{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) == 0", "state": "inactive", @@ -3112,19 +4134,40 @@ { "alerts": [], "annotations": { - "description": "Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusnotificationqueuerunningfull", - "summary": "Prometheus alert notification queue predicted to run full in less than 30m." + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf \"%.4g\" $value }} samples/s with different values but duplicated timestamp.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusduplicatetimestamps", + "summary": "Prometheus is dropping samples with duplicate timestamps." }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "duration": 600, + "evaluationTime": 8.284e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusNotificationQueueRunningFull", - "query": "(predict_linear(prometheus_notifications_queue_length{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]))", + "lastEvaluation": "2021-10-21T08:32:24.864004446Z", + "name": "PrometheusDuplicateTimestamps", + "query": "rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", + "state": "inactive", + "type": "alerting" + }, + { + "alerts": [], + "annotations": { + "description": "{{ printf \"%.1f\" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheuserrorsendingalertstoanyalertmanager", + "summary": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager." + }, + "duration": 900, + "evaluationTime": 0.00020893, + "health": "ok", + "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", + "severity": "critical" + }, + "lastEvaluation": "2021-10-21T08:32:24.865290647Z", + "name": "PrometheusErrorSendingAlertsToAnyAlertmanager", + "query": "min without(alertmanager) (rate(prometheus_notifications_errors_total{alertmanager!~\"\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{alertmanager!~\"\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 3", "state": "inactive", "type": "alerting" }, @@ -3136,12 +4179,13 @@ "summary": "Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000165767, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:24.863071897Z", "name": "PrometheusErrorSendingAlertsToSomeAlertmanagers", "query": "(rate(prometheus_notifications_errors_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 1", "state": "inactive", @@ -3150,57 +4194,60 @@ { "alerts": [], "annotations": { - "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusnotconnectedtoalertmanagers", - "summary": "Prometheus is not connected to any Alertmanagers." + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf \"%.0f\" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheuslabellimithit", + "summary": "Prometheus has dropped targets because some scrape configs have exceeded the labels limit." }, - "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "duration": 900, + "evaluationTime": 6.4316e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusNotConnectedToAlertmanagers", - "query": "max_over_time(prometheus_notifications_alertmanagers_discovered{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) < 1", + "lastEvaluation": "2021-10-21T08:32:24.865225792Z", + "name": "PrometheusLabelLimitHit", + "query": "increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheustsdbreloadsfailing", - "summary": "Prometheus has issues reloading blocks from disk." + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf \"%.0f\" $value }} rule group evaluations in the last 5m.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusmissingruleevaluations", + "summary": "Prometheus is missing rule evaluations due to slow rule group evaluation." }, - "duration": 14400, - "evaluationTime": 0, - "health": "unknown", + "duration": 900, + "evaluationTime": 0.000239144, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusTSDBReloadsFailing", - "query": "increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[3h]) > 0", + "lastEvaluation": "2021-10-21T08:32:24.864914358Z", + "name": "PrometheusMissingRuleEvaluations", + "query": "increase(prometheus_rule_group_iterations_missed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheustsdbcompactionsfailing", - "summary": "Prometheus has issues compacting blocks." + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusnotconnectedtoalertmanagers", + "summary": "Prometheus is not connected to any Alertmanagers." }, - "duration": 14400, - "evaluationTime": 0, - "health": "unknown", + "duration": 600, + "evaluationTime": 7.4567e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusTSDBCompactionsFailing", - "query": "increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[3h]) > 0", + "lastEvaluation": "2021-10-21T08:32:24.863238438Z", + "name": "PrometheusNotConnectedToAlertmanagers", + "query": "max_over_time(prometheus_notifications_alertmanagers_discovered{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) < 1", "state": "inactive", "type": "alerting" }, @@ -3212,12 +4259,13 @@ "summary": "Prometheus is not ingesting samples." }, "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000501295, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:24.863502332Z", "name": "PrometheusNotIngestingSamples", "query": "(rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) <= 0 and (sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}) > 0 or sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}) > 0))", "state": "inactive", @@ -3226,19 +4274,20 @@ { "alerts": [], "annotations": { - "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf \"%.4g\" $value }} samples/s with different values but duplicated timestamp.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusduplicatetimestamps", - "summary": "Prometheus is dropping samples with duplicate timestamps." + "description": "Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusnotificationqueuerunningfull", + "summary": "Prometheus alert notification queue predicted to run full in less than 30m." }, - "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "duration": 900, + "evaluationTime": 0.000223605, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusDuplicateTimestamps", - "query": "rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", + "lastEvaluation": "2021-10-21T08:32:24.862847522Z", + "name": "PrometheusNotificationQueueRunningFull", + "query": "(predict_linear(prometheus_notifications_queue_length{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]))", "state": "inactive", "type": "alerting" }, @@ -3250,12 +4299,13 @@ "summary": "Prometheus drops samples with out-of-order timestamps." }, "duration": 600, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 6.8377e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:24.864088091Z", "name": "PrometheusOutOfOrderTimestamps", "query": "rate(prometheus_target_scrapes_sample_out_of_order_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", "state": "inactive", @@ -3269,12 +4319,13 @@ "summary": "Prometheus fails to send samples to remote storage." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000277881, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:24.864157032Z", "name": "PrometheusRemoteStorageFailures", "query": "((rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) / ((rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) + (rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) or rate(prometheus_remote_storage_samples_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])))) * 100 > 1", "state": "inactive", @@ -3288,12 +4339,13 @@ "summary": "Prometheus remote write is behind." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000128792, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:24.864435662Z", "name": "PrometheusRemoteWriteBehind", "query": "(max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) - ignoring(remote_name, url) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) > 120", "state": "inactive", @@ -3307,12 +4359,13 @@ "summary": "Prometheus remote write desired shards calculation wants to run more than configured max shards." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000103187, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:24.864565142Z", "name": "PrometheusRemoteWriteDesiredShards", "query": "(max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]))", "state": "inactive", @@ -3326,12 +4379,13 @@ "summary": "Prometheus is failing rule evaluations." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 0.000244824, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "critical" }, - "lastEvaluation": "0001-01-01T00:00:00Z", + "lastEvaluation": "2021-10-21T08:32:24.8646689Z", "name": "PrometheusRuleFailures", "query": "increase(prometheus_rule_evaluation_failures_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", "state": "inactive", @@ -3340,76 +4394,60 @@ { "alerts": [], "annotations": { - "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf \"%.0f\" $value }} rule group evaluations in the last 5m.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusmissingruleevaluations", - "summary": "Prometheus is missing rule evaluations due to slow rule group evaluation." + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheustsdbcompactionsfailing", + "summary": "Prometheus has issues compacting blocks." }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "duration": 14400, + "evaluationTime": 7.7206e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusMissingRuleEvaluations", - "query": "increase(prometheus_rule_group_iterations_missed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", + "lastEvaluation": "2021-10-21T08:32:24.863424563Z", + "name": "PrometheusTSDBCompactionsFailing", + "query": "increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[3h]) > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf \"%.0f\" $value }} targets because the number of targets exceeded the configured target_limit.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheustargetlimithit", - "summary": "Prometheus has dropped targets because some scrape configs have exceeded the targets limit." + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheustsdbreloadsfailing", + "summary": "Prometheus has issues reloading blocks from disk." }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "duration": 14400, + "evaluationTime": 0.000110356, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusTargetLimitHit", - "query": "increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", + "lastEvaluation": "2021-10-21T08:32:24.863313573Z", + "name": "PrometheusTSDBReloadsFailing", + "query": "increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[3h]) > 0", "state": "inactive", "type": "alerting" }, { "alerts": [], "annotations": { - "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf \"%.0f\" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheuslabellimithit", - "summary": "Prometheus has dropped targets because some scrape configs have exceeded the labels limit." + "description": "Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf \"%.0f\" $value }} targets because the number of targets exceeded the configured target_limit.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheustargetlimithit", + "summary": "Prometheus has dropped targets because some scrape configs have exceeded the targets limit." }, "duration": 900, - "evaluationTime": 0, - "health": "unknown", + "evaluationTime": 7.1073e-05, + "health": "ok", "labels": { + "prometheus": "metalk8s-monitoring/prometheus-operator-prometheus", "severity": "warning" }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusLabelLimitHit", - "query": "increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", - "state": "inactive", - "type": "alerting" - }, - { - "alerts": [], - "annotations": { - "description": "{{ printf \"%.1f\" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.", - "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheuserrorsendingalertstoanyalertmanager", - "summary": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager." - }, - "duration": 900, - "evaluationTime": 0, - "health": "unknown", - "labels": { - "severity": "critical" - }, - "lastEvaluation": "0001-01-01T00:00:00Z", - "name": "PrometheusErrorSendingAlertsToAnyAlertmanager", - "query": "min without(alertmanager) (rate(prometheus_notifications_errors_total{alertmanager!~\"\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) / rate(prometheus_notifications_sent_total{alertmanager!~\"\",job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m])) * 100 > 3", + "lastEvaluation": "2021-10-21T08:32:24.865154153Z", + "name": "PrometheusTargetLimitHit", + "query": "increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=\"prometheus-operator-prometheus\",namespace=\"metalk8s-monitoring\"}[5m]) > 0", "state": "inactive", "type": "alerting" }