Skip to content

Commit

Permalink
Merge branch 'aenix-io:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
klinch0 authored Feb 5, 2025
2 parents e5e1472 + 861e6c4 commit f67816e
Show file tree
Hide file tree
Showing 10 changed files with 3,767 additions and 1,609 deletions.
3,611 changes: 3,611 additions & 0 deletions dashboards/control-plane/kube-etcd.json

Large diffs are not rendered by default.

1,602 changes: 0 additions & 1,602 deletions dashboards/control-plane/kube-etcd3.json

This file was deleted.

5 changes: 2 additions & 3 deletions hack/download-dashboards.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fix_d8() {
}

swap_pvc_overview() {
jq '(.panels[] | select(.title=="PVC Detailed") | .panels[] | select(.title=="Overview")) as $a | del(.panels[] | select(.title=="PVC Detailed").panels[] | select(.title=="Overview")) | ( (.panels[] | select(.title=="PVC Detailed"))) as $b | del( .panels[] | select(.title=="PVC Detailed")) | (.panels[.panels|length]=($a|.gridPos.y=$b.gridPos.y)) | (.panels[.panels|length]=($b|.gridPos.y=$a.gridPos.y))'
jq '(.panels[] | select(.title=="PVC Detailed") | .panels[] | select(.title=="Overview")) as $a | del(.panels[] | select(.title=="PVC Detailed").panels[] | select(.title=="Overview")) | ( (.panels[] | select(.title=="PVC Detailed"))) as $b | del( .panels[] | select(.title=="PVC Detailed")) | (.panels[.panels|length]=($a|.gridPos.y=$b.gridPos.y)) | (.panels[.panels|length]=($b|.gridPos.y=$a.gridPos.y))'
}

deprectaed_remove_faq() {
Expand Down Expand Up @@ -68,7 +68,7 @@ modules/402-ingress-nginx/monitoring/grafana-dashboards/ingress-nginx/namespace/
modules/402-ingress-nginx/monitoring/grafana-dashboards/ingress-nginx/vhost/vhost_detail.json
modules/402-ingress-nginx/monitoring/grafana-dashboards/ingress-nginx/vhost/vhosts.json
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/control-plane-status.json
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/kube-etcd3.json #TODO
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/kube-etcd.json #TODO
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/deprecated-resources.json
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//kubernetes-cluster/nodes/ntp.json #TODO
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//kubernetes-cluster/nodes/nodes.json
Expand Down Expand Up @@ -109,4 +109,3 @@ done <<\EOT
https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-namespaces.json
https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-pods.json
EOT

2 changes: 1 addition & 1 deletion packages/extra/bootbox/images/matchbox.tag
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ghcr.io/aenix-io/cozystack/matchbox:v0.24.1@sha256:26ce2eaae90c82e49e866ae5b18e38d6e3ac1a4b0a3b494ebe2c480a4685f143
ghcr.io/aenix-io/cozystack/matchbox:v0.24.1@sha256:002d540c3a5583bfadbcbd436c2e582e51fc854068a2a7d2dba41072e921ca96
2 changes: 1 addition & 1 deletion packages/extra/etcd/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ name: etcd
description: Storage for Kubernetes clusters
icon: /logos/etcd.svg
type: application
version: 2.4.0
version: 2.5.0
6 changes: 6 additions & 0 deletions packages/extra/etcd/templates/etcd-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ spec:
labels:
cozystack.io/service: etcd
spec:
containers:
- name: etcd
ports:
- name: metrics
containerPort: 2381
protocol: TCP
topologySpreadConstraints:
- maxSkew: 1
topologyKey: "kubernetes.io/hostname"
Expand Down
11 changes: 11 additions & 0 deletions packages/extra/etcd/templates/podscrape.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: etcd-pod-scrape
spec:
podMetricsEndpoints:
- port: metrics
scheme: http
selector:
matchLabels:
app.kubernetes.io/name: etcd
132 changes: 132 additions & 0 deletions packages/extra/etcd/templates/prometheus-rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: etcd-rules
spec:
groups:
- name: etcd
rules:
- alert: etcdInsufficientMembers
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': insufficient members '{{`{{ $value }}`}}'."
expr: |
sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
for: 3m
labels:
severity: critical

- alert: etcdNoLeader
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': member '{{`{{ $labels.instance }}`}}' has no leader."
expr: |
etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical

- alert: etcdHighNumberOfLeaderChanges
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': instance '{{`{{ $labels.instance }}`}}' has seen '{{`{{ $value }}`}}' leader changes within the last hour."
expr: |
rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
for: 15m
labels:
severity: warning

- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': '{{`{{ $value }}`}}' of requests for '{{`{{ $labels.grpc_method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
> 1
for: 10m
labels:
severity: warning

- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': '{{`{{ $value }}`}}' of requests for '{{`{{ $labels.grpc_method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
> 5
for: 5m
labels:
severity: critical

- alert: etcdGRPCRequestsSlow
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': gRPC requests to '{{`{{ $labels.grpc_method }}`}}' are taking '{{`{{ $value }}`}}' on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
> 0.15
for: 10m
labels:
severity: critical

- alert: etcdMemberCommunicationSlow
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': member communication with '{{`{{ $labels.To }}`}}' is taking '{{`{{ $value }}`}}' on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning

- alert: etcdHighNumberOfFailedProposals
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': '{{`{{ $value }}`}}' proposal failures within the last hour on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning

- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
summary: "'{{`{{ $value }}`}}' of requests for '{{`{{ $labels.method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) BY (method) > 0.01
for: 10m
labels:
severity: warning

- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
summary: "'{{`{{ $value }}`}}' of requests for '{{`{{ $labels.method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) BY (method) > 0.05
for: 10m
labels:
severity: critical

- alert: etcdHTTPRequestsSlow
annotations:
summary: "etcd instance '{{`{{ $labels.instance }}`}}' HTTP requests to '{{`{{ $labels.method }}`}}' are slow."
expr: |
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning

- alert: etcdMembersDown
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}' members are down."
description: 'etcd cluster "{{`{{ $labels.job }}`}}": members are down {{`{{ $value }}`}}.'
expr: |
max without (endpoint) (
sum without (instance, pod) (up{job=~".*etcd.*"} == bool 0)
or
count without (To) (
sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
)
)
> 0
for: 10m
labels:
severity: critical
2 changes: 1 addition & 1 deletion packages/extra/monitoring/dashboards.list
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ main/nodes
control-plane/control-plane-status
control-plane/deprecated-resources
control-plane/dns-coredns
control-plane/kube-etcd3
control-plane/kube-etcd
kubevirt/kubevirt-control-plane
3 changes: 2 additions & 1 deletion packages/extra/versions_map
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ etcd 2.0.1 6fc1cc7d
etcd 2.1.0 2b00fcf8
etcd 2.2.0 5ca8823
etcd 2.3.0 b908400d
etcd 2.4.0 HEAD
etcd 2.4.0 cb7b8158
etcd 2.5.0 HEAD
ingress 1.0.0 f642698
ingress 1.1.0 838bee5d
ingress 1.2.0 ced8e5b
Expand Down

0 comments on commit f67816e

Please sign in to comment.