Skip to content

Commit

Permalink
Maximum metrics for CPU/latency in the report profile (#52)
Browse files Browse the repository at this point in the history
Signed-off-by: Jose Castillo Lema <[email protected]>
  • Loading branch information
josecastillolema authored Apr 17, 2024
1 parent 7fe178e commit 78dcd29
Showing 1 changed file with 82 additions and 2 deletions.
84 changes: 82 additions & 2 deletions cmd/config/metrics-report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,29 @@
metricName: avg-ro-apicalls-latency
instant: true

- query: max_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb, scope))[{{.elapsed}}:]) > 0
metricName: max-ro-apicalls-latency
instant: true

- query: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb, scope))[{{.elapsed}}:]) > 0
metricName: avg-mutating-apicalls-latency
instant: true

- query: max_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb, scope))[{{.elapsed}}:]) > 0
metricName: max-mutating-apicalls-latency
instant: true

# Kubelet & CRI-O

# Average of the CPU usage from all worker's kubelet
# Average and max of the CPU usage from all worker's kubelet
- query: avg(avg_over_time(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m])[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"})
metricName: cpu-kubelet
instant: true

- query: max(max_over_time(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m])[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"})
metricName: max-cpu-kubelet
instant: true

# Average of the memory usage from all worker's kubelet
- query: avg(avg_over_time(process_resident_memory_bytes{service="kubelet",job="kubelet"}[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"})
metricName: memory-kubelet
Expand All @@ -29,11 +41,15 @@
metricName: max-memory-sum-kubelet
instant: true

# Average of the CPU usage from all worker's CRI-O
# Average and max of the CPU usage from all worker's CRI-O
- query: avg(avg_over_time(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m])[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"})
metricName: cpu-crio
instant: true

- query: max(max_over_time(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m])[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"})
metricName: max-cpu-crio
instant: true

# Average of the memory usage from all worker's CRI-O
- query: avg(avg_over_time(process_resident_memory_bytes{service="kubelet",job="crio"}[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"})
metricName: memory-crio
Expand All @@ -54,20 +70,36 @@
metricName: 99thEtcdDiskBackendCommit
instant: true

- query: max(max_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[{{.elapsed}}:]))
metricName: max-99thEtcdDiskBackendCommit
instant: true

- query: avg(avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[{{.elapsed}}:]))
metricName: 99thEtcdDiskWalFsync
instant: true

- query: max(max_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[{{.elapsed}}:]))
metricName: max-99thEtcdDiskWalFsync
instant: true

- query: avg(avg_over_time(histogram_quantile(0.99, irate(etcd_network_peer_round_trip_time_seconds_bucket[2m]))[{{.elapsed}}:]))
metricName: 99thEtcdRoundTripTime
instant: true

- query: max(max_over_time(histogram_quantile(0.99, irate(etcd_network_peer_round_trip_time_seconds_bucket[2m]))[{{.elapsed}}:]))
metricName: max-99thEtcdRoundTripTime
instant: true

# Control-plane

- query: avg(avg_over_time(topk(1, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-kube-controller-manager"}[2m])) by (pod))[{{.elapsed}}:]))
metricName: cpu-kube-controller-manager
instant: true

- query: max(max_over_time(topk(1, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-kube-controller-manager"}[2m])) by (pod))[{{.elapsed}}:]))
metricName: max-cpu-kube-controller-manager
instant: true

- query: avg(avg_over_time(topk(1, sum(container_memory_rss{name!="", namespace="openshift-kube-controller-manager"}) by (pod))[{{.elapsed}}:]))
metricName: memory-kube-controller-manager
instant: true
Expand All @@ -84,6 +116,10 @@
metricName: cpu-kube-apiserver
instant: true

- query: max(max_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-kube-apiserver"}[2m])) by (pod))[{{.elapsed}}:]))
metricName: max-cpu-kube-apiserver
instant: true

- query: avg(avg_over_time(topk(3, sum(container_memory_rss{name!="", namespace="openshift-kube-apiserver"}) by (pod))[{{.elapsed}}:]))
metricName: memory-kube-apiserver
instant: true
Expand All @@ -100,6 +136,10 @@
metricName: cpu-openshift-apiserver
instant: true

- query: max(max_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-apiserver"}[2m])) by (pod))[{{.elapsed}}:]))
metricName: max-cpu-openshift-apiserver
instant: true

- query: avg(avg_over_time(topk(3, sum(container_memory_rss{name!="", namespace="openshift-apiserver"}) by (pod))[{{.elapsed}}:]))
metricName: memory-openshift-apiserver
instant: true
Expand All @@ -116,6 +156,10 @@
metricName: cpu-etcd
instant: true

- query: max(max_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-etcd"}[2m])) by (pod))[{{.elapsed}}:]))
metricName: max-cpu-etcd
instant: true

- query: avg(avg_over_time(topk(3,sum(container_memory_rss{name!="", namespace="openshift-etcd"}) by (pod))[{{.elapsed}}:]))
metricName: memory-etcd
instant: true
Expand All @@ -132,6 +176,10 @@
metricName: cpu-openshift-controller-manager
instant: true

- query: max(max_over_time(topk(1, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-controller-manager"}[2m])) by (pod))[{{.elapsed}}:]))
metricName: max-cpu-openshift-controller-manager
instant: true

- query: avg(avg_over_time(topk(1, sum(container_memory_rss{name!="", namespace="openshift-controller-manager"}) by (pod))[{{.elapsed}}:]))
metricName: memory-openshift-controller-manager
instant: true
Expand All @@ -146,6 +194,10 @@
metricName: cpu-multus
instant: true

- query: max(max_over_time(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-multus", pod=~"(multus).+", container!="POD"}[2m])[{{.elapsed}}:])) by (container)
metricName: max-cpu-multus
instant: true

- query: avg(avg_over_time(container_memory_rss{name!="", namespace="openshift-multus", pod=~"(multus).+", container!="POD"}[{{.elapsed}}:])) by (container)
metricName: memory-multus
instant: true
Expand All @@ -160,6 +212,10 @@
metricName: cpu-ovn-control-plane
instant: true

- query: max(max_over_time(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-ovn-kubernetes", pod=~"(ovnkube-master|ovnkube-control-plane).+", container!="POD"}[2m])[{{.elapsed}}:])) by (container)
metricName: max-cpu-ovn-control-plane
instant: true

- query: avg(avg_over_time(container_memory_rss{name!="", namespace="openshift-ovn-kubernetes", pod=~"(ovnkube-master|ovnkube-control-plane).+", container!="POD"}[{{.elapsed}}:])) by (container)
metricName: memory-ovn-control-plane
instant: true
Expand All @@ -172,6 +228,10 @@
metricName: cpu-ovnkube-node
instant: true

- query: max(max_over_time(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+", container!="POD"}[2m])[{{.elapsed}}:])) by (container)
metricName: max-cpu-ovnkube-node
instant: true

- query: avg(avg_over_time(container_memory_rss{name!="", namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+", container!="POD"}[{{.elapsed}}:])) by (container)
metricName: memory-ovnkube-node
instant: true
Expand All @@ -186,6 +246,10 @@
metricName: cpu-masters
instant: true

- query: max(max_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) by (instance)[{{.elapsed}}:]))
metricName: max-cpu-masters
instant: true

- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
metricName: memory-masters
instant: true
Expand All @@ -202,6 +266,10 @@
metricName: cpu-workers
instant: true

- query: max(max_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (instance)[{{.elapsed}}:]))
metricName: max-cpu-workers
instant: true

- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
metricName: memory-workers
instant: true
Expand All @@ -218,6 +286,10 @@
metricName: cpu-infra
instant: true

- query: max(max_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (instance)[{{.elapsed}}:]))
metricName: max-cpu-infra
instant: true

- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
metricName: memory-infra
instant: true
Expand All @@ -236,6 +308,10 @@
metricName: cpu-prometheus
instant: true

- query: max(max_over_time(sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}[2m])) by (pod)[{{.elapsed}}:]))
metricName: max-cpu-prometheus
instant: true

- query: avg(avg_over_time(sum(container_memory_rss{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}) by (pod)[{{.elapsed}}:]))
metricName: memory-prometheus
instant: true
Expand All @@ -248,6 +324,10 @@
metricName: cpu-router
instant: true

- query: max(max_over_time(sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-ingress", pod=~"router-default.+"}[2m])) by (pod)[{{.elapsed}}:]))
metricName: max-cpu-router
instant: true

- query: avg(avg_over_time(sum(container_memory_rss{name!="", namespace="openshift-ingress", pod=~"router-default.+"}) by (pod)[{{.elapsed}}:]))
metricName: memory-router
instant: true
Expand Down

0 comments on commit 78dcd29

Please sign in to comment.