Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Istio Monitoring Pattern with AMP and AMG #140

Merged
merged 7 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions lib/common/resources/amp-config/istio/alerting-rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
groups:
- name: "istio.basic.alerting-rules"
rules:
- alert: IngressTrafficMissing
annotations:
summary: 'ingress gateway traffic missing'
description: '[Critical]: ingress gateway traffic missing, likely other monitors are misleading, check client logs'
expr: >
absent(istio_requests_total{destination_service_namespace=~"service-graph.*",reporter="source",source_workload="istio-ingressgateway"})==1
for: 5m
- alert: IstioMetricsMissing
annotations:
summary: 'Istio Metrics missing'
description: '[Critical]: Check prometheus deployment or whether the prometheus filters are applied correctly'
expr: >
absent(istio_request_total)==1 or absent(istio_request_duration_milliseconds_bucket)==1
for: 5m
- name: "istio.workload.alerting-rules"
rules:
- alert: HTTP5xxRateHigh
annotations:
summary: '5xx rate too high'
description: 'The HTTP 5xx errors rate higher than 0.05 in 5 mins'
expr: >
sum(irate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(irate(istio_requests_total{reporter="destination"}[5m])) > 0.05
for: 5m
- alert: WorkloadLatencyP99High
expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"svc.*"}[5m])) by (source_workload,namespace, le)) > 160
for: 10m
annotations:
description: 'The workload request latency P99 > 160ms '
message: "Request duration has slowed down for workload: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds"
- alert: IngressLatencyP99High
expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"istio.*"}[5m])) by (source_workload,namespace, le)) > 250
for: 10m
annotations:
description: 'The ingress latency P99 > 250ms '
message: "Request duration has slowed down for ingress: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds"
- name: "istio.infra.alerting-rules"
rules:
- alert: ProxyContainerCPUUsageHigh
expr: (sum(rate(container_cpu_usage_seconds_total{namespace!="kube-system", container=~"istio-proxy", namespace!=""}[5m])) BY (namespace, pod, container) * 100) > 80
for: 5m
annotations:
summary: "Proxy Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n"
description: "Proxy Container CPU usage is above 80%"
- alert: ProxyContainerMemoryUsageHigh
expr: (sum(container_memory_working_set_bytes{namespace!="kube-system", container=~"istio-proxy", namespace!=""}) BY (container, pod, namespace) / (sum(container_spec_memory_limit_bytes{namespace!="kube-system", container!="POD"}) BY (container, pod, namespace) > 0)* 100) > 80
for: 5m
annotations:
summary: "Proxy Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n"
description: "Proxy Container Memory usage is above 80%"
- alert: IngressMemoryUsageIncreaseRateHigh
expr: avg(deriv(container_memory_working_set_bytes{container=~"istio-proxy",namespace="istio-system"}[60m])) > 200
for: 180m
annotations:
summary: "Ingress proxy Memory change rate, VALUE = {{ $value }}\n"
description: "Ingress proxy Memory Usage increases more than 200 Bytes/sec"
- alert: IstiodContainerCPUUsageHigh
expr: (sum(rate(container_cpu_usage_seconds_total{namespace="istio-system", container="discovery"}[5m])) BY (pod) * 100) > 80
for: 5m
annotations:
summary: "Istiod Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n"
description: "Isitod Container CPU usage is above 80%"
- alert: IstiodMemoryUsageHigh
expr: (sum(container_memory_working_set_bytes{namespace="istio-system", container="discovery"}) BY (pod) / (sum(container_spec_memory_limit_bytes{namespace="istio-system", container="discovery"}) BY (pod) > 0)* 100) > 80
for: 5m
annotations:
summary: "Istiod Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n"
description: "Istiod Container Memory usage is above 80%"
- alert: IstiodMemoryUsageIncreaseRateHigh
expr: sum(deriv(container_memory_working_set_bytes{namespace="istio-system",pod=~"istiod-.*"}[60m])) > 1000
for: 300m
annotations:
summary: "Istiod Container Memory usage increase rate high, VALUE = {{ $value }}\n"
description: "Istiod Container Memory usage increases more than 1k Bytes/sec"
- name: "istio.controlplane.alerting-rules"
rules:
- alert: IstiodxdsPushErrorsHigh
annotations:
summary: 'istiod push errors is too high'
description: 'istiod push error rate is higher than 0.05'
expr: >
sum(irate(pilot_xds_push_errors{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05
for: 5m
- alert: IstiodxdsRejectHigh
annotations:
summary: 'istiod rejects rate is too high'
description: 'istiod rejects rate is higher than 0.05'
expr: >
sum(irate(pilot_total_xds_rejects{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05
for: 5m
- alert: IstiodContainerNotReady
annotations:
summary: 'istiod container not ready'
description: 'container: discovery not running'
expr: >
kube_pod_container_status_running{namespace="istio-system", container="discovery", component=""} == 0
for: 5m
- alert: IstiodUnavailableReplica
annotations:
summary: 'Istiod unavailable pod'
description: 'Istiod unavailable replica > 0'
expr: >
kube_deployment_status_replicas_unavailable{deployment="istiod", component=""} > 0
for: 5m
- alert: Ingress200RateLow
annotations:
summary: 'ingress gateway 200 rate drops'
description: 'The expected rate is 100 per ns, the limit is set based on 15ns'
expr: >
sum(rate(istio_requests_total{reporter="source", source_workload="istio-ingressgateway",response_code="200",destination_service_namespace=~"service-graph.*"}[5m])) < 1490
for: 30m
59 changes: 59 additions & 0 deletions lib/common/resources/amp-config/istio/recording-rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
groups:
- name: "istio.recording-rules"
interval: 5s
rules:
- record: "workload:istio_requests_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_requests_total)

- record: "workload:istio_request_duration_milliseconds_count"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_count)

- record: "workload:istio_request_duration_milliseconds_sum"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_sum)

- record: "workload:istio_request_duration_milliseconds_bucket"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_bucket)

- record: "workload:istio_request_bytes_count"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_count)

- record: "workload:istio_request_bytes_sum"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_sum)

- record: "workload:istio_request_bytes_bucket"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_bucket)

- record: "workload:istio_response_bytes_count"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_count)

- record: "workload:istio_response_bytes_sum"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_sum)

- record: "workload:istio_response_bytes_bucket"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_bucket)

- record: "workload:istio_tcp_sent_bytes_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_sent_bytes_total)

- record: "workload:istio_tcp_received_bytes_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_received_bytes_total)

- record: "workload:istio_tcp_connections_opened_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_opened_total)

- record: "workload:istio_tcp_connections_closed_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_closed_total)
107 changes: 105 additions & 2 deletions lib/common/resources/otel-collector-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ spec:
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$${1}/proxy/metrics/cadvisor
{{ if enableAPIserverJob }}
{{ start enableAPIserverJob }}
- job_name: 'apiserver'
scheme: https
tls_config:
Expand All @@ -94,7 +94,7 @@ spec:
regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
replacement: $1
action: drop
{{ end }}
{{ stop enableAPIserverJob }}
- job_name: serviceMonitor/default/kube-prometheus-stack-prometheus-node-exporter/0
honor_timestamps: true
scrape_interval: 30s
Expand Down Expand Up @@ -1607,6 +1607,15 @@ spec:
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: endpoints
ec2_sd_configs:
relabel_configs:
- source_labels: [ __address__ ]
action: keep
regex: '.*:9100$'
- action: replace
source_labels: [__meta_kubernetes_endpoint_node_name]
target_label: nodename
{{ start enableJavaMonJob }}
- job_name: 'kubernetes-java-jmx'
sample_limit: {{javaScrapeSampleLimit}}
metrics_path: {{javaPrometheusMetricsEndpoint}}
Expand Down Expand Up @@ -1637,6 +1646,100 @@ spec:
- source_labels: [ __name__ ]
regex: 'jvm_gc_collection_seconds.*'
action: drop
{{ stop enableJavaMonJob }}

{{ start enableNginxMonJob }}
- job_name: 'kubernetes-nginx'
sample_limit: {{nginxScrapeSampleLimit}}
metrics_path: {{nginxPrometheusMetricsEndpoint}}
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [ __address__ ]
action: keep
regex: '.*:10254$'
- source_labels: [__meta_kubernetes_pod_container_name]
target_label: container
action: replace
- source_labels: [__meta_kubernetes_pod_node_name]
target_label: host
action: replace
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
action: replace
metric_relabel_configs:
- source_labels: [__name__]
regex: 'go_memstats.*'
action: drop
- source_labels: [__name__]
regex: 'go_gc.*'
action: drop
- source_labels: [__name__]
regex: 'go_threads'
action: drop
- regex: exported_host
action: labeldrop
{{ stop enableNginxMonJob }}

{{ start enableIstioMonJob }}
- honor_labels: true
job_name: kubernetes-istio
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: drop
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
replacement: '[$$2]:$$1'
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: replace
regex: (\d+);((([0-9]+?)(\.|$)){4})
replacement: $$2:$$1
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- action: keep
source_labels: [ __address__ ]
regex: '.*:15020$$'
- action: drop
regex: Pending|Succeeded|Failed|Completed
source_labels:
- __meta_kubernetes_pod_phase
{{ stop enableIstioMonJob }}
exporters:
prometheusremotewrite:
endpoint: "{{remoteWriteEndpoint}}"
Expand Down
Loading
Loading