Skip to content

Commit

Permalink
update alloy-mixin to 1.4.1
Browse files Browse the repository at this point in the history
Signed-off-by: Weifeng Wang <[email protected]>

regenerates k8s manifests

Signed-off-by: Weifeng Wang <[email protected]>
  • Loading branch information
qclaogui committed Sep 28, 2024
1 parent 50145c2 commit 57e8a24
Show file tree
Hide file tree
Showing 24 changed files with 2,421 additions and 1,365 deletions.
266 changes: 170 additions & 96 deletions kubernetes/common/alloy/manifests/k8s-all-in-one.yaml

Large diffs are not rendered by default.

266 changes: 170 additions & 96 deletions kubernetes/microservices-mode/logs/k8s-all-in-one.yaml

Large diffs are not rendered by default.

266 changes: 170 additions & 96 deletions kubernetes/microservices-mode/metrics/k8s-all-in-one.yaml

Large diffs are not rendered by default.

266 changes: 170 additions & 96 deletions kubernetes/microservices-mode/profiles/k8s-all-in-one.yaml

Large diffs are not rendered by default.

266 changes: 170 additions & 96 deletions kubernetes/microservices-mode/traces/k8s-all-in-one.yaml

Large diffs are not rendered by default.

266 changes: 170 additions & 96 deletions kubernetes/monolithic-mode/all-in-one/k8s-all-in-one.yaml

Large diffs are not rendered by default.

266 changes: 170 additions & 96 deletions kubernetes/monolithic-mode/logs/k8s-all-in-one.yaml

Large diffs are not rendered by default.

266 changes: 170 additions & 96 deletions kubernetes/monolithic-mode/metrics/k8s-all-in-one.yaml

Large diffs are not rendered by default.

266 changes: 170 additions & 96 deletions kubernetes/monolithic-mode/profiles/k8s-all-in-one.yaml

Large diffs are not rendered by default.

266 changes: 170 additions & 96 deletions kubernetes/monolithic-mode/traces/k8s-all-in-one.yaml

Large diffs are not rendered by default.

266 changes: 170 additions & 96 deletions kubernetes/read-write-mode/logs/k8s-all-in-one.yaml

Large diffs are not rendered by default.

266 changes: 170 additions & 96 deletions kubernetes/read-write-mode/metrics/k8s-all-in-one.yaml

Large diffs are not rendered by default.

66 changes: 48 additions & 18 deletions monitoring-mixins/alloy-mixin/deploy/alloy-mixin-alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,62 +3,92 @@ groups:
rules:
- alert: ClusterNotConverging
annotations:
message: 'Cluster is not converging: nodes report different number of peers in the cluster.'
expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0
description: 'Cluster is not converging: nodes report different number of peers in the cluster. Job is {{ $labels.job }}'
summary: Cluster is not converging.
expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) (cluster_node_peers)) != 0
for: 10m
labels:
severity: warning
- alert: ClusterNodeCountMismatch
annotations:
message: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.
description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state. Job is {{ $labels.job }}
summary: Nodes report different number of peers vs. the count of observed Alloy metrics.
expr: |
sum without (state) (cluster_node_peers) !=
on (cluster, namespace, job) group_left
count by (cluster, namespace, job) (cluster_node_info)
on (cluster, namespace, job, cluster_name) group_left
count by (cluster, namespace, job, cluster_name) (cluster_node_info)
for: 15m
labels:
severity: warning
- alert: ClusterNodeUnhealthy
annotations:
message: Cluster node is reporting a gossip protocol health score > 0.
description: Cluster node is reporting a gossip protocol health score > 0. Job is {{ $labels.job }}
summary: Cluster unhealthy.
expr: |
cluster_node_gossip_health_score > 0
for: 10m
labels:
severity: warning
- alert: ClusterNodeNameConflict
annotations:
message: A node tried to join the cluster with a name conflicting with an existing peer.
expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0
description: A node tried to join the cluster with a name conflicting with an existing peer. Job is {{ $labels.job }}
summary: Cluster Node Name Conflict.
expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0
for: 10m
labels:
severity: warning
- alert: ClusterNodeStuckTerminating
annotations:
message: Cluster node stuck in Terminating state.
expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0
description: There is a node within the cluster that is stuck in Terminating state. Job is {{ $labels.job }}
summary: Cluster node stuck in Terminating state.
expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0
for: 10m
labels:
severity: warning
- alert: ClusterConfigurationDrift
annotations:
message: Cluster nodes are not using the same configuration file.
description: Cluster nodes are not using the same configuration file. Job is {{ $labels.job }}
summary: Cluster configuration drifting.
expr: |
count without (sha256) (
max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info)
max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info)
) > 1
for: 5m
labels:
severity: warning
- name: alloy_controller
rules:
- alert: SlowComponentEvaluations
annotations:
message: Component evaluations are taking too long.
description: Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.
summary: Component evaluations are taking too long.
expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0
for: 15m
labels:
severity: warning
- alert: UnhealthyComponents
annotations:
message: Unhealthy components detected.
description: Unhealthy components detected under job {{ $labels.job }}
summary: Unhealthy components detected.
expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0
for: 15m
labels:
severity: warning
- name: alloy_otelcol
rules:
- alert: OtelcolReceiverRefusedSpans
annotations:
message: The receiver could not push some spans to the pipeline.
expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0
description: The receiver could not push some spans to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.
summary: The receiver could not push some spans to the pipeline.
expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0
for: 5m
labels:
severity: warning
- alert: OtelcolExporterFailedSpans
annotations:
message: The exporter failed to send spans to their destination.
expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0
description: The exporter failed to send spans to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.
summary: The exporter failed to send spans to their destination.
expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0
for: 5m
labels:
severity: warning
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"title": "Documentation",
"tooltip": "Clustering documentation",
"type": "link",
"url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode"
"url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering"
},
{
"asDropdown": true,
Expand Down Expand Up @@ -441,11 +441,8 @@
"type": "query"
},
{
"allValue": ".*",
"datasource": "${datasource}",
"includeAll": true,
"label": "job",
"multi": true,
"name": "job",
"query": {
"query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"title": "Documentation",
"tooltip": "Clustering documentation",
"type": "link",
"url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode"
"url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering"
},
{
"asDropdown": true,
Expand Down Expand Up @@ -251,6 +251,32 @@
],
"title": "Convergance state timeline",
"type": "state-timeline"
},
{
"datasource": "${datasource}",
"description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n",
"fieldConfig": {
"defaults": {
"unit": "peers"
}
},
"gridPos": {
"h": 12,
"w": 24,
"x": 0,
"y": 18
},
"targets": [
{
"datasource": "${datasource}",
"expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n",
"instant": false,
"legendFormat": "{{instance}}",
"range": true
}
],
"title": "Number of peers seen by each instance",
"type": "timeseries"
}
],
"refresh": "10s",
Expand Down Expand Up @@ -301,11 +327,8 @@
"type": "query"
},
{
"allValue": ".*",
"datasource": "${datasource}",
"includeAll": true,
"label": "job",
"multi": true,
"name": "job",
"query": {
"query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
"targets": [
{
"datasource": "${datasource}",
"expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n",
"expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n",
"instant": false,
"legendFormat": "__auto",
"range": true
Expand Down Expand Up @@ -505,11 +505,8 @@
"type": "query"
},
{
"allValue": ".*",
"datasource": "${datasource}",
"includeAll": true,
"label": "job",
"multi": true,
"name": "job",
"query": {
"query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -320,5 +320,5 @@
},
"timezone": "utc",
"title": "Alloy / Logs Overview",
"uid": "alloy--logs-overview"
"uid": "53c1ecddc3a1d5d4b8d6cd0c23676c31"
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
"expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
"instant": false,
"legendFormat": "{{ pod }} / {{ transport }}",
"range": true
Expand Down Expand Up @@ -81,7 +81,7 @@
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
"expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
"instant": false,
"legendFormat": "{{ pod }} / {{ transport }}",
"range": true
Expand Down Expand Up @@ -192,7 +192,7 @@
"targets": [
{
"datasource": "${datasource}",
"expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n",
"expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n",
"format": "heatmap",
"instant": false,
"legendFormat": "{{le}}",
Expand All @@ -214,7 +214,7 @@
"targets": [
{
"datasource": "${datasource}",
"expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n",
"expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n",
"instant": false,
"legendFormat": "{{ pod }}",
"range": true
Expand All @@ -235,7 +235,7 @@
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
"expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
"instant": false,
"legendFormat": "{{ pod }}",
"range": true
Expand Down Expand Up @@ -278,7 +278,7 @@
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
"expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
"instant": false,
"legendFormat": "{{ pod }}",
"range": true
Expand Down Expand Up @@ -310,7 +310,7 @@
"targets": [
{
"datasource": "${datasource}",
"expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
"expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
"instant": false,
"legendFormat": "{{ pod }}",
"range": true
Expand Down Expand Up @@ -368,11 +368,8 @@
"type": "query"
},
{
"allValue": ".*",
"datasource": "${datasource}",
"includeAll": true,
"label": "job",
"multi": true,
"name": "job",
"query": {
"query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n",
Expand Down
Loading

0 comments on commit 57e8a24

Please sign in to comment.