update alloy-mixin to 1.4.1

Signed-off-by: Weifeng Wang <[email protected]> regenerates k8s manifests Signed-off-by: Weifeng Wang <[email protected]>
qclaogui · Sep 28, 2024 · 57e8a24 · 57e8a24
1 parent 50145c2
commit 57e8a24
Show file tree

Hide file tree

Showing 24 changed files with 2,421 additions and 1,365 deletions.
diff --git a/kubernetes/common/alloy/manifests/k8s-all-in-one.yaml b/kubernetes/common/alloy/manifests/k8s-all-in-one.yaml
diff --git a/kubernetes/microservices-mode/logs/k8s-all-in-one.yaml b/kubernetes/microservices-mode/logs/k8s-all-in-one.yaml
diff --git a/kubernetes/microservices-mode/metrics/k8s-all-in-one.yaml b/kubernetes/microservices-mode/metrics/k8s-all-in-one.yaml
diff --git a/kubernetes/microservices-mode/profiles/k8s-all-in-one.yaml b/kubernetes/microservices-mode/profiles/k8s-all-in-one.yaml
diff --git a/kubernetes/microservices-mode/traces/k8s-all-in-one.yaml b/kubernetes/microservices-mode/traces/k8s-all-in-one.yaml
diff --git a/kubernetes/monolithic-mode/all-in-one/k8s-all-in-one.yaml b/kubernetes/monolithic-mode/all-in-one/k8s-all-in-one.yaml
diff --git a/kubernetes/monolithic-mode/logs/k8s-all-in-one.yaml b/kubernetes/monolithic-mode/logs/k8s-all-in-one.yaml
diff --git a/kubernetes/monolithic-mode/metrics/k8s-all-in-one.yaml b/kubernetes/monolithic-mode/metrics/k8s-all-in-one.yaml
diff --git a/kubernetes/monolithic-mode/profiles/k8s-all-in-one.yaml b/kubernetes/monolithic-mode/profiles/k8s-all-in-one.yaml
diff --git a/kubernetes/monolithic-mode/traces/k8s-all-in-one.yaml b/kubernetes/monolithic-mode/traces/k8s-all-in-one.yaml
diff --git a/kubernetes/read-write-mode/logs/k8s-all-in-one.yaml b/kubernetes/read-write-mode/logs/k8s-all-in-one.yaml
diff --git a/kubernetes/read-write-mode/metrics/k8s-all-in-one.yaml b/kubernetes/read-write-mode/metrics/k8s-all-in-one.yaml
diff --git a/monitoring-mixins/alloy-mixin/deploy/alloy-mixin-alerts.yaml b/monitoring-mixins/alloy-mixin/deploy/alloy-mixin-alerts.yaml
@@ -3,62 +3,92 @@ groups:
       rules:
         - alert: ClusterNotConverging
           annotations:
-            message: 'Cluster is not converging: nodes report different number of peers in the cluster.'
-          expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0
+            description: 'Cluster is not converging: nodes report different number of peers in the cluster. Job is {{ $labels.job }}'
+            summary: Cluster is not converging.
+          expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) (cluster_node_peers)) != 0
           for: 10m
+          labels:
+            severity: warning
         - alert: ClusterNodeCountMismatch
           annotations:
-            message: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.
+            description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state. Job is {{ $labels.job }}
+            summary: Nodes report different number of peers vs. the count of observed Alloy metrics.
           expr: |
             sum without (state) (cluster_node_peers) !=
-            on (cluster, namespace, job) group_left
-            count by (cluster, namespace, job) (cluster_node_info)
+            on (cluster, namespace, job, cluster_name) group_left
+            count by (cluster, namespace, job, cluster_name) (cluster_node_info)
           for: 15m
+          labels:
+            severity: warning
         - alert: ClusterNodeUnhealthy
           annotations:
-            message: Cluster node is reporting a gossip protocol health score > 0.
+            description: Cluster node is reporting a gossip protocol health score > 0. Job is {{ $labels.job }}
+            summary: Cluster unhealthy.
           expr: |
             cluster_node_gossip_health_score > 0
           for: 10m
+          labels:
+            severity: warning
         - alert: ClusterNodeNameConflict
           annotations:
-            message: A node tried to join the cluster with a name conflicting with an existing peer.
-          expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0
+            description: A node tried to join the cluster with a name conflicting with an existing peer. Job is {{ $labels.job }}
+            summary: Cluster Node Name Conflict.
+          expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0
           for: 10m
+          labels:
+            severity: warning
         - alert: ClusterNodeStuckTerminating
           annotations:
-            message: Cluster node stuck in Terminating state.
-          expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0
+            description: There is a node within the cluster that is stuck in Terminating state. Job is {{ $labels.job }}
+            summary: Cluster node stuck in Terminating state.
+          expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0
           for: 10m
+          labels:
+            severity: warning
         - alert: ClusterConfigurationDrift
           annotations:
-            message: Cluster nodes are not using the same configuration file.
+            description: Cluster nodes are not using the same configuration file. Job is {{ $labels.job }}
+            summary: Cluster configuration drifting.
           expr: |
             count without (sha256) (
-                max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info)
+                max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info)
             ) > 1
           for: 5m
+          labels:
+            severity: warning
     - name: alloy_controller
       rules:
         - alert: SlowComponentEvaluations
           annotations:
-            message: Component evaluations are taking too long.
+            description: Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}.
+            summary: Component evaluations are taking too long.
           expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0
           for: 15m
+          labels:
+            severity: warning
         - alert: UnhealthyComponents
           annotations:
-            message: Unhealthy components detected.
+            description: Unhealthy components detected under job {{ $labels.job }}
+            summary: Unhealthy components detected.
           expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0
           for: 15m
+          labels:
+            severity: warning
     - name: alloy_otelcol
       rules:
         - alert: OtelcolReceiverRefusedSpans
           annotations:
-            message: The receiver could not push some spans to the pipeline.
-          expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0
+            description: The receiver could not push some spans to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.
+            summary: The receiver could not push some spans to the pipeline.
+          expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0
           for: 5m
+          labels:
+            severity: warning
         - alert: OtelcolExporterFailedSpans
           annotations:
-            message: The exporter failed to send spans to their destination.
-          expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0
+            description: The exporter failed to send spans to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.
+            summary: The exporter failed to send spans to their destination.
+          expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0
           for: 5m
+          labels:
+            severity: warning
diff --git a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-cluster-node.json b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-cluster-node.json
@@ -20,7 +20,7 @@
             "title": "Documentation",
             "tooltip": "Clustering documentation",
             "type": "link",
-            "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode"
+            "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering"
          },
          {
             "asDropdown": true,
@@ -441,11 +441,8 @@
                "type": "query"
             },
             {
-               "allValue": ".*",
                "datasource": "${datasource}",
-               "includeAll": true,
                "label": "job",
-               "multi": true,
                "name": "job",
                "query": {
                   "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n",

diff --git a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-cluster-overview.json b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-cluster-overview.json
@@ -20,7 +20,7 @@
             "title": "Documentation",
             "tooltip": "Clustering documentation",
             "type": "link",
-            "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode"
+            "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering"
          },
          {
             "asDropdown": true,
@@ -251,6 +251,32 @@
             ],
             "title": "Convergance state timeline",
             "type": "state-timeline"
+         },
+         {
+            "datasource": "${datasource}",
+            "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n",
+            "fieldConfig": {
+               "defaults": {
+                  "unit": "peers"
+               }
+            },
+            "gridPos": {
+               "h": 12,
+               "w": 24,
+               "x": 0,
+               "y": 18
+            },
+            "targets": [
+               {
+                  "datasource": "${datasource}",
+                  "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n",
+                  "instant": false,
+                  "legendFormat": "{{instance}}",
+                  "range": true
+               }
+            ],
+            "title": "Number of peers seen by each instance",
+            "type": "timeseries"
          }
       ],
       "refresh": "10s",
@@ -301,11 +327,8 @@
                "type": "query"
             },
             {
-               "allValue": ".*",
                "datasource": "${datasource}",
-               "includeAll": true,
                "label": "job",
-               "multi": true,
                "name": "job",
                "query": {
                   "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n",

diff --git a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-controller.json b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-controller.json
@@ -57,7 +57,7 @@
             "targets": [
                {
                   "datasource": "${datasource}",
-                  "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n",
+                  "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n",
                   "instant": false,
                   "legendFormat": "__auto",
                   "range": true
@@ -505,11 +505,8 @@
                "type": "query"
             },
             {
-               "allValue": ".*",
                "datasource": "${datasource}",
-               "includeAll": true,
                "label": "job",
-               "multi": true,
                "name": "job",
                "query": {
                   "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n",

diff --git a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-logs.json b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-logs.json
@@ -320,5 +320,5 @@
       },
       "timezone": "utc",
       "title": "Alloy / Logs Overview",
-      "uid": "alloy--logs-overview"
+      "uid": "53c1ecddc3a1d5d4b8d6cd0c23676c31"
    }
diff --git a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-opentelemetry.json b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-opentelemetry.json
@@ -49,7 +49,7 @@
             "targets": [
                {
                   "datasource": "${datasource}",
-                  "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
+                  "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
                   "instant": false,
                   "legendFormat": "{{ pod }} / {{ transport }}",
                   "range": true
@@ -81,7 +81,7 @@
             "targets": [
                {
                   "datasource": "${datasource}",
-                  "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
+                  "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
                   "instant": false,
                   "legendFormat": "{{ pod }} / {{ transport }}",
                   "range": true
@@ -192,7 +192,7 @@
             "targets": [
                {
                   "datasource": "${datasource}",
-                  "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n",
+                  "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n",
                   "format": "heatmap",
                   "instant": false,
                   "legendFormat": "{{le}}",
@@ -214,7 +214,7 @@
             "targets": [
                {
                   "datasource": "${datasource}",
-                  "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n",
+                  "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n",
                   "instant": false,
                   "legendFormat": "{{ pod }}",
                   "range": true
@@ -235,7 +235,7 @@
             "targets": [
                {
                   "datasource": "${datasource}",
-                  "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
+                  "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
                   "instant": false,
                   "legendFormat": "{{ pod }}",
                   "range": true
@@ -278,7 +278,7 @@
             "targets": [
                {
                   "datasource": "${datasource}",
-                  "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
+                  "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
                   "instant": false,
                   "legendFormat": "{{ pod }}",
                   "range": true
@@ -310,7 +310,7 @@
             "targets": [
                {
                   "datasource": "${datasource}",
-                  "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
+                  "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n",
                   "instant": false,
                   "legendFormat": "{{ pod }}",
                   "range": true
@@ -368,11 +368,8 @@
                "type": "query"
             },
             {
-               "allValue": ".*",
                "datasource": "${datasource}",
-               "includeAll": true,
                "label": "job",
-               "multi": true,
                "name": "job",
                "query": {
                   "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n",