From d01e2c02071121d4b7cf68747005578ce4089e4f Mon Sep 17 00:00:00 2001 From: Weifeng Wang Date: Thu, 25 Apr 2024 15:26:56 +0800 Subject: [PATCH] kubernetes jobs Metrics Scrape Signed-off-by: Weifeng Wang --- Makefile | 2 + .../kubernetes/integrations/mysql.alloy | 6 +- alloy-modules/kubernetes/jobs/README.md | 17 + alloy-modules/kubernetes/jobs/apiserver.alloy | 206 + .../kubernetes/jobs/kube-state-metrics.alloy | 153 + alloy-modules/kubernetes/jobs/kubelet.alloy | 292 + .../kubernetes/jobs/node-exporter.alloy | 228 + .../kubernetes/logs/keep-labels.alloy | 9 +- go.mod | 30 +- go.sum | 124 +- .../kubernetes/integrations/mysql.alloy | 6 +- .../alloy/configs/kubernetes/jobs/README.md | 17 + .../configs/kubernetes/jobs/apiserver.alloy | 206 + .../kubernetes/jobs/kube-state-metrics.alloy | 153 + .../configs/kubernetes/jobs/kubelet.alloy | 292 + .../kubernetes/jobs/node-exporter.alloy | 228 + .../configs/kubernetes/logs/keep-labels.alloy | 9 +- kubernetes/common/alloy/kustomization.yaml | 10 + .../alloy/manifests/k8s-all-in-one.yaml | 3232 +- kubernetes/common/alloy/values-k3d-k3s.yaml | 5 + .../grafana-pdc-agent/kustomization.yaml | 30 + .../pdc-agent-deployment.yaml | 45 + kubernetes/common/grafana/configs/grafana.ini | 2 +- .../configs/home-kubernetes-monitoring.json | 676 + kubernetes/common/grafana/kustomization.yaml | 8 + .../grafana/manifests/k8s-all-in-one.yaml | 694 +- .../manifests/k8s-all-in-one.yaml | 450 +- .../kube-prometheus-stack/values-k3d-k3s.yaml | 35 + .../kube-state-metrics/kustomization.yaml | 10 + .../manifests/k8s-all-in-one.yaml | 353 + .../kube-state-metrics/values-k3d-k3s.yaml | 6 + .../common/memcached/kustomization.yaml | 1 + .../memcached/manifests/k8s-all-in-one.yaml | 1325 +- .../common/memcached/values-k3d-k3s.yaml | 3 +- .../mysql/manifests/k8s-all-in-one.yaml | 3 +- kubernetes/common/mysql/values-k3d-k3s.yaml | 3 +- kubernetes/common/opencost/kustomization.yaml | 10 + .../common/opencost/values-k3d-k3s.yaml | 25 + .../kustomization.yaml | 10 + .../manifests/k8s-all-in-one.yaml | 152 + .../values-k3d-k3s.yaml | 4 + .../redis/manifests/k8s-all-in-one.yaml | 3 +- kubernetes/common/redis/values-k3d-k3s.yaml | 3 +- .../logs/k8s-all-in-one.yaml | 3272 +- kubernetes/microservices-mode/logs/logs.alloy | 20 +- .../metrics/k8s-all-in-one.yaml | 3401 +- .../metrics/kustomization.yaml | 1 + .../microservices-mode/metrics/metrics.alloy | 30 + .../profiles/k8s-all-in-one.yaml | 3401 +- .../profiles/kustomization.yaml | 1 + .../profiles/profiles.alloy | 30 + .../traces/k8s-all-in-one.yaml | 3401 +- .../traces/kustomization.yaml | 1 + .../microservices-mode/traces/traces.alloy | 30 + .../all-in-one/all-in-one.alloy | 30 +- .../all-in-one/k8s-all-in-one.yaml | 3775 +- .../all-in-one/kustomization.yaml | 12 + .../monolithic-mode/logs/k8s-all-in-one.yaml | 3272 +- kubernetes/monolithic-mode/logs/logs.alloy | 20 +- .../metrics/k8s-all-in-one.yaml | 3401 +- .../metrics/kustomization.yaml | 1 + .../monolithic-mode/metrics/metrics.alloy | 30 + .../profiles/k8s-all-in-one.yaml | 3247 +- .../monolithic-mode/profiles/profiles.alloy | 30 + .../traces/k8s-all-in-one.yaml | 3247 +- .../monolithic-mode/traces/traces.alloy | 30 + .../read-write-mode/logs/k8s-all-in-one.yaml | 3272 +- kubernetes/read-write-mode/logs/logs.alloy | 20 +- .../metrics/k8s-all-in-one.yaml | 3401 +- .../metrics/kustomization.yaml | 1 + .../read-write-mode/metrics/metrics.alloy | 30 + monitoring-mixins/k8s-all-in-one.yaml | 73487 ++++++++++------ .../kubernetes-mixin/deploy.libsonnet | 9 +- .../deploy/dashboards_out/apiserver.json | 6 +- .../deploy/dashboards_out/cluster-total.json | 2145 +- .../dashboards_out/controller-manager.json | 1389 +- .../dashboards_out/k8s-resources-cluster.json | 3884 +- .../k8s-resources-multicluster.json | 1621 +- .../k8s-resources-namespace.json | 3507 +- .../dashboards_out/k8s-resources-node.json | 6 +- .../dashboards_out/k8s-resources-pod.json | 3087 +- .../k8s-resources-workload.json | 2413 +- .../k8s-resources-workloads-namespace.json | 6 +- .../deploy/dashboards_out/kubelet.json | 6 +- .../dashboards_out/namespace-by-pod.json | 6 +- .../dashboards_out/namespace-by-workload.json | 6 +- .../persistentvolumesusage.json | 6 +- .../deploy/dashboards_out/pod-total.json | 6 +- .../deploy/dashboards_out/proxy.json | 6 +- .../deploy/dashboards_out/scheduler.json | 6 +- .../deploy/dashboards_out/workload-total.json | 6 +- .../deploy/kubernetes-mixin-rules.yaml | 14 +- .../deploy/kustomization.yaml | 4 + .../deploy/manifests/k8s-all-in-one.yaml | 19795 ++--- .../deploy/prometheus-rules.yaml | 14 +- .../kubernetes-mixin/jsonnetfile.lock.json | 18 +- monitoring-mixins/kustomization.yaml | 4 +- 97 files changed, 106425 insertions(+), 51515 deletions(-) create mode 100644 alloy-modules/kubernetes/jobs/README.md create mode 100644 alloy-modules/kubernetes/jobs/apiserver.alloy create mode 100644 alloy-modules/kubernetes/jobs/kube-state-metrics.alloy create mode 100644 alloy-modules/kubernetes/jobs/kubelet.alloy create mode 100644 alloy-modules/kubernetes/jobs/node-exporter.alloy create mode 100644 kubernetes/common/alloy/configs/kubernetes/jobs/README.md create mode 100644 kubernetes/common/alloy/configs/kubernetes/jobs/apiserver.alloy create mode 100644 kubernetes/common/alloy/configs/kubernetes/jobs/kube-state-metrics.alloy create mode 100644 kubernetes/common/alloy/configs/kubernetes/jobs/kubelet.alloy create mode 100644 kubernetes/common/alloy/configs/kubernetes/jobs/node-exporter.alloy create mode 100644 kubernetes/common/grafana-pdc-agent/kustomization.yaml create mode 100644 kubernetes/common/grafana-pdc-agent/pdc-agent-deployment.yaml create mode 100644 kubernetes/common/grafana/configs/home-kubernetes-monitoring.json create mode 100644 kubernetes/common/kube-state-metrics/kustomization.yaml create mode 100644 kubernetes/common/kube-state-metrics/manifests/k8s-all-in-one.yaml create mode 100644 kubernetes/common/kube-state-metrics/values-k3d-k3s.yaml create mode 100644 kubernetes/common/opencost/kustomization.yaml create mode 100644 kubernetes/common/opencost/values-k3d-k3s.yaml create mode 100644 kubernetes/common/prometheus-node-exporter/kustomization.yaml create mode 100644 kubernetes/common/prometheus-node-exporter/manifests/k8s-all-in-one.yaml create mode 100644 kubernetes/common/prometheus-node-exporter/values-k3d-k3s.yaml diff --git a/Makefile b/Makefile index 0cba3482..49522d1b 100644 --- a/Makefile +++ b/Makefile @@ -212,10 +212,12 @@ manifests-common: $(KUSTOMIZE) @$(KUSTOMIZE) build --enable-helm kubernetes/common/gateway > kubernetes/common/gateway/manifests/k8s-all-in-one.yaml @$(KUSTOMIZE) build --enable-helm kubernetes/common/grafana > kubernetes/common/grafana/manifests/k8s-all-in-one.yaml @$(KUSTOMIZE) build --enable-helm kubernetes/common/kube-prometheus-stack > kubernetes/common/kube-prometheus-stack/manifests/k8s-all-in-one.yaml + @$(KUSTOMIZE) build --enable-helm kubernetes/common/kube-state-metrics > kubernetes/common/kube-state-metrics/manifests/k8s-all-in-one.yaml @$(KUSTOMIZE) build --enable-helm kubernetes/common/memcached > kubernetes/common/memcached/manifests/k8s-all-in-one.yaml @$(KUSTOMIZE) build --enable-helm kubernetes/common/minio-operator > kubernetes/common/minio-operator/manifests/k8s-all-in-one.yaml @$(KUSTOMIZE) build --enable-helm kubernetes/common/minio-tenant > kubernetes/common/minio-tenant/manifests/k8s-all-in-one.yaml @$(KUSTOMIZE) build --enable-helm kubernetes/common/mysql > kubernetes/common/mysql/manifests/k8s-all-in-one.yaml + @$(KUSTOMIZE) build --enable-helm kubernetes/common/prometheus-node-exporter > kubernetes/common/prometheus-node-exporter/manifests/k8s-all-in-one.yaml @$(KUSTOMIZE) build --enable-helm kubernetes/common/prometheus-operator-crds > kubernetes/common/prometheus-operator-crds/manifests/k8s-all-in-one.yaml @$(KUSTOMIZE) build --enable-helm kubernetes/common/rancher-pushprox > kubernetes/common/rancher-pushprox/manifests/k8s-all-in-one.yaml @$(KUSTOMIZE) build --enable-helm kubernetes/common/redis > kubernetes/common/redis/manifests/k8s-all-in-one.yaml diff --git a/alloy-modules/kubernetes/integrations/mysql.alloy b/alloy-modules/kubernetes/integrations/mysql.alloy index 9a7557b5..142e33e1 100644 --- a/alloy-modules/kubernetes/integrations/mysql.alloy +++ b/alloy-modules/kubernetes/integrations/mysql.alloy @@ -17,7 +17,7 @@ declare "mysql_metrics_scrape" { } argument "namespace" { - comment = "kubernetes secret name (default: monitoring-system)" + comment = "kubernetes secret namespace (default: monitoring-system)" optional = true } @@ -27,8 +27,8 @@ declare "mysql_metrics_scrape" { } argument "keep_metrics" { + comment = "A regex of metrics to keep (default: see below)" optional = true - default = "(up|instance:mysql_heartbeat_lag_seconds|instance:mysql_slave_lag_seconds|mysql_global_status_aborted_clients|mysql_global_status_aborted_connects|mysql_global_status_buffer_pool_pages|mysql_global_status_bytes_received|mysql_global_status_bytes_sent|mysql_global_status_commands_total|mysql_global_status_created_tmp_disk_tables|mysql_global_status_created_tmp_files|mysql_global_status_created_tmp_tables|mysql_global_status_handlers_total|mysql_global_status_innodb_log_waits|mysql_global_status_innodb_mem_adaptive_hash|mysql_global_status_innodb_mem_dictionary|mysql_global_status_innodb_num_open_files|mysql_global_status_innodb_page_size|mysql_global_status_max_used_connections|mysql_global_status_open_files|mysql_global_status_open_table_definitions|mysql_global_status_open_tables|mysql_global_status_opened_files|mysql_global_status_opened_table_definitions|mysql_global_status_opened_tables|mysql_global_status_qcache_free_memory|mysql_global_status_qcache_hits|mysql_global_status_qcache_inserts|mysql_global_status_qcache_lowmem_prunes|mysql_global_status_qcache_not_cached|mysql_global_status_qcache_queries_in_cache|mysql_global_status_queries|mysql_global_status_questions|mysql_global_status_select_full_join|mysql_global_status_select_full_range_join|mysql_global_status_select_range|mysql_global_status_select_range_check|mysql_global_status_select_scan|mysql_global_status_slow_queries|mysql_global_status_sort_merge_passes|mysql_global_status_sort_range|mysql_global_status_sort_rows|mysql_global_status_sort_scan|mysql_global_status_table_locks_immediate|mysql_global_status_table_locks_waited|mysql_global_status_table_open_cache_hits|mysql_global_status_table_open_cache_misses|mysql_global_status_table_open_cache_overflows|mysql_global_status_threads_cached|mysql_global_status_threads_connected|mysql_global_status_threads_created|mysql_global_status_threads_running|mysql_global_status_uptime|mysql_global_status_wsrep_local_recv_queue|mysql_global_status_wsrep_local_state|mysql_global_status_wsrep_ready|mysql_global_variables_innodb_additional_mem_pool_size|mysql_global_variables_innodb_buffer_pool_size|mysql_global_variables_innodb_log_buffer_size|mysql_global_variables_key_buffer_size|mysql_global_variables_max_connections|mysql_global_variables_open_files_limit|mysql_global_variables_query_cache_size|mysql_global_variables_table_definition_cache|mysql_global_variables_table_open_cache|mysql_global_variables_thread_cache_size|mysql_global_variables_tokudb_cache_size|mysql_global_variables_wsrep_desync|mysql_heartbeat_now_timestamp_seconds|mysql_heartbeat_stored_timestamp_seconds|mysql_info_schema_processlist_threads|mysql_slave_status_seconds_behind_master|mysql_slave_status_slave_io_running|mysql_slave_status_slave_sql_running|mysql_slave_status_sql_delay|mysql_up)" } argument "scrape_interval" { @@ -100,7 +100,7 @@ declare "mysql_metrics_scrape" { // keep only metrics that match the keep_metrics regex rule { source_labels = ["__name__"] - regex = argument.keep_metrics.value + regex = coalesce(argument.keep_metrics.value, "(up|instance:mysql_heartbeat_lag_seconds|instance:mysql_slave_lag_seconds|mysql_global_status_aborted_clients|mysql_global_status_aborted_connects|mysql_global_status_buffer_pool_pages|mysql_global_status_bytes_received|mysql_global_status_bytes_sent|mysql_global_status_commands_total|mysql_global_status_created_tmp_disk_tables|mysql_global_status_created_tmp_files|mysql_global_status_created_tmp_tables|mysql_global_status_handlers_total|mysql_global_status_innodb_log_waits|mysql_global_status_innodb_mem_adaptive_hash|mysql_global_status_innodb_mem_dictionary|mysql_global_status_innodb_num_open_files|mysql_global_status_innodb_page_size|mysql_global_status_max_used_connections|mysql_global_status_open_files|mysql_global_status_open_table_definitions|mysql_global_status_open_tables|mysql_global_status_opened_files|mysql_global_status_opened_table_definitions|mysql_global_status_opened_tables|mysql_global_status_qcache_free_memory|mysql_global_status_qcache_hits|mysql_global_status_qcache_inserts|mysql_global_status_qcache_lowmem_prunes|mysql_global_status_qcache_not_cached|mysql_global_status_qcache_queries_in_cache|mysql_global_status_queries|mysql_global_status_questions|mysql_global_status_select_full_join|mysql_global_status_select_full_range_join|mysql_global_status_select_range|mysql_global_status_select_range_check|mysql_global_status_select_scan|mysql_global_status_slow_queries|mysql_global_status_sort_merge_passes|mysql_global_status_sort_range|mysql_global_status_sort_rows|mysql_global_status_sort_scan|mysql_global_status_table_locks_immediate|mysql_global_status_table_locks_waited|mysql_global_status_table_open_cache_hits|mysql_global_status_table_open_cache_misses|mysql_global_status_table_open_cache_overflows|mysql_global_status_threads_cached|mysql_global_status_threads_connected|mysql_global_status_threads_created|mysql_global_status_threads_running|mysql_global_status_uptime|mysql_global_status_wsrep_local_recv_queue|mysql_global_status_wsrep_local_state|mysql_global_status_wsrep_ready|mysql_global_variables_innodb_additional_mem_pool_size|mysql_global_variables_innodb_buffer_pool_size|mysql_global_variables_innodb_log_buffer_size|mysql_global_variables_key_buffer_size|mysql_global_variables_max_connections|mysql_global_variables_open_files_limit|mysql_global_variables_query_cache_size|mysql_global_variables_table_definition_cache|mysql_global_variables_table_open_cache|mysql_global_variables_thread_cache_size|mysql_global_variables_tokudb_cache_size|mysql_global_variables_wsrep_desync|mysql_heartbeat_now_timestamp_seconds|mysql_heartbeat_stored_timestamp_seconds|mysql_info_schema_processlist_threads|mysql_slave_status_seconds_behind_master|mysql_slave_status_slave_io_running|mysql_slave_status_slave_sql_running|mysql_slave_status_sql_delay|mysql_up)") action = "keep" } } diff --git a/alloy-modules/kubernetes/jobs/README.md b/alloy-modules/kubernetes/jobs/README.md new file mode 100644 index 00000000..d7ef02dc --- /dev/null +++ b/alloy-modules/kubernetes/jobs/README.md @@ -0,0 +1,17 @@ +# Module Components + +Kubernetes Jobs Module Components + +## Components + +- [`apiserver`](#apiserver_metrics_scrape) + +### `apiserver_metrics_scrape` + +kubernetes Apiserver Metrics Scrape + +***Arguments*** + +***Exports*** + +***Example*** diff --git a/alloy-modules/kubernetes/jobs/apiserver.alloy b/alloy-modules/kubernetes/jobs/apiserver.alloy new file mode 100644 index 00000000..57078202 --- /dev/null +++ b/alloy-modules/kubernetes/jobs/apiserver.alloy @@ -0,0 +1,206 @@ +/* +Module Components: apiserver +Description: kubernetes Apiserver Metrics Scrape + +*/ + +declare "apiserver_metrics_scrape" { + + /******************************************** + * ARGUMENTS + ********************************************/ + argument "forward_to" { + comment = "Must be a list(MetricsReceiver) where collected metrics should be forwarded to" + } + + argument "cluster" { } + + argument "namespaces" { + comment = "The namespaces to look for targets in (default: default)" + optional = true + } + + argument "field_selectors" { + // Docs: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ + comment = "The label selectors to use to find matching targets (default: [\"metadata.name=kubernetes\"])" + optional = true + } + + argument "label_selectors" { + // Docs: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ + comment = "The label selectors to use to find matching targets (default: [])" + optional = true + } + + argument "port_name" { + comment = "The value of the label for the selector (default: https)" + optional = true + } + + argument "job_label" { + comment = "The job label to add for all kube-apiserver metrics (default: integrations/kubernetes/apiserver)" + optional = true + } + + argument "keep_metrics" { + comment = "A regex of metrics to keep (default: see below)" + optional = true + } + + // drop metrics and les from kube-prometheus + // https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/kubernetesControlPlane-serviceMonitorApiserver.yaml + argument "drop_metrics" { + comment = "A regular expression of metrics to drop (default: see below)" + optional = true + } + + argument "drop_les" { + comment = "Regular expression of metric les label values to drop (default: see below)" + optional = true + } + + argument "scrape_interval" { + comment = "How often to scrape metrics from the targets (default: 60s)" + optional = true + } + + argument "scrape_timeout" { + comment = "How long before a scrape times out (default: 10s)" + optional = true + } + + argument "max_cache_size" { + comment = "The maximum number of elements to hold in the relabeling cache (default: 100000). This should be at least 2x-5x your largest scrape target or samples appended rate." + optional = true + } + + /***************************************************************** + * Targets From Docker Discovery + *****************************************************************/ + discovery.kubernetes "apiserver" { + role = "service" + + selectors { + role = "service" + field = join(coalesce(argument.field_selectors.value, ["metadata.name=kubernetes"]), ",") + label = join(coalesce(argument.label_selectors.value, []), ",") + } + + namespaces { + names = coalesce(argument.namespaces.value, ["default"]) + } + } + + /***************************************************************** + * Discovery Relabelings (pre-scrape) + *****************************************************************/ + discovery.relabel "apiserver" { + targets = discovery.kubernetes.apiserver.targets + + // only keep targets with a matching port name + rule { + source_labels = ["__meta_kubernetes_service_port_name"] + regex = coalesce(argument.port_name.value, "https") + action = "keep" + } + + // set the namespace + rule { + action = "replace" + source_labels = ["__meta_kubernetes_namespace"] + target_label = "namespace" + } + + // set the service_name + rule { + action = "replace" + source_labels = ["__meta_kubernetes_service_name"] + target_label = "service" + } + + // set the app name if specified as metadata labels "app:" or "app.kubernetes.io/name:" or "k8s-app:" + rule { + action = "replace" + source_labels = [ + "__meta_kubernetes_service_label_app_kubernetes_io_name", + "__meta_kubernetes_service_label_k8s_app", + "__meta_kubernetes_service_label_app", + ] + separator = ";" + regex = "^(?:;*)?([^;]+).*$" + replacement = "$1" + target_label = "app" + } + + // set the cluster label + rule { + action = "replace" + replacement = argument.cluster.value + target_label = "cluster" + } + + // set a source label + rule { + action = "replace" + replacement = "kubernetes" + target_label = "source" + } + } + + /***************************************************************** + * Prometheus Scrape Labels Targets + *****************************************************************/ + prometheus.scrape "apiserver" { + targets = discovery.relabel.apiserver.output + + job_name = coalesce(argument.job_label.value, "integrations/kubernetes/apiserver") + scheme = "https" + scrape_interval = coalesce(argument.scrape_interval.value, "60s") + scrape_timeout = coalesce(argument.scrape_timeout.value, "10s") + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + + tls_config { + ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = false + server_name = "kubernetes" + } + + clustering { + enabled = true + } + + forward_to = [prometheus.relabel.apiserver.receiver] + } + + /******************************************** + * Prometheus Metric Relabelings (post-scrape) + ********************************************/ + prometheus.relabel "apiserver" { + forward_to = argument.forward_to.value + max_cache_size = coalesce(argument.max_cache_size.value, 100000) + + // drop metrics that match the drop_metrics regex + rule { + source_labels = ["__name__"] + regex = coalesce(argument.drop_metrics.value, "(((go|process)_.+)|kubelet_node_name|kubelet_(pod_(worker|start)_latency_microseconds|cgroup_manager_latency_microseconds|pleg_relist_(latency|interval)_microseconds|runtime_operations(_latency_microseconds|_errors)?|eviction_stats_age_microseconds|device_plugin_(registration_count|alloc_latency_microseconds)|network_plugin_operations_latency_microseconds)|scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_(predicate|priority|preemption)_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)|apiserver_(request_(count|latencies(_summary)?)|dropped_requests|storage_(data_key_generation|transformation_(failures_total|latencies_microseconds))|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers)|kubelet_docker_(operations(_latency_microseconds|_errors|_timeout)?)|reflector_(items_per_(list|watch)|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)|etcd_(helper_(cache_(hit|miss)_count|cache_entry_count|object_counts)|request_(cache_(get|add)_latencies_summary|latencies_summary)|debugging.*|disk.*|server.*)|transformation_(latencies_microseconds|failures_total)|(admission_quota_controller|APIServiceOpenAPIAggregationControllerQueue1|APIServiceRegistrationController|autoregister|AvailableConditionController|crd_(autoregistration_controller|Establishing|finalizer|naming_condition_controller|openapi_controller)|DiscoveryController|non_structural_schema_condition_controller|kubeproxy_sync_proxy_rules|rest_client_request_latency|storage_operation_(errors_total|status_count))(_.*)|apiserver_admission_(controller_admission|step_admission)_latencies_seconds_.*)") + action = "drop" + } + + // drop metrics whose name and le label match the drop_les regex + rule { + source_labels = [ + "__name__", + "le", + ] + regex = coalesce(argument.drop_les.value, "apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)") + action = "drop" + } + + // keep only metrics that match the keep_metrics regex + rule { + source_labels = ["__name__"] + regex = coalesce(argument.keep_metrics.value, "(.+)") + action = "keep" + } + } +} diff --git a/alloy-modules/kubernetes/jobs/kube-state-metrics.alloy b/alloy-modules/kubernetes/jobs/kube-state-metrics.alloy new file mode 100644 index 00000000..df48b727 --- /dev/null +++ b/alloy-modules/kubernetes/jobs/kube-state-metrics.alloy @@ -0,0 +1,153 @@ +/* +Module Components: kube_state_metrics +Description: kubernetes kube_state_metrics Metrics Scrape + +*/ + +declare "kube_state_metrics_scrape" { + + /******************************************** + * ARGUMENTS + ********************************************/ + argument "forward_to" { + comment = "Must be a list(MetricsReceiver) where collected metrics should be forwarded to" + } + + argument "cluster" { } + + argument "namespaces" { + comment = "The namespaces to look for targets in (default: [] is all namespaces)" + optional = true + } + + argument "field_selectors" { + // Docs: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ + comment = "The label selectors to use to find matching targets (default: [])" + optional = true + } + + argument "label_selectors" { + // Docs: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ + comment = "The label selectors to use to find matching targets (default: [\"app.kubernetes.io/name=kube-state-metrics\"])" + optional = true + } + + argument "port_name" { + comment = "The of the port to scrape metrics from (default: http)" + optional = true + } + + argument "job_label" { + comment = "The job label to add for all kube_state_metrics metrics (default: integrations/kubernetes/kube-state-metrics)" + optional = true + } + + argument "keep_metrics" { + comment = "A regex of metrics to keep (default: see below)" + optional = true + } + + argument "drop_metrics" { + comment = "A regular expression of metrics to drop (default: see below)" + optional = true + } + + argument "scrape_interval" { + comment = "How often to scrape metrics from the targets (default: 60s)" + optional = true + } + + argument "scrape_timeout" { + comment = "How long before a scrape times out (default: 10s)" + optional = true + } + + argument "max_cache_size" { + comment = "The maximum number of elements to hold in the relabeling cache (default: 100000). This should be at least 2x-5x your largest scrape target or samples appended rate." + optional = true + } + + /***************************************************************** + * Targets From Service Discovery + *****************************************************************/ + discovery.kubernetes "kube_state_metrics" { + role = "service" + + selectors { + role = "service" + field = join(coalesce(argument.field_selectors.value, []), ",") + label = join(coalesce(argument.label_selectors.value, ["app.kubernetes.io/name=kube-state-metrics"]), ",") + } + + namespaces { + names = coalesce(argument.namespaces.value, []) + } + } + + /***************************************************************** + * Discovery Relabelings (pre-scrape) + *****************************************************************/ + discovery.relabel "kube_state_metrics" { + targets = discovery.kubernetes.kube_state_metrics.targets + + // only keep targets with a matching port name + rule { + source_labels = ["__meta_kubernetes_service_port_name"] + regex = coalesce(argument.port_name.value, "http") + action = "keep" + } + + // set the cluster label + rule { + action = "replace" + replacement = argument.cluster.value + target_label = "cluster" + } + + // set a source label + rule { + action = "replace" + replacement = "kubernetes" + target_label = "source" + } + } + + /***************************************************************** + * Prometheus Scrape Labels Targets + *****************************************************************/ + prometheus.scrape "kube_state_metrics" { + targets = discovery.relabel.kube_state_metrics.output + + job_name = coalesce(argument.job_label.value, "integrations/kubernetes/kube-state-metrics") + scrape_interval = coalesce(argument.scrape_interval.value, "60s") + scrape_timeout = coalesce(argument.scrape_timeout.value, "10s") + + clustering { + enabled = true + } + + forward_to = [prometheus.relabel.kube_state_metrics.receiver] + } + + /******************************************** + * Prometheus Metric Relabelings (post-scrape) + ********************************************/ + prometheus.relabel "kube_state_metrics" { + forward_to = argument.forward_to.value + max_cache_size = coalesce(argument.max_cache_size.value, 100000) + + // drop metrics that match the drop_metrics regex + rule { + source_labels = ["__name__"] + regex = coalesce(argument.drop_metrics.value, "(^(go|process)_.+$)") + action = "drop" + } + + // keep only metrics that match the keep_metrics regex + rule { + source_labels = ["__name__"] + regex = coalesce(argument.keep_metrics.value, "(up|kube_(daemonset.*|deployment_(metadata_generation|spec_replicas|status_(observed_generation|replicas_(available|updated)))|horizontalpodautoscaler_(spec_(max|min)_replicas|status_(current|desired)_replicas)|job.*|namespace_status_phase|node.*|persistentvolumeclaim_resource_requests_storage_bytes|pod_(container_(info|resource_(limits|requests)|status_(last_terminated_reason|restarts_total|waiting_reason))|info|owner|start_time|status_(phase|reason))|replicaset.*|resourcequota|statefulset.*))") + action = "keep" + } + } +} diff --git a/alloy-modules/kubernetes/jobs/kubelet.alloy b/alloy-modules/kubernetes/jobs/kubelet.alloy new file mode 100644 index 00000000..d3e2763a --- /dev/null +++ b/alloy-modules/kubernetes/jobs/kubelet.alloy @@ -0,0 +1,292 @@ +/* +Module Components: kubelet +Description: kubernetes kubelet Metrics Scrape + +*/ + +declare "kubelet_metrics_scrape" { + + /******************************************** + * ARGUMENTS + ********************************************/ + argument "forward_to" { + comment = "Must be a list(MetricsReceiver) where collected metrics should be forwarded to" + } + + argument "cluster" { } + + argument "keep_metrics" { + comment = "A regex of metrics to keep (default: see below)" + optional = true + } + + argument "drop_metrics" { + comment = "A regular expression of metrics to drop (default: see below)" + optional = true + } + + argument "scrape_interval" { + comment = "How often to scrape metrics from the targets (default: 60s)" + optional = true + } + + argument "scrape_timeout" { + comment = "How long before a scrape times out (default: 10s)" + optional = true + } + + argument "max_cache_size" { + comment = "The maximum number of elements to hold in the relabeling cache (default: 100000). This should be at least 2x-5x your largest scrape target or samples appended rate." + optional = true + } + + /***************************************************************** + * Targets From Docker Discovery + *****************************************************************/ + discovery.kubernetes "node" { + role = "node" + } + + /***************************************************************** + * Discovery Relabelings (pre-scrape) + *****************************************************************/ + discovery.relabel "node" { + targets = discovery.kubernetes.node.targets + + // set the address to use the kubernetes service dns name + rule { + target_label = "__address__" + replacement = "kubernetes.default.svc.cluster.local:443" + } + + // set the node label + rule { + source_labels = ["__meta_kubernetes_node_name"] + target_label = "node" + } + + // set the app name if specified as metadata labels "app:" or "app.kubernetes.io/name:" or "k8s-app:" + rule { + action = "replace" + source_labels = [ + "__meta_kubernetes_service_label_app_kubernetes_io_name", + "__meta_kubernetes_service_label_k8s_app", + "__meta_kubernetes_service_label_app", + ] + separator = ";" + regex = "^(?:;*)?([^;]+).*$" + replacement = "$1" + target_label = "app" + } + + // set the cluster label + rule { + action = "replace" + replacement = argument.cluster.value + target_label = "cluster" + } + + // set a source label + rule { + action = "replace" + replacement = "kubernetes" + target_label = "source" + } + } + + discovery.relabel "kubelet" { + targets = discovery.relabel.node.output + + rule { + target_label = "job" + replacement = "integrations/kubernetes/kubelet" + } + + // set the metrics path to use the proxy path to the nodes kubelet metrics endpoint + rule { + source_labels = ["__meta_kubernetes_node_name"] + regex = "(.+)" + replacement = "/api/v1/nodes/${1}/proxy/metrics" + target_label = "__metrics_path__" + } + } + + discovery.relabel "resources" { + targets = discovery.relabel.node.output + + rule { + target_label = "job" + replacement = "integrations/kubernetes/kube-resources" + } + + // set the metrics path to use the proxy path to the nodes kubelet metrics endpoint + rule { + source_labels = ["__meta_kubernetes_node_name"] + regex = "(.+)" + replacement = "/api/v1/nodes/${1}/proxy/metrics/resource" + target_label = "__metrics_path__" + } + } + + discovery.relabel "probes" { + targets = discovery.relabel.node.output + + rule { + target_label = "job" + replacement = "integrations/kubernetes/kube-probes" + } + + // set the metrics path to use the proxy path to the nodes kubelet metrics endpoint + rule { + source_labels = ["__meta_kubernetes_node_name"] + regex = "(.+)" + replacement = "/api/v1/nodes/${1}/proxy/metrics/probes" + target_label = "__metrics_path__" + } + } + + discovery.relabel "cadvisor" { + targets = discovery.relabel.node.output + + rule { + target_label = "job" + replacement = "integrations/kubernetes/cadvisor" + } + + // set the metrics path to use the proxy path to the nodes kubelet metrics endpoint + rule { + source_labels = ["__meta_kubernetes_node_name"] + regex = "(.+)" + replacement = "/api/v1/nodes/${1}/proxy/metrics/cadvisor" + target_label = "__metrics_path__" + } + } + + /***************************************************************** + * Prometheus Scrape Labels Targets + *****************************************************************/ + prometheus.scrape "kubelet" { + targets = concat( + discovery.relabel.kubelet.output, + discovery.relabel.resources.output, + discovery.relabel.probes.output, + discovery.relabel.cadvisor.output, + ) + + scheme = "https" + scrape_interval = coalesce(argument.scrape_interval.value, "60s") + scrape_timeout = coalesce(argument.scrape_timeout.value, "10s") + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + + tls_config { + ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = false + server_name = "kubernetes" + } + + clustering { + enabled = true + } + + forward_to = [prometheus.relabel.kubelet.receiver] + } + + /******************************************** + * Prometheus Metric Relabelings (post-scrape) + ********************************************/ + prometheus.relabel "kubelet" { + forward_to = argument.forward_to.value + max_cache_size = coalesce(argument.max_cache_size.value, 100000) + + // drop metrics that match the drop_metrics regex + rule { + source_labels = ["__name__"] + regex = coalesce(argument.drop_metrics.value, "(^(go|process)_.+$)") + action = "drop" + } + + // keep only metrics that match the keep_metrics regex + rule { + source_labels = ["__name__"] + regex = coalesce(argument.keep_metrics.value, "(.+)") + action = "keep" + } + // Drop empty container labels, addressing https://github.com/google/cadvisor/issues/2688 + rule { + source_labels = ["__name__", "container"] + separator = "@" + regex = "(container_cpu_.*|container_fs_.*|container_memory_.*)@" + action = "drop" + } + + // Drop empty image labels, addressing https://github.com/google/cadvisor/issues/2688 + rule { + source_labels = ["__name__", "image"] + separator = "@" + regex = "(container_cpu_.*|container_fs_.*|container_memory_.*|container_network_.*)@" + action = "drop" + } + + // Normalizing unimportant labels (not deleting to continue satisfying