Skip to content

Commit

Permalink
Merge pull request #119 from qclaogui:k8s-jobs
Browse files Browse the repository at this point in the history
kubernetes jobs Metrics Scrape
  • Loading branch information
qclaogui authored Apr 26, 2024
2 parents 6f3d208 + d01e2c0 commit 42386f3
Show file tree
Hide file tree
Showing 97 changed files with 106,425 additions and 51,515 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,12 @@ manifests-common: $(KUSTOMIZE)
@$(KUSTOMIZE) build --enable-helm kubernetes/common/gateway > kubernetes/common/gateway/manifests/k8s-all-in-one.yaml
@$(KUSTOMIZE) build --enable-helm kubernetes/common/grafana > kubernetes/common/grafana/manifests/k8s-all-in-one.yaml
@$(KUSTOMIZE) build --enable-helm kubernetes/common/kube-prometheus-stack > kubernetes/common/kube-prometheus-stack/manifests/k8s-all-in-one.yaml
@$(KUSTOMIZE) build --enable-helm kubernetes/common/kube-state-metrics > kubernetes/common/kube-state-metrics/manifests/k8s-all-in-one.yaml
@$(KUSTOMIZE) build --enable-helm kubernetes/common/memcached > kubernetes/common/memcached/manifests/k8s-all-in-one.yaml
@$(KUSTOMIZE) build --enable-helm kubernetes/common/minio-operator > kubernetes/common/minio-operator/manifests/k8s-all-in-one.yaml
@$(KUSTOMIZE) build --enable-helm kubernetes/common/minio-tenant > kubernetes/common/minio-tenant/manifests/k8s-all-in-one.yaml
@$(KUSTOMIZE) build --enable-helm kubernetes/common/mysql > kubernetes/common/mysql/manifests/k8s-all-in-one.yaml
@$(KUSTOMIZE) build --enable-helm kubernetes/common/prometheus-node-exporter > kubernetes/common/prometheus-node-exporter/manifests/k8s-all-in-one.yaml
@$(KUSTOMIZE) build --enable-helm kubernetes/common/prometheus-operator-crds > kubernetes/common/prometheus-operator-crds/manifests/k8s-all-in-one.yaml
@$(KUSTOMIZE) build --enable-helm kubernetes/common/rancher-pushprox > kubernetes/common/rancher-pushprox/manifests/k8s-all-in-one.yaml
@$(KUSTOMIZE) build --enable-helm kubernetes/common/redis > kubernetes/common/redis/manifests/k8s-all-in-one.yaml
Expand Down
6 changes: 3 additions & 3 deletions alloy-modules/kubernetes/integrations/mysql.alloy
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ declare "mysql_metrics_scrape" {
}

argument "namespace" {
comment = "kubernetes secret name (default: monitoring-system)"
comment = "kubernetes secret namespace (default: monitoring-system)"
optional = true
}

Expand All @@ -27,8 +27,8 @@ declare "mysql_metrics_scrape" {
}

argument "keep_metrics" {
comment = "A regex of metrics to keep (default: see below)"
optional = true
default = "(up|instance:mysql_heartbeat_lag_seconds|instance:mysql_slave_lag_seconds|mysql_global_status_aborted_clients|mysql_global_status_aborted_connects|mysql_global_status_buffer_pool_pages|mysql_global_status_bytes_received|mysql_global_status_bytes_sent|mysql_global_status_commands_total|mysql_global_status_created_tmp_disk_tables|mysql_global_status_created_tmp_files|mysql_global_status_created_tmp_tables|mysql_global_status_handlers_total|mysql_global_status_innodb_log_waits|mysql_global_status_innodb_mem_adaptive_hash|mysql_global_status_innodb_mem_dictionary|mysql_global_status_innodb_num_open_files|mysql_global_status_innodb_page_size|mysql_global_status_max_used_connections|mysql_global_status_open_files|mysql_global_status_open_table_definitions|mysql_global_status_open_tables|mysql_global_status_opened_files|mysql_global_status_opened_table_definitions|mysql_global_status_opened_tables|mysql_global_status_qcache_free_memory|mysql_global_status_qcache_hits|mysql_global_status_qcache_inserts|mysql_global_status_qcache_lowmem_prunes|mysql_global_status_qcache_not_cached|mysql_global_status_qcache_queries_in_cache|mysql_global_status_queries|mysql_global_status_questions|mysql_global_status_select_full_join|mysql_global_status_select_full_range_join|mysql_global_status_select_range|mysql_global_status_select_range_check|mysql_global_status_select_scan|mysql_global_status_slow_queries|mysql_global_status_sort_merge_passes|mysql_global_status_sort_range|mysql_global_status_sort_rows|mysql_global_status_sort_scan|mysql_global_status_table_locks_immediate|mysql_global_status_table_locks_waited|mysql_global_status_table_open_cache_hits|mysql_global_status_table_open_cache_misses|mysql_global_status_table_open_cache_overflows|mysql_global_status_threads_cached|mysql_global_status_threads_connected|mysql_global_status_threads_created|mysql_global_status_threads_running|mysql_global_status_uptime|mysql_global_status_wsrep_local_recv_queue|mysql_global_status_wsrep_local_state|mysql_global_status_wsrep_ready|mysql_global_variables_innodb_additional_mem_pool_size|mysql_global_variables_innodb_buffer_pool_size|mysql_global_variables_innodb_log_buffer_size|mysql_global_variables_key_buffer_size|mysql_global_variables_max_connections|mysql_global_variables_open_files_limit|mysql_global_variables_query_cache_size|mysql_global_variables_table_definition_cache|mysql_global_variables_table_open_cache|mysql_global_variables_thread_cache_size|mysql_global_variables_tokudb_cache_size|mysql_global_variables_wsrep_desync|mysql_heartbeat_now_timestamp_seconds|mysql_heartbeat_stored_timestamp_seconds|mysql_info_schema_processlist_threads|mysql_slave_status_seconds_behind_master|mysql_slave_status_slave_io_running|mysql_slave_status_slave_sql_running|mysql_slave_status_sql_delay|mysql_up)"
}

argument "scrape_interval" {
Expand Down Expand Up @@ -100,7 +100,7 @@ declare "mysql_metrics_scrape" {
// keep only metrics that match the keep_metrics regex
rule {
source_labels = ["__name__"]
regex = argument.keep_metrics.value
regex = coalesce(argument.keep_metrics.value, "(up|instance:mysql_heartbeat_lag_seconds|instance:mysql_slave_lag_seconds|mysql_global_status_aborted_clients|mysql_global_status_aborted_connects|mysql_global_status_buffer_pool_pages|mysql_global_status_bytes_received|mysql_global_status_bytes_sent|mysql_global_status_commands_total|mysql_global_status_created_tmp_disk_tables|mysql_global_status_created_tmp_files|mysql_global_status_created_tmp_tables|mysql_global_status_handlers_total|mysql_global_status_innodb_log_waits|mysql_global_status_innodb_mem_adaptive_hash|mysql_global_status_innodb_mem_dictionary|mysql_global_status_innodb_num_open_files|mysql_global_status_innodb_page_size|mysql_global_status_max_used_connections|mysql_global_status_open_files|mysql_global_status_open_table_definitions|mysql_global_status_open_tables|mysql_global_status_opened_files|mysql_global_status_opened_table_definitions|mysql_global_status_opened_tables|mysql_global_status_qcache_free_memory|mysql_global_status_qcache_hits|mysql_global_status_qcache_inserts|mysql_global_status_qcache_lowmem_prunes|mysql_global_status_qcache_not_cached|mysql_global_status_qcache_queries_in_cache|mysql_global_status_queries|mysql_global_status_questions|mysql_global_status_select_full_join|mysql_global_status_select_full_range_join|mysql_global_status_select_range|mysql_global_status_select_range_check|mysql_global_status_select_scan|mysql_global_status_slow_queries|mysql_global_status_sort_merge_passes|mysql_global_status_sort_range|mysql_global_status_sort_rows|mysql_global_status_sort_scan|mysql_global_status_table_locks_immediate|mysql_global_status_table_locks_waited|mysql_global_status_table_open_cache_hits|mysql_global_status_table_open_cache_misses|mysql_global_status_table_open_cache_overflows|mysql_global_status_threads_cached|mysql_global_status_threads_connected|mysql_global_status_threads_created|mysql_global_status_threads_running|mysql_global_status_uptime|mysql_global_status_wsrep_local_recv_queue|mysql_global_status_wsrep_local_state|mysql_global_status_wsrep_ready|mysql_global_variables_innodb_additional_mem_pool_size|mysql_global_variables_innodb_buffer_pool_size|mysql_global_variables_innodb_log_buffer_size|mysql_global_variables_key_buffer_size|mysql_global_variables_max_connections|mysql_global_variables_open_files_limit|mysql_global_variables_query_cache_size|mysql_global_variables_table_definition_cache|mysql_global_variables_table_open_cache|mysql_global_variables_thread_cache_size|mysql_global_variables_tokudb_cache_size|mysql_global_variables_wsrep_desync|mysql_heartbeat_now_timestamp_seconds|mysql_heartbeat_stored_timestamp_seconds|mysql_info_schema_processlist_threads|mysql_slave_status_seconds_behind_master|mysql_slave_status_slave_io_running|mysql_slave_status_slave_sql_running|mysql_slave_status_sql_delay|mysql_up)")
action = "keep"
}
}
Expand Down
17 changes: 17 additions & 0 deletions alloy-modules/kubernetes/jobs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Module Components

Kubernetes Jobs Module Components

## Components

- [`apiserver`](#apiserver_metrics_scrape)

### `apiserver_metrics_scrape`

kubernetes Apiserver Metrics Scrape

***Arguments***

***Exports***

***Example***
206 changes: 206 additions & 0 deletions alloy-modules/kubernetes/jobs/apiserver.alloy
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
/*
Module Components: apiserver
Description: kubernetes Apiserver Metrics Scrape

*/

declare "apiserver_metrics_scrape" {

/********************************************
* ARGUMENTS
********************************************/
argument "forward_to" {
comment = "Must be a list(MetricsReceiver) where collected metrics should be forwarded to"
}

argument "cluster" { }

argument "namespaces" {
comment = "The namespaces to look for targets in (default: default)"
optional = true
}

argument "field_selectors" {
// Docs: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
comment = "The label selectors to use to find matching targets (default: [\"metadata.name=kubernetes\"])"
optional = true
}

argument "label_selectors" {
// Docs: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
comment = "The label selectors to use to find matching targets (default: [])"
optional = true
}

argument "port_name" {
comment = "The value of the label for the selector (default: https)"
optional = true
}

argument "job_label" {
comment = "The job label to add for all kube-apiserver metrics (default: integrations/kubernetes/apiserver)"
optional = true
}

argument "keep_metrics" {
comment = "A regex of metrics to keep (default: see below)"
optional = true
}

// drop metrics and les from kube-prometheus
// https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/kubernetesControlPlane-serviceMonitorApiserver.yaml
argument "drop_metrics" {
comment = "A regular expression of metrics to drop (default: see below)"
optional = true
}

argument "drop_les" {
comment = "Regular expression of metric les label values to drop (default: see below)"
optional = true
}

argument "scrape_interval" {
comment = "How often to scrape metrics from the targets (default: 60s)"
optional = true
}

argument "scrape_timeout" {
comment = "How long before a scrape times out (default: 10s)"
optional = true
}

argument "max_cache_size" {
comment = "The maximum number of elements to hold in the relabeling cache (default: 100000). This should be at least 2x-5x your largest scrape target or samples appended rate."
optional = true
}

/*****************************************************************
* Targets From Docker Discovery
*****************************************************************/
discovery.kubernetes "apiserver" {
role = "service"

selectors {
role = "service"
field = join(coalesce(argument.field_selectors.value, ["metadata.name=kubernetes"]), ",")
label = join(coalesce(argument.label_selectors.value, []), ",")
}

namespaces {
names = coalesce(argument.namespaces.value, ["default"])
}
}

/*****************************************************************
* Discovery Relabelings (pre-scrape)
*****************************************************************/
discovery.relabel "apiserver" {
targets = discovery.kubernetes.apiserver.targets

// only keep targets with a matching port name
rule {
source_labels = ["__meta_kubernetes_service_port_name"]
regex = coalesce(argument.port_name.value, "https")
action = "keep"
}

// set the namespace
rule {
action = "replace"
source_labels = ["__meta_kubernetes_namespace"]
target_label = "namespace"
}

// set the service_name
rule {
action = "replace"
source_labels = ["__meta_kubernetes_service_name"]
target_label = "service"
}

// set the app name if specified as metadata labels "app:" or "app.kubernetes.io/name:" or "k8s-app:"
rule {
action = "replace"
source_labels = [
"__meta_kubernetes_service_label_app_kubernetes_io_name",
"__meta_kubernetes_service_label_k8s_app",
"__meta_kubernetes_service_label_app",
]
separator = ";"
regex = "^(?:;*)?([^;]+).*$"
replacement = "$1"
target_label = "app"
}

// set the cluster label
rule {
action = "replace"
replacement = argument.cluster.value
target_label = "cluster"
}

// set a source label
rule {
action = "replace"
replacement = "kubernetes"
target_label = "source"
}
}

/*****************************************************************
* Prometheus Scrape Labels Targets
*****************************************************************/
prometheus.scrape "apiserver" {
targets = discovery.relabel.apiserver.output

job_name = coalesce(argument.job_label.value, "integrations/kubernetes/apiserver")
scheme = "https"
scrape_interval = coalesce(argument.scrape_interval.value, "60s")
scrape_timeout = coalesce(argument.scrape_timeout.value, "10s")
bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"

tls_config {
ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
insecure_skip_verify = false
server_name = "kubernetes"
}

clustering {
enabled = true
}

forward_to = [prometheus.relabel.apiserver.receiver]
}

/********************************************
* Prometheus Metric Relabelings (post-scrape)
********************************************/
prometheus.relabel "apiserver" {
forward_to = argument.forward_to.value
max_cache_size = coalesce(argument.max_cache_size.value, 100000)

// drop metrics that match the drop_metrics regex
rule {
source_labels = ["__name__"]
regex = coalesce(argument.drop_metrics.value, "(((go|process)_.+)|kubelet_node_name|kubelet_(pod_(worker|start)_latency_microseconds|cgroup_manager_latency_microseconds|pleg_relist_(latency|interval)_microseconds|runtime_operations(_latency_microseconds|_errors)?|eviction_stats_age_microseconds|device_plugin_(registration_count|alloc_latency_microseconds)|network_plugin_operations_latency_microseconds)|scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_(predicate|priority|preemption)_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)|apiserver_(request_(count|latencies(_summary)?)|dropped_requests|storage_(data_key_generation|transformation_(failures_total|latencies_microseconds))|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers)|kubelet_docker_(operations(_latency_microseconds|_errors|_timeout)?)|reflector_(items_per_(list|watch)|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)|etcd_(helper_(cache_(hit|miss)_count|cache_entry_count|object_counts)|request_(cache_(get|add)_latencies_summary|latencies_summary)|debugging.*|disk.*|server.*)|transformation_(latencies_microseconds|failures_total)|(admission_quota_controller|APIServiceOpenAPIAggregationControllerQueue1|APIServiceRegistrationController|autoregister|AvailableConditionController|crd_(autoregistration_controller|Establishing|finalizer|naming_condition_controller|openapi_controller)|DiscoveryController|non_structural_schema_condition_controller|kubeproxy_sync_proxy_rules|rest_client_request_latency|storage_operation_(errors_total|status_count))(_.*)|apiserver_admission_(controller_admission|step_admission)_latencies_seconds_.*)")
action = "drop"
}

// drop metrics whose name and le label match the drop_les regex
rule {
source_labels = [
"__name__",
"le",
]
regex = coalesce(argument.drop_les.value, "apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)")
action = "drop"
}

// keep only metrics that match the keep_metrics regex
rule {
source_labels = ["__name__"]
regex = coalesce(argument.keep_metrics.value, "(.+)")
action = "keep"
}
}
}
Loading

0 comments on commit 42386f3

Please sign in to comment.