From 99cfbaf0f79975d067647b7f7faa25784cc750fe Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 7 Dec 2023 18:54:18 -0800 Subject: [PATCH] chore: Update the `build_info` metric label naming and tests (#5275) --- charts/karpenter/README.md | 2 +- go.mod | 2 +- hack/docs/metrics_gen_docs.go | 46 ++-- pkg/operator/operator.go | 2 +- pkg/operator/suite_test.go | 16 +- .../content/en/preview/reference/metrics.md | 205 +++++++++--------- 6 files changed, 147 insertions(+), 126 deletions(-) diff --git a/charts/karpenter/README.md b/charts/karpenter/README.md index de421b4243e1..8e7feeffbd1c 100644 --- a/charts/karpenter/README.md +++ b/charts/karpenter/README.md @@ -34,7 +34,7 @@ helm upgrade --install --namespace karpenter --create-namespace \ | controller.envFrom | list | `[]` | | | controller.extraVolumeMounts | list | `[]` | Additional volumeMounts for the controller pod. | | controller.healthProbe.port | int | `8081` | The container port to use for http health probe. | -| controller.image.digest | string | `"sha256:afa0d0fd5ac375859dc3d239ec992f197cdf01f6c8e3413e3845a43c2434621e"` | SHA256 digest of the controller image. | +| controller.image.digest | string | `"sha256:5e5f59f74d86ff7f13d7d80b89afff8c661cb4e3265f2fdda95b76dd9c838cc1"` | SHA256 digest of the controller image. | | controller.image.repository | string | `"public.ecr.aws/karpenter/controller"` | Repository path to the controller image. | | controller.image.tag | string | `"v0.33.0"` | Tag of the controller image. | | controller.metrics.port | int | `8000` | The container port to use for metrics. | diff --git a/go.mod b/go.mod index 4f359c3c4cab..77fbedbaf5cd 100644 --- a/go.mod +++ b/go.mod @@ -15,6 +15,7 @@ require ( github.com/patrickmn/go-cache v2.1.0+incompatible github.com/pelletier/go-toml/v2 v2.1.0 github.com/prometheus/client_golang v1.17.0 + github.com/prometheus/client_model v0.5.0 github.com/samber/lo v1.39.0 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.26.0 @@ -79,7 +80,6 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/olekukonko/tablewriter v0.0.5 // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/common v0.44.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect github.com/prometheus/statsd_exporter v0.24.0 // indirect diff --git a/hack/docs/metrics_gen_docs.go b/hack/docs/metrics_gen_docs.go index d01e362f5654..5522034f0001 100644 --- a/hack/docs/metrics_gen_docs.go +++ b/hack/docs/metrics_gen_docs.go @@ -56,6 +56,14 @@ func main() { packages := getPackages(flag.Arg(i)) allMetrics = append(allMetrics, getMetricsFromPackages(packages...)...) } + // Controller Runtime naming is different in that they don't specify a namespace or subsystem + // Getting the metrics requires special parsing logic + for i := range allMetrics { + if allMetrics[i].subsystem == "" && strings.HasPrefix(allMetrics[i].name, "controller_runtime_") { + allMetrics[i].subsystem = "controller_runtime" + allMetrics[i].name = strings.TrimPrefix(allMetrics[i].name, "controller_runtime_") + } + } sort.Slice(allMetrics, bySubsystem(allMetrics)) outputFileName := flag.Arg(flag.NArg() - 1) @@ -80,19 +88,15 @@ description: > previousSubsystem := "" for _, metric := range allMetrics { - // Controller Runtime naming is different in that they don't specify a namespace or subsystem - // Getting the metrics requires special parsing logic - if metric.subsystem == "" && strings.HasPrefix(metric.name, "controller_runtime_") { - metric.subsystem = "controller_runtime" - metric.name = strings.TrimPrefix(metric.name, "controller_runtime_") - } if metric.subsystem != previousSubsystem { - subsystemTitle := strings.Join(lo.Map(strings.Split(metric.subsystem, "_"), func(s string, _ int) string { - return fmt.Sprintf("%s%s", strings.ToTitle(s[0:1]), s[1:]) - }), " ") - fmt.Fprintf(f, "## %s Metrics\n", subsystemTitle) + if metric.subsystem != "" { + subsystemTitle := strings.Join(lo.Map(strings.Split(metric.subsystem, "_"), func(s string, _ int) string { + return fmt.Sprintf("%s%s", strings.ToTitle(s[0:1]), s[1:]) + }), " ") + fmt.Fprintf(f, "## %s Metrics\n", subsystemTitle) + fmt.Fprintln(f) + } previousSubsystem = metric.subsystem - fmt.Fprintln(f) } fmt.Fprintf(f, "### `%s`\n", metric.qualifiedName()) fmt.Fprintf(f, "%s\n", metric.help) @@ -155,19 +159,23 @@ func getMetricsFromPackages(packages ...*ast.Package) []metricInfo { } func bySubsystem(metrics []metricInfo) func(i int, j int) bool { - subSystemSortOrder := map[string]int{} - subSystemSortOrder["nodepool"] = 1 - subSystemSortOrder["nodes"] = 2 - subSystemSortOrder["pods"] = 3 - subSystemSortOrder["cloudprovider"] = 4 - subSystemSortOrder["cloudprovider_batcher"] = 5 + // Higher ordering comes first. If a value isn't designated here then the subsystem will be given a default of 0. + // Metrics without a subsystem come first since there is no designation for the bucket they fall under + subSystemSortOrder := map[string]int{ + "": 100, + "nodepool": 10, + "nodeclaim": 9, + "nodes": 8, + "pods": 7, + } + return func(i, j int) bool { lhs := metrics[i] rhs := metrics[j] if subSystemSortOrder[lhs.subsystem] != subSystemSortOrder[rhs.subsystem] { - return subSystemSortOrder[lhs.subsystem] < subSystemSortOrder[rhs.subsystem] + return subSystemSortOrder[lhs.subsystem] > subSystemSortOrder[rhs.subsystem] } - return lhs.qualifiedName() < rhs.qualifiedName() + return lhs.qualifiedName() > rhs.qualifiedName() } } diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 30354a86cb36..bc2bdfd3b329 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -77,7 +77,7 @@ var BuildInfo = prometheus.NewGaugeVec( Name: "build_info", Help: "A metric with a constant '1' value labeled by version from which karpenter was built.", }, - []string{"version", "goversion", "sha"}) + []string{"version", "goversion", "commit"}) func init() { lo.Must0(apis.AddToScheme(scheme.Scheme)) diff --git a/pkg/operator/suite_test.go b/pkg/operator/suite_test.go index 66d5259a28f0..899bf6bd70b0 100644 --- a/pkg/operator/suite_test.go +++ b/pkg/operator/suite_test.go @@ -22,15 +22,16 @@ import ( "github.com/aws/aws-sdk-go/service/eks" "github.com/samber/lo" + prometheusmodel "github.com/prometheus/client_model/go" + "sigs.k8s.io/karpenter/pkg/operator/scheme" + coretest "sigs.k8s.io/karpenter/pkg/test" + "github.com/aws/karpenter-provider-aws/pkg/apis" "github.com/aws/karpenter-provider-aws/pkg/fake" awscontext "github.com/aws/karpenter-provider-aws/pkg/operator" "github.com/aws/karpenter-provider-aws/pkg/operator/options" "github.com/aws/karpenter-provider-aws/pkg/test" - "sigs.k8s.io/karpenter/pkg/operator/scheme" - coretest "sigs.k8s.io/karpenter/pkg/test" - . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" . "knative.dev/pkg/logging/testing" @@ -102,4 +103,13 @@ var _ = Describe("Operator", func() { _, err := awscontext.ResolveClusterEndpoint(ctx, fakeEKSAPI) Expect(err).To(HaveOccurred()) }) + It("should fire a metric with the build_info", func() { + m, found := FindMetricWithLabelValues("karpenter_build_info", map[string]string{}) + Expect(found).To(BeTrue()) + + for _, label := range []string{"version", "goversion", "commit"} { + _, ok := lo.Find(m.GetLabel(), func(l *prometheusmodel.LabelPair) bool { return lo.FromPtr(l.Name) == label }) + Expect(ok).To(BeTrue()) + } + }) }) diff --git a/website/content/en/preview/reference/metrics.md b/website/content/en/preview/reference/metrics.md index cdb7e7fff63c..4c3e2a0fdff3 100644 --- a/website/content/en/preview/reference/metrics.md +++ b/website/content/en/preview/reference/metrics.md @@ -8,166 +8,169 @@ description: > --- Karpenter makes several metrics available in Prometheus format to allow monitoring cluster provisioning status. These metrics are available by default at `karpenter.karpenter.svc.cluster.local:8000/metrics` configurable via the `METRICS_PORT` environment variable documented [here](../settings) -## Controller Runtime Metrics +### `karpenter_build_info` +A metric with a constant '1' value labeled by version from which karpenter was built. -### `controller_runtime_active_workers` -Number of currently used workers per controller +## Nodepool Metrics -### `controller_runtime_max_concurrent_reconciles` -Maximum number of concurrent reconciles per controller +### `karpenter_nodepool_usage` +The nodepool usage is the amount of resources that have been provisioned by a particular nodepool. Labeled by nodepool name and resource type. -### `controller_runtime_reconcile_errors_total` -Total number of reconciliation errors per controller +### `karpenter_nodepool_limit` +The nodepool limits are the limits specified on the nodepool that restrict the quantity of resources provisioned. Labeled by nodepool name and resource type. -### `controller_runtime_reconcile_time_seconds` -Length of time per reconciliation per controller +## Nodes Metrics -### `controller_runtime_reconcile_total` -Total number of reconciliations per controller +### `karpenter_nodes_total_pod_requests` +Node total pod requests are the resources requested by non-DaemonSet pods bound to nodes. -## Consistency Metrics +### `karpenter_nodes_total_pod_limits` +Node total pod limits are the resources specified by non-DaemonSet pod limits. -### `karpenter_consistency_errors` -Number of consistency checks that have failed. +### `karpenter_nodes_total_daemon_requests` +Node total daemon requests are the resource requested by DaemonSet pods bound to nodes. -## Disruption Metrics +### `karpenter_nodes_total_daemon_limits` +Node total daemon limits are the resources specified by DaemonSet pod limits. -### `karpenter_disruption_actions_performed_total` -Number of disruption actions performed. Labeled by disruption method. +### `karpenter_nodes_termination_time_seconds` +The time taken between a node's deletion request and the removal of its finalizer -### `karpenter_disruption_consolidation_timeouts_total` -Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type. +### `karpenter_nodes_terminated` +Number of nodes terminated in total by Karpenter. Labeled by owning nodepool. -### `karpenter_disruption_eligible_nodes` -Number of nodes eligible for disruption by Karpenter. Labeled by disruption method. +### `karpenter_nodes_system_overhead` +Node system daemon overhead are the resources reserved for system overhead, the difference between the node's capacity and allocatable values are reported by the status. -### `karpenter_disruption_evaluation_duration_seconds` -Duration of the disruption evaluation process in seconds. +### `karpenter_nodes_leases_deleted` +Number of deleted leaked leases. -### `karpenter_disruption_queue_depth` -The number of commands currently being waited on in the disruption orchestration queue. +### `karpenter_nodes_created` +Number of nodes created in total by Karpenter. Labeled by owning nodepool. -### `karpenter_disruption_replacement_nodeclaim_failures_total` -The number of times that Karpenter failed to launch a replacement node for disruption. Labeled by disruption method. +### `karpenter_nodes_allocatable` +Node allocatable are the resources allocatable by nodes. -### `karpenter_disruption_replacement_nodeclaim_initialized_seconds` -Amount of time required for a replacement nodeclaim to become initialized. +## Pods Metrics -## Interruption Metrics +### `karpenter_pods_state` +Pod state is the current state of pods. This metric can be used several ways as it is labeled by the pod name, namespace, owner, node, nodepool name, zone, architecture, capacity type, instance type and pod phase. -### `karpenter_interruption_actions_performed` -Number of notification actions performed. Labeled by action +### `karpenter_pods_startup_time_seconds` +The time from pod creation until the pod is running. -### `karpenter_interruption_deleted_messages` -Count of messages deleted from the SQS queue. +## Provisioner Metrics -### `karpenter_interruption_message_latency_time_seconds` -Length of time between message creation in queue and an action taken on the message by the controller. +### `karpenter_provisioner_scheduling_simulation_duration_seconds` +Duration of scheduling simulations used for deprovisioning and provisioning in seconds. -### `karpenter_interruption_received_messages` -Count of messages received from the SQS queue. Broken down by message type and whether the message was actionable. +### `karpenter_provisioner_scheduling_duration_seconds` +Duration of scheduling process in seconds. ## Nodeclaims Metrics -### `karpenter_nodeclaims_created` -Number of nodeclaims created in total by Karpenter. Labeled by reason the nodeclaim was created and the owning nodepool. - -### `karpenter_nodeclaims_disrupted` -Number of nodeclaims disrupted in total by Karpenter. Labeled by disruption type of the nodeclaim and the owning nodepool. - -### `karpenter_nodeclaims_drifted` -Number of nodeclaims drifted reasons in total by Karpenter. Labeled by drift type of the nodeclaim and the owning nodepool. - -### `karpenter_nodeclaims_initialized` -Number of nodeclaims initialized in total by Karpenter. Labeled by the owning nodepool. - -### `karpenter_nodeclaims_launched` -Number of nodeclaims launched in total by Karpenter. Labeled by the owning nodepool. +### `karpenter_nodeclaims_terminated` +Number of nodeclaims terminated in total by Karpenter. Labeled by reason the nodeclaim was terminated and the owning nodepool. ### `karpenter_nodeclaims_registered` Number of nodeclaims registered in total by Karpenter. Labeled by the owning nodepool. -### `karpenter_nodeclaims_terminated` -Number of nodeclaims terminated in total by Karpenter. Labeled by reason the nodeclaim was terminated and the owning nodepool. - -## Provisioner Metrics +### `karpenter_nodeclaims_launched` +Number of nodeclaims launched in total by Karpenter. Labeled by the owning nodepool. -### `karpenter_provisioner_scheduling_duration_seconds` -Duration of scheduling process in seconds. +### `karpenter_nodeclaims_initialized` +Number of nodeclaims initialized in total by Karpenter. Labeled by the owning nodepool. -### `karpenter_provisioner_scheduling_simulation_duration_seconds` -Duration of scheduling simulations used for deprovisioning and provisioning in seconds. +### `karpenter_nodeclaims_drifted` +Number of nodeclaims drifted reasons in total by Karpenter. Labeled by drift type of the nodeclaim and the owning nodepool. -## Nodepool Metrics +### `karpenter_nodeclaims_disrupted` +Number of nodeclaims disrupted in total by Karpenter. Labeled by disruption type of the nodeclaim and the owning nodepool. -### `karpenter_nodepool_limit` -The nodepool limits are the limits specified on the nodepool that restrict the quantity of resources provisioned. Labeled by nodepool name and resource type. +### `karpenter_nodeclaims_created` +Number of nodeclaims created in total by Karpenter. Labeled by reason the nodeclaim was created and the owning nodepool. -### `karpenter_nodepool_usage` -The nodepool usage is the amount of resources that have been provisioned by a particular nodepool. Labeled by nodepool name and resource type. +## Interruption Metrics -## Nodes Metrics +### `karpenter_interruption_received_messages` +Count of messages received from the SQS queue. Broken down by message type and whether the message was actionable. -### `karpenter_nodes_allocatable` -Node allocatable are the resources allocatable by nodes. +### `karpenter_interruption_message_latency_time_seconds` +Length of time between message creation in queue and an action taken on the message by the controller. -### `karpenter_nodes_created` -Number of nodes created in total by Karpenter. Labeled by owning nodepool. +### `karpenter_interruption_deleted_messages` +Count of messages deleted from the SQS queue. -### `karpenter_nodes_leases_deleted` -Number of deleted leaked leases. +### `karpenter_interruption_actions_performed` +Number of notification actions performed. Labeled by action -### `karpenter_nodes_system_overhead` -Node system daemon overhead are the resources reserved for system overhead, the difference between the node's capacity and allocatable values are reported by the status. +## Disruption Metrics -### `karpenter_nodes_terminated` -Number of nodes terminated in total by Karpenter. Labeled by owning nodepool. +### `karpenter_disruption_replacement_nodeclaim_initialized_seconds` +Amount of time required for a replacement nodeclaim to become initialized. -### `karpenter_nodes_termination_time_seconds` -The time taken between a node's deletion request and the removal of its finalizer +### `karpenter_disruption_replacement_nodeclaim_failures_total` +The number of times that Karpenter failed to launch a replacement node for disruption. Labeled by disruption method. -### `karpenter_nodes_total_daemon_limits` -Node total daemon limits are the resources specified by DaemonSet pod limits. +### `karpenter_disruption_queue_depth` +The number of commands currently being waited on in the disruption orchestration queue. -### `karpenter_nodes_total_daemon_requests` -Node total daemon requests are the resource requested by DaemonSet pods bound to nodes. +### `karpenter_disruption_evaluation_duration_seconds` +Duration of the disruption evaluation process in seconds. -### `karpenter_nodes_total_pod_limits` -Node total pod limits are the resources specified by non-DaemonSet pod limits. +### `karpenter_disruption_eligible_nodes` +Number of nodes eligible for disruption by Karpenter. Labeled by disruption method. -### `karpenter_nodes_total_pod_requests` -Node total pod requests are the resources requested by non-DaemonSet pods bound to nodes. +### `karpenter_disruption_consolidation_timeouts_total` +Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type. -## Pods Metrics +### `karpenter_disruption_actions_performed_total` +Number of disruption actions performed. Labeled by disruption method. -### `karpenter_pods_startup_time_seconds` -The time from pod creation until the pod is running. +## Consistency Metrics -### `karpenter_pods_state` -Pod state is the current state of pods. This metric can be used several ways as it is labeled by the pod name, namespace, owner, node, nodepool name, zone, architecture, capacity type, instance type and pod phase. +### `karpenter_consistency_errors` +Number of consistency checks that have failed. ## Cloudprovider Metrics -### `karpenter_cloudprovider_duration_seconds` -Duration of cloud provider method calls. Labeled by the controller, method name and provider. +### `karpenter_cloudprovider_instance_type_price_estimate` +Estimated hourly price used when making informed decisions on node cost calculation. This is updated once on startup and then every 12 hours. -### `karpenter_cloudprovider_errors_total` -Total number of errors returned from CloudProvider calls. +### `karpenter_cloudprovider_instance_type_memory_bytes` +Memory, in bytes, for a given instance type. ### `karpenter_cloudprovider_instance_type_cpu_cores` VCPUs cores for a given instance type. -### `karpenter_cloudprovider_instance_type_memory_bytes` -Memory, in bytes, for a given instance type. +### `karpenter_cloudprovider_errors_total` +Total number of errors returned from CloudProvider calls. -### `karpenter_cloudprovider_instance_type_price_estimate` -Estimated hourly price used when making informed decisions on node cost calculation. This is updated once on startup and then every 12 hours. +### `karpenter_cloudprovider_duration_seconds` +Duration of cloud provider method calls. Labeled by the controller, method name and provider. ## Cloudprovider Batcher Metrics +### `karpenter_cloudprovider_batcher_batch_time_seconds` +Duration of the batching window per batcher + ### `karpenter_cloudprovider_batcher_batch_size` Size of the request batch per batcher -### `karpenter_cloudprovider_batcher_batch_time_seconds` -Duration of the batching window per batcher +## Controller Runtime Metrics + +### `controller_runtime_reconcile_total` +Total number of reconciliations per controller + +### `controller_runtime_reconcile_time_seconds` +Length of time per reconciliation per controller + +### `controller_runtime_reconcile_errors_total` +Total number of reconciliation errors per controller + +### `controller_runtime_max_concurrent_reconciles` +Maximum number of concurrent reconciles per controller + +### `controller_runtime_active_workers` +Number of currently used workers per controller