From 14bf3a180e9ac1608c309d8189e6a8472e372aee Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Tue, 20 Feb 2024 02:06:08 -0800 Subject: [PATCH 1/2] docs: adding cluster_state metrics docs --- website/content/en/docs/reference/metrics.md | 7 +++++++ website/content/en/preview/reference/metrics.md | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/website/content/en/docs/reference/metrics.md b/website/content/en/docs/reference/metrics.md index 7ec8e338711d..010be61ef884 100644 --- a/website/content/en/docs/reference/metrics.md +++ b/website/content/en/docs/reference/metrics.md @@ -177,3 +177,10 @@ Maximum number of concurrent reconciles per controller ### `controller_runtime_active_workers` Number of currently used workers per controller +## Cluster State Metrics + +### `cluster_state_node_count` +Current count of nodes in cluster state + +### `cluster_state_synced` +Returns 1 if cluster state is synced and 0 otherwise. To be synced it checks that nodeclaims and nodes that are stored in the APIServer have the same representation as Karpenter's cluster state. diff --git a/website/content/en/preview/reference/metrics.md b/website/content/en/preview/reference/metrics.md index d50ec4f20d84..5b201b2e4846 100644 --- a/website/content/en/preview/reference/metrics.md +++ b/website/content/en/preview/reference/metrics.md @@ -185,3 +185,10 @@ Maximum number of concurrent reconciles per controller ### `controller_runtime_active_workers` Number of currently used workers per controller +## Cluster State Metrics + +### `cluster_state_node_count` +Current count of nodes in cluster state + +### `cluster_state_synced` +Returns 1 if cluster state is synced and 0 otherwise. To be synced it checks that nodeclaims and nodes that are stored in the APIServer have the same representation as Karpenter's cluster state From 673b5dc87789a605433c702987264b04ec11a06f Mon Sep 17 00:00:00 2001 From: Bryce Soghigian Date: Tue, 20 Feb 2024 17:44:39 -0800 Subject: [PATCH 2/2] chore: bump karpenter to latest commit in core --- go.mod | 10 +++++----- go.sum | 20 +++++++++---------- website/content/en/docs/reference/metrics.md | 8 -------- .../content/en/preview/reference/metrics.md | 19 +++++++++--------- 4 files changed, 24 insertions(+), 33 deletions(-) diff --git a/go.mod b/go.mod index 6ec92aacd7ad..e829999f459c 100644 --- a/go.mod +++ b/go.mod @@ -27,7 +27,7 @@ require ( k8s.io/utils v0.0.0-20230726121419-3b25d923346b knative.dev/pkg v0.0.0-20231010144348-ca8c009405dd sigs.k8s.io/controller-runtime v0.17.2 - sigs.k8s.io/karpenter v0.34.1-0.20240215201436-15bb262fa130 + sigs.k8s.io/karpenter v0.34.1-0.20240220171136-46d3d646ea37 ) require ( @@ -78,7 +78,7 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/olekukonko/tablewriter v0.0.5 // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/prometheus/client_model v0.5.0 // indirect + github.com/prometheus/client_model v0.6.0 // indirect github.com/prometheus/common v0.45.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect github.com/prometheus/statsd_exporter v0.24.0 // indirect @@ -102,13 +102,13 @@ require ( google.golang.org/genproto/googleapis/api v0.0.0-20231009173412-8bfb1ae86b6c // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20231009173412-8bfb1ae86b6c // indirect google.golang.org/grpc v1.58.3 // indirect - google.golang.org/protobuf v1.31.0 // indirect + google.golang.org/protobuf v1.32.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/cloud-provider v0.29.1 // indirect + k8s.io/cloud-provider v0.29.2 // indirect k8s.io/component-base v0.29.2 // indirect - k8s.io/csi-translation-lib v0.29.1 // indirect + k8s.io/csi-translation-lib v0.29.2 // indirect k8s.io/klog/v2 v2.120.1 // indirect k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect diff --git a/go.sum b/go.sum index 0a47393cf516..f14a4a1b0314 100644 --- a/go.sum +++ b/go.sum @@ -300,8 +300,8 @@ github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1: github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw= -github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI= +github.com/prometheus/client_model v0.6.0 h1:k1v3CzpSRUTrKMppY35TLwPvxHqBu0bYgxZzqGIgaos= +github.com/prometheus/client_model v0.6.0/go.mod h1:NTQHnmxFpouOD0DpvP4XujX3CdOAGQPoaGhyTchlyt8= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc= @@ -701,8 +701,8 @@ google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp0 google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= -google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.32.0 h1:pPC6BG5ex8PDFnkbrGU3EixyhKcQ2aDuBS36lqK/C7I= +google.golang.org/protobuf v1.32.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -739,12 +739,12 @@ k8s.io/apimachinery v0.29.2 h1:EWGpfJ856oj11C52NRCHuU7rFDwxev48z+6DSlGNsV8= k8s.io/apimachinery v0.29.2/go.mod h1:6HVkd1FwxIagpYrHSwJlQqZI3G9LfYWRPAkUvLnXTKU= k8s.io/client-go v0.29.2 h1:FEg85el1TeZp+/vYJM7hkDlSTFZ+c5nnK44DJ4FyoRg= k8s.io/client-go v0.29.2/go.mod h1:knlvFZE58VpqbQpJNbCbctTVXcd35mMyAAwBdpt4jrA= -k8s.io/cloud-provider v0.29.1 h1:bDLpOSpysWrtU2PCkvyP2sUTwRBa6MGCmxt68CRRW/8= -k8s.io/cloud-provider v0.29.1/go.mod h1:u50Drm6AbuoKpsVbAstNiFHGgbSVHuJV4TWN5imdM2w= +k8s.io/cloud-provider v0.29.2 h1:ghKNXoQmeP8Fj/YTJNR6xQOzNrKXt6YZyy6mOEEa3yg= +k8s.io/cloud-provider v0.29.2/go.mod h1:KAp+07AUGmxcLnoLY5FndU4hj6158KMbiviNgctNRUk= k8s.io/component-base v0.29.2 h1:lpiLyuvPA9yV1aQwGLENYyK7n/8t6l3nn3zAtFTJYe8= k8s.io/component-base v0.29.2/go.mod h1:BfB3SLrefbZXiBfbM+2H1dlat21Uewg/5qtKOl8degM= -k8s.io/csi-translation-lib v0.29.1 h1:b2tYZnnHyrQVHG6GYel7egmVvKeIlX/xbTNm9ynBSUg= -k8s.io/csi-translation-lib v0.29.1/go.mod h1:Zglui6PgFSew8ux50djwZ3PFK6eNrWktid66D7pHDDo= +k8s.io/csi-translation-lib v0.29.2 h1:TJVZTzR7gj6+HSb+jJxLUxnAuwrEy71IxhJ4nmTzyjE= +k8s.io/csi-translation-lib v0.29.2/go.mod h1:vbSYY4c6mVPwTHAvb5V3CHlq/dmQFIZC1SJOsaFiY3I= k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/AuzbMm96cd3YHRTU83I780= @@ -760,8 +760,8 @@ sigs.k8s.io/controller-runtime v0.17.2 h1:FwHwD1CTUemg0pW2otk7/U5/i5m2ymzvOXdbeG sigs.k8s.io/controller-runtime v0.17.2/go.mod h1:+MngTvIQQQhfXtwfdGw/UOQ/aIaqsYywfCINOtwMO/s= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/karpenter v0.34.1-0.20240215201436-15bb262fa130 h1:v32NPT5Vup3jQOnBV5qbKkH8lmrkPmcu7ES7R+mBd8c= -sigs.k8s.io/karpenter v0.34.1-0.20240215201436-15bb262fa130/go.mod h1:9SKzdbyklbaYTYkylwGQuq0vZoZ2TA9WTG8Ad9UtUyY= +sigs.k8s.io/karpenter v0.34.1-0.20240220171136-46d3d646ea37 h1:9BGNR9+y6wBZCUyNtouAmE6IKZ7OTWp0Pbah4sdQnsc= +sigs.k8s.io/karpenter v0.34.1-0.20240220171136-46d3d646ea37/go.mod h1:xHMNckVQTSXN56es2BHr3s4ehyo39tOkcUU3OeUsK8U= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= diff --git a/website/content/en/docs/reference/metrics.md b/website/content/en/docs/reference/metrics.md index 010be61ef884..0f42331c1b95 100644 --- a/website/content/en/docs/reference/metrics.md +++ b/website/content/en/docs/reference/metrics.md @@ -176,11 +176,3 @@ Maximum number of concurrent reconciles per controller ### `controller_runtime_active_workers` Number of currently used workers per controller - -## Cluster State Metrics - -### `cluster_state_node_count` -Current count of nodes in cluster state - -### `cluster_state_synced` -Returns 1 if cluster state is synced and 0 otherwise. To be synced it checks that nodeclaims and nodes that are stored in the APIServer have the same representation as Karpenter's cluster state. diff --git a/website/content/en/preview/reference/metrics.md b/website/content/en/preview/reference/metrics.md index 5b201b2e4846..b7cf1366f50f 100644 --- a/website/content/en/preview/reference/metrics.md +++ b/website/content/en/preview/reference/metrics.md @@ -115,11 +115,17 @@ The number of times that Karpenter failed to launch a replacement node for disru ### `karpenter_disruption_queue_depth` The number of commands currently being waited on in the disruption orchestration queue. +### `karpenter_disruption_pods_disrupted_total` +Total number of reschedulable pods disrupted on nodes. Labeled by NodePool, disruption action, method, and consolidation type. + +### `karpenter_disruption_nodes_disrupted_total` +Total number of nodes disrupted. Labeled by NodePool, disruption action, method, and consolidation type. + ### `karpenter_disruption_evaluation_duration_seconds` -Duration of the disruption evaluation process in seconds. +Duration of the disruption evaluation process in seconds. Labeled by method and consolidation type. ### `karpenter_disruption_eligible_nodes` -Number of nodes eligible for disruption by Karpenter. Labeled by disruption method. +Number of nodes eligible for disruption by Karpenter. Labeled by disruption method and consolidation type. ### `karpenter_disruption_consolidation_timeouts_total` Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type. @@ -128,7 +134,7 @@ Number of times the Consolidation algorithm has reached a timeout. Labeled by co The number of nodes for a given NodePool that can be disrupted at a point in time. Labeled by NodePool. Note that allowed disruptions can change very rapidly, as new nodes may be created and others may be deleted at any point. ### `karpenter_disruption_actions_performed_total` -Number of disruption actions performed. Labeled by disruption method. +Number of disruption actions performed. Labeled by disruption action, method, and consolidation type. ## Consistency Metrics @@ -185,10 +191,3 @@ Maximum number of concurrent reconciles per controller ### `controller_runtime_active_workers` Number of currently used workers per controller -## Cluster State Metrics - -### `cluster_state_node_count` -Current count of nodes in cluster state - -### `cluster_state_synced` -Returns 1 if cluster state is synced and 0 otherwise. To be synced it checks that nodeclaims and nodes that are stored in the APIServer have the same representation as Karpenter's cluster state