Skip to content

Commit

Permalink
feat: adding karpenter_state_nodes_total metric and karpenter_state_…
Browse files Browse the repository at this point in the history
…synced (kubernetes-sigs#952)

Co-authored-by: Jonathan Innis <[email protected]>
Co-authored-by: Nick Tran <[email protected]>
  • Loading branch information
3 people authored Feb 14, 2024
1 parent 686b75d commit 42398a0
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 3 deletions.
10 changes: 7 additions & 3 deletions pkg/controllers/state/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,9 @@ func (c *Cluster) Synced(ctx context.Context) bool {
// This doesn't ensure that the two states are exactly aligned (we could still not be tracking a node
// that exists in the cluster state but not in the apiserver) but it ensures that we have a state
// representation for every node/nodeClaim that exists on the apiserver
return stateNodeClaimNames.IsSuperset(nodeClaimNames) &&
stateNodeNames.IsSuperset(nodeNames)
synced := stateNodeClaimNames.IsSuperset(nodeClaimNames) && stateNodeNames.IsSuperset(nodeNames)
clusterStateSynced.Set(lo.Ternary[float64](synced, 1, 0))
return synced
}

// ForPodsWithAntiAffinity calls the supplied function once for each pod with required anti affinity terms that is
Expand Down Expand Up @@ -236,13 +237,15 @@ func (c *Cluster) UpdateNodeClaim(nodeClaim *v1beta1.NodeClaim) {
// If the nodeclaim hasn't launched yet, we want to add it into cluster state to ensure
// that we're not racing with the internal cache for the cluster, assuming the node doesn't exist.
c.nodeClaimNameToProviderID[nodeClaim.Name] = nodeClaim.Status.ProviderID
clusterStateNodesCount.Set(float64(len(c.nodes)))
}

func (c *Cluster) DeleteNodeClaim(name string) {
c.mu.Lock()
defer c.mu.Unlock()

c.cleanupNodeClaim(name)
clusterStateNodesCount.Set(float64(len(c.nodes)))
}

func (c *Cluster) UpdateNode(ctx context.Context, node *v1.Node) error {
Expand All @@ -269,14 +272,15 @@ func (c *Cluster) UpdateNode(ctx context.Context, node *v1.Node) error {
}
c.nodes[node.Spec.ProviderID] = n
c.nodeNameToProviderID[node.Name] = node.Spec.ProviderID
clusterStateNodesCount.Set(float64(len(c.nodes)))
return nil
}

func (c *Cluster) DeleteNode(name string) {
c.mu.Lock()
defer c.mu.Unlock()

c.cleanupNode(name)
clusterStateNodesCount.Set(float64(len(c.nodes)))
}

func (c *Cluster) UpdatePod(ctx context.Context, pod *v1.Pod) error {
Expand Down
52 changes: 52 additions & 0 deletions pkg/controllers/state/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package state

import (
"github.com/prometheus/client_golang/prometheus"
crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"

"sigs.k8s.io/karpenter/pkg/metrics"
)

const (
stateSubsystem = "cluster_state"
)

var (
clusterStateNodesCount = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: stateSubsystem,
Name: "node_count",
Help: "Current count of nodes in cluster state",
},
)

clusterStateSynced = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: stateSubsystem,
Name: "synced",
Help: "Returns 1 if cluster state is synced and 0 otherwise. Synced checks that nodeclaims and nodes that are stored in the APIServer have the same representation as Karpenter's cluster state",
},
)
)

func init() {
crmetrics.Registry.MustRegister(clusterStateNodesCount, clusterStateSynced)
}
15 changes: 15 additions & 0 deletions pkg/controllers/state/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1121,17 +1121,25 @@ var _ = Describe("Cluster State Sync", func() {
})
ExpectApplied(ctx, env.Client, node)
ExpectReconcileSucceeded(ctx, nodeController, client.ObjectKeyFromObject(node))
ExpectMetricGaugeValue("karpenter_cluster_state_node_count", float64(i+1), nil)
}

Expect(cluster.Synced(ctx)).To(BeTrue())
ExpectMetricGaugeValue("karpenter_cluster_state_synced", 1.0, nil)
ExpectMetricGaugeValue("karpenter_cluster_state_node_count", 1000.0, nil)
})
It("should consider the cluster state synced when nodes don't have provider id", func() {
// Deploy 1000 nodes and sync them all with the cluster
for i := 0; i < 1000; i++ {
node := test.Node()
ExpectApplied(ctx, env.Client, node)
ExpectReconcileSucceeded(ctx, nodeController, client.ObjectKeyFromObject(node))
ExpectMetricGaugeValue("karpenter_cluster_state_node_count", float64(i+1), nil)
}
Expect(cluster.Synced(ctx)).To(BeTrue())
ExpectMetricGaugeValue("karpenter_cluster_state_synced", 1.0, nil)
ExpectMetricGaugeValue("karpenter_cluster_state_node_count", 1000.0, nil)

})
It("should consider the cluster state synced when nodes register provider id", func() {
// Deploy 1000 nodes and sync them all with the cluster
Expand All @@ -1140,6 +1148,7 @@ var _ = Describe("Cluster State Sync", func() {
nodes = append(nodes, test.Node())
ExpectApplied(ctx, env.Client, nodes[i])
ExpectReconcileSucceeded(ctx, nodeController, client.ObjectKeyFromObject(nodes[i]))
ExpectMetricGaugeValue("karpenter_cluster_state_node_count", float64(i+1), make(map[string]string))
}
Expect(cluster.Synced(ctx)).To(BeTrue())
for i := 0; i < 1000; i++ {
Expand All @@ -1148,6 +1157,8 @@ var _ = Describe("Cluster State Sync", func() {
ExpectReconcileSucceeded(ctx, nodeController, client.ObjectKeyFromObject(nodes[i]))
}
Expect(cluster.Synced(ctx)).To(BeTrue())
ExpectMetricGaugeValue("karpenter_cluster_state_synced", 1.0, nil)
ExpectMetricGaugeValue("karpenter_cluster_state_node_count", 1000.0, nil)
})
It("should consider the cluster state synced when all nodeclaims are tracked", func() {
// Deploy 1000 nodeClaims and sync them all with the cluster
Expand Down Expand Up @@ -1263,6 +1274,7 @@ var _ = Describe("Cluster State Sync", func() {
}
}
Expect(cluster.Synced(ctx)).To(BeFalse())
ExpectMetricGaugeValue("karpenter_cluster_state_synced", 0, nil)
})
It("shouldn't consider the cluster state synced if a nodeclaim is added manually with UpdateNodeClaim", func() {
nodeClaim := test.NodeClaim()
Expand All @@ -1277,14 +1289,17 @@ var _ = Describe("Cluster State Sync", func() {

cluster.UpdateNodeClaim(nodeClaim)
Expect(cluster.Synced(ctx)).To(BeFalse())
ExpectMetricGaugeValue("karpenter_cluster_state_synced", 0, nil)

ExpectApplied(ctx, env.Client, nodeClaim)
ExpectReconcileSucceeded(ctx, nodeClaimController, client.ObjectKeyFromObject(nodeClaim))
Expect(cluster.Synced(ctx)).To(BeFalse())
ExpectMetricGaugeValue("karpenter_cluster_state_synced", 0, nil)

ExpectDeleted(ctx, env.Client, nodeClaim)
ExpectReconcileSucceeded(ctx, nodeClaimController, client.ObjectKeyFromObject(nodeClaim))
Expect(cluster.Synced(ctx)).To(BeTrue())
ExpectMetricGaugeValue("karpenter_cluster_state_synced", 1, nil)
})
})

Expand Down
7 changes: 7 additions & 0 deletions pkg/test/expectations/expectations.go
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,13 @@ func FindMetricWithLabelValues(name string, labelValues map[string]string) (*pro
return nil, false
}

func ExpectMetricGaugeValue(metricName string, expectedValue float64, labels map[string]string) {
GinkgoHelper()
metric, ok := FindMetricWithLabelValues(metricName, labels)
Expect(ok).To(BeTrue(), "Metric "+metricName+" should be available")
Expect(lo.FromPtr(metric.Gauge.Value)).To(Equal(expectedValue), "Metric "+metricName+" should have the expected value")
}

func ExpectManualBinding(ctx context.Context, c client.Client, pod *v1.Pod, node *v1.Node) {
GinkgoHelper()
Expect(c.Create(ctx, &v1.Binding{
Expand Down

0 comments on commit 42398a0

Please sign in to comment.