From 9d34d219ca8b120e5c57d1913bb174021972542c Mon Sep 17 00:00:00 2001 From: Jerad C Date: Wed, 13 Oct 2021 13:55:37 -0500 Subject: [PATCH] add pod metrics controller --- cmd/controller/main.go | 5 +- .../allocation/binpacking/packer.go | 2 +- pkg/controllers/allocation/launcher.go | 2 +- .../allocation/scheduling/scheduler.go | 2 +- pkg/controllers/metrics/common.go | 54 ++++++ pkg/controllers/metrics/controller.go | 166 ++++++++++++++++++ pkg/controllers/metrics/node/controller.go | 121 ------------- .../metrics/{node/counter.go => nodes.go} | 64 ++----- pkg/controllers/metrics/pods.go | 72 ++++++++ pkg/metrics/constants.go | 4 +- 10 files changed, 317 insertions(+), 175 deletions(-) create mode 100644 pkg/controllers/metrics/common.go create mode 100644 pkg/controllers/metrics/controller.go delete mode 100644 pkg/controllers/metrics/node/controller.go rename pkg/controllers/metrics/{node/counter.go => nodes.go} (80%) create mode 100644 pkg/controllers/metrics/pods.go diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 841af4fad559..1aa1648fc93a 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -24,7 +24,7 @@ import ( "github.com/awslabs/karpenter/pkg/cloudprovider/registry" "github.com/awslabs/karpenter/pkg/controllers" "github.com/awslabs/karpenter/pkg/controllers/allocation" - nodemetrics "github.com/awslabs/karpenter/pkg/controllers/metrics/node" + "github.com/awslabs/karpenter/pkg/controllers/metrics" "github.com/awslabs/karpenter/pkg/controllers/node" "github.com/awslabs/karpenter/pkg/controllers/termination" "github.com/awslabs/karpenter/pkg/utils/env" @@ -93,11 +93,12 @@ func main() { MetricsBindAddress: fmt.Sprintf(":%d", options.MetricsPort), HealthProbeBindAddress: fmt.Sprintf(":%d", options.HealthProbePort), }) + if err := manager.RegisterControllers(ctx, allocation.NewController(manager.GetClient(), clientSet.CoreV1(), cloudProvider), termination.NewController(ctx, manager.GetClient(), clientSet.CoreV1(), cloudProvider), node.NewController(manager.GetClient()), - nodemetrics.NewController(manager.GetClient()), + metrics.NewController(manager.GetClient(), cloudProvider), ).Start(ctx); err != nil { panic(fmt.Sprintf("Unable to start manager, %s", err.Error())) } diff --git a/pkg/controllers/allocation/binpacking/packer.go b/pkg/controllers/allocation/binpacking/packer.go index 9af9d091b3fe..eb59a5b3c17f 100644 --- a/pkg/controllers/allocation/binpacking/packer.go +++ b/pkg/controllers/allocation/binpacking/packer.go @@ -40,7 +40,7 @@ var ( packDuration = prometheus.NewHistogram( prometheus.HistogramOpts{ - Namespace: metrics.KarpenterNamespace, + Namespace: metrics.Namespace, Subsystem: "allocation_controller", Name: "binpacking_duration_seconds", Help: "Duration of binpacking process in seconds.", diff --git a/pkg/controllers/allocation/launcher.go b/pkg/controllers/allocation/launcher.go index 623f2b1ddb29..ee9ddaa6880b 100644 --- a/pkg/controllers/allocation/launcher.go +++ b/pkg/controllers/allocation/launcher.go @@ -105,7 +105,7 @@ func (l *Launcher) bind(ctx context.Context, node *v1.Node, pods []*v1.Pod) (err var bindTimeHistogram = prometheus.NewHistogram( prometheus.HistogramOpts{ - Namespace: metrics.KarpenterNamespace, + Namespace: metrics.Namespace, Subsystem: "allocation_controller", Name: "bind_duration_seconds", Help: "Duration of bind process in seconds. Broken down by result.", diff --git a/pkg/controllers/allocation/scheduling/scheduler.go b/pkg/controllers/allocation/scheduling/scheduler.go index 5863d113fe29..3af7c633b130 100644 --- a/pkg/controllers/allocation/scheduling/scheduler.go +++ b/pkg/controllers/allocation/scheduling/scheduler.go @@ -34,7 +34,7 @@ import ( var schedulingDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Namespace: metrics.KarpenterNamespace, + Namespace: metrics.Namespace, Subsystem: "allocation_controller", Name: "scheduling_duration_seconds", Help: "Duration of scheduling process in seconds. Broken down by provisioner and error.", diff --git a/pkg/controllers/metrics/common.go b/pkg/controllers/metrics/common.go new file mode 100644 index 000000000000..d7230fec4163 --- /dev/null +++ b/pkg/controllers/metrics/common.go @@ -0,0 +1,54 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "github.com/awslabs/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/awslabs/karpenter/pkg/metrics" + "github.com/prometheus/client_golang/prometheus" + v1 "k8s.io/api/core/v1" +) + +const ( + controllerName = "Metrics" + + metricSubsystemCapacity = "capacity" + metricSubsystemPods = "pods" + + metricLabelArch = "arch" + metricLabelInstanceType = "instancetype" + metricLabelOS = "os" + metricLabelPhase = "phase" + metricLabelProvisioner = metrics.ProvisionerLabel + metricLabelZone = "zone" + + nodeLabelArch = v1.LabelArchStable + nodeLabelInstanceType = v1.LabelInstanceTypeStable + nodeLabelOS = v1.LabelOSStable + nodeLabelZone = v1.LabelTopologyZone + + nodeConditionTypeReady = v1.NodeReady +) + +var nodeLabelProvisioner = v1alpha5.ProvisionerNameLabelKey + +func publishCount(gaugeVec *prometheus.GaugeVec, labels prometheus.Labels, count int) error { + gauge, err := gaugeVec.GetMetricWith(labels) + if err != nil { + return err + } + gauge.Set(float64(count)) + return nil +} diff --git a/pkg/controllers/metrics/controller.go b/pkg/controllers/metrics/controller.go new file mode 100644 index 000000000000..74159b27e626 --- /dev/null +++ b/pkg/controllers/metrics/controller.go @@ -0,0 +1,166 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/awslabs/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/awslabs/karpenter/pkg/cloudprovider" + "go.uber.org/multierr" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/util/workqueue" + "knative.dev/pkg/logging" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +type Controller struct { + CloudProvider cloudprovider.CloudProvider + KubeClient client.Client +} + +func NewController(kubeClient client.Client, cloudProvider cloudprovider.CloudProvider) *Controller { + return &Controller{ + CloudProvider: cloudProvider, + KubeClient: kubeClient, + } +} + +func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + loggerName := fmt.Sprintf("%s.provisioner/%s", strings.ToLower(controllerName), req.Name) + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(loggerName)) + + // Does the provisioner exist? + provisioner := &v1alpha5.Provisioner{} + if err := c.KubeClient.Get(ctx, req.NamespacedName, provisioner); err != nil { + if !errors.IsNotFound(err) { + // Unable to determine existence of the provisioner, try again later. + return reconcile.Result{}, err + } + + // The provisioner has been deleted. + return reconcile.Result{}, nil + } + + // The provisioner does exist, so update counters. + if err := c.updateCounts(ctx, provisioner); err != nil { + return reconcile.Result{}, err + } + + // Schedule the next run. + return reconcile.Result{RequeueAfter: 10 * time.Second}, nil +} + +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + return controllerruntime. + NewControllerManagedBy(m). + Named(controllerName). + For(&v1alpha5.Provisioner{}). + WithOptions(controller.Options{ + MaxConcurrentReconciles: 10, + }). + Complete(c) +} + +func (c *Controller) updateCounts(ctx context.Context, provisioner *v1alpha5.Provisioner) error { + updateCountFuncs := []func(context.Context, *v1alpha5.Provisioner) error{ + c.updateNodeCounts, + c.updatePodCounts, + } + updateCountFuncsLen := len(updateCountFuncs) + errors := make([]error, updateCountFuncsLen) + workqueue.ParallelizeUntil(ctx, updateCountFuncsLen, updateCountFuncsLen, func(index int) { + errors[index] = updateCountFuncs[index](ctx, provisioner) + }) + + return multierr.Combine(errors...) +} + +func (c *Controller) updateNodeCounts(ctx context.Context, provisioner *v1alpha5.Provisioner) error { + instanceTypes, err := c.CloudProvider.GetInstanceTypes(ctx, &provisioner.Spec.Constraints) + if err != nil { + return err + } + + archValues := sets.NewString() + instanceTypeValues := sets.NewString() + osValues := sets.NewString() + zoneValues := sets.NewString() + for _, instanceType := range instanceTypes { + archValues.Insert(instanceType.Architecture()) + instanceTypeValues.Insert(instanceType.Name()) + osValues.Insert(instanceType.OperatingSystems().UnsortedList()...) + zoneValues.Insert(instanceType.Zones().UnsortedList()...) + } + knownValuesForNodeLabels := map[string]sets.String{ + nodeLabelArch: archValues, + nodeLabelInstanceType: instanceTypeValues, + nodeLabelOS: osValues, + nodeLabelZone: zoneValues, + } + + return publishNodeCounts(provisioner.Name, knownValuesForNodeLabels, func(matchingLabels client.MatchingLabels, consume nodeListConsumerFunc) error { + nodes := v1.NodeList{} + if err := c.KubeClient.List(ctx, &nodes, matchingLabels); err != nil { + return err + } + return consume(nodes.Items) + }) +} + +func (c *Controller) updatePodCounts(ctx context.Context, provisioner *v1alpha5.Provisioner) error { + podsForProvisioner, err := c.podsForProvisioner(ctx, provisioner) + if err != nil { + return err + } + + return publishPodCounts(provisioner.Name, podsForProvisioner) +} + +// podsForProvisioner returns a map of slices containing all pods scheduled to nodes in each zone. +func (c *Controller) podsForProvisioner(ctx context.Context, provisioner *v1alpha5.Provisioner) ([]v1.Pod, error) { + // Karpenter does not apply a label, or other marker, to pods. + + results := []v1.Pod{} + + // 1. Fetch all nodes associated with the provisioner. + nodeList := v1.NodeList{} + withProvisionerName := client.MatchingLabels{nodeLabelProvisioner: provisioner.Name} + if err := c.KubeClient.List(ctx, &nodeList, withProvisionerName); err != nil { + return nil, err + } + + // 2. Get all the pods scheduled to each node. + for _, node := range nodeList.Items { + podList := v1.PodList{} + withNodeName := client.MatchingFields{"spec.nodeName": node.Name} + if err := c.KubeClient.List(ctx, &podList, withNodeName); err != nil { + return nil, err + } + + results = append(results, podList.Items...) + } + + return results, nil +} diff --git a/pkg/controllers/metrics/node/controller.go b/pkg/controllers/metrics/node/controller.go deleted file mode 100644 index 7f1d9b939cfa..000000000000 --- a/pkg/controllers/metrics/node/controller.go +++ /dev/null @@ -1,121 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package node - -import ( - "context" - "time" - - "github.com/awslabs/karpenter/pkg/apis/provisioning/v1alpha5" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - "knative.dev/pkg/logging" - controllerruntime "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/controller-runtime/pkg/event" - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/predicate" - "sigs.k8s.io/controller-runtime/pkg/reconcile" -) - -const ( - controllerName = "NodeMetrics" - requeueInterval = 10 * time.Second -) - -type Controller struct { - KubeClient client.Client -} - -func NewController(kubeClient client.Client) *Controller { - return &Controller{KubeClient: kubeClient} -} - -func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(controllerName)) - - provisionerName := req.NamespacedName.Name - - // 1. Has the provisioner been deleted? - if err := c.provisionerExists(ctx, req); err != nil { - if !errors.IsNotFound(err) { - // Unable to determine existence of the provisioner, try again later. - return reconcile.Result{Requeue: true}, err - } - - // The provisioner has been deleted. Reset all the associated counts to zero. - if err := publishNodeCountsForProvisioner(provisionerName, consumeZeroNodes); err != nil { - // One or more metrics were not zeroed. Try again later. - return reconcile.Result{Requeue: true}, err - } - - // Since the provisioner is gone, do not requeue. - return reconcile.Result{}, nil - } - - // 2. Update node counts associated with this provisioner. - if err := publishNodeCountsForProvisioner(provisionerName, c.consumeNodesWith(ctx)); err != nil { - // An updated value for one or more metrics was not published. Try again later. - return reconcile.Result{Requeue: true}, err - } - - // 3. Schedule the next run. - return reconcile.Result{RequeueAfter: requeueInterval}, nil -} - -func (c *Controller) Register(_ context.Context, m manager.Manager) error { - return controllerruntime. - NewControllerManagedBy(m). - Named(controllerName). - For(&v1alpha5.Provisioner{}, builder.WithPredicates( - predicate.Funcs{ - CreateFunc: func(_ event.CreateEvent) bool { return true }, - DeleteFunc: func(_ event.DeleteEvent) bool { return true }, - UpdateFunc: func(_ event.UpdateEvent) bool { return false }, - GenericFunc: func(_ event.GenericEvent) bool { return false }, - }, - )). - WithOptions(controller.Options{ - MaxConcurrentReconciles: 1, - }). - Complete(c) -} - -// provisionerExists simply attempts to retrieve the provisioner from the Controller's Client -// and returns any resulting error. -func (c *Controller) provisionerExists(ctx context.Context, req reconcile.Request) error { - provisioner := v1alpha5.Provisioner{} - return c.KubeClient.Get(ctx, req.NamespacedName, &provisioner) -} - -// consumeNodesWith will retrieve matching nodes from the Controller's Client then -// pass the nodes to `consume` and returns any resulting error. If Client returns an error when -// retrieving nodes then the error is returned without calling `consume`. -func (c *Controller) consumeNodesWith(ctx context.Context) consumeNodesWithFunc { - return func(nodeLabels client.MatchingLabels, consume nodeListConsumerFunc) error { - nodes := v1.NodeList{} - if err := c.KubeClient.List(ctx, &nodes, nodeLabels); err != nil { - return err - } - return consume(nodes.Items) - } -} - -// consumeZeroNodes calls `consume` with an empty slice and returns any resulting error. -func consumeZeroNodes(_ client.MatchingLabels, consume nodeListConsumerFunc) error { - return consume([]v1.Node{}) -} diff --git a/pkg/controllers/metrics/node/counter.go b/pkg/controllers/metrics/nodes.go similarity index 80% rename from pkg/controllers/metrics/node/counter.go rename to pkg/controllers/metrics/nodes.go index 5e779745adad..43e8384c3cd8 100644 --- a/pkg/controllers/metrics/node/counter.go +++ b/pkg/controllers/metrics/nodes.go @@ -12,52 +12,30 @@ See the License for the specific language governing permissions and limitations under the License. */ -package node +package metrics import ( "strings" - "github.com/awslabs/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/awslabs/karpenter/pkg/metrics" "github.com/prometheus/client_golang/prometheus" "go.uber.org/multierr" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" "sigs.k8s.io/controller-runtime/pkg/client" crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" ) -const ( - metricNamespace = metrics.KarpenterNamespace - metricSubsystem = "capacity" - - metricLabelArch = "arch" - metricLabelInstanceType = "instancetype" - metricLabelOS = "os" - metricLabelProvisioner = metrics.ProvisionerLabel - metricLabelZone = "zone" - - nodeLabelArch = v1.LabelArchStable - nodeLabelInstanceType = v1.LabelInstanceTypeStable - nodeLabelOS = v1.LabelOSStable - nodeLabelZone = v1.LabelTopologyZone - - nodeConditionTypeReady = v1.NodeReady -) - type ( nodeListConsumerFunc = func([]v1.Node) error consumeNodesWithFunc = func(client.MatchingLabels, nodeListConsumerFunc) error ) var ( - nodeLabelProvisioner = v1alpha5.ProvisionerNameLabelKey - - knownValuesForNodeLabels = map[string][]string{} - nodeCountByProvisioner = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Namespace: metricNamespace, - Subsystem: metricSubsystem, + Namespace: metrics.Namespace, + Subsystem: metricSubsystemCapacity, Name: "node_count", Help: "Total node count by provisioner.", }, @@ -68,8 +46,8 @@ var ( readyNodeCountByProvisionerZone = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Namespace: metricNamespace, - Subsystem: metricSubsystem, + Namespace: metrics.Namespace, + Subsystem: metricSubsystemCapacity, Name: "ready_node_count", Help: "Count of nodes that are ready by provisioner and zone.", }, @@ -81,8 +59,8 @@ var ( readyNodeCountByArchProvisionerZone = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Namespace: metricNamespace, - Subsystem: metricSubsystem, + Namespace: metrics.Namespace, + Subsystem: metricSubsystemCapacity, Name: "ready_node_arch_count", Help: "Count of nodes that are ready by architecture, provisioner, and zone.", }, @@ -95,8 +73,8 @@ var ( readyNodeCountByInstancetypeProvisionerZone = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Namespace: metricNamespace, - Subsystem: metricSubsystem, + Namespace: metrics.Namespace, + Subsystem: metricSubsystemCapacity, Name: "ready_node_instancetype_count", Help: "Count of nodes that are ready by instance type, provisioner, and zone.", }, @@ -109,8 +87,8 @@ var ( readyNodeCountByOsProvisionerZone = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Namespace: metricNamespace, - Subsystem: metricSubsystem, + Namespace: metrics.Namespace, + Subsystem: metricSubsystemCapacity, Name: "ready_node_os_count", Help: "Count of nodes that are ready by operating system, provisioner, and zone.", }, @@ -130,7 +108,7 @@ func init() { crmetrics.Registry.MustRegister(readyNodeCountByOsProvisionerZone) } -func publishNodeCountsForProvisioner(provisioner string, consumeNodesWith consumeNodesWithFunc) error { +func publishNodeCounts(provisioner string, knownValuesForNodeLabels map[string]sets.String, consumeNodesWith consumeNodesWithFunc) error { archValues := knownValuesForNodeLabels[nodeLabelArch] instanceTypeValues := knownValuesForNodeLabels[nodeLabelInstanceType] osValues := knownValuesForNodeLabels[nodeLabelOS] @@ -143,7 +121,7 @@ func publishNodeCountsForProvisioner(provisioner string, consumeNodesWith consum return publishCount(nodeCountByProvisioner, metricLabelsFrom(nodeLabels), len(nodes)) })) - for _, zone := range zoneValues { + for zone := range zoneValues { nodeLabels = client.MatchingLabels{ nodeLabelProvisioner: provisioner, nodeLabelZone: zone, @@ -152,7 +130,7 @@ func publishNodeCountsForProvisioner(provisioner string, consumeNodesWith consum return publishCount(readyNodeCountByProvisionerZone, metricLabelsFrom(nodeLabels), len(readyNodes)) }))) - for _, arch := range archValues { + for arch := range archValues { nodeLabels := client.MatchingLabels{ nodeLabelArch: arch, nodeLabelProvisioner: provisioner, @@ -163,7 +141,7 @@ func publishNodeCountsForProvisioner(provisioner string, consumeNodesWith consum }))) } - for _, instanceType := range instanceTypeValues { + for instanceType := range instanceTypeValues { nodeLabels := client.MatchingLabels{ nodeLabelInstanceType: instanceType, nodeLabelProvisioner: provisioner, @@ -174,7 +152,7 @@ func publishNodeCountsForProvisioner(provisioner string, consumeNodesWith consum }))) } - for _, os := range osValues { + for os := range osValues { nodeLabels := client.MatchingLabels{ nodeLabelOS: os, nodeLabelProvisioner: provisioner, @@ -225,11 +203,3 @@ func metricLabelsFrom(nodeLabels map[string]string) prometheus.Labels { } return metricLabels } - -func publishCount(gaugeVec *prometheus.GaugeVec, labels prometheus.Labels, count int) error { - gauge, err := gaugeVec.GetMetricWith(labels) - if err == nil { - gauge.Set(float64(count)) - } - return err -} diff --git a/pkg/controllers/metrics/pods.go b/pkg/controllers/metrics/pods.go new file mode 100644 index 000000000000..3312139a2de4 --- /dev/null +++ b/pkg/controllers/metrics/pods.go @@ -0,0 +1,72 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "strings" + + "github.com/awslabs/karpenter/pkg/metrics" + "github.com/prometheus/client_golang/prometheus" + "go.uber.org/multierr" + v1 "k8s.io/api/core/v1" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + phaseValues = []v1.PodPhase{ + v1.PodFailed, + v1.PodPending, + v1.PodRunning, + v1.PodSucceeded, + v1.PodUnknown, + } + + podCountByPhaseProvisioner = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metrics.Namespace, + Subsystem: metricSubsystemPods, + Name: "count", + Help: "Total pod count by phase and provisioner.", + }, + []string{ + metricLabelPhase, + metricLabelProvisioner, + }, + ) +) + +func init() { + crmetrics.Registry.MustRegister(podCountByPhaseProvisioner) +} + +func publishPodCounts(provisioner string, podList []v1.Pod) error { + countByPhase := make(map[v1.PodPhase]int, len(phaseValues)) + + for _, pod := range podList { + countByPhase[pod.Status.Phase]++ + } + + errors := make([]error, 0, len(phaseValues)) + + for _, phase := range phaseValues { + metricLabels := prometheus.Labels{ + metricLabelPhase: strings.ToLower(string(phase)), + metricLabelProvisioner: provisioner, + } + errors = append(errors, publishCount(podCountByPhaseProvisioner, metricLabels, countByPhase[phase])) + } + + return multierr.Combine(errors...) +} diff --git a/pkg/metrics/constants.go b/pkg/metrics/constants.go index 82267cbfb1de..82f1f4bc6019 100644 --- a/pkg/metrics/constants.go +++ b/pkg/metrics/constants.go @@ -21,8 +21,8 @@ import ( ) const ( - // KarpenterNamespace is the common namespace for application metrics - KarpenterNamespace = "karpenter" + // Common namespace for application metrics. + Namespace = "karpenter" ErrorLabel = "error" ProvisionerLabel = "provisioner"