From eee6491397f2acc1bf67afb373ac611853cc0de9 Mon Sep 17 00:00:00 2001 From: "shaofan.hs" Date: Tue, 18 Jun 2024 16:49:25 +0800 Subject: [PATCH] fix and update: multicluster metrics --- multicluster/manager.go | 3 --- multicluster/metrics/metrics.go | 18 +++++++++++++++--- multicluster/multi_cluster_cache.go | 9 ++++++++- multicluster/multi_cluster_client.go | 19 +++++++++++++++++++ 4 files changed, 42 insertions(+), 7 deletions(-) diff --git a/multicluster/manager.go b/multicluster/manager.go index 6630e9d..547ad96 100644 --- a/multicluster/manager.go +++ b/multicluster/manager.go @@ -19,7 +19,6 @@ package multicluster import ( "context" "errors" - "fmt" "os" "strings" "sync" @@ -254,8 +253,6 @@ func getClusterFilter(cfg *ManagerConfig) (func(string) bool, error) { blockSet = sets.NewString(strings.Split(blockList, ",")...) } - fmt.Printf("allowList: %v, blockList: %v, allowSet: %v, blockSet: %v\n", allowList, blockList, allowSet, blockSet) - if allowSet != nil && blockSet != nil { return nil, errors.New("both cluster allow and block lists are set") } diff --git a/multicluster/metrics/metrics.go b/multicluster/metrics/metrics.go index 239bc98..c1b174d 100644 --- a/multicluster/metrics/metrics.go +++ b/multicluster/metrics/metrics.go @@ -29,32 +29,40 @@ const ( CacheCount = "cache_count" ClientCount = "client_count" ClusterEventCount = "cluster_event_count" + InvalidClusterCount = "invalid_cluster_count" ) var ( cacheCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Subsystem: MultiClusterSubSystem, Name: CacheCount, - Help: "count the number of cache call", + Help: "Number of Cache methods involked", }, []string{"cluster", "method", "code"}) clientCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Subsystem: MultiClusterSubSystem, Name: ClientCount, - Help: "count the number of client call", + Help: "Number of Client methods involked", }, []string{"cluster", "method", "code"}) clusterEventCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Subsystem: MultiClusterSubSystem, Name: ClusterEventCount, - Help: "count the number of cluster event", + Help: "Number of cluster events", }, []string{"cluster", "event", "success"}) + + invalidClusterCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: MultiClusterSubSystem, + Name: InvalidClusterCount, + Help: "Number of invalid clusters for Client and Cache", + }, []string{"method", "cluster"}) ) func init() { metrics.Registry.MustRegister(cacheCounter) metrics.Registry.MustRegister(clientCounter) metrics.Registry.MustRegister(clusterEventCounter) + metrics.Registry.MustRegister(invalidClusterCounter) } func NewCacheCountMetrics(cluster, method string, err error) prometheus.Counter { @@ -69,6 +77,10 @@ func NewClusterEventCountMetrics(cluster, event, success string) prometheus.Coun return clusterEventCounter.WithLabelValues(cluster, event, success) } +func NewInvalidClusterCounterMetrics(method, cluster string) prometheus.Counter { + return clusterEventCounter.WithLabelValues(method, cluster) +} + func CodeForError(err error) string { if err == nil { return "200" diff --git a/multicluster/multi_cluster_cache.go b/multicluster/multi_cluster_cache.go index 6dd84fe..695f60c 100644 --- a/multicluster/multi_cluster_cache.go +++ b/multicluster/multi_cluster_cache.go @@ -19,6 +19,7 @@ import ( "context" "fmt" "reflect" + "strings" "sync" "time" @@ -299,6 +300,7 @@ func (mcc *multiClusterCache) WaitForCacheSync(ctx context.Context) bool { if len(clusters) == 0 { clusters = []string{clusterinfo.Fed} } else if err != nil { + metrics.NewInvalidClusterCounterMetrics("WaitForCacheSync", strings.Join(clusters, ",")) mcc.log.Error(err, "failed to get clusters") return false } @@ -319,6 +321,7 @@ func (mcc *multiClusterCache) WaitForCacheSync(ctx context.Context) bool { } else { c, ok := clusterToCache[cluster] if !ok { + metrics.NewInvalidClusterCounterMetrics("WaitForCacheSync", cluster) mcc.log.Info("invalid cluster", "cluster", cluster) continue } @@ -379,6 +382,7 @@ func (mcc *multiClusterCache) Get(ctx context.Context, key types.NamespacedName, cluster, err = getCluster(ctx, obj.GetLabels()) if err != nil { + metrics.NewInvalidClusterCounterMetrics("Get", cluster) mcc.log.Error(err, "failed to get cluster") return err } @@ -394,6 +398,7 @@ func (mcc *multiClusterCache) Get(ctx context.Context, key types.NamespacedName, clusterCache, ok := clusterToCache[cluster] if !ok { + metrics.NewInvalidClusterCounterMetrics("Get", cluster) return fmt.Errorf("unable to get: %v because of unknown cluster: %s for the cache", key, cluster) } return clusterCache.Get(ctx, key, obj) @@ -406,6 +411,7 @@ func (mcc *multiClusterCache) List(ctx context.Context, list client.ObjectList, clusters, _, err := mcc.getClusters(ctx) if err != nil { + metrics.NewInvalidClusterCounterMetrics("Get", strings.Join(clusters, ",")) mcc.log.Error(err, "failed to get clusters") return err } @@ -427,13 +433,14 @@ func (mcc *multiClusterCache) List(ctx context.Context, list client.ObjectList, var ok bool c, ok = clusterToCache[cluster] if !ok { + metrics.NewInvalidClusterCounterMetrics("List", cluster) return fmt.Errorf("unable to list because of unknown cluster: %s for the cache", cluster) } } listObj := list.DeepCopyObject().(client.ObjectList) err = c.List(ctx, listObj, opts...) - metrics.NewClientCountMetrics(cluster, "List", err).Inc() + metrics.NewCacheCountMetrics(cluster, "List", err).Inc() if err != nil { return err } diff --git a/multicluster/multi_cluster_client.go b/multicluster/multi_cluster_client.go index 536b57d..c643e15 100644 --- a/multicluster/multi_cluster_client.go +++ b/multicluster/multi_cluster_client.go @@ -19,6 +19,7 @@ package multicluster import ( "context" "fmt" + "strings" "sync" "github.com/go-logr/logr" @@ -142,6 +143,7 @@ func (mcc *multiClusterClient) Create(ctx context.Context, obj client.Object, op // Get cluster info from context or labels, and delete it from labels because we should not write it into apiserver cluster, err = getThenDeleteCluster(ctx, obj.GetLabels()) if err != nil { + metrics.NewInvalidClusterCounterMetrics("Create", cluster) mcc.log.Error(err, "failed to get cluster") return err } @@ -155,6 +157,7 @@ func (mcc *multiClusterClient) Create(ctx context.Context, obj client.Object, op clusterClient, ok := mcc.clusterToClient[cluster] if !ok { + metrics.NewInvalidClusterCounterMetrics("Create", cluster) return fmt.Errorf("unable to create: %v because of unknown cluster: %s for the client", obj, cluster) } return clusterClient.Create(ctx, obj, opts...) @@ -169,6 +172,7 @@ func (mcc *multiClusterClient) Delete(ctx context.Context, obj client.Object, op cluster, err = getCluster(ctx, obj.GetLabels()) if err != nil { + metrics.NewInvalidClusterCounterMetrics("Delete", cluster) mcc.log.Error(err, "failed to get cluster") return err } @@ -182,6 +186,7 @@ func (mcc *multiClusterClient) Delete(ctx context.Context, obj client.Object, op clusterClient, ok := mcc.clusterToClient[cluster] if !ok { + metrics.NewInvalidClusterCounterMetrics("Delete", cluster) return fmt.Errorf("unable to delete: %v because of unknown cluster: %s for the client", obj, cluster) } return clusterClient.Delete(ctx, obj, opts...) @@ -195,6 +200,7 @@ func (mcc *multiClusterClient) DeleteAllOf(ctx context.Context, obj client.Objec cluster, err = getCluster(ctx, obj.GetLabels()) if err != nil { + metrics.NewInvalidClusterCounterMetrics("DeleteAllOf", cluster) mcc.log.Error(err, "failed to get cluster") return err } @@ -208,6 +214,7 @@ func (mcc *multiClusterClient) DeleteAllOf(ctx context.Context, obj client.Objec clusterClient, ok := mcc.clusterToClient[cluster] if !ok { + metrics.NewInvalidClusterCounterMetrics("DeleteAllOf", cluster) err = fmt.Errorf("unable to deleteAllOf: %v because of unknown cluster: %s for the client", obj, cluster) return } @@ -226,6 +233,7 @@ func (mcc *multiClusterClient) Get(ctx context.Context, key types.NamespacedName cluster, err = getCluster(ctx, obj.GetLabels()) if err != nil { + metrics.NewInvalidClusterCounterMetrics("Get", cluster) mcc.log.Error(err, "failed to get cluster") return err } @@ -239,6 +247,7 @@ func (mcc *multiClusterClient) Get(ctx context.Context, key types.NamespacedName clusterClient, ok := mcc.clusterToClient[cluster] if !ok { + metrics.NewInvalidClusterCounterMetrics("Get", cluster) return fmt.Errorf("unable to get: %v because of unknown cluster: %s for the client", obj, cluster) } return clusterClient.Get(ctx, key, obj) @@ -251,6 +260,7 @@ func (mcc *multiClusterClient) List(ctx context.Context, list client.ObjectList, clusters, err := mcc.getClusterNames(ctx) if err != nil { + metrics.NewInvalidClusterCounterMetrics("List", strings.Join(clusters, ",")) mcc.log.Error(err, "failed to get clusters") return err } @@ -273,6 +283,7 @@ func (mcc *multiClusterClient) List(ctx context.Context, list client.ObjectList, var ok bool c, ok = mcc.clusterToClient[cluster] if !ok { + metrics.NewInvalidClusterCounterMetrics("List", cluster) return fmt.Errorf("unable to list because of unknown cluster: %s for the client", cluster) } } @@ -314,6 +325,7 @@ func (mcc *multiClusterClient) Patch(ctx context.Context, obj client.Object, pat // Get cluster info from context or labels, and delete it from labels because we should not write it into apiserver cluster, err = getThenDeleteCluster(ctx, obj.GetLabels()) if err != nil { + metrics.NewInvalidClusterCounterMetrics("Patch", cluster) mcc.log.Error(err, "failed to get cluster") return err } @@ -327,6 +339,7 @@ func (mcc *multiClusterClient) Patch(ctx context.Context, obj client.Object, pat clusterClient, ok := mcc.clusterToClient[cluster] if !ok { + metrics.NewInvalidClusterCounterMetrics("Patch", cluster) return fmt.Errorf("unable to patch: %v because of unknown cluster: %v for the client", obj, cluster) } return clusterClient.Patch(ctx, obj, patch, opts...) @@ -342,6 +355,7 @@ func (mcc *multiClusterClient) Update(ctx context.Context, obj client.Object, op // Get cluster info from context or labels, and delete it from labels because we should not write it into apiserver cluster, err = getThenDeleteCluster(ctx, obj.GetLabels()) if err != nil { + metrics.NewInvalidClusterCounterMetrics("Update", cluster) mcc.log.Error(err, "failed to get cluster") return err } @@ -355,6 +369,7 @@ func (mcc *multiClusterClient) Update(ctx context.Context, obj client.Object, op clusterClient, ok := mcc.clusterToClient[cluster] if !ok { + metrics.NewInvalidClusterCounterMetrics("Update", cluster) err = fmt.Errorf("unable to update: %v because of unknown cluster: %s for the client", obj, cluster) return } @@ -393,6 +408,7 @@ func (sw *statusWriter) Update(ctx context.Context, obj client.Object, opts ...c // Get cluster info from context or labels, and delete it from labels because we should not write it into apiserver cluster, err = getThenDeleteCluster(ctx, obj.GetLabels()) if err != nil { + metrics.NewInvalidClusterCounterMetrics("StatusUpdate", cluster) sw.log.Error(err, "failed to get cluster") return err } @@ -403,6 +419,7 @@ func (sw *statusWriter) Update(ctx context.Context, obj client.Object, opts ...c clusterClient, ok := sw.clusterToClient[cluster] if !ok { + metrics.NewInvalidClusterCounterMetrics("StatusUpdate", cluster) return fmt.Errorf("unable to update: %v because of unknown cluster: %s for the client", obj, cluster) } return clusterClient.Status().Update(ctx, obj, opts...) @@ -418,6 +435,7 @@ func (sw *statusWriter) Patch(ctx context.Context, obj client.Object, patch clie // Get cluster info from context or labels, and delete it from labels because we should not write it into apiserver cluster, err = getThenDeleteCluster(ctx, obj.GetLabels()) if err != nil { + metrics.NewInvalidClusterCounterMetrics("StatusPatch", cluster) sw.log.Error(err, "failed to get cluster") return err } @@ -428,6 +446,7 @@ func (sw *statusWriter) Patch(ctx context.Context, obj client.Object, patch clie clusterClient, ok := sw.clusterToClient[cluster] if !ok { + metrics.NewInvalidClusterCounterMetrics("StatusPatch", cluster) return fmt.Errorf("unable to update: %v because of unknown cluster: %s for the client", obj, cluster) } return clusterClient.Status().Patch(ctx, obj, patch, opts...)