Skip to content

Commit

Permalink
fix and update: multicluster metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
shaofan-hs committed Jun 18, 2024
1 parent 66f7b8b commit eee6491
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 7 deletions.
3 changes: 0 additions & 3 deletions multicluster/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package multicluster
import (
"context"
"errors"
"fmt"
"os"
"strings"
"sync"
Expand Down Expand Up @@ -254,8 +253,6 @@ func getClusterFilter(cfg *ManagerConfig) (func(string) bool, error) {
blockSet = sets.NewString(strings.Split(blockList, ",")...)
}

fmt.Printf("allowList: %v, blockList: %v, allowSet: %v, blockSet: %v\n", allowList, blockList, allowSet, blockSet)

if allowSet != nil && blockSet != nil {
return nil, errors.New("both cluster allow and block lists are set")
}
Expand Down
18 changes: 15 additions & 3 deletions multicluster/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,32 +29,40 @@ const (
CacheCount = "cache_count"
ClientCount = "client_count"
ClusterEventCount = "cluster_event_count"
InvalidClusterCount = "invalid_cluster_count"
)

var (
cacheCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: MultiClusterSubSystem,
Name: CacheCount,
Help: "count the number of cache call",
Help: "Number of Cache methods involked",
}, []string{"cluster", "method", "code"})

clientCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: MultiClusterSubSystem,
Name: ClientCount,
Help: "count the number of client call",
Help: "Number of Client methods involked",
}, []string{"cluster", "method", "code"})

clusterEventCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: MultiClusterSubSystem,
Name: ClusterEventCount,
Help: "count the number of cluster event",
Help: "Number of cluster events",
}, []string{"cluster", "event", "success"})

invalidClusterCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: MultiClusterSubSystem,
Name: InvalidClusterCount,
Help: "Number of invalid clusters for Client and Cache",
}, []string{"method", "cluster"})
)

func init() {
metrics.Registry.MustRegister(cacheCounter)
metrics.Registry.MustRegister(clientCounter)
metrics.Registry.MustRegister(clusterEventCounter)
metrics.Registry.MustRegister(invalidClusterCounter)
}

func NewCacheCountMetrics(cluster, method string, err error) prometheus.Counter {
Expand All @@ -69,6 +77,10 @@ func NewClusterEventCountMetrics(cluster, event, success string) prometheus.Coun
return clusterEventCounter.WithLabelValues(cluster, event, success)
}

func NewInvalidClusterCounterMetrics(method, cluster string) prometheus.Counter {
return clusterEventCounter.WithLabelValues(method, cluster)
}

func CodeForError(err error) string {
if err == nil {
return "200"
Expand Down
9 changes: 8 additions & 1 deletion multicluster/multi_cluster_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"context"
"fmt"
"reflect"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -299,6 +300,7 @@ func (mcc *multiClusterCache) WaitForCacheSync(ctx context.Context) bool {
if len(clusters) == 0 {
clusters = []string{clusterinfo.Fed}
} else if err != nil {
metrics.NewInvalidClusterCounterMetrics("WaitForCacheSync", strings.Join(clusters, ","))
mcc.log.Error(err, "failed to get clusters")
return false
}
Expand All @@ -319,6 +321,7 @@ func (mcc *multiClusterCache) WaitForCacheSync(ctx context.Context) bool {
} else {
c, ok := clusterToCache[cluster]
if !ok {
metrics.NewInvalidClusterCounterMetrics("WaitForCacheSync", cluster)
mcc.log.Info("invalid cluster", "cluster", cluster)
continue
}
Expand Down Expand Up @@ -379,6 +382,7 @@ func (mcc *multiClusterCache) Get(ctx context.Context, key types.NamespacedName,

cluster, err = getCluster(ctx, obj.GetLabels())
if err != nil {
metrics.NewInvalidClusterCounterMetrics("Get", cluster)
mcc.log.Error(err, "failed to get cluster")
return err
}
Expand All @@ -394,6 +398,7 @@ func (mcc *multiClusterCache) Get(ctx context.Context, key types.NamespacedName,

clusterCache, ok := clusterToCache[cluster]
if !ok {
metrics.NewInvalidClusterCounterMetrics("Get", cluster)
return fmt.Errorf("unable to get: %v because of unknown cluster: %s for the cache", key, cluster)
}
return clusterCache.Get(ctx, key, obj)
Expand All @@ -406,6 +411,7 @@ func (mcc *multiClusterCache) List(ctx context.Context, list client.ObjectList,

clusters, _, err := mcc.getClusters(ctx)
if err != nil {
metrics.NewInvalidClusterCounterMetrics("Get", strings.Join(clusters, ","))
mcc.log.Error(err, "failed to get clusters")
return err
}
Expand All @@ -427,13 +433,14 @@ func (mcc *multiClusterCache) List(ctx context.Context, list client.ObjectList,
var ok bool
c, ok = clusterToCache[cluster]
if !ok {
metrics.NewInvalidClusterCounterMetrics("List", cluster)
return fmt.Errorf("unable to list because of unknown cluster: %s for the cache", cluster)
}
}

listObj := list.DeepCopyObject().(client.ObjectList)
err = c.List(ctx, listObj, opts...)
metrics.NewClientCountMetrics(cluster, "List", err).Inc()
metrics.NewCacheCountMetrics(cluster, "List", err).Inc()
if err != nil {
return err
}
Expand Down
19 changes: 19 additions & 0 deletions multicluster/multi_cluster_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package multicluster
import (
"context"
"fmt"
"strings"
"sync"

"github.com/go-logr/logr"
Expand Down Expand Up @@ -142,6 +143,7 @@ func (mcc *multiClusterClient) Create(ctx context.Context, obj client.Object, op
// Get cluster info from context or labels, and delete it from labels because we should not write it into apiserver
cluster, err = getThenDeleteCluster(ctx, obj.GetLabels())
if err != nil {
metrics.NewInvalidClusterCounterMetrics("Create", cluster)
mcc.log.Error(err, "failed to get cluster")
return err
}
Expand All @@ -155,6 +157,7 @@ func (mcc *multiClusterClient) Create(ctx context.Context, obj client.Object, op

clusterClient, ok := mcc.clusterToClient[cluster]
if !ok {
metrics.NewInvalidClusterCounterMetrics("Create", cluster)
return fmt.Errorf("unable to create: %v because of unknown cluster: %s for the client", obj, cluster)
}
return clusterClient.Create(ctx, obj, opts...)
Expand All @@ -169,6 +172,7 @@ func (mcc *multiClusterClient) Delete(ctx context.Context, obj client.Object, op

cluster, err = getCluster(ctx, obj.GetLabels())
if err != nil {
metrics.NewInvalidClusterCounterMetrics("Delete", cluster)
mcc.log.Error(err, "failed to get cluster")
return err
}
Expand All @@ -182,6 +186,7 @@ func (mcc *multiClusterClient) Delete(ctx context.Context, obj client.Object, op

clusterClient, ok := mcc.clusterToClient[cluster]
if !ok {
metrics.NewInvalidClusterCounterMetrics("Delete", cluster)
return fmt.Errorf("unable to delete: %v because of unknown cluster: %s for the client", obj, cluster)
}
return clusterClient.Delete(ctx, obj, opts...)
Expand All @@ -195,6 +200,7 @@ func (mcc *multiClusterClient) DeleteAllOf(ctx context.Context, obj client.Objec

cluster, err = getCluster(ctx, obj.GetLabels())
if err != nil {
metrics.NewInvalidClusterCounterMetrics("DeleteAllOf", cluster)
mcc.log.Error(err, "failed to get cluster")
return err
}
Expand All @@ -208,6 +214,7 @@ func (mcc *multiClusterClient) DeleteAllOf(ctx context.Context, obj client.Objec

clusterClient, ok := mcc.clusterToClient[cluster]
if !ok {
metrics.NewInvalidClusterCounterMetrics("DeleteAllOf", cluster)
err = fmt.Errorf("unable to deleteAllOf: %v because of unknown cluster: %s for the client", obj, cluster)
return
}
Expand All @@ -226,6 +233,7 @@ func (mcc *multiClusterClient) Get(ctx context.Context, key types.NamespacedName

cluster, err = getCluster(ctx, obj.GetLabels())
if err != nil {
metrics.NewInvalidClusterCounterMetrics("Get", cluster)
mcc.log.Error(err, "failed to get cluster")
return err
}
Expand All @@ -239,6 +247,7 @@ func (mcc *multiClusterClient) Get(ctx context.Context, key types.NamespacedName

clusterClient, ok := mcc.clusterToClient[cluster]
if !ok {
metrics.NewInvalidClusterCounterMetrics("Get", cluster)
return fmt.Errorf("unable to get: %v because of unknown cluster: %s for the client", obj, cluster)
}
return clusterClient.Get(ctx, key, obj)
Expand All @@ -251,6 +260,7 @@ func (mcc *multiClusterClient) List(ctx context.Context, list client.ObjectList,

clusters, err := mcc.getClusterNames(ctx)
if err != nil {
metrics.NewInvalidClusterCounterMetrics("List", strings.Join(clusters, ","))
mcc.log.Error(err, "failed to get clusters")
return err
}
Expand All @@ -273,6 +283,7 @@ func (mcc *multiClusterClient) List(ctx context.Context, list client.ObjectList,
var ok bool
c, ok = mcc.clusterToClient[cluster]
if !ok {
metrics.NewInvalidClusterCounterMetrics("List", cluster)
return fmt.Errorf("unable to list because of unknown cluster: %s for the client", cluster)
}
}
Expand Down Expand Up @@ -314,6 +325,7 @@ func (mcc *multiClusterClient) Patch(ctx context.Context, obj client.Object, pat
// Get cluster info from context or labels, and delete it from labels because we should not write it into apiserver
cluster, err = getThenDeleteCluster(ctx, obj.GetLabels())
if err != nil {
metrics.NewInvalidClusterCounterMetrics("Patch", cluster)
mcc.log.Error(err, "failed to get cluster")
return err
}
Expand All @@ -327,6 +339,7 @@ func (mcc *multiClusterClient) Patch(ctx context.Context, obj client.Object, pat

clusterClient, ok := mcc.clusterToClient[cluster]
if !ok {
metrics.NewInvalidClusterCounterMetrics("Patch", cluster)
return fmt.Errorf("unable to patch: %v because of unknown cluster: %v for the client", obj, cluster)
}
return clusterClient.Patch(ctx, obj, patch, opts...)
Expand All @@ -342,6 +355,7 @@ func (mcc *multiClusterClient) Update(ctx context.Context, obj client.Object, op
// Get cluster info from context or labels, and delete it from labels because we should not write it into apiserver
cluster, err = getThenDeleteCluster(ctx, obj.GetLabels())
if err != nil {
metrics.NewInvalidClusterCounterMetrics("Update", cluster)
mcc.log.Error(err, "failed to get cluster")
return err
}
Expand All @@ -355,6 +369,7 @@ func (mcc *multiClusterClient) Update(ctx context.Context, obj client.Object, op

clusterClient, ok := mcc.clusterToClient[cluster]
if !ok {
metrics.NewInvalidClusterCounterMetrics("Update", cluster)
err = fmt.Errorf("unable to update: %v because of unknown cluster: %s for the client", obj, cluster)
return
}
Expand Down Expand Up @@ -393,6 +408,7 @@ func (sw *statusWriter) Update(ctx context.Context, obj client.Object, opts ...c
// Get cluster info from context or labels, and delete it from labels because we should not write it into apiserver
cluster, err = getThenDeleteCluster(ctx, obj.GetLabels())
if err != nil {
metrics.NewInvalidClusterCounterMetrics("StatusUpdate", cluster)
sw.log.Error(err, "failed to get cluster")
return err
}
Expand All @@ -403,6 +419,7 @@ func (sw *statusWriter) Update(ctx context.Context, obj client.Object, opts ...c

clusterClient, ok := sw.clusterToClient[cluster]
if !ok {
metrics.NewInvalidClusterCounterMetrics("StatusUpdate", cluster)
return fmt.Errorf("unable to update: %v because of unknown cluster: %s for the client", obj, cluster)
}
return clusterClient.Status().Update(ctx, obj, opts...)
Expand All @@ -418,6 +435,7 @@ func (sw *statusWriter) Patch(ctx context.Context, obj client.Object, patch clie
// Get cluster info from context or labels, and delete it from labels because we should not write it into apiserver
cluster, err = getThenDeleteCluster(ctx, obj.GetLabels())
if err != nil {
metrics.NewInvalidClusterCounterMetrics("StatusPatch", cluster)
sw.log.Error(err, "failed to get cluster")
return err
}
Expand All @@ -428,6 +446,7 @@ func (sw *statusWriter) Patch(ctx context.Context, obj client.Object, patch clie

clusterClient, ok := sw.clusterToClient[cluster]
if !ok {
metrics.NewInvalidClusterCounterMetrics("StatusPatch", cluster)
return fmt.Errorf("unable to update: %v because of unknown cluster: %s for the client", obj, cluster)
}
return clusterClient.Status().Patch(ctx, obj, patch, opts...)
Expand Down

0 comments on commit eee6491

Please sign in to comment.