Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Neg count metrics #2177

Merged
merged 2 commits into from
Jun 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 9 additions & 15 deletions pkg/neg/metrics/metricscollector/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,28 +139,22 @@ var (
},
[]string{"result"},
)

negsManagedCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: negControllerSubsystem,
Name: "managed_neg_count",
Help: "Number of NEGs the Neg Controller Manages",
},
[]string{"location", "endpoint_type"},
)
)

type syncerState struct {
lastSyncResult negtypes.Reason
inErrorState bool
}

// listAllSyncerStates lists all possible states for syncers.
func listAllSyncerStates() []syncerState {
var syncerStates []syncerState
// For error state errors, we should expect the syncer also in error state.
for _, state := range negtypes.ListErrorStates() {
syncerStates = append(syncerStates, syncerState{lastSyncResult: state, inErrorState: true})
}

for _, state := range negtypes.ListNonErrorStates() {
syncerStates = append(syncerStates, syncerState{lastSyncResult: state, inErrorState: true})
syncerStates = append(syncerStates, syncerState{lastSyncResult: state, inErrorState: false})
}
return syncerStates
}

type syncerStateCount map[syncerState]int

// LabelPropagationStat contains stats related to label propagation.
Expand Down
52 changes: 45 additions & 7 deletions pkg/neg/metrics/metricscollector/metrics_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ func RegisterMetrics() {
prometheus.MustRegister(DualStackMigrationServiceCount)
prometheus.MustRegister(SyncerCountByEndpointType)
prometheus.MustRegister(syncerSyncResult)
prometheus.MustRegister(negsManagedCount)
})
}

Expand All @@ -53,6 +54,13 @@ type SyncerMetricsCollector interface {
// UpdateSyncerEPMetrics update the endpoint and endpointSlice count for the given syncer
UpdateSyncerEPMetrics(key negtypes.NegSyncerKey, endpointCount, endpointSliceCount negtypes.StateCountMap)
SetLabelPropagationStats(key negtypes.NegSyncerKey, labelstatLabelPropagationStats LabelPropagationStats)
// Updates the number of negs per syncer per zone
UpdateSyncerNegCount(key negtypes.NegSyncerKey, negByLocation map[string]int)
}

type negLocTypeKey struct {
location string
endpointType string
}

type SyncerMetrics struct {
Expand All @@ -76,6 +84,8 @@ type SyncerMetrics struct {
// Stores the count of various kinds of endpoints which each syncer manages.
// Refer neg/metrics.go for the kinds of endpoints.
endpointsCountPerType map[negtypes.NegSyncerKey]map[string]int
//Stores the number of NEGs the NEG controller is managed based on location
syncerNegCount map[negtypes.NegSyncerKey]map[string]int

// logger logs message related to NegMetricsCollector
logger klog.Logger
Expand All @@ -91,6 +101,7 @@ func NewNegMetricsCollector(exportInterval time.Duration, logger klog.Logger) *S
dualStackMigrationStartTime: make(map[negtypes.NegSyncerKey]time.Time),
dualStackMigrationEndTime: make(map[negtypes.NegSyncerKey]time.Time),
endpointsCountPerType: make(map[negtypes.NegSyncerKey]map[string]int),
syncerNegCount: make(map[negtypes.NegSyncerKey]map[string]int),
clock: clock.RealClock{},
metricsInterval: exportInterval,
logger: logger.WithName("NegMetricsCollector"),
Expand Down Expand Up @@ -119,7 +130,11 @@ func (sm *SyncerMetrics) export() {
NumberOfEndpoints.WithLabelValues(epWithAnnotation).Set(float64(lpMetrics.EndpointsWithAnnotation))

stateCount, syncerCount := sm.computeSyncerStateMetrics()
PublishSyncerStateMetrics(stateCount)
//Reset metric so non-existent keys are now 0
SyncerCountBySyncResult.Reset()
for syncerState, count := range stateCount {
SyncerCountBySyncResult.WithLabelValues(string(syncerState.lastSyncResult), strconv.FormatBool(syncerState.inErrorState)).Set(float64(count))
}

epStateCount, epsStateCount, epCount, epsCount := sm.computeEndpointStateMetrics()
for state, count := range epStateCount {
Expand All @@ -129,10 +144,18 @@ func (sm *SyncerMetrics) export() {
syncerEndpointSliceState.WithLabelValues(string(state)).Set(float64(count))
}

negCounts := sm.computeNegCounts()
//Clear existing metrics (ensures that keys that don't exist anymore are reset)
negsManagedCount.Reset()
for key, count := range negCounts {
negsManagedCount.WithLabelValues(key.location, key.endpointType).Set(float64(count))
}

sm.logger.V(3).Info("Exporting syncer related metrics", "Syncer count", syncerCount,
"Network Endpoint Count", lpMetrics.NumberOfEndpoints,
"Endpoint Count From EPS", epCount,
"Endpoint Slice Count", epsCount,
"NEG Count", negCounts,
)

finishedDurations, longestUnfinishedDurations := sm.computeDualStackMigrationDurations()
Expand Down Expand Up @@ -207,6 +230,7 @@ func (sm *SyncerMetrics) DeleteSyncer(key negtypes.NegSyncerKey) {
delete(sm.dualStackMigrationStartTime, key)
delete(sm.dualStackMigrationEndTime, key)
delete(sm.endpointsCountPerType, key)
delete(sm.syncerNegCount, key)
}

// computeLabelMetrics aggregates label propagation metrics.
Expand Down Expand Up @@ -384,11 +408,25 @@ func (sm *SyncerMetrics) computeDualStackMigrationCounts() (map[string]int, int,
return syncerCountByEndpointType, migrationEndpointCount, migrationServices.Len()
}

func PublishSyncerStateMetrics(stateCount syncerStateCount) {
// Iterate to initialize all possible syncer state values.
for _, syncerState := range listAllSyncerStates() {
SyncerCountBySyncResult.WithLabelValues(
string(syncerState.lastSyncResult), strconv.FormatBool(syncerState.inErrorState)).
Set(float64(stateCount[syncerState]))
func (sm *SyncerMetrics) UpdateSyncerNegCount(key negtypes.NegSyncerKey, negsByLocation map[string]int) {
sm.mu.Lock()
defer sm.mu.Unlock()

sm.syncerNegCount[key] = negsByLocation
}

func (sm *SyncerMetrics) computeNegCounts() map[negLocTypeKey]int {
sm.mu.Lock()
defer sm.mu.Unlock()

negCountByLocation := make(map[negLocTypeKey]int)

for syncerKey, syncerNegCount := range sm.syncerNegCount {
for location, count := range syncerNegCount {
key := negLocTypeKey{location: location, endpointType: string(syncerKey.NegType)}
negCountByLocation[key] += count
}
}

return negCountByLocation
}
97 changes: 97 additions & 0 deletions pkg/neg/metrics/metricscollector/metrics_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,103 @@ func TestComputeLabelMetrics(t *testing.T) {
}
}

func TestComputeNegCounts(t *testing.T) {
collector := NewNegMetricsCollector(10*time.Second, klog.TODO())
l7Syncer1 := negtypes.NegSyncerKey{
Namespace: "ns1",
Name: "svc-1",
NegName: "neg-l7-1",
NegType: negtypes.VmIpPortEndpointType,
EpCalculatorMode: negtypes.L7Mode,
}
l7Syncer2 := negtypes.NegSyncerKey{
Namespace: "ns1",
Name: "svc-2",
NegName: "neg-l7-2",
NegType: negtypes.VmIpPortEndpointType,
EpCalculatorMode: negtypes.L7Mode,
}
l4Syncer1 := negtypes.NegSyncerKey{
Namespace: "ns2",
Name: "svc-1",
NegName: "neg-l4-1",
NegType: negtypes.VmIpEndpointType,
EpCalculatorMode: negtypes.L7Mode,
}
l4Syncer2 := negtypes.NegSyncerKey{
Namespace: "ns2",
Name: "svc-2",
NegName: "neg-l4-2",
NegType: negtypes.VmIpEndpointType,
EpCalculatorMode: negtypes.L7Mode,
}
for _, tc := range []struct {
desc string
syncerNegCount map[negtypes.NegSyncerKey]map[string]int
expect map[negLocTypeKey]int
}{
{
desc: "Empty Data",
syncerNegCount: map[negtypes.NegSyncerKey]map[string]int{},
expect: map[negLocTypeKey]int{},
},
{
desc: "Single syncers for each type",
syncerNegCount: map[negtypes.NegSyncerKey]map[string]int{
l7Syncer1: map[string]int{
"zone1": 1,
"zone2": 1,
},
l4Syncer1: map[string]int{
"zone1": 1,
"zone3": 1,
},
},
expect: map[negLocTypeKey]int{
negLocTypeKey{location: "zone1", endpointType: string(negtypes.VmIpPortEndpointType)}: 1,
negLocTypeKey{location: "zone2", endpointType: string(negtypes.VmIpPortEndpointType)}: 1,
negLocTypeKey{location: "zone1", endpointType: string(negtypes.VmIpEndpointType)}: 1,
negLocTypeKey{location: "zone3", endpointType: string(negtypes.VmIpEndpointType)}: 1,
},
},
{
desc: "Multiple syncers per type",
syncerNegCount: map[negtypes.NegSyncerKey]map[string]int{
l7Syncer1: map[string]int{
"zone1": 1,
"zone2": 1,
},
l7Syncer2: map[string]int{
"zone1": 1,
"zone4": 1,
},
l4Syncer1: map[string]int{
"zone1": 1,
"zone3": 1,
},
l4Syncer2: map[string]int{
"zone2": 1,
"zone3": 1,
},
},
expect: map[negLocTypeKey]int{
negLocTypeKey{location: "zone1", endpointType: string(negtypes.VmIpPortEndpointType)}: 2,
negLocTypeKey{location: "zone2", endpointType: string(negtypes.VmIpPortEndpointType)}: 1,
negLocTypeKey{location: "zone4", endpointType: string(negtypes.VmIpPortEndpointType)}: 1,
negLocTypeKey{location: "zone1", endpointType: string(negtypes.VmIpEndpointType)}: 1,
negLocTypeKey{location: "zone2", endpointType: string(negtypes.VmIpEndpointType)}: 1,
negLocTypeKey{location: "zone3", endpointType: string(negtypes.VmIpEndpointType)}: 2,
},
},
} {
collector.syncerNegCount = tc.syncerNegCount
out := collector.computeNegCounts()
if diff := cmp.Diff(out, tc.expect); diff != "" {
t.Errorf("For test case %s, (-got +want):\n%s", tc.desc, diff)
}
}
}

type fakeClock struct {
clock.Clock
curTime time.Time
Expand Down
3 changes: 3 additions & 0 deletions pkg/neg/syncers/transaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ func (s *transactionSyncer) ensureNetworkEndpointGroups() error {

var errList []error
var negObjRefs []negv1beta1.NegObjectReference
negsByLocation := make(map[string]int)
for _, zone := range zones {
var negObj negv1beta1.NegObjectReference
negObj, err = ensureNetworkEndpointGroup(
Expand All @@ -415,10 +416,12 @@ func (s *transactionSyncer) ensureNetworkEndpointGroups() error {

if s.svcNegClient != nil && err == nil {
negObjRefs = append(negObjRefs, negObj)
negsByLocation[zone]++
}
}

s.updateInitStatus(negObjRefs, errList)
s.syncMetricsCollector.UpdateSyncerNegCount(s.NegSyncerKey, negsByLocation)
return utilerrors.NewAggregate(errList)
}

Expand Down
16 changes: 0 additions & 16 deletions pkg/neg/types/sync_errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,22 +189,6 @@ func ClassifyError(err error) NegSyncError {
return syncErrType
}

// ListErrorStates lists all error-state reasons.
func ListErrorStates() []Reason {
return []Reason{ReasonEPCountsDiffer, ReasonEPNodeMissing, ReasonEPNodeNotFound,
ReasonEPNodeTypeAssertionFailed, ReasonEPPodMissing, ReasonEPPodNotFound,
ReasonEPPodTypeAssertionFailed, ReasonEPPodTerminal, ReasonEPZoneMissing,
ReasonEPSEndpointCountZero, ReasonEPCalculationCountZero, ReasonInvalidAPIResponse,
ReasonInvalidEPAttach, ReasonInvalidEPDetach, ReasonEPIPInvalid, ReasonEPIPNotFromPod,
ReasonEPIPOutOfPodCIDR, ReasonEPServiceNotFound, ReasonEPPodLabelMismatch}
}

// ListErrorStates lists all non error-state reasons.
func ListNonErrorStates() []Reason {
return []Reason{ReasonNegNotFound, ReasonCurrentNegEPNotFound,
ReasonEPSNotFound, ReasonOtherError, ReasonSuccess}
}

// StrategyQuotaError indicates that a quota error was a result of a request using throttling.Strategy.
type StrategyQuotaError struct {
Err error
Expand Down