From 582a8a93629fef0058f8af41d5beeb29b88c3e95 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Fri, 15 Jul 2022 13:47:17 -0500 Subject: [PATCH] metrics: even classless blocked evals get metrics This PR fixes a bug where blocked evaluations with no class set would not have metrics exported at the dc:class scope. Fixes #13759 --- .changelog/13786.txt | 3 ++ nomad/blocked_evals.go | 10 ++--- nomad/blocked_evals_stats.go | 42 +++++++++++---------- nomad/blocked_evals_stats_test.go | 62 +++++++++++++++++++------------ 4 files changed, 70 insertions(+), 47 deletions(-) create mode 100644 .changelog/13786.txt diff --git a/.changelog/13786.txt b/.changelog/13786.txt new file mode 100644 index 00000000000..17d25b90bd1 --- /dev/null +++ b/.changelog/13786.txt @@ -0,0 +1,3 @@ +```release-note:bug +metrics: Fixed a bug where blocked evals with no class produced no dc:class scope metrics +``` diff --git a/nomad/blocked_evals.go b/nomad/blocked_evals.go index 02ce6c18ff4..2026e6826e3 100644 --- a/nomad/blocked_evals.go +++ b/nomad/blocked_evals.go @@ -4,8 +4,8 @@ import ( "sync" "time" - metrics "github.com/armon/go-metrics" - log "github.com/hashicorp/go-hclog" + "github.com/armon/go-metrics" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" ) @@ -31,7 +31,7 @@ const ( // failed allocation becomes available. type BlockedEvals struct { // logger is the logger to use by the blocked eval tracker. - logger log.Logger + logger hclog.Logger evalBroker *EvalBroker enabled bool @@ -96,7 +96,7 @@ type wrappedEval struct { // NewBlockedEvals creates a new blocked eval tracker that will enqueue // unblocked evals into the passed broker. -func NewBlockedEvals(evalBroker *EvalBroker, logger log.Logger) *BlockedEvals { +func NewBlockedEvals(evalBroker *EvalBroker, logger hclog.Logger) *BlockedEvals { return &BlockedEvals{ logger: logger.Named("blocked_evals"), evalBroker: evalBroker, @@ -727,7 +727,7 @@ func (b *BlockedEvals) EmitStats(period time.Duration, stopCh <-chan struct{}) { metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "job", "memory"}, float32(v.MemoryMB), labels) } - for k, v := range stats.BlockedResources.ByNode { + for k, v := range stats.BlockedResources.ByClassInDC { labels := []metrics.Label{ {Name: "datacenter", Value: k.dc}, {Name: "node_class", Value: k.class}, diff --git a/nomad/blocked_evals_stats.go b/nomad/blocked_evals_stats.go index 00755965183..cf90fb5431a 100644 --- a/nomad/blocked_evals_stats.go +++ b/nomad/blocked_evals_stats.go @@ -24,8 +24,8 @@ type BlockedStats struct { BlockedResources *BlockedResourcesStats } -// node stores information related to nodes. -type node struct { +// classInDC is a coordinate of a specific class in a specific datacenter +type classInDC struct { dc string class string } @@ -65,9 +65,9 @@ func (b *BlockedStats) prune(cutoff time.Time) { } } - for k, v := range b.BlockedResources.ByNode { + for k, v := range b.BlockedResources.ByClassInDC { if shouldPrune(v) { - delete(b.BlockedResources.ByNode, k) + delete(b.BlockedResources.ByClassInDC, k) } } } @@ -89,6 +89,10 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats { for class := range allocMetrics.ClassExhausted { classes[class] = struct{}{} } + if len(allocMetrics.ClassExhausted) == 0 { + // some evaluations have no class + classes[""] = struct{}{} + } for _, r := range allocMetrics.ResourcesExhausted { resources.CPU += r.CPU resources.MemoryMB += r.MemoryMB @@ -99,32 +103,32 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats { nsID := structs.NewNamespacedID(eval.JobID, eval.Namespace) byJob[nsID] = resources - byNodeInfo := make(map[node]BlockedResourcesSummary) + byClassInDC := make(map[classInDC]BlockedResourcesSummary) for dc := range dcs { for class := range classes { - k := node{dc: dc, class: class} - byNodeInfo[k] = resources + k := classInDC{dc: dc, class: class} + byClassInDC[k] = resources } } return &BlockedResourcesStats{ - ByJob: byJob, - ByNode: byNodeInfo, + ByJob: byJob, + ByClassInDC: byClassInDC, } } // BlockedResourcesStats stores resources requested by blocked evaluations, // tracked both by job and by node. type BlockedResourcesStats struct { - ByJob map[structs.NamespacedID]BlockedResourcesSummary - ByNode map[node]BlockedResourcesSummary + ByJob map[structs.NamespacedID]BlockedResourcesSummary + ByClassInDC map[classInDC]BlockedResourcesSummary } // NewBlockedResourcesStats returns a new BlockedResourcesStats. func NewBlockedResourcesStats() *BlockedResourcesStats { return &BlockedResourcesStats{ - ByJob: make(map[structs.NamespacedID]BlockedResourcesSummary), - ByNode: make(map[node]BlockedResourcesSummary), + ByJob: make(map[structs.NamespacedID]BlockedResourcesSummary), + ByClassInDC: make(map[classInDC]BlockedResourcesSummary), } } @@ -136,8 +140,8 @@ func (b *BlockedResourcesStats) Copy() *BlockedResourcesStats { result.ByJob[k] = v // value copy } - for k, v := range b.ByNode { - result.ByNode[k] = v // value copy + for k, v := range b.ByClassInDC { + result.ByClassInDC[k] = v // value copy } return result @@ -152,8 +156,8 @@ func (b *BlockedResourcesStats) Add(a *BlockedResourcesStats) *BlockedResourcesS result.ByJob[k] = b.ByJob[k].Add(v) } - for k, v := range a.ByNode { - result.ByNode[k] = b.ByNode[k].Add(v) + for k, v := range a.ByClassInDC { + result.ByClassInDC[k] = b.ByClassInDC[k].Add(v) } return result @@ -168,8 +172,8 @@ func (b *BlockedResourcesStats) Subtract(a *BlockedResourcesStats) *BlockedResou result.ByJob[k] = b.ByJob[k].Subtract(v) } - for k, v := range a.ByNode { - result.ByNode[k] = b.ByNode[k].Subtract(v) + for k, v := range a.ByClassInDC { + result.ByClassInDC[k] = b.ByClassInDC[k].Subtract(v) } return result diff --git a/nomad/blocked_evals_stats_test.go b/nomad/blocked_evals_stats_test.go index 3aaa3f45494..ce3fee6c6f2 100644 --- a/nomad/blocked_evals_stats_test.go +++ b/nomad/blocked_evals_stats_test.go @@ -128,8 +128,8 @@ func TestBlockedResourceStats_New(t *testing.T) { a := NewBlockedResourcesStats() require.NotNil(t, a.ByJob) require.Empty(t, a.ByJob) - require.NotNil(t, a.ByNode) - require.Empty(t, a.ByNode) + require.NotNil(t, a.ByClassInDC) + require.Empty(t, a.ByClassInDC) } var ( @@ -143,15 +143,20 @@ var ( Namespace: "two", } - node1 = node{ + node1 = classInDC{ dc: "dc1", class: "alpha", } - node2 = node{ + node2 = classInDC{ dc: "dc1", class: "beta", } + + node3 = classInDC{ + dc: "dc1", + class: "", // not set + } ) func TestBlockedResourceStats_Copy(t *testing.T) { @@ -166,7 +171,7 @@ func TestBlockedResourceStats_Copy(t *testing.T) { MemoryMB: 256, }, } - a.ByNode = map[node]BlockedResourcesSummary{ + a.ByClassInDC = map[classInDC]BlockedResourcesSummary{ node1: { Timestamp: now1, CPU: 300, @@ -180,7 +185,7 @@ func TestBlockedResourceStats_Copy(t *testing.T) { CPU: 888, MemoryMB: 888, } - c.ByNode[node1] = BlockedResourcesSummary{ + c.ByClassInDC[node1] = BlockedResourcesSummary{ Timestamp: now2, CPU: 999, MemoryMB: 999, @@ -188,7 +193,7 @@ func TestBlockedResourceStats_Copy(t *testing.T) { // underlying data should have been deep copied require.Equal(t, 100, a.ByJob[id1].CPU) - require.Equal(t, 300, a.ByNode[node1].CPU) + require.Equal(t, 300, a.ByClassInDC[node1].CPU) } func TestBlockedResourcesStats_Add(t *testing.T) { @@ -196,7 +201,7 @@ func TestBlockedResourcesStats_Add(t *testing.T) { a.ByJob = map[structs.NamespacedID]BlockedResourcesSummary{ id1: {Timestamp: now(1), CPU: 111, MemoryMB: 222}, } - a.ByNode = map[node]BlockedResourcesSummary{ + a.ByClassInDC = map[classInDC]BlockedResourcesSummary{ node1: {Timestamp: now(2), CPU: 333, MemoryMB: 444}, } @@ -205,7 +210,7 @@ func TestBlockedResourcesStats_Add(t *testing.T) { id1: {Timestamp: now(3), CPU: 200, MemoryMB: 300}, id2: {Timestamp: now(4), CPU: 400, MemoryMB: 500}, } - b.ByNode = map[node]BlockedResourcesSummary{ + b.ByClassInDC = map[classInDC]BlockedResourcesSummary{ node1: {Timestamp: now(5), CPU: 600, MemoryMB: 700}, node2: {Timestamp: now(6), CPU: 800, MemoryMB: 900}, } @@ -218,10 +223,10 @@ func TestBlockedResourcesStats_Add(t *testing.T) { id2: {Timestamp: now(4), CPU: 400, MemoryMB: 500}, }, result.ByJob) - require.Equal(t, map[node]BlockedResourcesSummary{ + require.Equal(t, map[classInDC]BlockedResourcesSummary{ node1: {Timestamp: now(5), CPU: 933, MemoryMB: 1144}, node2: {Timestamp: now(6), CPU: 800, MemoryMB: 900}, - }, result.ByNode) + }, result.ByClassInDC) }) // make sure we handle zeros in both directions @@ -233,20 +238,31 @@ func TestBlockedResourcesStats_Add(t *testing.T) { id2: {Timestamp: now(4), CPU: 400, MemoryMB: 500}, }, result.ByJob) - require.Equal(t, map[node]BlockedResourcesSummary{ + require.Equal(t, map[classInDC]BlockedResourcesSummary{ node1: {Timestamp: now(2), CPU: 933, MemoryMB: 1144}, node2: {Timestamp: now(6), CPU: 800, MemoryMB: 900}, - }, result.ByNode) + }, result.ByClassInDC) }) } +func TestBlockedResourcesStats_Add_NoClass(t *testing.T) { + a := NewBlockedResourcesStats() + a.ByClassInDC = map[classInDC]BlockedResourcesSummary{ + node3: {Timestamp: now(1), CPU: 111, MemoryMB: 1111}, + } + result := a.Add(a) + require.Equal(t, map[classInDC]BlockedResourcesSummary{ + node3: {Timestamp: now(1), CPU: 222, MemoryMB: 2222}, + }, result.ByClassInDC) +} + func TestBlockedResourcesStats_Subtract(t *testing.T) { a := NewBlockedResourcesStats() a.ByJob = map[structs.NamespacedID]BlockedResourcesSummary{ id1: {Timestamp: now(1), CPU: 100, MemoryMB: 100}, id2: {Timestamp: now(2), CPU: 200, MemoryMB: 200}, } - a.ByNode = map[node]BlockedResourcesSummary{ + a.ByClassInDC = map[classInDC]BlockedResourcesSummary{ node1: {Timestamp: now(3), CPU: 300, MemoryMB: 300}, node2: {Timestamp: now(4), CPU: 400, MemoryMB: 400}, } @@ -256,7 +272,7 @@ func TestBlockedResourcesStats_Subtract(t *testing.T) { id1: {Timestamp: now(5), CPU: 10, MemoryMB: 11}, id2: {Timestamp: now(6), CPU: 12, MemoryMB: 13}, } - b.ByNode = map[node]BlockedResourcesSummary{ + b.ByClassInDC = map[classInDC]BlockedResourcesSummary{ node1: {Timestamp: now(7), CPU: 14, MemoryMB: 15}, node2: {Timestamp: now(8), CPU: 16, MemoryMB: 17}, } @@ -274,14 +290,14 @@ func TestBlockedResourcesStats_Subtract(t *testing.T) { require.Equal(t, 187, result.ByJob[id2].MemoryMB) // node1 - require.Equal(t, now(7), result.ByNode[node1].Timestamp) - require.Equal(t, 286, result.ByNode[node1].CPU) - require.Equal(t, 285, result.ByNode[node1].MemoryMB) + require.Equal(t, now(7), result.ByClassInDC[node1].Timestamp) + require.Equal(t, 286, result.ByClassInDC[node1].CPU) + require.Equal(t, 285, result.ByClassInDC[node1].MemoryMB) // node2 - require.Equal(t, now(8), result.ByNode[node2].Timestamp) - require.Equal(t, 384, result.ByNode[node2].CPU) - require.Equal(t, 383, result.ByNode[node2].MemoryMB) + require.Equal(t, now(8), result.ByClassInDC[node2].Timestamp) + require.Equal(t, 384, result.ByClassInDC[node2].CPU) + require.Equal(t, 383, result.ByClassInDC[node2].MemoryMB) } // testBlockedEvalsRandomBlockedEval wraps an eval that is randomly generated. @@ -363,9 +379,9 @@ func clearTimestampFromBlockedResourceStats(b *BlockedResourcesStats) { v.Timestamp = time.Time{} b.ByJob[k] = v } - for k, v := range b.ByNode { + for k, v := range b.ByClassInDC { v.Timestamp = time.Time{} - b.ByNode[k] = v + b.ByClassInDC[k] = v } }