-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
client: emit optional telemetry from prerun and prestart hooks.
The Nomad client can now optionally emit telemetry data from the prerun and prestart hooks. This allows operators to monitor and alert on failures and time taken to complete. The new datapoints are: - nomad.client.alloc_hook.prerun.success (counter) - nomad.client.alloc_hook.prerun.failed (counter) - nomad.client.alloc_hook.prerun.elapsed (sample) - nomad.client.task_hook.prestart.success (counter) - nomad.client.task_hook.prestart.failed (counter) - nomad.client.task_hook.prestart.elapsed (sample) The hook execution time is useful to Nomad engineering and will help optimize code where possible and understand job specification impacts on hook performance. Currently only the PreRun and PreStart hooks have telemetry enabled, so we limit the number of new metrics being produced.
- Loading branch information
Showing
20 changed files
with
494 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
// Copyright (c) HashiCorp, Inc. | ||
// SPDX-License-Identifier: BUSL-1.1 | ||
|
||
package hookstats | ||
|
||
import ( | ||
"time" | ||
|
||
"github.com/armon/go-metrics" | ||
"github.com/hashicorp/nomad/client/allocrunner/interfaces" | ||
) | ||
|
||
// Handler implements interfaces.HookStatsHandler and is used when the operator | ||
// has not disabled hook metrics. | ||
type Handler struct { | ||
baseLabels []metrics.Label | ||
runnerType string | ||
} | ||
|
||
// NewHandler creates a new hook stats handler to be used for emitting hook | ||
// stats for operator alerting and performance identification. The base labels | ||
// should be passed from the client set of labels and the runner type indicates | ||
// if the hooks are run from the alloc or task runner. | ||
func NewHandler(base []metrics.Label, runnerType string) interfaces.HookStatsHandler { | ||
return &Handler{ | ||
baseLabels: base, | ||
runnerType: runnerType, | ||
} | ||
} | ||
|
||
func (h *Handler) Emit(start time.Time, hookName, hookType string, err error) { | ||
|
||
// Add the hook name to the base labels array, so we have a complete set to | ||
// add to the metrics. Operators do not want this as part of the metric | ||
// name due to cardinality control. | ||
labels := h.baseLabels | ||
labels = append(labels, metrics.Label{Name: "hook_name", Value: hookName}) | ||
|
||
metrics.MeasureSinceWithLabels([]string{"client", h.runnerType, hookType, "elapsed"}, start, labels) | ||
if err != nil { | ||
metrics.IncrCounterWithLabels([]string{"client", h.runnerType, hookType, "failed"}, 1, labels) | ||
} else { | ||
metrics.IncrCounterWithLabels([]string{"client", h.runnerType, hookType, "success"}, 1, labels) | ||
} | ||
} | ||
|
||
// NoOpHandler implements interfaces.HookStatsHandler and is used when the | ||
// operator has disabled hook metrics. | ||
type NoOpHandler struct{} | ||
|
||
func NewNoOpHandler() interfaces.HookStatsHandler { return &NoOpHandler{} } | ||
|
||
func (n *NoOpHandler) Emit(_ time.Time, _, _ string, _ error) {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
// Copyright (c) HashiCorp, Inc. | ||
// SPDX-License-Identifier: BUSL-1.1 | ||
|
||
package hookstats | ||
|
||
import ( | ||
"errors" | ||
"testing" | ||
"time" | ||
|
||
"github.com/armon/go-metrics" | ||
"github.com/hashicorp/nomad/ci" | ||
"github.com/shoenig/test/must" | ||
) | ||
|
||
func TestHandler(t *testing.T) { | ||
ci.Parallel(t) | ||
|
||
// Generate base labels that represent what an operator would see and then | ||
// create out new handler to interact with. | ||
baseLabels := []metrics.Label{ | ||
{Name: "datacenter", Value: "dc1"}, | ||
{Name: "node_class", Value: "none"}, | ||
{Name: "node_pool", Value: "default"}, | ||
{Name: "namespace", Value: "default"}, | ||
{Name: "host", Value: "client-5d3c"}, | ||
{Name: "node_id", Value: "35db24e7-0a55-80d2-2279-e022c37cc591"}, | ||
} | ||
newHandler := NewHandler(baseLabels, "test_hook") | ||
|
||
// The data stored is within the in-memory sink as map entries, so we need | ||
// to know the key names to pull this out correctly. Build those now. | ||
var metricKeySuffix, sampleName, counterSuccessName, counterFailureName string | ||
|
||
for _, label := range baseLabels { | ||
metricKeySuffix += ";" + label.Name + "=" + label.Value | ||
} | ||
|
||
metricKeySuffix += ";" + "hook_name=test_hook_name" | ||
sampleName = "nomad_test.client.test_hook.prerun.elapsed" + metricKeySuffix | ||
counterSuccessName = "nomad_test.client.test_hook.prerun.success" + metricKeySuffix | ||
counterFailureName = "nomad_test.client.test_hook.prerun.failed" + metricKeySuffix | ||
|
||
// Create an in-memory sink and global, so we can actually look at and test | ||
// the metrics we emit. | ||
inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond) | ||
|
||
_, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink) | ||
must.NoError(t, err) | ||
|
||
// Emit hook related metrics where the supplied error is nil and check that | ||
// the data is as expected. | ||
newHandler.Emit(time.Now(), "test_hook_name", "prerun", nil) | ||
|
||
sinkData := inMemorySink.Data() | ||
must.Len(t, 1, sinkData) | ||
must.MapContainsKey(t, sinkData[0].Counters, counterSuccessName) | ||
must.MapContainsKey(t, sinkData[0].Samples, sampleName) | ||
|
||
successCounter := sinkData[0].Counters[counterSuccessName] | ||
must.Eq(t, 1, successCounter.Count) | ||
must.Eq(t, 1, successCounter.Sum) | ||
|
||
sample1 := sinkData[0].Samples[sampleName] | ||
must.Eq(t, 1, sample1.Count) | ||
must.True(t, sample1.Sum > 0) | ||
|
||
// Create a new in-memory sink and global collector to ensure we don't have | ||
// leftovers from the previous test. | ||
inMemorySink = metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond) | ||
|
||
_, err = metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink) | ||
must.NoError(t, err) | ||
|
||
// Emit a hook related metrics where the supplied error is non-nil and | ||
// check that the data is as expected. | ||
newHandler.Emit(time.Now(), "test_hook_name", "prerun", errors.New("test error")) | ||
|
||
sinkData = inMemorySink.Data() | ||
must.Len(t, 1, sinkData) | ||
must.MapContainsKey(t, sinkData[0].Counters, counterFailureName) | ||
must.MapContainsKey(t, sinkData[0].Samples, sampleName) | ||
|
||
failureCounter := sinkData[0].Counters[counterFailureName] | ||
must.Eq(t, 1, failureCounter.Count) | ||
must.Eq(t, 1, failureCounter.Sum) | ||
|
||
sample2 := sinkData[0].Samples[sampleName] | ||
must.Eq(t, 1, sample2.Count) | ||
must.True(t, sample2.Sum > 0) | ||
} | ||
|
||
func TestNoOpHandler(t *testing.T) { | ||
ci.Parallel(t) | ||
|
||
newHandler := NewNoOpHandler() | ||
|
||
// Create a new in-memory sink and global collector, so we can test that no | ||
// metrics are emitted. | ||
inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond) | ||
|
||
_, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink) | ||
must.NoError(t, err) | ||
|
||
// Call the function with a non-nil error and check the results of the | ||
// in-memory sink. | ||
newHandler.Emit(time.Now(), "test_hook_name", "prerun", errors.New("test error")) | ||
|
||
sinkData := inMemorySink.Data() | ||
must.Len(t, 1, sinkData) | ||
must.MapLen(t, 0, sinkData[0].Counters) | ||
must.MapLen(t, 0, sinkData[0].Samples) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.