hashicorp · jrasell · Dec 12, 2024 · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024
@@ -0,0 +1,3 @@
+```release-note:improvement
+client: Emit telemetry from prerun and prestart hooks for monitoring and alerting
+```
@@ -10,9 +10,11 @@ import (
 	"sync"
 	"time"
 
+	"github.com/armon/go-metrics"
 	log "github.com/hashicorp/go-hclog"
 	multierror "github.com/hashicorp/go-multierror"
 	"github.com/hashicorp/nomad/client/allocdir"
+	"github.com/hashicorp/nomad/client/allocrunner/hookstats"
 	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
 	"github.com/hashicorp/nomad/client/allocrunner/state"
 	"github.com/hashicorp/nomad/client/allocrunner/tasklifecycle"
@@ -44,6 +46,7 @@ import (
 
 // allocRunner is used to run all the tasks in a given allocation
 type allocRunner struct {
+
 	// id is the ID of the allocation. Can be accessed without a lock
 	id string
 
@@ -53,6 +56,15 @@ type allocRunner struct {
 	// clientConfig is the client configuration block.
 	clientConfig *config.Config
 
+	// clientBaseLabels are the base metric labels generated by the client.
+	// These can be used by processes which emit metrics that want to include
+	// these labels that include node_id, node_class, and node_pool.
+	//
+	// They could be generated using the clientConfig, but the kv pairs will
+	// not change unless the client process is fully restarted, so passing them
+	// along avoids extra work.
+	clientBaseLabels []metrics.Label
+
 	// stateUpdater is used to emit updated alloc state
 	stateUpdater cinterfaces.AllocStateHandler
 
@@ -87,6 +99,10 @@ type allocRunner struct {
 	// vaultClientFunc is used to get the client used to manage Vault tokens
 	vaultClientFunc vaultclient.VaultClientFunc
 
+	// hookStatsHandler is used by certain hooks to emit telemetry data, if the
+	// operator has not disabled this functionality.
+	hookStatsHandler interfaces.HookStatsHandler
+
 	// waitCh is closed when the Run loop has exited
 	waitCh chan struct{}
 
@@ -230,6 +246,7 @@ func NewAllocRunner(config *config.AllocRunnerConfig) (interfaces.AllocRunner, e
 		id:                       alloc.ID,
 		alloc:                    alloc,
 		clientConfig:             config.ClientConfig,
+		clientBaseLabels:         config.BaseLabels,
 		consulServicesHandler:    config.ConsulServices,
 		consulProxiesClientFunc:  config.ConsulProxiesFunc,
 		sidsClient:               config.ConsulSI,
@@ -269,6 +286,8 @@ func NewAllocRunner(config *config.AllocRunnerConfig) (interfaces.AllocRunner, e
 	// Create alloc broadcaster
 	ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger)
 
+	ar.setHookStatsHandler(alloc.Namespace)
+
 	// Create alloc dir
 	ar.allocDir = allocdir.NewAllocDir(
 		ar.logger,
@@ -315,6 +334,7 @@ func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error {
 		trConfig := &taskrunner.Config{
 			Alloc:               ar.alloc,
 			ClientConfig:        ar.clientConfig,
+			ClientBaseLabels:    ar.clientBaseLabels,
 			Task:                task,
 			TaskDir:             ar.allocDir.NewTaskDir(task),
 			Logger:              ar.logger,
@@ -1549,3 +1569,23 @@ func (ar *allocRunner) GetCSIVolumes() (map[string]*state.CSIVolumeStub, error)
 	}
 	return allocVols.CSIVolumes, nil
 }
+
+// setHookStatsHandler builds the hook stats handler based on whether the
+// operator has disabled this or not.
+//
+// The non-noop implementation is built using the base client labels and
+// the allocation namespace. The caller will add "hook_name". It would be
+// possible to add more labels, however, this would cause the metric
+// cardinality to increase dramatically without much benefit.
+//
+// This could be inline within the NewAllocRunner function, but having a
+// separate function makes testing easier.
+func (ar *allocRunner) setHookStatsHandler(ns string) {
+	if ar.clientConfig.DisableAllocationHookMetrics {
+		ar.hookStatsHandler = hookstats.NewNoOpHandler()
+	} else {
+		labels := ar.clientBaseLabels
+		labels = append(labels, metrics.Label{Name: "namespace", Value: ns})
+		ar.hookStatsHandler = hookstats.NewHandler(labels, "alloc_hook")
+	}
+}
@@ -189,7 +189,17 @@ func (ar *allocRunner) prerun() error {
 			ar.logger.Trace("running pre-run hook", "name", name, "start", start)
 		}
 
-		if err := pre.Prerun(); err != nil {
+		// If the operator has disabled hook metrics, then don't call the time
+		// function to save 30ns per hook.
+		var hookExecutionStart time.Time
+
+		if !ar.clientConfig.DisableAllocationHookMetrics {
+			hookExecutionStart = time.Now()
+		}
+
+		err := pre.Prerun()
+		ar.hookStatsHandler.Emit(hookExecutionStart, name, "prerun", err)
+		if err != nil {
 			return fmt.Errorf("pre-run hook %q failed: %v", name, err)
 		}
 

@@ -12,15 +12,18 @@ import (
 	"testing"
 	"time"
 
+	"github.com/armon/go-metrics"
 	"github.com/hashicorp/consul/api"
 	multierror "github.com/hashicorp/go-multierror"
 	"github.com/hashicorp/nomad/ci"
 	"github.com/hashicorp/nomad/client/allochealth"
+	"github.com/hashicorp/nomad/client/allocrunner/hookstats"
 	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
 	arstate "github.com/hashicorp/nomad/client/allocrunner/state"
 	"github.com/hashicorp/nomad/client/allocrunner/tasklifecycle"
 	"github.com/hashicorp/nomad/client/allocrunner/taskrunner"
 	"github.com/hashicorp/nomad/client/allocwatcher"
+	client "github.com/hashicorp/nomad/client/config"
 	"github.com/hashicorp/nomad/client/lib/proclib"
 	"github.com/hashicorp/nomad/client/serviceregistration"
 	regMock "github.com/hashicorp/nomad/client/serviceregistration/mock"
@@ -2531,3 +2534,31 @@ func TestAllocRunner_GetUpdatePriority(t *testing.T) {
 	calloc = ar.clientAlloc(map[string]*structs.TaskState{})
 	must.Eq(t, cstructs.AllocUpdatePriorityUrgent, ar.GetUpdatePriority(calloc))
 }
+
+func TestAllocRunner_setHookStatsHandler(t *testing.T) {
+	ci.Parallel(t)
+
+	// Create an allocation runner that doesn't have any configuration, which
+	// means the operator has not disabled hook metrics.
+	baseAllocRunner := &allocRunner{
+		clientConfig:     &client.Config{},
+		clientBaseLabels: []metrics.Label{},
+	}
+
+	baseAllocRunner.setHookStatsHandler("platform")
+	handler, ok := baseAllocRunner.hookStatsHandler.(*hookstats.Handler)
+	must.True(t, ok)
+	must.NotNil(t, handler)
+
+	// Create a new allocation runner but explicitly disable hook metrics
+	// collection.
+	baseAllocRunner = &allocRunner{
+		clientConfig:     &client.Config{DisableAllocationHookMetrics: true},
+		clientBaseLabels: []metrics.Label{},
+	}
+
+	baseAllocRunner.setHookStatsHandler("platform")
+	noopHandler, ok := baseAllocRunner.hookStatsHandler.(*hookstats.NoOpHandler)
+	must.True(t, ok)
+	must.NotNil(t, noopHandler)
+}
@@ -0,0 +1,53 @@
+// Copyright (c) HashiCorp, Inc.
+// SPDX-License-Identifier: BUSL-1.1
+
+package hookstats
+
+import (
+	"time"
+
+	"github.com/armon/go-metrics"
+	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
+)
+
+// Handler implements interfaces.HookStatsHandler and is used when the operator
+// has not disabled hook metrics.
+type Handler struct {
+	baseLabels []metrics.Label
+	runnerType string
+}
+
+// NewHandler creates a new hook stats handler to be used for emitting hook
+// stats for operator alerting and performance identification. The base labels
+// should be passed from the client set of labels and the runner type indicates
+// if the hooks are run from the alloc or task runner.
+func NewHandler(base []metrics.Label, runnerType string) interfaces.HookStatsHandler {
+	return &Handler{
+		baseLabels: base,
+		runnerType: runnerType,
+	}
+}
+
+func (h *Handler) Emit(start time.Time, hookName, hookType string, err error) {
+
+	// Add the hook name to the base labels array, so we have a complete set to
+	// add to the metrics. Operators do not want this as part of the metric
+	// name due to cardinality control.
+	labels := h.baseLabels
+	labels = append(labels, metrics.Label{Name: "hook_name", Value: hookName})
+
+	metrics.MeasureSinceWithLabels([]string{"client", h.runnerType, hookType, "elapsed"}, start, labels)
+	if err != nil {
+		metrics.IncrCounterWithLabels([]string{"client", h.runnerType, hookType, "failed"}, 1, labels)
+	} else {
+		metrics.IncrCounterWithLabels([]string{"client", h.runnerType, hookType, "success"}, 1, labels)
+	}
+}
+
+// NoOpHandler implements interfaces.HookStatsHandler and is used when the
+// operator has disabled hook metrics.
+type NoOpHandler struct{}
+
+func NewNoOpHandler() interfaces.HookStatsHandler { return &NoOpHandler{} }
+
+func (n *NoOpHandler) Emit(_ time.Time, _, _ string, _ error) {}
@@ -0,0 +1,113 @@
+// Copyright (c) HashiCorp, Inc.
+// SPDX-License-Identifier: BUSL-1.1
+
+package hookstats
+
+import (
+	"errors"
+	"testing"
+	"time"
+
+	"github.com/armon/go-metrics"
+	"github.com/hashicorp/nomad/ci"
+	"github.com/shoenig/test/must"
+)
+
+func TestHandler(t *testing.T) {
+	ci.Parallel(t)
+
+	// Generate base labels that represent what an operator would see and then
+	// create out new handler to interact with.
+	baseLabels := []metrics.Label{
+		{Name: "datacenter", Value: "dc1"},
+		{Name: "node_class", Value: "none"},
+		{Name: "node_pool", Value: "default"},
+		{Name: "namespace", Value: "default"},
+		{Name: "host", Value: "client-5d3c"},
+		{Name: "node_id", Value: "35db24e7-0a55-80d2-2279-e022c37cc591"},
+	}
+	newHandler := NewHandler(baseLabels, "test_hook")
+
+	// The data stored is within the in-memory sink as map entries, so we need
+	// to know the key names to pull this out correctly. Build those now.
+	var metricKeySuffix, sampleName, counterSuccessName, counterFailureName string
+
+	for _, label := range baseLabels {
+		metricKeySuffix += ";" + label.Name + "=" + label.Value
+	}
+
+	metricKeySuffix += ";" + "hook_name=test_hook_name"
+	sampleName = "nomad_test.client.test_hook.prerun.elapsed" + metricKeySuffix
+	counterSuccessName = "nomad_test.client.test_hook.prerun.success" + metricKeySuffix
+	counterFailureName = "nomad_test.client.test_hook.prerun.failed" + metricKeySuffix
+
+	// Create an in-memory sink and global, so we can actually look at and test
+	// the metrics we emit.
+	inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond)
+
+	_, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink)
+	must.NoError(t, err)
+
+	// Emit hook related metrics where the supplied error is nil and check that
+	// the data is as expected.
+	newHandler.Emit(time.Now(), "test_hook_name", "prerun", nil)
+
+	sinkData := inMemorySink.Data()
+	must.Len(t, 1, sinkData)
+	must.MapContainsKey(t, sinkData[0].Counters, counterSuccessName)
+	must.MapContainsKey(t, sinkData[0].Samples, sampleName)
+
+	successCounter := sinkData[0].Counters[counterSuccessName]
+	must.Eq(t, 1, successCounter.Count)
+	must.Eq(t, 1, successCounter.Sum)
+
+	sample1 := sinkData[0].Samples[sampleName]
+	must.Eq(t, 1, sample1.Count)
+	must.True(t, sample1.Sum > 0)
+
+	// Create a new in-memory sink and global collector to ensure we don't have
+	// leftovers from the previous test.
+	inMemorySink = metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond)
+
+	_, err = metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink)
+	must.NoError(t, err)
+
+	// Emit a hook related metrics where the supplied error is non-nil and
+	// check that the data is as expected.
+	newHandler.Emit(time.Now(), "test_hook_name", "prerun", errors.New("test error"))
+
+	sinkData = inMemorySink.Data()
+	must.Len(t, 1, sinkData)
+	must.MapContainsKey(t, sinkData[0].Counters, counterFailureName)
+	must.MapContainsKey(t, sinkData[0].Samples, sampleName)
+
+	failureCounter := sinkData[0].Counters[counterFailureName]
+	must.Eq(t, 1, failureCounter.Count)
+	must.Eq(t, 1, failureCounter.Sum)
+
+	sample2 := sinkData[0].Samples[sampleName]
+	must.Eq(t, 1, sample2.Count)
+	must.True(t, sample2.Sum > 0)
+}
+
+func TestNoOpHandler(t *testing.T) {
+	ci.Parallel(t)
+
+	newHandler := NewNoOpHandler()
+
+	// Create a new in-memory sink and global collector, so we can test that no
+	// metrics are emitted.
+	inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond)
+
+	_, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink)
+	must.NoError(t, err)
+
+	// Call the function with a non-nil error and check the results of the
+	// in-memory sink.
+	newHandler.Emit(time.Now(), "test_hook_name", "prerun", errors.New("test error"))
+
+	sinkData := inMemorySink.Data()
+	must.Len(t, 1, sinkData)
+	must.MapLen(t, 0, sinkData[0].Counters)
+	must.MapLen(t, 0, sinkData[0].Samples)
+}
@@ -4,6 +4,8 @@
 package interfaces
 
 import (
+	"time"
+
 	"github.com/hashicorp/nomad/client/allocdir"
 	"github.com/hashicorp/nomad/client/allocrunner/state"
 	"github.com/hashicorp/nomad/client/pluginmanager/csimanager"
@@ -71,3 +73,25 @@ type HookResourceSetter interface {
 	SetCSIMounts(map[string]*csimanager.MountInfo)
 	GetCSIMounts(map[string]*csimanager.MountInfo)
 }
+
+// HookStatsHandler defines the interface used to emit metrics for the alloc
+// and task runner hooks.
+type HookStatsHandler interface {
+
+	// Emit is called once the hook has run, indicating the desired metrics
+	// should be emitted, if configured.
+	//
+	// Args:
+	//  start: The time logged as the hook was triggered. This is used for the
+	//    elapsed time metric.
+	//
+	//  hookName: The name of the hook that was run, as returned typically by
+	//    the Name() hook function.
+	//
+	//  hookType: The type of hook such as "prerun" or "postrun".
+	//
+	//  err: The error returned from the hook execution. The caller should not
+	//    need to check whether this is nil or not before called this function.
+	//
+	Emit(start time.Time, hookName, hookType string, err error)
+}