diff --git a/CHANGELOG.md b/CHANGELOG.md index 0dce8cf2c5f..3f195857cd8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,9 +2,11 @@ IMPROVEMENTS: * api: Allocations now track and return modify time in addition to create time. - * cli: Allocation create and modify times are displayed in a human readable relative format like `6 h ago`. + * cli: Allocation create and modify times are displayed in a human readable + relative format like `6 h ago`. * core: Allow agents to be run in `rpc_upgrade_mode` when migrating a cluster to TLS rather than changing `heartbeat_grace`. + * client: Added metrics to track state transitions of allocations [GH-3061] BUG FIXES: diff --git a/client/alloc_runner.go b/client/alloc_runner.go index 1a396ac058e..326a39472c6 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -9,6 +9,7 @@ import ( "sync" "time" + metrics "github.com/armon/go-metrics" "github.com/boltdb/bolt" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/client/allocdir" @@ -101,6 +102,10 @@ type AllocRunner struct { // can lower write volume by not re-writing these values immutablePersisted bool allocDirPersisted bool + + // baseLabels are used when emitting tagged metrics. All alloc runner metrics + // will have these tags, and optionally more. + baseLabels []metrics.Label } // COMPAT: Remove in 0.7.0 @@ -173,6 +178,22 @@ func NewAllocRunner(logger *log.Logger, config *config.Config, stateDB *bolt.DB, // TODO Should be passed a context ar.ctx, ar.exitFn = context.WithCancel(context.TODO()) + + ar.baseLabels = []metrics.Label{ + { + Name: "job", + Value: alloc.Job.Name, + }, + { + Name: "task_group", + Value: alloc.TaskGroup, + }, + { + Name: "node_id", + Value: ar.config.Node.ID, + }, + } + return ar } @@ -645,6 +666,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv taskState.Failed = true } if event.Type == structs.TaskRestarting { + if !r.config.DisableTaggedMetrics { + metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, + 1, r.baseLabels) + } + if r.config.BackwardsCompatibleMetrics { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1) + } taskState.Restarts++ taskState.LastRestart = time.Unix(0, event.Time) } @@ -668,6 +696,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv // Capture the start time if it is just starting if taskState.State != structs.TaskStateRunning { taskState.StartedAt = time.Now().UTC() + if !r.config.DisableTaggedMetrics { + metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, + 1, r.baseLabels) + } + if r.config.BackwardsCompatibleMetrics { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1) + } } case structs.TaskStateDead: // Capture the finished time. If it has never started there is no finish @@ -690,6 +725,24 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv } } + // Emitting metrics to indicate task complete and failures + if taskState.Failed { + if !r.config.DisableTaggedMetrics { + metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, + 1, r.baseLabels) + } + if r.config.BackwardsCompatibleMetrics { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1) + } + } else { + if !r.config.DisableTaggedMetrics { + metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"}, + 1, r.baseLabels) + } + if r.config.BackwardsCompatibleMetrics { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1) + } + } // If the task failed, we should kill all the other tasks in the task group. if taskState.Failed { for _, tr := range otherTaskRunners { @@ -794,6 +847,15 @@ func (r *AllocRunner) Run() { return } + // Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution + if !r.config.DisableTaggedMetrics { + metrics.IncrCounterWithLabels([]string{"client", "allocs", "start"}, + 1, r.baseLabels) + } + if r.config.BackwardsCompatibleMetrics { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1) + } + // Start the watcher wCtx, watcherCancel := context.WithCancel(r.ctx) go r.watchHealth(wCtx) @@ -922,6 +984,15 @@ func (r *AllocRunner) handleDestroy() { // state as we wait for a destroy. alloc := r.Alloc() + // Increment the destroy count for this alloc runner since this allocation is being removed from this client. + if !r.config.DisableTaggedMetrics { + metrics.IncrCounterWithLabels([]string{"client", "allocs", "destroy"}, + 1, r.baseLabels) + } + if r.config.BackwardsCompatibleMetrics { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1) + } + // Broadcast and persist state synchronously r.sendBroadcast(alloc) if err := r.saveAllocRunnerState(); err != nil { diff --git a/website/source/docs/agent/telemetry.html.md b/website/source/docs/agent/telemetry.html.md index 0f808bae06b..865b947d97a 100644 --- a/website/source/docs/agent/telemetry.html.md +++ b/website/source/docs/agent/telemetry.html.md @@ -419,6 +419,48 @@ Starting in version 0.7, Nomad will emit tagged metrics, in the below format: Gauge node_id, datacenter, disk + + `nomad.client.allocs.start` + Number of allocations starting + Integer + Counter + node_id, job, task_group + + + `nomad.client.allocs.running` + Number of allocations starting to run + Integer + Counter + node_id, job, task_group + + + `nomad.client.allocs.failed` + Number of allocations failing + Integer + Counter + node_id, job, task_group + + + `nomad.client.allocs.restart` + Number of allocations restarting + Integer + Counter + node_id, job, task_group + + + `nomad.client.allocs.complete` + Number of allocations completing + Integer + Counter + node_id, job, task_group + + + `nomad.client.allocs.destroy` + Number of allocations being destroyed + Integer + Counter + node_id, job, task_group + ## Host Metrics (deprecated post Nomad 0.7)