From 45583d757eb5115e0ccbbfb7e6f34337535c2475 Mon Sep 17 00:00:00 2001 From: Diptanu Choudhury Date: Sat, 19 Aug 2017 01:28:48 -0700 Subject: [PATCH 1/7] Added metrics to track task/alloc start/restarts/dead events --- client/alloc_runner.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index 1a396ac058e..ddbd2ca0077 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -9,6 +9,7 @@ import ( "sync" "time" + metrics "github.com/armon/go-metrics" "github.com/boltdb/bolt" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/client/allocdir" @@ -645,6 +646,7 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv taskState.Failed = true } if event.Type == structs.TaskRestarting { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1) taskState.Restarts++ taskState.LastRestart = time.Unix(0, event.Time) } @@ -668,12 +670,14 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv // Capture the start time if it is just starting if taskState.State != structs.TaskStateRunning { taskState.StartedAt = time.Now().UTC() + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1) } case structs.TaskStateDead: // Capture the finished time. If it has never started there is no finish // time if !taskState.StartedAt.IsZero() { taskState.FinishedAt = time.Now().UTC() + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "dead"}, 1) } // Find all tasks that are not the one that is dead and check if the one @@ -740,6 +744,9 @@ func (r *AllocRunner) Run() { defer close(r.waitCh) go r.dirtySyncState() + // Incr alloc runner start counter + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1) + // Find the task group to run in the allocation alloc := r.Alloc() tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) @@ -922,6 +929,14 @@ func (r *AllocRunner) handleDestroy() { // state as we wait for a destroy. alloc := r.Alloc() + // Incr the alloc destroy counter + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1) + + //TODO(schmichael) updater can cause a GC which can block on this alloc + // runner shutting down. Since handleDestroy can be called by Run() we + // can't block shutdown here as it would cause a deadlock. + go r.updater(alloc) + // Broadcast and persist state synchronously r.sendBroadcast(alloc) if err := r.saveAllocRunnerState(); err != nil { From 0bade76fd5308c078726a951f2e24bf20ffa53a0 Mon Sep 17 00:00:00 2001 From: Diptanu Choudhury Date: Tue, 29 Aug 2017 22:36:50 -0700 Subject: [PATCH 2/7] Recording counter for dead allocs properly --- client/alloc_runner.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index ddbd2ca0077..260e15e4e40 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -675,9 +675,9 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv case structs.TaskStateDead: // Capture the finished time. If it has never started there is no finish // time + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "dead"}, 1) if !taskState.StartedAt.IsZero() { taskState.FinishedAt = time.Now().UTC() - metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "dead"}, 1) } // Find all tasks that are not the one that is dead and check if the one @@ -694,6 +694,12 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv } } + // Emitting metrics to indicate task complete and failures + if taskState.Failed { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1) + } else { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1) + } // If the task failed, we should kill all the other tasks in the task group. if taskState.Failed { for _, tr := range otherTaskRunners { @@ -744,7 +750,7 @@ func (r *AllocRunner) Run() { defer close(r.waitCh) go r.dirtySyncState() - // Incr alloc runner start counter + // Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1) // Find the task group to run in the allocation @@ -929,7 +935,7 @@ func (r *AllocRunner) handleDestroy() { // state as we wait for a destroy. alloc := r.Alloc() - // Incr the alloc destroy counter + // Increment the destroy count for this alloc runner since this allocation is being removed from this client. metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1) //TODO(schmichael) updater can cause a GC which can block on this alloc From 9593e12672eaa74f29340d704458113e6be5ca08 Mon Sep 17 00:00:00 2001 From: Diptanu Choudhury Date: Wed, 1 Nov 2017 15:52:55 -0700 Subject: [PATCH 3/7] Incrementing the start counter when we are actually starting a container --- client/alloc_runner.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index 260e15e4e40..312b3c5512d 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -675,7 +675,6 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv case structs.TaskStateDead: // Capture the finished time. If it has never started there is no finish // time - metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "dead"}, 1) if !taskState.StartedAt.IsZero() { taskState.FinishedAt = time.Now().UTC() } @@ -750,9 +749,6 @@ func (r *AllocRunner) Run() { defer close(r.waitCh) go r.dirtySyncState() - // Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution - metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1) - // Find the task group to run in the allocation alloc := r.Alloc() tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) @@ -807,6 +803,9 @@ func (r *AllocRunner) Run() { return } + // Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1) + // Start the watcher wCtx, watcherCancel := context.WithCancel(r.ctx) go r.watchHealth(wCtx) From 103ff5526e1f36544c7f5ce07cfdfb780ddadcea Mon Sep 17 00:00:00 2001 From: Diptanu Choudhury Date: Thu, 2 Nov 2017 10:05:38 -0700 Subject: [PATCH 4/7] Added support for tagged metrics --- client/alloc_runner.go | 69 +++++++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 11 deletions(-) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index 312b3c5512d..d37cf50c44f 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -102,6 +102,10 @@ type AllocRunner struct { // can lower write volume by not re-writing these values immutablePersisted bool allocDirPersisted bool + + // baseLabels are used when emitting tagged metrics. All alloc runner metrics + // will have these tags, and optionally more. + baseLabels []metrics.Label } // COMPAT: Remove in 0.7.0 @@ -174,6 +178,18 @@ func NewAllocRunner(logger *log.Logger, config *config.Config, stateDB *bolt.DB, // TODO Should be passed a context ar.ctx, ar.exitFn = context.WithCancel(context.TODO()) + + ar.baseLabels = []metrics.Label{ + { + Name: "job", + Value: alloc.Job.Name, + }, + { + Name: "task_group", + Value: alloc.TaskGroup, + }, + } + return ar } @@ -646,7 +662,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv taskState.Failed = true } if event.Type == structs.TaskRestarting { - metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1) + if !r.config.DisableTaggedMetrics { + metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, + 1, r.baseLabels) + } + if r.config.BackwardsCompatibleMetrics { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1) + } taskState.Restarts++ taskState.LastRestart = time.Unix(0, event.Time) } @@ -670,7 +692,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv // Capture the start time if it is just starting if taskState.State != structs.TaskStateRunning { taskState.StartedAt = time.Now().UTC() - metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1) + if !r.config.DisableTaggedMetrics { + metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, + 1, r.baseLabels) + } + if r.config.BackwardsCompatibleMetrics { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1) + } } case structs.TaskStateDead: // Capture the finished time. If it has never started there is no finish @@ -695,9 +723,21 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv // Emitting metrics to indicate task complete and failures if taskState.Failed { - metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1) + if !r.config.DisableTaggedMetrics { + metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, + 1, r.baseLabels) + } + if r.config.BackwardsCompatibleMetrics { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1) + } } else { - metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1) + if !r.config.DisableTaggedMetrics { + metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"}, + 1, r.baseLabels) + } + if r.config.BackwardsCompatibleMetrics { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1) + } } // If the task failed, we should kill all the other tasks in the task group. if taskState.Failed { @@ -804,7 +844,13 @@ func (r *AllocRunner) Run() { } // Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution - metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1) + if !r.config.DisableTaggedMetrics { + metrics.IncrCounterWithLabels([]string{"client", "allocs", "start"}, + 1, r.baseLabels) + } + if r.config.BackwardsCompatibleMetrics { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1) + } // Start the watcher wCtx, watcherCancel := context.WithCancel(r.ctx) @@ -935,12 +981,13 @@ func (r *AllocRunner) handleDestroy() { alloc := r.Alloc() // Increment the destroy count for this alloc runner since this allocation is being removed from this client. - metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1) - - //TODO(schmichael) updater can cause a GC which can block on this alloc - // runner shutting down. Since handleDestroy can be called by Run() we - // can't block shutdown here as it would cause a deadlock. - go r.updater(alloc) + if !r.config.DisableTaggedMetrics { + metrics.IncrCounterWithLabels([]string{"client", "allocs", "destroy"}, + 1, r.baseLabels) + } + if r.config.BackwardsCompatibleMetrics { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1) + } // Broadcast and persist state synchronously r.sendBroadcast(alloc) From 1081d5050fdae5fe70502e1cb6593c4476ac7975 Mon Sep 17 00:00:00 2001 From: Diptanu Choudhury Date: Thu, 2 Nov 2017 13:26:46 -0700 Subject: [PATCH 5/7] Added docs --- website/source/docs/agent/telemetry.html.md | 42 +++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/website/source/docs/agent/telemetry.html.md b/website/source/docs/agent/telemetry.html.md index 0f808bae06b..1c6532743fd 100644 --- a/website/source/docs/agent/telemetry.html.md +++ b/website/source/docs/agent/telemetry.html.md @@ -419,6 +419,48 @@ Starting in version 0.7, Nomad will emit tagged metrics, in the below format: Gauge node_id, datacenter, disk + + `nomad.client.allocs.start` + Number of allocations starting + Integer + Counter + job, task_group + + + `nomad.client.allocs.running` + Number of allocations starting to run + Integer + Counter + job, task_group + + + `nomad.client.allocs.failed` + Number of allocations failing + Integer + Counter + job, task_group + + + `nomad.client.allocs.restart` + Number of allocations restarting + Integer + Counter + job, task_group + + + `nomad.client.allocs.complete` + Number of allocations completing + Integer + Counter + job, task_group + + + `nomad.client.allocs.destroy` + Number of allocations being destroyed + Integer + Counter + job, task_group + ## Host Metrics (deprecated post Nomad 0.7) From 5d364084755e6be0b47b749c9566cbd81b26283b Mon Sep 17 00:00:00 2001 From: Diptanu Choudhury Date: Thu, 2 Nov 2017 13:29:10 -0700 Subject: [PATCH 6/7] Added the node_id as a tag --- client/alloc_runner.go | 4 ++++ website/source/docs/agent/telemetry.html.md | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index d37cf50c44f..326a39472c6 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -188,6 +188,10 @@ func NewAllocRunner(logger *log.Logger, config *config.Config, stateDB *bolt.DB, Name: "task_group", Value: alloc.TaskGroup, }, + { + Name: "node_id", + Value: ar.config.Node.ID, + }, } return ar diff --git a/website/source/docs/agent/telemetry.html.md b/website/source/docs/agent/telemetry.html.md index 1c6532743fd..865b947d97a 100644 --- a/website/source/docs/agent/telemetry.html.md +++ b/website/source/docs/agent/telemetry.html.md @@ -424,42 +424,42 @@ Starting in version 0.7, Nomad will emit tagged metrics, in the below format: Number of allocations starting Integer Counter - job, task_group + node_id, job, task_group `nomad.client.allocs.running` Number of allocations starting to run Integer Counter - job, task_group + node_id, job, task_group `nomad.client.allocs.failed` Number of allocations failing Integer Counter - job, task_group + node_id, job, task_group `nomad.client.allocs.restart` Number of allocations restarting Integer Counter - job, task_group + node_id, job, task_group `nomad.client.allocs.complete` Number of allocations completing Integer Counter - job, task_group + node_id, job, task_group `nomad.client.allocs.destroy` Number of allocations being destroyed Integer Counter - job, task_group + node_id, job, task_group From 4e37ac1e890bbdf33f6c27e4abd0f6c953a8a140 Mon Sep 17 00:00:00 2001 From: Diptanu Choudhury Date: Thu, 2 Nov 2017 13:44:13 -0700 Subject: [PATCH 7/7] Updated changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 013f0150559..c4c34272d40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,9 +2,11 @@ IMPROVEMENTS: * api: Allocations now track and return modify time in addition to create time. - * cli: Allocation create and modify times are displayed in a human readable relative format like `6 h ago`. + * cli: Allocation create and modify times are displayed in a human readable + relative format like `6 h ago`. * core: Allow agents to be run in `rpc_upgrade_mode` when migrating a cluster to TLS rather than changing `heartbeat_grace`. + * client: Added metrics to track state transitions of allocations [GH-3061] BUG FIXES: