From 45583d757eb5115e0ccbbfb7e6f34337535c2475 Mon Sep 17 00:00:00 2001
From: Diptanu Choudhury <diptanuc@gmail.com>
Date: Sat, 19 Aug 2017 01:28:48 -0700
Subject: [PATCH 1/7] Added metrics to track task/alloc start/restarts/dead
 events

---
 client/alloc_runner.go | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/client/alloc_runner.go b/client/alloc_runner.go
index 1a396ac058e..ddbd2ca0077 100644
--- a/client/alloc_runner.go
+++ b/client/alloc_runner.go
@@ -9,6 +9,7 @@ import (
 	"sync"
 	"time"
 
+	metrics "github.com/armon/go-metrics"
 	"github.com/boltdb/bolt"
 	"github.com/hashicorp/go-multierror"
 	"github.com/hashicorp/nomad/client/allocdir"
@@ -645,6 +646,7 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 			taskState.Failed = true
 		}
 		if event.Type == structs.TaskRestarting {
+			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
 			taskState.Restarts++
 			taskState.LastRestart = time.Unix(0, event.Time)
 		}
@@ -668,12 +670,14 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 		// Capture the start time if it is just starting
 		if taskState.State != structs.TaskStateRunning {
 			taskState.StartedAt = time.Now().UTC()
+			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1)
 		}
 	case structs.TaskStateDead:
 		// Capture the finished time. If it has never started there is no finish
 		// time
 		if !taskState.StartedAt.IsZero() {
 			taskState.FinishedAt = time.Now().UTC()
+			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "dead"}, 1)
 		}
 
 		// Find all tasks that are not the one that is dead and check if the one
@@ -740,6 +744,9 @@ func (r *AllocRunner) Run() {
 	defer close(r.waitCh)
 	go r.dirtySyncState()
 
+	// Incr alloc runner start counter
+	metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1)
+
 	// Find the task group to run in the allocation
 	alloc := r.Alloc()
 	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
@@ -922,6 +929,14 @@ func (r *AllocRunner) handleDestroy() {
 	// state as we wait for a destroy.
 	alloc := r.Alloc()
 
+	// Incr the alloc destroy counter
+	metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1)
+
+	//TODO(schmichael) updater can cause a GC which can block on this alloc
+	// runner shutting down. Since handleDestroy can be called by Run() we
+	// can't block shutdown here as it would cause a deadlock.
+	go r.updater(alloc)
+
 	// Broadcast and persist state synchronously
 	r.sendBroadcast(alloc)
 	if err := r.saveAllocRunnerState(); err != nil {

From 0bade76fd5308c078726a951f2e24bf20ffa53a0 Mon Sep 17 00:00:00 2001
From: Diptanu Choudhury <diptanuc@gmail.com>
Date: Tue, 29 Aug 2017 22:36:50 -0700
Subject: [PATCH 2/7] Recording counter for dead allocs properly

---
 client/alloc_runner.go | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/client/alloc_runner.go b/client/alloc_runner.go
index ddbd2ca0077..260e15e4e40 100644
--- a/client/alloc_runner.go
+++ b/client/alloc_runner.go
@@ -675,9 +675,9 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 	case structs.TaskStateDead:
 		// Capture the finished time. If it has never started there is no finish
 		// time
+		metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "dead"}, 1)
 		if !taskState.StartedAt.IsZero() {
 			taskState.FinishedAt = time.Now().UTC()
-			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "dead"}, 1)
 		}
 
 		// Find all tasks that are not the one that is dead and check if the one
@@ -694,6 +694,12 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 			}
 		}
 
+		// Emitting metrics to indicate task complete and failures
+		if taskState.Failed {
+			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1)
+		} else {
+			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1)
+		}
 		// If the task failed, we should kill all the other tasks in the task group.
 		if taskState.Failed {
 			for _, tr := range otherTaskRunners {
@@ -744,7 +750,7 @@ func (r *AllocRunner) Run() {
 	defer close(r.waitCh)
 	go r.dirtySyncState()
 
-	// Incr alloc runner start counter
+	// Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution
 	metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1)
 
 	// Find the task group to run in the allocation
@@ -929,7 +935,7 @@ func (r *AllocRunner) handleDestroy() {
 	// state as we wait for a destroy.
 	alloc := r.Alloc()
 
-	// Incr the alloc destroy counter
+	// Increment the destroy count for this alloc runner since this allocation is being removed from this client.
 	metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1)
 
 	//TODO(schmichael) updater can cause a GC which can block on this alloc

From 9593e12672eaa74f29340d704458113e6be5ca08 Mon Sep 17 00:00:00 2001
From: Diptanu Choudhury <diptanuc@gmail.com>
Date: Wed, 1 Nov 2017 15:52:55 -0700
Subject: [PATCH 3/7] Incrementing the start counter when we are actually
 starting a container

---
 client/alloc_runner.go | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/client/alloc_runner.go b/client/alloc_runner.go
index 260e15e4e40..312b3c5512d 100644
--- a/client/alloc_runner.go
+++ b/client/alloc_runner.go
@@ -675,7 +675,6 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 	case structs.TaskStateDead:
 		// Capture the finished time. If it has never started there is no finish
 		// time
-		metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "dead"}, 1)
 		if !taskState.StartedAt.IsZero() {
 			taskState.FinishedAt = time.Now().UTC()
 		}
@@ -750,9 +749,6 @@ func (r *AllocRunner) Run() {
 	defer close(r.waitCh)
 	go r.dirtySyncState()
 
-	// Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution
-	metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1)
-
 	// Find the task group to run in the allocation
 	alloc := r.Alloc()
 	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
@@ -807,6 +803,9 @@ func (r *AllocRunner) Run() {
 		return
 	}
 
+	// Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution
+	metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1)
+
 	// Start the watcher
 	wCtx, watcherCancel := context.WithCancel(r.ctx)
 	go r.watchHealth(wCtx)

From 103ff5526e1f36544c7f5ce07cfdfb780ddadcea Mon Sep 17 00:00:00 2001
From: Diptanu Choudhury <diptanuc@gmail.com>
Date: Thu, 2 Nov 2017 10:05:38 -0700
Subject: [PATCH 4/7] Added support for tagged metrics

---
 client/alloc_runner.go | 69 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 11 deletions(-)

diff --git a/client/alloc_runner.go b/client/alloc_runner.go
index 312b3c5512d..d37cf50c44f 100644
--- a/client/alloc_runner.go
+++ b/client/alloc_runner.go
@@ -102,6 +102,10 @@ type AllocRunner struct {
 	// can lower write volume by not re-writing these values
 	immutablePersisted bool
 	allocDirPersisted  bool
+
+	// baseLabels are used when emitting tagged metrics. All alloc runner metrics
+	// will have these tags, and optionally more.
+	baseLabels []metrics.Label
 }
 
 // COMPAT: Remove in 0.7.0
@@ -174,6 +178,18 @@ func NewAllocRunner(logger *log.Logger, config *config.Config, stateDB *bolt.DB,
 
 	// TODO Should be passed a context
 	ar.ctx, ar.exitFn = context.WithCancel(context.TODO())
+
+	ar.baseLabels = []metrics.Label{
+		{
+			Name:  "job",
+			Value: alloc.Job.Name,
+		},
+		{
+			Name:  "task_group",
+			Value: alloc.TaskGroup,
+		},
+	}
+
 	return ar
 }
 
@@ -646,7 +662,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 			taskState.Failed = true
 		}
 		if event.Type == structs.TaskRestarting {
-			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
+			if !r.config.DisableTaggedMetrics {
+				metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"},
+					1, r.baseLabels)
+			}
+			if r.config.BackwardsCompatibleMetrics {
+				metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
+			}
 			taskState.Restarts++
 			taskState.LastRestart = time.Unix(0, event.Time)
 		}
@@ -670,7 +692,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 		// Capture the start time if it is just starting
 		if taskState.State != structs.TaskStateRunning {
 			taskState.StartedAt = time.Now().UTC()
-			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1)
+			if !r.config.DisableTaggedMetrics {
+				metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"},
+					1, r.baseLabels)
+			}
+			if r.config.BackwardsCompatibleMetrics {
+				metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1)
+			}
 		}
 	case structs.TaskStateDead:
 		// Capture the finished time. If it has never started there is no finish
@@ -695,9 +723,21 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 
 		// Emitting metrics to indicate task complete and failures
 		if taskState.Failed {
-			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1)
+			if !r.config.DisableTaggedMetrics {
+				metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"},
+					1, r.baseLabels)
+			}
+			if r.config.BackwardsCompatibleMetrics {
+				metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1)
+			}
 		} else {
-			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1)
+			if !r.config.DisableTaggedMetrics {
+				metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"},
+					1, r.baseLabels)
+			}
+			if r.config.BackwardsCompatibleMetrics {
+				metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1)
+			}
 		}
 		// If the task failed, we should kill all the other tasks in the task group.
 		if taskState.Failed {
@@ -804,7 +844,13 @@ func (r *AllocRunner) Run() {
 	}
 
 	// Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution
-	metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1)
+	if !r.config.DisableTaggedMetrics {
+		metrics.IncrCounterWithLabels([]string{"client", "allocs", "start"},
+			1, r.baseLabels)
+	}
+	if r.config.BackwardsCompatibleMetrics {
+		metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1)
+	}
 
 	// Start the watcher
 	wCtx, watcherCancel := context.WithCancel(r.ctx)
@@ -935,12 +981,13 @@ func (r *AllocRunner) handleDestroy() {
 	alloc := r.Alloc()
 
 	// Increment the destroy count for this alloc runner since this allocation is being removed from this client.
-	metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1)
-
-	//TODO(schmichael) updater can cause a GC which can block on this alloc
-	// runner shutting down. Since handleDestroy can be called by Run() we
-	// can't block shutdown here as it would cause a deadlock.
-	go r.updater(alloc)
+	if !r.config.DisableTaggedMetrics {
+		metrics.IncrCounterWithLabels([]string{"client", "allocs", "destroy"},
+			1, r.baseLabels)
+	}
+	if r.config.BackwardsCompatibleMetrics {
+		metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1)
+	}
 
 	// Broadcast and persist state synchronously
 	r.sendBroadcast(alloc)

From 1081d5050fdae5fe70502e1cb6593c4476ac7975 Mon Sep 17 00:00:00 2001
From: Diptanu Choudhury <diptanuc@gmail.com>
Date: Thu, 2 Nov 2017 13:26:46 -0700
Subject: [PATCH 5/7] Added docs

---
 website/source/docs/agent/telemetry.html.md | 42 +++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/website/source/docs/agent/telemetry.html.md b/website/source/docs/agent/telemetry.html.md
index 0f808bae06b..1c6532743fd 100644
--- a/website/source/docs/agent/telemetry.html.md
+++ b/website/source/docs/agent/telemetry.html.md
@@ -419,6 +419,48 @@ Starting in version 0.7, Nomad will emit tagged metrics, in the below format:
     <td>Gauge</td>
     <td>node_id, datacenter, disk</td>
   </tr>
+  <tr>
+    <td>`nomad.client.allocs.start`</td>
+    <td>Number of allocations starting</td>
+    <td>Integer</td>
+    <td>Counter</td>
+    <td>job, task_group</td>
+  </tr>
+  <tr>
+    <td>`nomad.client.allocs.running`</td>
+    <td>Number of allocations starting to run</td>
+    <td>Integer</td>
+    <td>Counter</td>
+    <td>job, task_group</td>
+  </tr>
+  <tr>
+    <td>`nomad.client.allocs.failed`</td>
+    <td>Number of allocations failing</td>
+    <td>Integer</td>
+    <td>Counter</td>
+    <td>job, task_group</td>
+  </tr>
+  <tr>
+    <td>`nomad.client.allocs.restart`</td>
+    <td>Number of allocations restarting</td>
+    <td>Integer</td>
+    <td>Counter</td>
+    <td>job, task_group</td>
+  </tr>
+  <tr>
+    <td>`nomad.client.allocs.complete`</td>
+    <td>Number of allocations completing</td>
+    <td>Integer</td>
+    <td>Counter</td>
+    <td>job, task_group</td>
+  </tr>
+  <tr>
+    <td>`nomad.client.allocs.destroy`</td>
+    <td>Number of allocations being destroyed</td>
+    <td>Integer</td>
+    <td>Counter</td>
+    <td>job, task_group</td>
+  </tr>
 </table>
 
 ## Host Metrics (deprecated post Nomad 0.7)

From 5d364084755e6be0b47b749c9566cbd81b26283b Mon Sep 17 00:00:00 2001
From: Diptanu Choudhury <diptanuc@gmail.com>
Date: Thu, 2 Nov 2017 13:29:10 -0700
Subject: [PATCH 6/7] Added the node_id as a tag

---
 client/alloc_runner.go                      |  4 ++++
 website/source/docs/agent/telemetry.html.md | 12 ++++++------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/client/alloc_runner.go b/client/alloc_runner.go
index d37cf50c44f..326a39472c6 100644
--- a/client/alloc_runner.go
+++ b/client/alloc_runner.go
@@ -188,6 +188,10 @@ func NewAllocRunner(logger *log.Logger, config *config.Config, stateDB *bolt.DB,
 			Name:  "task_group",
 			Value: alloc.TaskGroup,
 		},
+		{
+			Name:  "node_id",
+			Value: ar.config.Node.ID,
+		},
 	}
 
 	return ar
diff --git a/website/source/docs/agent/telemetry.html.md b/website/source/docs/agent/telemetry.html.md
index 1c6532743fd..865b947d97a 100644
--- a/website/source/docs/agent/telemetry.html.md
+++ b/website/source/docs/agent/telemetry.html.md
@@ -424,42 +424,42 @@ Starting in version 0.7, Nomad will emit tagged metrics, in the below format:
     <td>Number of allocations starting</td>
     <td>Integer</td>
     <td>Counter</td>
-    <td>job, task_group</td>
+    <td>node_id, job, task_group</td>
   </tr>
   <tr>
     <td>`nomad.client.allocs.running`</td>
     <td>Number of allocations starting to run</td>
     <td>Integer</td>
     <td>Counter</td>
-    <td>job, task_group</td>
+    <td>node_id, job, task_group</td>
   </tr>
   <tr>
     <td>`nomad.client.allocs.failed`</td>
     <td>Number of allocations failing</td>
     <td>Integer</td>
     <td>Counter</td>
-    <td>job, task_group</td>
+    <td>node_id, job, task_group</td>
   </tr>
   <tr>
     <td>`nomad.client.allocs.restart`</td>
     <td>Number of allocations restarting</td>
     <td>Integer</td>
     <td>Counter</td>
-    <td>job, task_group</td>
+    <td>node_id, job, task_group</td>
   </tr>
   <tr>
     <td>`nomad.client.allocs.complete`</td>
     <td>Number of allocations completing</td>
     <td>Integer</td>
     <td>Counter</td>
-    <td>job, task_group</td>
+    <td>node_id, job, task_group</td>
   </tr>
   <tr>
     <td>`nomad.client.allocs.destroy`</td>
     <td>Number of allocations being destroyed</td>
     <td>Integer</td>
     <td>Counter</td>
-    <td>job, task_group</td>
+    <td>node_id, job, task_group</td>
   </tr>
 </table>
 

From 4e37ac1e890bbdf33f6c27e4abd0f6c953a8a140 Mon Sep 17 00:00:00 2001
From: Diptanu Choudhury <diptanuc@gmail.com>
Date: Thu, 2 Nov 2017 13:44:13 -0700
Subject: [PATCH 7/7] Updated changelog

---
 CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 013f0150559..c4c34272d40 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,9 +2,11 @@
 
 IMPROVEMENTS:
  * api: Allocations now track and return modify time in addition to create time.
- * cli: Allocation create and modify times are displayed in a human readable relative format like `6 h ago`.
+ * cli: Allocation create and modify times are displayed in a human readable 
+   relative format like `6 h ago`.
  * core: Allow agents to be run in `rpc_upgrade_mode` when migrating a cluster
    to TLS rather than changing `heartbeat_grace`.
+ * client: Added metrics to track state transitions of allocations [GH-3061]
 
 BUG FIXES: