Skip to content

Commit

Permalink
Merge pull request #3061 from hashicorp/f-add-client-metrics
Browse files Browse the repository at this point in the history
Added metrics to track task/alloc start/restarts/dead events
  • Loading branch information
diptanu authored Nov 2, 2017
2 parents ab9de63 + 4e37ac1 commit 533a0f1
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 1 deletion.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

IMPROVEMENTS:
* api: Allocations now track and return modify time in addition to create time.
* cli: Allocation create and modify times are displayed in a human readable relative format like `6 h ago`.
* cli: Allocation create and modify times are displayed in a human readable
relative format like `6 h ago`.
* core: Allow agents to be run in `rpc_upgrade_mode` when migrating a cluster
to TLS rather than changing `heartbeat_grace`.
* client: Added metrics to track state transitions of allocations [GH-3061]

BUG FIXES:

Expand Down
71 changes: 71 additions & 0 deletions client/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"sync"
"time"

metrics "github.com/armon/go-metrics"
"github.com/boltdb/bolt"
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/client/allocdir"
Expand Down Expand Up @@ -101,6 +102,10 @@ type AllocRunner struct {
// can lower write volume by not re-writing these values
immutablePersisted bool
allocDirPersisted bool

// baseLabels are used when emitting tagged metrics. All alloc runner metrics
// will have these tags, and optionally more.
baseLabels []metrics.Label
}

// COMPAT: Remove in 0.7.0
Expand Down Expand Up @@ -173,6 +178,22 @@ func NewAllocRunner(logger *log.Logger, config *config.Config, stateDB *bolt.DB,

// TODO Should be passed a context
ar.ctx, ar.exitFn = context.WithCancel(context.TODO())

ar.baseLabels = []metrics.Label{
{
Name: "job",
Value: alloc.Job.Name,
},
{
Name: "task_group",
Value: alloc.TaskGroup,
},
{
Name: "node_id",
Value: ar.config.Node.ID,
},
}

return ar
}

Expand Down Expand Up @@ -645,6 +666,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
taskState.Failed = true
}
if event.Type == structs.TaskRestarting {
if !r.config.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"},
1, r.baseLabels)
}
if r.config.BackwardsCompatibleMetrics {
metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
}
taskState.Restarts++
taskState.LastRestart = time.Unix(0, event.Time)
}
Expand All @@ -668,6 +696,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
// Capture the start time if it is just starting
if taskState.State != structs.TaskStateRunning {
taskState.StartedAt = time.Now().UTC()
if !r.config.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"},
1, r.baseLabels)
}
if r.config.BackwardsCompatibleMetrics {
metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1)
}
}
case structs.TaskStateDead:
// Capture the finished time. If it has never started there is no finish
Expand All @@ -690,6 +725,24 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
}
}

// Emitting metrics to indicate task complete and failures
if taskState.Failed {
if !r.config.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"},
1, r.baseLabels)
}
if r.config.BackwardsCompatibleMetrics {
metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1)
}
} else {
if !r.config.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"},
1, r.baseLabels)
}
if r.config.BackwardsCompatibleMetrics {
metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1)
}
}
// If the task failed, we should kill all the other tasks in the task group.
if taskState.Failed {
for _, tr := range otherTaskRunners {
Expand Down Expand Up @@ -794,6 +847,15 @@ func (r *AllocRunner) Run() {
return
}

// Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution
if !r.config.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "start"},
1, r.baseLabels)
}
if r.config.BackwardsCompatibleMetrics {
metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1)
}

// Start the watcher
wCtx, watcherCancel := context.WithCancel(r.ctx)
go r.watchHealth(wCtx)
Expand Down Expand Up @@ -922,6 +984,15 @@ func (r *AllocRunner) handleDestroy() {
// state as we wait for a destroy.
alloc := r.Alloc()

// Increment the destroy count for this alloc runner since this allocation is being removed from this client.
if !r.config.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "destroy"},
1, r.baseLabels)
}
if r.config.BackwardsCompatibleMetrics {
metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1)
}

// Broadcast and persist state synchronously
r.sendBroadcast(alloc)
if err := r.saveAllocRunnerState(); err != nil {
Expand Down
42 changes: 42 additions & 0 deletions website/source/docs/agent/telemetry.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,48 @@ Starting in version 0.7, Nomad will emit tagged metrics, in the below format:
<td>Gauge</td>
<td>node_id, datacenter, disk</td>
</tr>
<tr>
<td>`nomad.client.allocs.start`</td>
<td>Number of allocations starting</td>
<td>Integer</td>
<td>Counter</td>
<td>node_id, job, task_group</td>
</tr>
<tr>
<td>`nomad.client.allocs.running`</td>
<td>Number of allocations starting to run</td>
<td>Integer</td>
<td>Counter</td>
<td>node_id, job, task_group</td>
</tr>
<tr>
<td>`nomad.client.allocs.failed`</td>
<td>Number of allocations failing</td>
<td>Integer</td>
<td>Counter</td>
<td>node_id, job, task_group</td>
</tr>
<tr>
<td>`nomad.client.allocs.restart`</td>
<td>Number of allocations restarting</td>
<td>Integer</td>
<td>Counter</td>
<td>node_id, job, task_group</td>
</tr>
<tr>
<td>`nomad.client.allocs.complete`</td>
<td>Number of allocations completing</td>
<td>Integer</td>
<td>Counter</td>
<td>node_id, job, task_group</td>
</tr>
<tr>
<td>`nomad.client.allocs.destroy`</td>
<td>Number of allocations being destroyed</td>
<td>Integer</td>
<td>Counter</td>
<td>node_id, job, task_group</td>
</tr>
</table>

## Host Metrics (deprecated post Nomad 0.7)
Expand Down

0 comments on commit 533a0f1

Please sign in to comment.