Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added metrics to track task/alloc start/restarts/dead events #3061

Merged
merged 7 commits into from
Nov 2, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

IMPROVEMENTS:
* api: Allocations now track and return modify time in addition to create time.
* cli: Allocation create and modify times are displayed in a human readable relative format like `6 h ago`.
* cli: Allocation create and modify times are displayed in a human readable
relative format like `6 h ago`.
* core: Allow agents to be run in `rpc_upgrade_mode` when migrating a cluster
to TLS rather than changing `heartbeat_grace`.
* client: Added metrics to track state transitions of allocations [GH-3061]

BUG FIXES:

Expand Down
71 changes: 71 additions & 0 deletions client/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"sync"
"time"

metrics "github.com/armon/go-metrics"
"github.com/boltdb/bolt"
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/client/allocdir"
Expand Down Expand Up @@ -101,6 +102,10 @@ type AllocRunner struct {
// can lower write volume by not re-writing these values
immutablePersisted bool
allocDirPersisted bool

// baseLabels are used when emitting tagged metrics. All alloc runner metrics
// will have these tags, and optionally more.
baseLabels []metrics.Label
}

// COMPAT: Remove in 0.7.0
Expand Down Expand Up @@ -173,6 +178,22 @@ func NewAllocRunner(logger *log.Logger, config *config.Config, stateDB *bolt.DB,

// TODO Should be passed a context
ar.ctx, ar.exitFn = context.WithCancel(context.TODO())

ar.baseLabels = []metrics.Label{
{
Name: "job",
Value: alloc.Job.Name,
},
{
Name: "task_group",
Value: alloc.TaskGroup,
},
{
Name: "node_id",
Value: ar.config.Node.ID,
},
}

return ar
}

Expand Down Expand Up @@ -645,6 +666,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
taskState.Failed = true
}
if event.Type == structs.TaskRestarting {
if !r.config.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"},
1, r.baseLabels)
}
if r.config.BackwardsCompatibleMetrics {
metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
}
taskState.Restarts++
taskState.LastRestart = time.Unix(0, event.Time)
}
Expand All @@ -668,6 +696,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
// Capture the start time if it is just starting
if taskState.State != structs.TaskStateRunning {
taskState.StartedAt = time.Now().UTC()
if !r.config.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"},
1, r.baseLabels)
}
if r.config.BackwardsCompatibleMetrics {
metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1)
}
}
case structs.TaskStateDead:
// Capture the finished time. If it has never started there is no finish
Expand All @@ -690,6 +725,24 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
}
}

// Emitting metrics to indicate task complete and failures
if taskState.Failed {
if !r.config.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"},
1, r.baseLabels)
}
if r.config.BackwardsCompatibleMetrics {
metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1)
}
} else {
if !r.config.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"},
1, r.baseLabels)
}
if r.config.BackwardsCompatibleMetrics {
metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1)
}
}
// If the task failed, we should kill all the other tasks in the task group.
if taskState.Failed {
for _, tr := range otherTaskRunners {
Expand Down Expand Up @@ -794,6 +847,15 @@ func (r *AllocRunner) Run() {
return
}

// Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution
if !r.config.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "start"},
1, r.baseLabels)
}
if r.config.BackwardsCompatibleMetrics {
metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1)
}

// Start the watcher
wCtx, watcherCancel := context.WithCancel(r.ctx)
go r.watchHealth(wCtx)
Expand Down Expand Up @@ -922,6 +984,15 @@ func (r *AllocRunner) handleDestroy() {
// state as we wait for a destroy.
alloc := r.Alloc()

// Increment the destroy count for this alloc runner since this allocation is being removed from this client.
if !r.config.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "destroy"},
1, r.baseLabels)
}
if r.config.BackwardsCompatibleMetrics {
metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1)
}

// Broadcast and persist state synchronously
r.sendBroadcast(alloc)
if err := r.saveAllocRunnerState(); err != nil {
Expand Down
42 changes: 42 additions & 0 deletions website/source/docs/agent/telemetry.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,48 @@ Starting in version 0.7, Nomad will emit tagged metrics, in the below format:
<td>Gauge</td>
<td>node_id, datacenter, disk</td>
</tr>
<tr>
<td>`nomad.client.allocs.start`</td>
<td>Number of allocations starting</td>
<td>Integer</td>
<td>Counter</td>
<td>node_id, job, task_group</td>
</tr>
<tr>
<td>`nomad.client.allocs.running`</td>
<td>Number of allocations starting to run</td>
<td>Integer</td>
<td>Counter</td>
<td>node_id, job, task_group</td>
</tr>
<tr>
<td>`nomad.client.allocs.failed`</td>
<td>Number of allocations failing</td>
<td>Integer</td>
<td>Counter</td>
<td>node_id, job, task_group</td>
</tr>
<tr>
<td>`nomad.client.allocs.restart`</td>
<td>Number of allocations restarting</td>
<td>Integer</td>
<td>Counter</td>
<td>node_id, job, task_group</td>
</tr>
<tr>
<td>`nomad.client.allocs.complete`</td>
<td>Number of allocations completing</td>
<td>Integer</td>
<td>Counter</td>
<td>node_id, job, task_group</td>
</tr>
<tr>
<td>`nomad.client.allocs.destroy`</td>
<td>Number of allocations being destroyed</td>
<td>Integer</td>
<td>Counter</td>
<td>node_id, job, task_group</td>
</tr>
</table>

## Host Metrics (deprecated post Nomad 0.7)
Expand Down