Skip to content

Commit

Permalink
Add job status metrics
Browse files Browse the repository at this point in the history
This avoids having to write services to repeatedly hit the jobs API
  • Loading branch information
pete-woods committed Jul 26, 2019
1 parent beeb31e commit d07ea34
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 2 deletions.
59 changes: 59 additions & 0 deletions nomad/leader.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,9 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error {
// Periodically publish job summary metrics
go s.publishJobSummaryMetrics(stopCh)

// Periodically publish job status metrics
go s.publishJobStatusMetrics(stopCh)

// Setup the heartbeat timers. This is done both when starting up or when
// a leader fail over happens. Since the timers are maintained by the leader
// node, effectively this means all the timers are renewed at the time of failover.
Expand Down Expand Up @@ -700,6 +703,62 @@ func (s *Server) iterateJobSummaryMetrics(summary *structs.JobSummary) {
}
}

// publishJobStatusMetrics publishes the job statuses as metrics
func (s *Server) publishJobStatusMetrics(stopCh chan struct{}) {
timer := time.NewTimer(0)
defer timer.Stop()

for {
select {
case <-stopCh:
return
case <-timer.C:
timer.Reset(s.config.StatsCollectionInterval)
state, err := s.State().Snapshot()
if err != nil {
s.logger.Error("failed to get state", "error", err)
continue
}
ws := memdb.NewWatchSet()
iter, err := state.Jobs(ws)
if err != nil {
s.logger.Error("failed to get job statuses", "error", err)
continue
}

s.iterateJobStatusMetrics(&iter)
}
}
}

func (s *Server) iterateJobStatusMetrics(jobs *memdb.ResultIterator) {
var pending int64 // Sum of all jobs in 'pending' state
var running int64 // Sum of all jobs in 'running' state
var dead int64 // Sum of all jobs in 'dead' state

for {
raw := (*jobs).Next()
if raw == nil {
break
}

job := raw.(*structs.Job)

switch job.Status {
case structs.JobStatusPending:
pending++
case structs.JobStatusRunning:
running++
case structs.JobStatusDead:
dead++
}
}

metrics.SetGauge([]string{"nomad", "job_status", "pending"}, float32(pending))
metrics.SetGauge([]string{"nomad", "job_status", "running"}, float32(running))
metrics.SetGauge([]string{"nomad", "job_status", "dead"}, float32(dead))
}

// revokeLeadership is invoked once we step down as leader.
// This is used to cleanup any state that may be specific to a leader.
func (s *Server) revokeLeadership() error {
Expand Down
35 changes: 33 additions & 2 deletions website/source/docs/telemetry/metrics.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -705,9 +705,9 @@ detailed above) but any new metrics will only be available in the new format.
</tr>
</table>

## Job Metrics
## Job Summary Metrics

Job metrics are emitted by the Nomad leader server.
Job summary metrics are emitted by the Nomad leader server.

<table class="table table-bordered table-striped">
<tr>
Expand Down Expand Up @@ -761,6 +761,37 @@ Job metrics are emitted by the Nomad leader server.
</tr>
</table>

## Job Status Metrics

Job status metrics are emitted by the Nomad leader server.

<table class="table table-bordered table-striped">
<tr>
<th>Metric</th>
<th>Description</th>
<th>Unit</th>
<th>Type</th>
</tr>
<tr>
<td>`nomad.job_status.pending`</td>
<td>Number jobs pending</td>
<td>Integer</td>
<td>Gauge</td>
</tr>
<tr>
<td>`nomad.job_status.running`</td>
<td>Number jobs running</td>
<td>Integer</td>
<td>Gauge</td>
</tr>
<tr>
<td>`nomad.job_status.dead`</td>
<td>Number of dead jobs</td>
<td>Integer</td>
<td>Gauge</td>
</tr>
</table>

## Metric Types

<table class="table table-bordered table-striped">
Expand Down

0 comments on commit d07ea34

Please sign in to comment.