Skip to content

Commit

Permalink
Add ping metrics
Browse files Browse the repository at this point in the history
This enables fetching device statistics from UNMS, extracting
the ping RTT values from it, and export them back to prometheus.

Since fetching statistics requires talking to another, device-specific
endpoint, the context timeout is increased from 5 to 30s. A future
change should pass the request context from the promhttp endpoint
down to the fetchDeviceData() method, to make the timout depend on
Prometheus' scrape request life cycle.
  • Loading branch information
dmke committed Apr 9, 2022
1 parent 28f163f commit 8dfe88a
Show file tree
Hide file tree
Showing 5 changed files with 255 additions and 4 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,11 @@ If an interface is marked as the WAN interface, these metrics are populated.
- `wan_tx_bytes`: Bytes transmitted since last reset
- `wan_rx_rate`: Bytes received rate (momentarily)
- `wan_tx_rate`: Bytes transmitted rate (momentarily)

### Ping Metrics

- `ping_loss_ratio`: Packet loss ratio (range 0-1, with 0.33 meaning 33% packet loss)
- `ping_rtt_best_seconds`: Best round trip time, in seconds
- `ping_rtt_mean_seconds`: Mean round trip time, in seconds
- `ping_rtt_worst_seconds`: Worst round trip time, in seconds
- `ping_rtt_std_deviation_seconds`: Standard deviation for round trip time, in seconds
46 changes: 42 additions & 4 deletions exporter/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,13 @@ import (
"github.com/ffddorf/unms-exporter/models"
)

func (e *Exporter) fetchDeviceData() ([]*models.DeviceStatusOverview, error) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
type Device struct {
Statistics *models.DeviceStatistics
*models.DeviceStatusOverview
}

func (e *Exporter) fetchDeviceData() ([]Device, error) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()

params := &devices.GetDevicesParams{
Expand All @@ -21,13 +26,46 @@ func (e *Exporter) fetchDeviceData() ([]*models.DeviceStatusOverview, error) {
return nil, err
}

data := make([]*models.DeviceStatusOverview, 0, len(devicesResponse.Payload))
data := make([]Device, 0, len(devicesResponse.Payload))
for _, overview := range devicesResponse.Payload {
if overview.Identification == nil {
continue
}
data = append(data, overview)
dev := Device{nil, overview}

if id := derefOrEmpty(overview.Identification.ID); id != "" {
params := &devices.GetDevicesIDStatisticsParams{
ID: id,
Interval: "hour", // smallest interval possible
Context: ctx,
}
statisticsResponse, err := e.api.Devices.GetDevicesIDStatistics(params)
if err != nil {
return nil, err
}
dev.Statistics = statisticsResponse.Payload
}
data = append(data, dev)
}

return data, nil
}

func (dev *Device) PingMetrics() *PingMetrics {
if dev.Statistics == nil || len(dev.Statistics.Ping) == 0 {
return nil
}

m := NewHistory(len(dev.Statistics.Ping))
for _, xy := range dev.Statistics.Ping {
if xy == nil {
m.Add(0, true)
continue
}

rtt := time.Duration(xy.Y * float64(time.Millisecond))
m.Add(rtt, false)
}

return m.Compute()
}
91 changes: 91 additions & 0 deletions exporter/device_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package exporter

import (
"testing"
"time"

"github.com/ffddorf/unms-exporter/models"
)

const (
ms = time.Millisecond
µs = time.Microsecond //nolint:asciicheck
)

type metricExpectation map[string]struct {
actual interface{}
satisfied bool
}

func comparePingMetrics(t *testing.T, expectations metricExpectation, actual *PingMetrics) {
t.Helper()

anyFailure := false
for field, expectation := range expectations {
if !expectation.satisfied {
anyFailure = true
t.Errorf("unexpected value for field %q: %v", field, expectation.actual)
}
}
if anyFailure {
t.FailNow()
}
}

func TestDevice_PingMetrics_connected(t *testing.T) {
t.Parallel()

subject := Device{
Statistics: &models.DeviceStatistics{
Ping: models.ListOfCoordinates{{Y: 5}, {Y: 10}, {Y: 25}, {Y: 15}, {Y: 1}}, // x values are ignored
},
}

actual := subject.PingMetrics()
if actual == nil {
t.Error("expected PingMetrics() to return somthing, got nil")
}

comparePingMetrics(t, metricExpectation{
"packets sent": {actual.PacketsSent, actual.PacketsSent == 5},
"packets lost": {actual.PacketsLost, actual.PacketsLost == 0},
"rtt best": {actual.Best, actual.Best == 1*ms},
"rtt worst": {actual.Worst, actual.Worst == 25*ms},
"rtt median": {actual.Median, actual.Median == 10*ms},
"rtt meain": {actual.Mean, actual.Mean == 11200*µs}, // 11.2ms
"rtt std dev": {actual.StdDev, 8350*µs < actual.StdDev && actual.StdDev < 8360*µs}, // ~8.352245ms
}, actual)
}

func TestDevice_PingMetrics_missingPackets(t *testing.T) {
t.Parallel()

subject := Device{
Statistics: &models.DeviceStatistics{
Ping: models.ListOfCoordinates{nil, {Y: 100}, {Y: 250}, nil, {Y: 120}},
},
}

actual := subject.PingMetrics()
if actual == nil {
t.Error("expected PingMetrics() to return somthing, got nil")
}

comparePingMetrics(t, metricExpectation{
"packets sent": {actual.PacketsSent, actual.PacketsSent == 5},
"packets lost": {actual.PacketsLost, actual.PacketsLost == 2},
"rtt best": {actual.Best, actual.Best == 100*ms},
"rtt worst": {actual.Worst, actual.Worst == 250*ms},
"rtt median": {actual.Median, actual.Median == 120*ms},
"rtt meain": {actual.Mean, 156666*µs < actual.Mean && actual.Mean < 156667*µs}, // 156.66666ms
"rtt std dev": {actual.StdDev, 66499*µs < actual.StdDev && actual.StdDev < 66500*µs}, // ~66.499791ms
}, actual)
}

func TestDevice_PingMetrics_disconnected(t *testing.T) {
t.Parallel()

if actual := (&Device{}).PingMetrics(); actual != nil {
t.Errorf("expected PingMetrics() to return nil, got %+v", actual)
}
}
19 changes: 19 additions & 0 deletions exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ var metricSpecs = map[string]metricSpec{
"wan_tx_bytes": newSpec("Bytes sent on WAN interface", nil),
"wan_rx_rate": newSpec("Receive rate on WAN interface", nil),
"wan_tx_rate": newSpec("Transmit rate on WAN interface", nil),

"ping_loss_ratio": newSpec("Ping packet loss ratio", nil),
"ping_rtt_best_seconds": newSpec("Best ping round trip time in seconds", nil),
"ping_rtt_mean_seconds": newSpec("Mean ping round trip time in seconds", nil),
"ping_rtt_worst_seconds": newSpec("Worst ping round trip time in seconds", nil),
"ping_rtt_std_deviation_seconds": newSpec("Standard deviation for ping round trip time in seconds", nil),
}

type Exporter struct {
Expand Down Expand Up @@ -244,6 +250,19 @@ func (e *Exporter) collectImpl(out chan<- prom.Metric) error {
out <- e.newMetric("wan_rx_rate", prom.GaugeValue, wanIF.Statistics.Rxrate, deviceLabels...)
out <- e.newMetric("wan_tx_rate", prom.GaugeValue, wanIF.Statistics.Txrate, deviceLabels...)
}

// Ping metrics
ratio := 1.0
if ping := device.PingMetrics(); ping != nil {
if ping.PacketsSent > 0 {
ratio = float64(ping.PacketsLost) / float64(ping.PacketsSent)
}
out <- e.newMetric("ping_rtt_best_seconds", prom.GaugeValue, ping.Best.Seconds(), deviceLabels...)
out <- e.newMetric("ping_rtt_mean_seconds", prom.GaugeValue, ping.Mean.Seconds(), deviceLabels...)
out <- e.newMetric("ping_rtt_worst_seconds", prom.GaugeValue, ping.Worst.Seconds(), deviceLabels...)
out <- e.newMetric("ping_rtt_std_deviation_seconds", prom.GaugeValue, ping.StdDev.Seconds(), deviceLabels...)
}
out <- e.newMetric("ping_loss_ratio", prom.GaugeValue, ratio, deviceLabels...)
}

return nil
Expand Down
95 changes: 95 additions & 0 deletions exporter/ping_rtt.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package exporter

import (
"math"
"sort"
"time"
)

// PingMetrics is a dumb data point computed from a list of PingResults.
type PingMetrics struct {
PacketsSent int // number of packets sent
PacketsLost int // number of packets lost
Best time.Duration // best RTT
Worst time.Duration // worst RTT
Median time.Duration // median RTT
Mean time.Duration // mean RTT
StdDev time.Duration // RTT std deviation
}

// PingResult stores the information about a single ping, in particular
// the round-trip time or whether the packet was lost.
type PingResult struct {
RTT time.Duration
Lost bool
}

// PingHistory represents the ping history for a single node/device.
type PingHistory []PingResult

// NewHistory creates a new History object with a specific capacity.
func NewHistory(capacity int) PingHistory {
return make(PingHistory, 0, capacity)
}

// AddResult saves a ping result into the internal history.
func (h *PingHistory) Add(rtt time.Duration, lost bool) {
*h = append(*h, PingResult{RTT: rtt, Lost: lost})
}

// Compute aggregates the result history into a single data point.
func (h PingHistory) Compute() *PingMetrics {
numFailure := 0
numTotal := len(h)

if numTotal == 0 {
return nil
}

data := make([]float64, 0, numTotal)
var best, worst, mean, stddev, total, sumSquares float64

for _, curr := range h {
if curr.Lost {
numFailure++
continue
}

rtt := curr.RTT.Seconds()
if rtt < best || len(data) == 0 {
best = rtt
}
if rtt > worst || len(data) == 0 {
worst = rtt
}
data = append(data, rtt)
total += rtt
}

size := float64(numTotal - numFailure)
mean = total / size
for _, rtt := range data {
sumSquares += math.Pow(rtt-mean, 2)
}
stddev = math.Sqrt(sumSquares / size)

median := math.NaN()
if l := len(data); l > 0 {
sort.Float64Slice(data).Sort()
if l%2 == 0 {
median = (data[l/2-1] + data[l/2]) / 2
} else {
median = data[l/2]
}
}

return &PingMetrics{
PacketsSent: numTotal,
PacketsLost: numFailure,
Best: time.Duration(best * float64(time.Second)),
Worst: time.Duration(worst * float64(time.Second)),
Median: time.Duration(median * float64(time.Second)),
Mean: time.Duration(mean * float64(time.Second)),
StdDev: time.Duration(stddev * float64(time.Second)),
}
}

0 comments on commit 8dfe88a

Please sign in to comment.