Skip to content

Commit

Permalink
Linux CPU: Cache CPU metrics
Browse files Browse the repository at this point in the history
Cache CPU metrics to avoid counters (ie iowait) jumping backwards.

Fixes: #1686

Signed-off-by: Ben Kochie <[email protected]>
  • Loading branch information
SuperQ committed May 24, 2020
1 parent b8847b5 commit 3565316
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
steps:
- checkout
- run: sudo pip install codespell
- run: codespell --skip=".git,./vendor,ttar,go.mod,go.sum,*pem" -L uint,packages\',uptodate
- run: codespell --skip=".git,./vendor,ttar,go.mod,go.sum,*pem,./collector/fixtures" -L uint,packages\',uptodate

build:
machine:
Expand Down
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* [CHANGE]
* [FEATURE]
* [ENHANCEMENT]
* [BUGFIX]
* [BUGFIX] Linux CPU: Cache CPU metrics to make them monotonically increasing #1711

## 1.0.0-rc.1 / 2020-05-14

Expand Down
85 changes: 84 additions & 1 deletion collector/cpu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"fmt"
"path/filepath"
"strconv"
"sync"

"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
Expand All @@ -35,6 +36,8 @@ type cpuCollector struct {
cpuCoreThrottle *prometheus.Desc
cpuPackageThrottle *prometheus.Desc
logger log.Logger
cpuStats []procfs.CPUStat
cpuStatsMutex sync.Mutex
}

var (
Expand Down Expand Up @@ -203,7 +206,12 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
return err
}

for cpuID, cpuStat := range stats.CPU {
c.updateCPUStats(stats.CPU)

// Acquire a lock to read the stats.
c.cpuStatsMutex.Lock()
defer c.cpuStatsMutex.Unlock()
for cpuID, cpuStat := range c.cpuStats {
cpuNum := strconv.Itoa(cpuID)
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.User, cpuNum, "user")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.Nice, cpuNum, "nice")
Expand All @@ -221,3 +229,78 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {

return nil
}

// updateCPUStats updates the internal cache of CPU stats.
func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
// Acquire a lock to update the stats.
c.cpuStatsMutex.Lock()
defer c.cpuStatsMutex.Unlock()

// Reset the cache if the list of CPUs has changed.
if len(c.cpuStats) != len(newStats) {
c.cpuStats = make([]procfs.CPUStat, len(newStats))
}

for i, n := range newStats {
// If idle jumps backwards, assume we had a hotplug event and reset the stats for this CPU.
if n.Idle < c.cpuStats[i].Idle {
level.Warn(c.logger).Log("msg", "CPU Idle counter jumped backwards, possible hotplug event, resetting CPU stats", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
c.cpuStats[i] = procfs.CPUStat{}
}
c.cpuStats[i].Idle = n.Idle

if n.User >= c.cpuStats[i].User {
c.cpuStats[i].User = n.User
} else {
level.Warn(c.logger).Log("msg", "CPU User counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].User, "new_value", n.User)
}

if n.Nice >= c.cpuStats[i].Nice {
c.cpuStats[i].Nice = n.Nice
} else {
level.Warn(c.logger).Log("msg", "CPU Nice counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Nice, "new_value", n.Nice)
}

if n.System >= c.cpuStats[i].System {
c.cpuStats[i].System = n.System
} else {
level.Warn(c.logger).Log("msg", "CPU System counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].System, "new_value", n.System)
}

if n.Iowait >= c.cpuStats[i].Iowait {
c.cpuStats[i].Iowait = n.Iowait
} else {
level.Warn(c.logger).Log("msg", "CPU Iowait counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Iowait, "new_value", n.Iowait)
}

if n.IRQ >= c.cpuStats[i].IRQ {
c.cpuStats[i].IRQ = n.IRQ
} else {
level.Warn(c.logger).Log("msg", "CPU IRQ counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].IRQ, "new_value", n.IRQ)
}

if n.SoftIRQ >= c.cpuStats[i].SoftIRQ {
c.cpuStats[i].SoftIRQ = n.SoftIRQ
} else {
level.Warn(c.logger).Log("msg", "CPU SoftIRQ counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].SoftIRQ, "new_value", n.SoftIRQ)
}

if n.Steal >= c.cpuStats[i].Steal {
c.cpuStats[i].Steal = n.Steal
} else {
level.Warn(c.logger).Log("msg", "CPU Steal counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Steal, "new_value", n.Steal)
}

if n.Guest >= c.cpuStats[i].Guest {
c.cpuStats[i].Guest = n.Guest
} else {
level.Warn(c.logger).Log("msg", "CPU Guest counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Guest, "new_value", n.Guest)
}

if n.GuestNice >= c.cpuStats[i].GuestNice {
c.cpuStats[i].GuestNice = n.GuestNice
} else {
level.Warn(c.logger).Log("msg", "CPU GuestNice counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].GuestNice, "new_value", n.GuestNice)
}
}
}

0 comments on commit 3565316

Please sign in to comment.