From 6df1a3c944908d93366780e9baa34b67c3d4dff1 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 23 Oct 2023 10:32:38 -0400 Subject: [PATCH] metrics: prevent negative counter from iowait decrease The iowait metric obtained from `/proc/stat` can under some circumstances decrease. The relevant condition is when an interrupt arrives on a different core than the one that gets woken up for the IO, and a particular counter in the kernel for that core gets interrupted. This is documented in the man page for the `proc(5)` pseudo-filesystem, and considered an unfortunate behavior that can't be changed for the sake of ABI compatibility. In Nomad, we get the current "busy" time (everything except for idle) and compare it to the previous busy time to get the counter incremeent. If the iowait counter decreases and the idle counter increases more than the increase in the total busy time, we can get a negative total. This previously caused a panic in our metrics collection (see #15861) but that is being prevented by reporting an error message. Fix the bug by putting a zero floor on the values we return from the host CPU stats calculator. Fixes: #15861 Fixes: #18804 --- .changelog/18835.txt | 3 +++ client/hoststats/host.go | 10 ++++----- client/hoststats/host_test.go | 39 ++++++++++++++++++++++++++--------- 3 files changed, 37 insertions(+), 15 deletions(-) create mode 100644 .changelog/18835.txt diff --git a/.changelog/18835.txt b/.changelog/18835.txt new file mode 100644 index 00000000000..99d4ae17e71 --- /dev/null +++ b/.changelog/18835.txt @@ -0,0 +1,3 @@ +```release-note:bug +metrics: Fixed a bug where CPU counters could report errors for negative values +``` diff --git a/client/hoststats/host.go b/client/hoststats/host.go index dc508939ab4..70468b12145 100644 --- a/client/hoststats/host.go +++ b/client/hoststats/host.go @@ -273,7 +273,7 @@ func (h *HostCpuStatsCalculator) Calculate(times cpu.TimesStat) (idle float64, u currentIdle := times.Idle currentUser := times.User currentSystem := times.System - currentTotal := times.Total() + currentTotal := times.Total() // this is Idle + currentBusy currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq + times.Softirq + times.Steal + times.Guest + times.GuestNice @@ -284,16 +284,16 @@ func (h *HostCpuStatsCalculator) Calculate(times cpu.TimesStat) (idle float64, u total = ((currentBusy - h.prevBusy) / deltaTotal) * 100 // Protect against any invalid values - if math.IsNaN(idle) || math.IsInf(idle, 0) { + if math.IsNaN(idle) || math.IsInf(idle, 0) || idle < 0.0 { idle = 100.0 } - if math.IsNaN(user) || math.IsInf(user, 0) { + if math.IsNaN(user) || math.IsInf(user, 0) || user < 0.0 { user = 0.0 } - if math.IsNaN(system) || math.IsInf(system, 0) { + if math.IsNaN(system) || math.IsInf(system, 0) || system < 0.0 { system = 0.0 } - if math.IsNaN(total) || math.IsInf(total, 0) { + if math.IsNaN(total) || math.IsInf(total, 0) || total < 0.0 { total = 0.0 } diff --git a/client/hoststats/host_test.go b/client/hoststats/host_test.go index 9eaa9287023..0a8a7f2c0df 100644 --- a/client/hoststats/host_test.go +++ b/client/hoststats/host_test.go @@ -7,6 +7,7 @@ import ( "testing" "github.com/shirou/gopsutil/v3/cpu" + "github.com/shoenig/test/must" ) func TestHostCpuStatsCalculator_Nan(t *testing.T) { @@ -20,16 +21,34 @@ func TestHostCpuStatsCalculator_Nan(t *testing.T) { calculator.Calculate(times) idle, user, system, total := calculator.Calculate(times) - if idle != 100.0 { - t.Errorf("idle: Expected: %f, Got %f", 100.0, idle) - } - if user != 0.0 { - t.Errorf("user: Expected: %f, Got %f", 0.0, user) - } - if system != 0.0 { - t.Errorf("system: Expected: %f, Got %f", 0.0, system) + must.Eq(t, 100.0, idle, must.Sprint("unexpected idle stats")) + must.Eq(t, 0.0, user, must.Sprint("unexpected user stats")) + must.Eq(t, 0.0, system, must.Sprint("unexpected system stats")) + must.Eq(t, 0.0, total, must.Sprint("unexpected total stats")) +} + +func TestHostCpuStatsCalculator_DecreasedIOWait(t *testing.T) { + times := cpu.TimesStat{ + CPU: "cpu0", + User: 20000, + Nice: 100, + System: 9000, + Idle: 370000, + Iowait: 700, } - if total != 0.0 { - t.Errorf("total: Expected: %f, Got %f", 0.0, total) + + calculator := NewHostCpuStatsCalculator() + calculator.Calculate(times) + + times = cpu.TimesStat{ + CPU: "cpu0", + User: 20000, + Nice: 100, + System: 9000, + Idle: 380000, + Iowait: 600, } + + _, _, _, total := calculator.Calculate(times) + must.GreaterEq(t, 0.0, total, must.Sprint("total must never be negative")) }