From dfbb1f63f73fc74d737e35f58f61af7c35369295 Mon Sep 17 00:00:00 2001 From: Daniel Liu Date: Tue, 10 Dec 2024 16:08:02 +0800 Subject: [PATCH] metrics: improve accuracy of CPU gauges (#26793) This PR changes metrics collection to actually measure the time interval between collections, rather than assume 3 seconds. I did some ad hoc profiling, and on slower hardware (eg, my Raspberry Pi 4) I routinely saw intervals between 3.3 - 3.5 seconds, with some being as high as 4.5 seconds. This will generally cause the CPU gauge readings to be too high, and in some cases can cause impossibly large values for the CPU load metrics (eg. greater than 400 for a 4 core CPU). --------- Co-authored-by: Felix Lange --- metrics/cpu.go | 7 ++++--- metrics/cpu_enabled.go | 4 ++-- metrics/cputime_nop.go | 2 +- metrics/cputime_unix.go | 4 ++-- metrics/metrics.go | 21 +++++++++++++++------ 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/metrics/cpu.go b/metrics/cpu.go index 72ece16e0768d..3a49cd42493a1 100644 --- a/metrics/cpu.go +++ b/metrics/cpu.go @@ -17,8 +17,9 @@ package metrics // CPUStats is the system and process CPU stats. +// All values are in seconds. type CPUStats struct { - GlobalTime int64 // Time spent by the CPU working on all processes - GlobalWait int64 // Time spent by waiting on disk for all processes - LocalTime int64 // Time spent by the CPU working on this process + GlobalTime float64 // Time spent by the CPU working on all processes + GlobalWait float64 // Time spent by waiting on disk for all processes + LocalTime float64 // Time spent by the CPU working on this process } diff --git a/metrics/cpu_enabled.go b/metrics/cpu_enabled.go index 5d0e8485d7fa4..efb2234c9990c 100644 --- a/metrics/cpu_enabled.go +++ b/metrics/cpu_enabled.go @@ -38,7 +38,7 @@ func ReadCPUStats(stats *CPUStats) { } // requesting all cpu times will always return an array with only one time stats entry timeStat := timeStats[0] - stats.GlobalTime = int64((timeStat.User + timeStat.Nice + timeStat.System) * cpu.ClocksPerSec) - stats.GlobalWait = int64((timeStat.Iowait) * cpu.ClocksPerSec) + stats.GlobalTime = timeStat.User + timeStat.Nice + timeStat.System + stats.GlobalWait = timeStat.Iowait stats.LocalTime = getProcessCPUTime() } diff --git a/metrics/cputime_nop.go b/metrics/cputime_nop.go index 0188735a78339..465d88c4d2320 100644 --- a/metrics/cputime_nop.go +++ b/metrics/cputime_nop.go @@ -21,6 +21,6 @@ package metrics // getProcessCPUTime returns 0 on Windows as there is no system call to resolve // the actual process' CPU time. -func getProcessCPUTime() int64 { +func getProcessCPUTime() float64 { return 0 } diff --git a/metrics/cputime_unix.go b/metrics/cputime_unix.go index 50bb95b31e93a..a5b50ebaa27f7 100644 --- a/metrics/cputime_unix.go +++ b/metrics/cputime_unix.go @@ -26,11 +26,11 @@ import ( ) // getProcessCPUTime retrieves the process' CPU time since program startup. -func getProcessCPUTime() int64 { +func getProcessCPUTime() float64 { var usage syscall.Rusage if err := syscall.Getrusage(syscall.RUSAGE_SELF, &usage); err != nil { log.Warn("Failed to retrieve CPU time", "err", err) return 0 } - return int64(usage.Utime.Sec+usage.Stime.Sec)*100 + int64(usage.Utime.Usec+usage.Stime.Usec)/10000 //nolint:unconvert + return float64(usage.Utime.Sec+usage.Stime.Sec) + float64(usage.Utime.Usec+usage.Stime.Usec)/1000000 //nolint:unconvert } diff --git a/metrics/metrics.go b/metrics/metrics.go index 2b8bad8bee36f..00f46e2a7c0a5 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -127,8 +127,6 @@ func CollectProcessMetrics(refresh time.Duration) { return } - refreshFreq := int64(refresh / time.Second) - // Create the various data collectors var ( cpustats = make([]CPUStats, 2) @@ -163,14 +161,25 @@ func CollectProcessMetrics(refresh time.Duration) { diskWriteBytesCounter = GetOrRegisterCounter("system/disk/writebytes", DefaultRegistry) ) + var lastCollectTime time.Time + // Iterate loading the different stats and updating the meters. now, prev := 0, 1 for ; ; now, prev = prev, now { - // CPU + // Gather CPU times. ReadCPUStats(&cpustats[now]) - cpuSysLoad.Update((cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq) - cpuSysWait.Update((cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq) - cpuProcLoad.Update((cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq) + collectTime := time.Now() + secondsSinceLastCollect := collectTime.Sub(lastCollectTime).Seconds() + lastCollectTime = collectTime + if secondsSinceLastCollect > 0 { + sysLoad := (cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / secondsSinceLastCollect + sysWait := (cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / secondsSinceLastCollect + procLoad := (cpustats[now].LocalTime - cpustats[prev].LocalTime) / secondsSinceLastCollect + // Convert to integer percentage. + cpuSysLoad.Update(int64(sysLoad * 100)) + cpuSysWait.Update(int64(sysWait * 100)) + cpuProcLoad.Update(int64(procLoad * 100)) + } // Threads cpuThreads.Update(int64(threadCreateProfile.Count()))