From b17a8728b6d37bd14ba5275bb29f3908d3acf53c Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Mon, 14 Oct 2019 14:33:48 -0700 Subject: [PATCH 1/5] add a map of profilers to CPUids `runtime.NumCPU()` returns the number of CPUs that the process can run on. This number does not necessarily correlate to CPU ids if the affinity mask of the process is set. This change maintains the current behavior as default, but also allows the user to specify a range of CPUids to use instead. The CPU id is stored as the value of a map keyed on the profiler object's address. Signed-off-by: Joe Damato --- CHANGELOG.md | 1 + README.md | 7 ++++ collector/perf_linux.go | 80 +++++++++++++++++++++++++++++++++-------- 3 files changed, 74 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8eaf77306a..e07e9f5a9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ ### Changes +* [ENHANCEMENT] Add `--collector.perf.cpus` to allow setting the CPU list for perf stats. * [CHANGE] Add `--collector.netdev.device-whitelist`. #1279 * [CHANGE] Refactor mdadm collector #1403 * [CHANGE] Add `mountaddr` label to NFS metrics. #1417 diff --git a/README.md b/README.md index 7a4b9f407f..ff54260448 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,13 @@ Depending on the configured value different metrics will be available, for most cases `0` will provide the most complete set. For more information see [`man 2 perf_event_open`](http://man7.org/linux/man-pages/man2/perf_event_open.2.html). +By default, the perf collector will only collect metrics of the CPUs that +`node_exporter` can run on. If this is insufficient (e.g. if you run `node_exporter` with +its CPU affinity set to specific CPUs) You can specify a list of alternate CPUs by using the +`--collector.perf.cpus` flag. For example, to collect metrics on CPUs 2-6, you +would specify: `--collector.perf --collector.perf.cpus=2-6`. + + Name | Description | OS ---------|-------------|---- buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinfo. | Linux diff --git a/collector/perf_linux.go b/collector/perf_linux.go index 0ab7b84f98..b6eae0207f 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -15,16 +15,25 @@ package collector import ( "fmt" - "runtime" - perf "github.com/hodgesds/perf-utils" "github.com/prometheus/client_golang/prometheus" + kingpin "gopkg.in/alecthomas/kingpin.v2" + "runtime" + "strconv" + "strings" ) const ( perfSubsystem = "perf" ) +var ( + cpus = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String() + hwProfilerCpuMap = make(map[*perf.HardwareProfiler]int) + swProfilerCpuMap = make(map[*perf.SoftwareProfiler]int) + cacheProfilerCpuMap = make(map[*perf.CacheProfiler]int) +) + func init() { registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector) } @@ -41,6 +50,14 @@ type perfCollector struct { desc map[string]*prometheus.Desc } +func isValidCPUString(cpus *string) bool { + if cpus == nil || *cpus == "" || !strings.Contains(*cpus, "-") || strings.Count(*cpus, "-") != 1 { + return false + } + + return true +} + // NewPerfCollector returns a new perf based collector, it creates a profiler // per CPU. func NewPerfCollector() (Collector, error) { @@ -49,23 +66,55 @@ func NewPerfCollector() (Collector, error) { perfSwProfilers: map[int]perf.SoftwareProfiler{}, perfCacheProfilers: map[int]perf.CacheProfiler{}, } - ncpus := runtime.NumCPU() - for i := 0; i < ncpus; i++ { + + start := 0 + ncpus := 0 + var err error + + if !isValidCPUString(cpus) { + start = 0 + ncpus = runtime.NumCPU() - 1 + } else { + cpu_range := strings.Split(*cpus, "-") + start, err = strconv.Atoi(cpu_range[0]) + if err != nil { + start = 0 + } + + ncpus, err = strconv.Atoi(cpu_range[1]) + if err != nil { + ncpus = runtime.NumCPU() - 1 + } + } + + for i, idx := start, 0; i <= ncpus; i, idx = i+1, idx+1 { // Use -1 to profile all processes on the CPU, see: // man perf_event_open - collector.perfHwProfilers[i] = perf.NewHardwareProfiler(-1, i) - if err := collector.perfHwProfilers[i].Start(); err != nil { + p := perf.NewHardwareProfiler(-1, i) + collector.perfHwProfilers[idx] = p + if err := collector.perfHwProfilers[idx].Start(); err != nil { return collector, err + } else { + hwProfilerCpuMap[&p] = i } - collector.perfSwProfilers[i] = perf.NewSoftwareProfiler(-1, i) + + p2 := perf.NewSoftwareProfiler(-1, i) + collector.perfSwProfilers[i] = p2 if err := collector.perfSwProfilers[i].Start(); err != nil { return collector, err + } else { + swProfilerCpuMap[&p2] = i } - collector.perfCacheProfilers[i] = perf.NewCacheProfiler(-1, i) + + p3 := perf.NewCacheProfiler(-1, i) + collector.perfCacheProfilers[i] = p3 if err := collector.perfCacheProfilers[i].Start(); err != nil { return collector, err + } else { + cacheProfilerCpuMap[&p3] = i } } + collector.desc = map[string]*prometheus.Desc{ "cpucycles_total": prometheus.NewDesc( prometheus.BuildFQName( @@ -330,8 +379,9 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error { } func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { - for cpu, profiler := range c.perfHwProfilers { - cpuStr := fmt.Sprintf("%d", cpu) + for _, profiler := range c.perfHwProfilers { + cpuid := hwProfilerCpuMap[&profiler] + cpuStr := fmt.Sprintf("%d", cpuid) hwProfile, err := profiler.Profile() if err != nil { return err @@ -401,8 +451,9 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { } func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { - for cpu, profiler := range c.perfSwProfilers { - cpuStr := fmt.Sprintf("%d", cpu) + for _, profiler := range c.perfSwProfilers { + cpuid := swProfilerCpuMap[&profiler] + cpuStr := fmt.Sprintf("%d", cpuid) swProfile, err := profiler.Profile() if err != nil { return err @@ -456,8 +507,9 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { } func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { - for cpu, profiler := range c.perfCacheProfilers { - cpuStr := fmt.Sprintf("%d", cpu) + for _, profiler := range c.perfCacheProfilers { + cpuid := cacheProfilerCpuMap[&profiler] + cpuStr := fmt.Sprintf("%d", cpuid) cacheProfile, err := profiler.Profile() if err != nil { return err From 27381c4a2e909a86a5ae2b76d54f3b8a8637902d Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 16 Oct 2019 11:31:23 -0700 Subject: [PATCH 2/5] Fix scoping on CPU id maps and style cleanup Signed-off-by: Joe Damato --- collector/perf_linux.go | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/collector/perf_linux.go b/collector/perf_linux.go index b6eae0207f..a4db7f90d7 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -14,6 +14,7 @@ package collector import ( + "errors" "fmt" perf "github.com/hodgesds/perf-utils" "github.com/prometheus/client_golang/prometheus" @@ -28,10 +29,7 @@ const ( ) var ( - cpus = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String() - hwProfilerCpuMap = make(map[*perf.HardwareProfiler]int) - swProfilerCpuMap = make(map[*perf.SoftwareProfiler]int) - cacheProfilerCpuMap = make(map[*perf.CacheProfiler]int) + perfCpusFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String() ) func init() { @@ -44,14 +42,17 @@ func init() { // settings not all profiler values may be exposed on the target system at any // given time. type perfCollector struct { - perfHwProfilers map[int]perf.HardwareProfiler - perfSwProfilers map[int]perf.SoftwareProfiler - perfCacheProfilers map[int]perf.CacheProfiler - desc map[string]*prometheus.Desc + hwProfilerCpuMap map[*perf.HardwareProfiler]int + swProfilerCpuMap map[*perf.SoftwareProfiler]int + cacheProfilerCpuMap map[*perf.CacheProfiler]int + perfHwProfilers map[int]perf.HardwareProfiler + perfSwProfilers map[int]perf.SoftwareProfiler + perfCacheProfilers map[int]perf.CacheProfiler + desc map[string]*prometheus.Desc } func isValidCPUString(cpus *string) bool { - if cpus == nil || *cpus == "" || !strings.Contains(*cpus, "-") || strings.Count(*cpus, "-") != 1 { + if !strings.Contains(*cpus, "-") || strings.Count(*cpus, "-") != 1 { return false } @@ -71,17 +72,19 @@ func NewPerfCollector() (Collector, error) { ncpus := 0 var err error - if !isValidCPUString(cpus) { + if perfCpusFlag == nil || *perfCpusFlag == "" { start = 0 ncpus = runtime.NumCPU() - 1 + } else if !isValidCPUString(perfCpusFlag) { + return nil, errors.New("--collector.perf.cpus flag value is invalid, it must be a range (e.g. 2-6)") } else { - cpu_range := strings.Split(*cpus, "-") - start, err = strconv.Atoi(cpu_range[0]) + cpuRange := strings.Split(*perfCpusFlag, "-") + start, err = strconv.Atoi(cpuRange[0]) if err != nil { start = 0 } - ncpus, err = strconv.Atoi(cpu_range[1]) + ncpus, err = strconv.Atoi(cpuRange[1]) if err != nil { ncpus = runtime.NumCPU() - 1 } @@ -95,7 +98,7 @@ func NewPerfCollector() (Collector, error) { if err := collector.perfHwProfilers[idx].Start(); err != nil { return collector, err } else { - hwProfilerCpuMap[&p] = i + collector.hwProfilerCpuMap[&p] = i } p2 := perf.NewSoftwareProfiler(-1, i) @@ -103,7 +106,7 @@ func NewPerfCollector() (Collector, error) { if err := collector.perfSwProfilers[i].Start(); err != nil { return collector, err } else { - swProfilerCpuMap[&p2] = i + collector.swProfilerCpuMap[&p2] = i } p3 := perf.NewCacheProfiler(-1, i) @@ -111,7 +114,7 @@ func NewPerfCollector() (Collector, error) { if err := collector.perfCacheProfilers[i].Start(); err != nil { return collector, err } else { - cacheProfilerCpuMap[&p3] = i + collector.cacheProfilerCpuMap[&p3] = i } } @@ -380,7 +383,7 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfHwProfilers { - cpuid := hwProfilerCpuMap[&profiler] + cpuid := c.hwProfilerCpuMap[&profiler] cpuStr := fmt.Sprintf("%d", cpuid) hwProfile, err := profiler.Profile() if err != nil { @@ -452,7 +455,7 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfSwProfilers { - cpuid := swProfilerCpuMap[&profiler] + cpuid := c.swProfilerCpuMap[&profiler] cpuStr := fmt.Sprintf("%d", cpuid) swProfile, err := profiler.Profile() if err != nil { @@ -508,7 +511,7 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfCacheProfilers { - cpuid := cacheProfilerCpuMap[&profiler] + cpuid := c.cacheProfilerCpuMap[&profiler] cpuStr := fmt.Sprintf("%d", cpuid) cacheProfile, err := profiler.Profile() if err != nil { From 341322ef552935ee4ab56e3d1913bf993a7fea0f Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 16 Oct 2019 13:09:07 -0700 Subject: [PATCH 3/5] Initialize CPU maps Signed-off-by: Joe Damato --- collector/perf_linux.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/collector/perf_linux.go b/collector/perf_linux.go index a4db7f90d7..bde584458f 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -63,9 +63,12 @@ func isValidCPUString(cpus *string) bool { // per CPU. func NewPerfCollector() (Collector, error) { collector := &perfCollector{ - perfHwProfilers: map[int]perf.HardwareProfiler{}, - perfSwProfilers: map[int]perf.SoftwareProfiler{}, - perfCacheProfilers: map[int]perf.CacheProfiler{}, + perfHwProfilers: map[int]perf.HardwareProfiler{}, + perfSwProfilers: map[int]perf.SoftwareProfiler{}, + perfCacheProfilers: map[int]perf.CacheProfiler{}, + hwProfilerCpuMap: map[*perf.HardwareProfiler]int{}, + swProfilerCpuMap: map[*perf.SoftwareProfiler]int{}, + cacheProfilerCpuMap: map[*perf.CacheProfiler]int{}, } start := 0 From 573cf020fe6fa3a195ce322ec7dbed487769701c Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 16 Oct 2019 13:35:30 -0700 Subject: [PATCH 4/5] Use perf Profiler pointers instead Signed-off-by: Joe Damato --- collector/perf_linux.go | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/collector/perf_linux.go b/collector/perf_linux.go index bde584458f..348a1b51fd 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -45,9 +45,9 @@ type perfCollector struct { hwProfilerCpuMap map[*perf.HardwareProfiler]int swProfilerCpuMap map[*perf.SoftwareProfiler]int cacheProfilerCpuMap map[*perf.CacheProfiler]int - perfHwProfilers map[int]perf.HardwareProfiler - perfSwProfilers map[int]perf.SoftwareProfiler - perfCacheProfilers map[int]perf.CacheProfiler + perfHwProfilers map[int]*perf.HardwareProfiler + perfSwProfilers map[int]*perf.SoftwareProfiler + perfCacheProfilers map[int]*perf.CacheProfiler desc map[string]*prometheus.Desc } @@ -63,9 +63,9 @@ func isValidCPUString(cpus *string) bool { // per CPU. func NewPerfCollector() (Collector, error) { collector := &perfCollector{ - perfHwProfilers: map[int]perf.HardwareProfiler{}, - perfSwProfilers: map[int]perf.SoftwareProfiler{}, - perfCacheProfilers: map[int]perf.CacheProfiler{}, + perfHwProfilers: map[int]*perf.HardwareProfiler{}, + perfSwProfilers: map[int]*perf.SoftwareProfiler{}, + perfCacheProfilers: map[int]*perf.CacheProfiler{}, hwProfilerCpuMap: map[*perf.HardwareProfiler]int{}, swProfilerCpuMap: map[*perf.SoftwareProfiler]int{}, cacheProfilerCpuMap: map[*perf.CacheProfiler]int{}, @@ -97,24 +97,24 @@ func NewPerfCollector() (Collector, error) { // Use -1 to profile all processes on the CPU, see: // man perf_event_open p := perf.NewHardwareProfiler(-1, i) - collector.perfHwProfilers[idx] = p - if err := collector.perfHwProfilers[idx].Start(); err != nil { + collector.perfHwProfilers[idx] = &p + if err := p.Start(); err != nil { return collector, err } else { collector.hwProfilerCpuMap[&p] = i } p2 := perf.NewSoftwareProfiler(-1, i) - collector.perfSwProfilers[i] = p2 - if err := collector.perfSwProfilers[i].Start(); err != nil { + collector.perfSwProfilers[i] = &p2 + if err := p2.Start(); err != nil { return collector, err } else { collector.swProfilerCpuMap[&p2] = i } p3 := perf.NewCacheProfiler(-1, i) - collector.perfCacheProfilers[i] = p3 - if err := collector.perfCacheProfilers[i].Start(); err != nil { + collector.perfCacheProfilers[i] = &p3 + if err := p3.Start(); err != nil { return collector, err } else { collector.cacheProfilerCpuMap[&p3] = i @@ -386,9 +386,9 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfHwProfilers { - cpuid := c.hwProfilerCpuMap[&profiler] + cpuid := c.hwProfilerCpuMap[profiler] cpuStr := fmt.Sprintf("%d", cpuid) - hwProfile, err := profiler.Profile() + hwProfile, err := (*profiler).Profile() if err != nil { return err } @@ -458,9 +458,9 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfSwProfilers { - cpuid := c.swProfilerCpuMap[&profiler] + cpuid := c.swProfilerCpuMap[profiler] cpuStr := fmt.Sprintf("%d", cpuid) - swProfile, err := profiler.Profile() + swProfile, err := (*profiler).Profile() if err != nil { return err } @@ -514,9 +514,9 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { for _, profiler := range c.perfCacheProfilers { - cpuid := c.cacheProfilerCpuMap[&profiler] + cpuid := c.cacheProfilerCpuMap[profiler] cpuStr := fmt.Sprintf("%d", cpuid) - cacheProfile, err := profiler.Profile() + cacheProfile, err := (*profiler).Profile() if err != nil { return err } From b813bf1ebac5bdf12fecc06b6df765636f5f38ce Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 16 Oct 2019 16:04:06 -0700 Subject: [PATCH 5/5] Update docs to include 0-indexed CPU ids Signed-off-by: Joe Damato --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ff54260448..7bd8d0a0a4 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,8 @@ By default, the perf collector will only collect metrics of the CPUs that `node_exporter` can run on. If this is insufficient (e.g. if you run `node_exporter` with its CPU affinity set to specific CPUs) You can specify a list of alternate CPUs by using the `--collector.perf.cpus` flag. For example, to collect metrics on CPUs 2-6, you -would specify: `--collector.perf --collector.perf.cpus=2-6`. +would specify: `--collector.perf --collector.perf.cpus=2-6`. The CPU ids start +at 0. Name | Description | OS