From 49a9ead8668f7e9e4e472eeed46944a92e69b87d Mon Sep 17 00:00:00 2001 From: Richard Elling Date: Sun, 3 Mar 2019 15:27:56 -0800 Subject: [PATCH 1/3] add spurious_interrupt measurement --- plugins/inputs/interrupts/README.md | 130 ++++++++++++------- plugins/inputs/interrupts/interrupts.go | 99 ++++++++++++-- plugins/inputs/interrupts/interrupts_test.go | 23 +++- 3 files changed, 194 insertions(+), 58 deletions(-) diff --git a/plugins/inputs/interrupts/README.md b/plugins/inputs/interrupts/README.md index 5da647f47793f..97c295d5a51db 100644 --- a/plugins/inputs/interrupts/README.md +++ b/plugins/inputs/interrupts/README.md @@ -1,6 +1,7 @@ # Interrupts Input Plugin -The interrupts plugin gathers metrics about IRQs from `/proc/interrupts` and `/proc/softirqs`. +The interrupts plugin gathers metrics about IRQs from `/proc/interrupts`, `/proc/softirqs`, +and `/proc/irq/IRQ_NUMBER/spurious` ### Configuration ```toml @@ -13,65 +14,99 @@ The interrupts plugin gathers metrics about IRQs from `/proc/interrupts` and `/p ## deployments. # cpu_as_tag = false + ## spurious interrupt counters can be collected + # spurious = false + ## To filter which IRQs to collect, make use of tagpass / tagdrop, i.e. # [inputs.interrupts.tagdrop] # irq = [ "NET_RX", "TASKLET" ] ``` -### Metrics +### Measurements -There are two styles depending on the value of `cpu_as_tag`. +There are two styles of `interrupts` and `soft_interrupts` depending on the value of `cpu_as_tag`. +When `cpu_as_tag` is `false` the per-CPU count is in a field. When `true` the CPU is a tag and +there is one point per interrupt per CPU. Having the CPU as a tag easily allows queries by-CPU at +the cost of requiring greater cardinality. -With `cpu_as_tag = false`: +For `spurious_interrupts` there is only a per-interrupt counter available; no per-CPU info is +available. -- interrupts - - tags: - - irq (IRQ name) - - type - - device (name of the device that is located at the IRQ) - - cpu - - fields: - - cpu (int, number of interrupts per cpu) - - total (int, total number of interrupts) - -- soft_interrupts - - tags: - - irq (IRQ name) - - type - - device (name of the device that is located at the IRQ) - - cpu - - fields: - - cpu (int, number of interrupts per cpu) - - total (int, total number of interrupts) +#### interrupts +The `interrupts` measurement reports the hard interrupt data collected from the +`/proc/interrupts` file in Linux. +These interrupts are typically from hardware devices, but can be also generated by +software, such as timers and interprocessor interrupts. -With `cpu_as_tag = true`: +##### interrupts tags +| cpu_as_tag | tag | description | +|:---:|:---:|---| +| - | device | description of device | +| - | irq | IRQ name (ephemeral, could be a number) | +| - | type | interrupt type | +| true | cpu | CPU number | + +##### interrupts fields +| cpu_as_tag | field | counter | units | description | +|:---:|:---:|:---:|:---:|---| +| - | count | counter | events | number of times the interrupt has been triggered | +| false | cpu# | counter | events | number of times the interrupt has been handled by CPU _#_ | +| false | total | counter | events | total number of times the interrupt has been handled by all CPUs | + +#### soft_interrupts +The `soft_interrupts` measurement reports the soft interrupt data collected from the +`/proc/softirqs` file in Linux. + +Note: for some Linux systems there can be fixed, large number of CPUs reported in +the `/proc/softirqs` file. This number could be much larger than the actual number of +CPUs in the system. The fields for these phantom CPUs contain zeroes. The approach +taken to remove these phantom CPUs is to remove the columns containing all zeros +to the right (higher CPU numbers). For systems where CPUs are dynamically enabled, +this can lead to CPUs not being reported until enabled. However, this is preferable +to collecting metrics for tens or hundreds of phantom CPUs. For queries with fixed +numbers of CPUs, consider using `fill(0)` rather than `fill(null)` + +##### soft_interrupts tags +| cpu_as_tag | tag | description | +|:---:|:---:|---| +| - | irq | IRQ name | +| true | cpu | CPU number | -- interrupts - - tags: - - irq (IRQ name) - - type - - device (name of the device that is located at the IRQ) - - cpu - - fields: - - count (int, number of interrupts) - -- soft_interrupts - - tags: - - irq (IRQ name) - - type - - device (name of the device that is located at the IRQ) - - cpu - - fields: - - count (int, number of interrupts) +##### soft_interrupts fields +| cpu_as_tag | field | type | units | description | +|:---:|:---:|:---:|:---:|---| +| false | cpu# | counter | events | number of times the interrupt has been handled by CPU _#_ | +| false | total | counter | events | total number of times the interrupt has been handled by all CPUs | +| true | count | counter | events | number of times the interrupt has been handled by CPU in the tag | + + +#### spurious_interrupts +The `spurious_interrupts` measurement reports the number of spurious interrupts triggered +and unhandled for IRQs. This data is collected from the `/proc/irq/IRQ/spurious` file. +This data is identified by IRQ and not per-CPU. + +##### spurious_interrupts tags +| tag | description | +|:---:|---| +| device | description of device | +| irq | IRQ name | +| type | interrupt type | + +##### spurious_interrupts fields +| field | type | units | description | +|:---:|:---:|:---:|---| +| count | counter | events | number of times the interrupt has been handled (modulo 100,000) | +| total | counter | events | total number of times the interrupt has been handled | +| unhandled | counter | events | number of times an interrupt was not handled | ### Example Output With `cpu_as_tag = false`: ``` -interrupts,irq=0,type=IO-APIC,device=2-edge\ timer,cpu=cpu0 count=23i 1489346531000000000 -interrupts,irq=1,type=IO-APIC,device=1-edge\ i8042,cpu=cpu0 count=9i 1489346531000000000 -interrupts,irq=30,type=PCI-MSI,device=65537-edge\ virtio1-input.0,cpu=cpu1 count=1i 1489346531000000000 -soft_interrupts,irq=NET_RX,cpu=cpu0 count=280879i 1489346531000000000 +interrupts,irq=0,type=IO-APIC,device=2-edge\ timer cpu0=23i,cpu1=0i,total=23i 1489346531000000000 +interrupts,irq=1,type=IO-APIC,device=1-edge\ i8042 cpu0=4i,cpu1=5i,total=9i 1489346531000000000 +interrupts,irq=30,type=PCI-MSI,device=65537-edge\ virtio1-input.0 cpu0=2i,cpu1=40i,total=42i 1489346531000000000 +soft_interrupts,irq=NET_RX cpu0=140412i,cpu1=140467,total=280879i 1489346531000000000 ``` With `cpu_as_tag = true`: @@ -81,3 +116,8 @@ interrupts,cpu=cpu7,irq=PIW,type=Posted-interrupt\ wakeup\ event count=0i 154353 soft_interrupts,cpu=cpu0,irq=HI count=246441i 1543539773000000000 soft_interrupts,cpu=cpu1,irq=HI count=159154i 1543539773000000000 ``` + +With `spurious = true` add to the above: +``` +spurious_interrupts,device=17-fasteoi\ ioc0,irq=17,type=IO-APIC count=27836i,total=327836i,unhandled=0i 1551582077000000000 +``` \ No newline at end of file diff --git a/plugins/inputs/interrupts/interrupts.go b/plugins/inputs/interrupts/interrupts.go index 5b0ca374cb907..e5db8353a6a49 100644 --- a/plugins/inputs/interrupts/interrupts.go +++ b/plugins/inputs/interrupts/interrupts.go @@ -5,6 +5,7 @@ import ( "fmt" "io" "os" + "path/filepath" "strconv" "strings" @@ -14,14 +15,18 @@ import ( type Interrupts struct { CpuAsTag bool `toml:"cpu_as_tag"` + Spurious bool `toml:"spurious"` } type IRQ struct { - ID string - Type string - Device string - Total int64 - Cpus []int64 + ID string + Type string + Device string + Total int64 + Cpus []int64 + HasSpurious bool + SpuriousCount uint64 + SpuriousUnhandled uint64 } func NewIRQ(id string) *IRQ { @@ -37,6 +42,9 @@ const sampleConfig = ` ## deployments. # cpu_as_tag = false + ## spurious interrupt counters can be collected + # spurious = false + ## To filter which IRQs to collect, make use of tagpass / tagdrop, i.e. # [inputs.interrupts.tagdrop] # irq = [ "NET_RX", "TASKLET" ] @@ -50,14 +58,14 @@ func (s *Interrupts) SampleConfig() string { return sampleConfig } -func parseInterrupts(r io.Reader) ([]IRQ, error) { +func parseInterrupts(r io.Reader, spurious bool) ([]IRQ, error) { var irqs []IRQ var cpucount int scanner := bufio.NewScanner(r) if scanner.Scan() { cpus := strings.Fields(scanner.Text()) if cpus[0] != "CPU0" { - return nil, fmt.Errorf("Expected first line to start with CPU0, but was %s", scanner.Text()) + return nil, fmt.Errorf("expected first line to start with CPU0, but was %s", scanner.Text()) } cpucount = len(cpus) } @@ -90,14 +98,63 @@ scan: } else if len(fields) > cpucount { irq.Type = strings.Join(fields[cpucount+1:], " ") } + if spurious { + file := filepath.Join("/proc/irq", irq.ID, "spurious") + f, err := os.Open(file) + if err == nil { + irq.HasSpurious, irq.SpuriousCount, irq.SpuriousUnhandled = parseSpurious(f) + _ = f.Close() + } + } irqs = append(irqs, *irq) } if scanner.Err() != nil { - return nil, fmt.Errorf("Error scanning file: %s", scanner.Err()) + return nil, fmt.Errorf("error scanning file: %s", scanner.Err()) + } + + // determine the rightmost CPU column with non-zero data + validCpuIndex := 0 + for _, irq := range irqs { + var i int + for i = len(irq.Cpus) - 1; i > validCpuIndex && irq.Cpus[i] == 0; i-- { + } + if i > validCpuIndex { + validCpuIndex = i + } + } + // remove data for any CPUs above the validCpuIndex + validCpuCount := validCpuIndex + 1 + for i := 0; i < len(irqs); i++ { + if len(irqs[i].Cpus) > validCpuCount { + irqs[i].Cpus = append(irqs[i].Cpus[:validCpuCount]) + } } + return irqs, nil } +func parseSpurious(r io.Reader) (bool, uint64, uint64) { + count := uint64(0) + unhandled := uint64(0) + foundData := false + scanner := bufio.NewScanner(r) + for scanner.Scan() { + s := strings.Fields(scanner.Text()) + if len(s) < 2 { + continue + } + switch s[0] { + case "count": + count, _ = strconv.ParseUint(s[1], 10, 64) + foundData = true + case "unhandled": + unhandled, _ = strconv.ParseUint(s[1], 10, 64) + foundData = true + } + } + return foundData, count, unhandled +} + func gatherTagsFields(irq IRQ) (map[string]string, map[string]interface{}) { tags := map[string]string{"irq": irq.ID, "type": irq.Type, "device": irq.Device} fields := map[string]interface{}{"total": irq.Total} @@ -112,16 +169,19 @@ func (s *Interrupts) Gather(acc telegraf.Accumulator) error { for measurement, file := range map[string]string{"interrupts": "/proc/interrupts", "soft_interrupts": "/proc/softirqs"} { f, err := os.Open(file) if err != nil { - acc.AddError(fmt.Errorf("Could not open file: %s", file)) + acc.AddError(fmt.Errorf("could not open file: %s", file)) continue } - defer f.Close() - irqs, err := parseInterrupts(f) + irqs, err := parseInterrupts(f, s.Spurious) + _ = f.Close() if err != nil { - acc.AddError(fmt.Errorf("Parsing %s: %s", file, err)) + acc.AddError(fmt.Errorf("parsing %s: %s", file, err)) continue } reportMetrics(measurement, irqs, acc, s.CpuAsTag) + if s.Spurious { + reportSpuriousMetrics(irqs, acc) + } } return nil } @@ -143,6 +203,21 @@ func reportMetrics(measurement string, irqs []IRQ, acc telegraf.Accumulator, cpu } } +func reportSpuriousMetrics(irqs []IRQ, acc telegraf.Accumulator, ) { + for _, irq := range irqs { + if !irq.HasSpurious { + continue + } + tags, _ := gatherTagsFields(irq) + spuriousFields := map[string]interface{}{ + "count": irq.SpuriousCount, + "unhandled": irq.SpuriousUnhandled, + "total": irq.Total, + } + acc.AddFields("spurious_interrupts", spuriousFields, tags) + } +} + func init() { inputs.Add("interrupts", func() telegraf.Input { return &Interrupts{} diff --git a/plugins/inputs/interrupts/interrupts_test.go b/plugins/inputs/interrupts/interrupts_test.go index 2579d926d20c6..a18e353bd2368 100644 --- a/plugins/inputs/interrupts/interrupts_test.go +++ b/plugins/inputs/interrupts/interrupts_test.go @@ -33,7 +33,7 @@ func expectCpuAsFields(m *testutil.Accumulator, t *testing.T, measurement string func setup(t *testing.T, irqString string, cpuAsTags bool) (*testutil.Accumulator, []IRQ) { f := bytes.NewBufferString(irqString) - irqs, err := parseInterrupts(f) + irqs, err := parseInterrupts(f, false) require.Equal(t, nil, err) require.NotEqual(t, 0, len(irqs)) @@ -154,3 +154,24 @@ func TestCpuAsFieldsHwIrqs(t *testing.T) { expectCpuAsFields(acc, t, "interrupts", irq) } } + +// ===================================================================================== +// spurious interrupts +// +// Note: the spurious interrupt ID is gathered from /proc/interrupts as part of the +// hardware interrupts, so its test is not recreated here. +// ===================================================================================== + +const spuriousIrqsString = ` +count 12345 +unhandled 89 +last_unhandled 6677 +` + +func TestSpuriousParser(t *testing.T) { + f := bytes.NewBufferString(spuriousIrqsString) + hasSpurious, count, unhandled := parseSpurious(f) + require.True(t, hasSpurious, "spurious data found") + require.Equal(t, uint64(12345), count, "incorrect parsed count") + require.Equal(t, uint64(89), unhandled, "incorrect parsed unhandled") +} From f7374f4f38925a71dab22597d923924c86586694 Mon Sep 17 00:00:00 2001 From: Richard Elling Date: Sun, 3 Mar 2019 15:56:57 -0800 Subject: [PATCH 2/3] go fmt finds things goland doesn't --- plugins/inputs/interrupts/interrupts.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/inputs/interrupts/interrupts.go b/plugins/inputs/interrupts/interrupts.go index e5db8353a6a49..61cbd54aaf27b 100644 --- a/plugins/inputs/interrupts/interrupts.go +++ b/plugins/inputs/interrupts/interrupts.go @@ -203,7 +203,7 @@ func reportMetrics(measurement string, irqs []IRQ, acc telegraf.Accumulator, cpu } } -func reportSpuriousMetrics(irqs []IRQ, acc telegraf.Accumulator, ) { +func reportSpuriousMetrics(irqs []IRQ, acc telegraf.Accumulator) { for _, irq := range irqs { if !irq.HasSpurious { continue From b90f3295c20384afd3610d73563eca82b74b4a2f Mon Sep 17 00:00:00 2001 From: Richard Elling Date: Sun, 26 May 2019 12:31:40 -0700 Subject: [PATCH 3/3] WIP --- plugins/inputs/interrupts/interrupts.go | 48 +++++++++++-------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/plugins/inputs/interrupts/interrupts.go b/plugins/inputs/interrupts/interrupts.go index 61cbd54aaf27b..60a1fa9a1a896 100644 --- a/plugins/inputs/interrupts/interrupts.go +++ b/plugins/inputs/interrupts/interrupts.go @@ -15,7 +15,6 @@ import ( type Interrupts struct { CpuAsTag bool `toml:"cpu_as_tag"` - Spurious bool `toml:"spurious"` } type IRQ struct { @@ -24,7 +23,6 @@ type IRQ struct { Device string Total int64 Cpus []int64 - HasSpurious bool SpuriousCount uint64 SpuriousUnhandled uint64 } @@ -42,9 +40,6 @@ const sampleConfig = ` ## deployments. # cpu_as_tag = false - ## spurious interrupt counters can be collected - # spurious = false - ## To filter which IRQs to collect, make use of tagpass / tagdrop, i.e. # [inputs.interrupts.tagdrop] # irq = [ "NET_RX", "TASKLET" ] @@ -58,7 +53,7 @@ func (s *Interrupts) SampleConfig() string { return sampleConfig } -func parseInterrupts(r io.Reader, spurious bool) ([]IRQ, error) { +func parseInterrupts(r io.Reader) ([]IRQ, error) { var irqs []IRQ var cpucount int scanner := bufio.NewScanner(r) @@ -98,13 +93,12 @@ scan: } else if len(fields) > cpucount { irq.Type = strings.Join(fields[cpucount+1:], " ") } - if spurious { - file := filepath.Join("/proc/irq", irq.ID, "spurious") - f, err := os.Open(file) - if err == nil { - irq.HasSpurious, irq.SpuriousCount, irq.SpuriousUnhandled = parseSpurious(f) - _ = f.Close() - } + + // collect spurious interrupt data for this irq.ID + file := filepath.Join("/proc/irq", irq.ID, "spurious") + f, err := os.Open(file) + if err == nil { + irq.SpuriousCount, irq.SpuriousUnhandled = parseSpurious(f) } irqs = append(irqs, *irq) } @@ -112,7 +106,15 @@ scan: return nil, fmt.Errorf("error scanning file: %s", scanner.Err()) } - // determine the rightmost CPU column with non-zero data + // For some Linux systems there can be fixed, large number of CPUs reported in + // the `/proc/softirqs` file. This number could be much larger than the actual number of + // CPUs in the system. The fields for these phantom CPUs contain zeroes. The approach + // taken to remove these phantom CPUs is to remove the columns containing all zeros + // to the right (higher CPU numbers). For systems where CPUs are dynamically enabled, + // this can lead to CPUs not being reported until enabled. However, this is preferable + // to collecting metrics for tens or hundreds of phantom CPUs. + + // First, determine the rightmost CPU column with non-zero data validCpuIndex := 0 for _, irq := range irqs { var i int @@ -122,7 +124,7 @@ scan: validCpuIndex = i } } - // remove data for any CPUs above the validCpuIndex + // Secondly, remove data for any CPUs above the validCpuIndex validCpuCount := validCpuIndex + 1 for i := 0; i < len(irqs); i++ { if len(irqs[i].Cpus) > validCpuCount { @@ -133,10 +135,9 @@ scan: return irqs, nil } -func parseSpurious(r io.Reader) (bool, uint64, uint64) { +func parseSpurious(r io.Reader) (uint64, uint64) { count := uint64(0) unhandled := uint64(0) - foundData := false scanner := bufio.NewScanner(r) for scanner.Scan() { s := strings.Fields(scanner.Text()) @@ -146,13 +147,11 @@ func parseSpurious(r io.Reader) (bool, uint64, uint64) { switch s[0] { case "count": count, _ = strconv.ParseUint(s[1], 10, 64) - foundData = true case "unhandled": unhandled, _ = strconv.ParseUint(s[1], 10, 64) - foundData = true } } - return foundData, count, unhandled + return count, unhandled } func gatherTagsFields(irq IRQ) (map[string]string, map[string]interface{}) { @@ -172,16 +171,14 @@ func (s *Interrupts) Gather(acc telegraf.Accumulator) error { acc.AddError(fmt.Errorf("could not open file: %s", file)) continue } - irqs, err := parseInterrupts(f, s.Spurious) + irqs, err := parseInterrupts(f) _ = f.Close() if err != nil { acc.AddError(fmt.Errorf("parsing %s: %s", file, err)) continue } reportMetrics(measurement, irqs, acc, s.CpuAsTag) - if s.Spurious { - reportSpuriousMetrics(irqs, acc) - } + reportSpuriousMetrics(irqs, acc) } return nil } @@ -205,9 +202,6 @@ func reportMetrics(measurement string, irqs []IRQ, acc telegraf.Accumulator, cpu func reportSpuriousMetrics(irqs []IRQ, acc telegraf.Accumulator) { for _, irq := range irqs { - if !irq.HasSpurious { - continue - } tags, _ := gatherTagsFields(irq) spuriousFields := map[string]interface{}{ "count": irq.SpuriousCount,