influxdata · richardelling · Mar 3, 2019 · Mar 3, 2019 · May 26, 2019 · May 26, 2019
diff --git a/plugins/inputs/interrupts/README.md b/plugins/inputs/interrupts/README.md
@@ -1,6 +1,7 @@
 # Interrupts Input Plugin
 
-The interrupts plugin gathers metrics about IRQs from `/proc/interrupts` and `/proc/softirqs`.
+The interrupts plugin gathers metrics about IRQs from `/proc/interrupts`, `/proc/softirqs`, 
+and `/proc/irq/IRQ_NUMBER/spurious`
 
 ### Configuration
 ```toml
@@ -13,65 +14,99 @@ The interrupts plugin gathers metrics about IRQs from `/proc/interrupts` and `/p
   ## deployments.
   # cpu_as_tag = false
 
+  ## spurious interrupt counters can be collected
+  # spurious = false
+
   ## To filter which IRQs to collect, make use of tagpass / tagdrop, i.e.
   # [inputs.interrupts.tagdrop]
   #   irq = [ "NET_RX", "TASKLET" ]
 ```
 
-### Metrics
+### Measurements
 
-There are two styles depending on the value of `cpu_as_tag`.
+There are two styles of `interrupts` and `soft_interrupts` depending on the value of `cpu_as_tag`.
+When `cpu_as_tag` is `false` the per-CPU count is in a field. When `true` the CPU is a tag and
+there is one point per interrupt per CPU. Having the CPU as a tag easily allows queries by-CPU at
+the cost of requiring greater cardinality.
 
-With `cpu_as_tag = false`:
+For `spurious_interrupts` there is only a per-interrupt counter available; no per-CPU info is 
+available.
 
-- interrupts
-  - tags:
-    - irq (IRQ name)
-    - type
-    - device (name of the device that is located at the IRQ)
-    - cpu
-  - fields:
-    - cpu (int, number of interrupts per cpu)
-    - total (int, total number of interrupts)
-
-- soft_interrupts
-  - tags:
-    - irq (IRQ name)
-    - type
-    - device (name of the device that is located at the IRQ)
-    - cpu
-  - fields:
-    - cpu (int, number of interrupts per cpu)
-    - total (int, total number of interrupts)
+#### interrupts
+The `interrupts` measurement reports the hard interrupt data collected from the 
+`/proc/interrupts` file in Linux.
+These interrupts are typically from hardware devices, but can be also generated by 
+software, such as timers and interprocessor interrupts.
 
-With `cpu_as_tag = true`:
+##### interrupts tags
+| cpu_as_tag | tag | description |
+|:---:|:---:|---|
+| - | device | description of device |
+| - | irq | IRQ name (ephemeral, could be a number) |
+| - | type | interrupt type |
+| true | cpu | CPU number |
+
+##### interrupts fields 
+| cpu_as_tag | field | counter | units | description |
+|:---:|:---:|:---:|:---:|---|
+| - | count | counter | events | number of times the interrupt has been triggered |
+| false | cpu# | counter | events | number of times the interrupt has been handled by CPU _#_ |
+| false | total | counter | events | total number of times the interrupt has been handled by all CPUs |
+
+#### soft_interrupts
+The `soft_interrupts` measurement reports the soft interrupt data collected from the 
+`/proc/softirqs` file in Linux.
+
+Note: for some Linux systems there can be fixed, large number of CPUs reported in
+the `/proc/softirqs` file. This number could be much larger than the actual number of 
+CPUs in the system. The fields for these phantom CPUs contain zeroes. The approach
+taken to remove these phantom CPUs is to remove the columns containing all zeros
+to the right (higher CPU numbers). For systems where CPUs are dynamically enabled,
+this can lead to CPUs not being reported until enabled. However, this is preferable
+to collecting metrics for tens or hundreds of phantom CPUs. For queries with fixed
+numbers of CPUs, consider using `fill(0)` rather than `fill(null)`
+
+##### soft_interrupts tags
+| cpu_as_tag | tag | description |
+|:---:|:---:|---|
+| - | irq | IRQ name |
+| true | cpu | CPU number |
 
-- interrupts
-  - tags:
-    - irq (IRQ name)
-    - type
-    - device (name of the device that is located at the IRQ)
-    - cpu
-  - fields:
-    - count (int, number of interrupts)
-
-- soft_interrupts
-  - tags:
-    - irq (IRQ name)
-    - type
-    - device (name of the device that is located at the IRQ)
-    - cpu
-  - fields:
-    - count (int, number of interrupts)
+##### soft_interrupts fields 
+| cpu_as_tag | field | type | units | description |
+|:---:|:---:|:---:|:---:|---|
+| false | cpu# | counter | events | number of times the interrupt has been handled by CPU _#_ |
+| false | total | counter | events | total number of times the interrupt has been handled by all CPUs |
+| true | count | counter | events | number of times the interrupt has been handled by CPU in the tag |
+
+
+#### spurious_interrupts
+The `spurious_interrupts` measurement reports the number of spurious interrupts triggered
+and unhandled for IRQs. This data is collected from the `/proc/irq/IRQ/spurious` file.
+This data is identified by IRQ and not per-CPU.
+
+##### spurious_interrupts tags
+| tag | description |
+|:---:|---|
+| device | description of device |
+| irq | IRQ name |
+| type | interrupt type |
+
+##### spurious_interrupts fields 
+| field | type | units | description |
+|:---:|:---:|:---:|---|
+| count | counter | events | number of times the interrupt has been handled (modulo 100,000) |
+| total | counter | events | total number of times the interrupt has been handled |
+| unhandled | counter | events | number of times an interrupt was not handled |
 
 ### Example Output
 
 With `cpu_as_tag = false`:
 ```
-interrupts,irq=0,type=IO-APIC,device=2-edge\ timer,cpu=cpu0 count=23i 1489346531000000000
-interrupts,irq=1,type=IO-APIC,device=1-edge\ i8042,cpu=cpu0 count=9i 1489346531000000000
-interrupts,irq=30,type=PCI-MSI,device=65537-edge\ virtio1-input.0,cpu=cpu1 count=1i 1489346531000000000
-soft_interrupts,irq=NET_RX,cpu=cpu0 count=280879i 1489346531000000000
+interrupts,irq=0,type=IO-APIC,device=2-edge\ timer cpu0=23i,cpu1=0i,total=23i 1489346531000000000
+interrupts,irq=1,type=IO-APIC,device=1-edge\ i8042 cpu0=4i,cpu1=5i,total=9i 1489346531000000000
+interrupts,irq=30,type=PCI-MSI,device=65537-edge\ virtio1-input.0 cpu0=2i,cpu1=40i,total=42i 1489346531000000000
+soft_interrupts,irq=NET_RX cpu0=140412i,cpu1=140467,total=280879i 1489346531000000000
 ```
 
 With `cpu_as_tag = true`:
@@ -81,3 +116,8 @@ interrupts,cpu=cpu7,irq=PIW,type=Posted-interrupt\ wakeup\ event count=0i 154353
 soft_interrupts,cpu=cpu0,irq=HI count=246441i 1543539773000000000
 soft_interrupts,cpu=cpu1,irq=HI count=159154i 1543539773000000000
 ```
+
+With `spurious = true` add to the above:
+```
+spurious_interrupts,device=17-fasteoi\ ioc0,irq=17,type=IO-APIC count=27836i,total=327836i,unhandled=0i 1551582077000000000
+```
diff --git a/plugins/inputs/interrupts/interrupts.go b/plugins/inputs/interrupts/interrupts.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"path/filepath"
 	"strconv"
 	"strings"
 
@@ -17,11 +18,13 @@ type Interrupts struct {
 }
 
 type IRQ struct {
-	ID     string
-	Type   string
-	Device string
-	Total  int64
-	Cpus   []int64
+	ID                string
+	Type              string
+	Device            string
+	Total             int64
+	Cpus              []int64
+	SpuriousCount     uint64
+	SpuriousUnhandled uint64
 }
 
 func NewIRQ(id string) *IRQ {
@@ -57,7 +60,7 @@ func parseInterrupts(r io.Reader) ([]IRQ, error) {
 	if scanner.Scan() {
 		cpus := strings.Fields(scanner.Text())
 		if cpus[0] != "CPU0" {
-			return nil, fmt.Errorf("Expected first line to start with CPU0, but was %s", scanner.Text())
+			return nil, fmt.Errorf("expected first line to start with CPU0, but was %s", scanner.Text())
 		}
 		cpucount = len(cpus)
 	}
@@ -90,14 +93,67 @@ scan:
 		} else if len(fields) > cpucount {
 			irq.Type = strings.Join(fields[cpucount+1:], " ")
 		}
+
+		// collect spurious interrupt data for this irq.ID
+		file := filepath.Join("/proc/irq", irq.ID, "spurious")
+		f, err := os.Open(file)
+		if err == nil {
+			irq.SpuriousCount, irq.SpuriousUnhandled = parseSpurious(f)
+		}
 		irqs = append(irqs, *irq)
 	}
 	if scanner.Err() != nil {
-		return nil, fmt.Errorf("Error scanning file: %s", scanner.Err())
+		return nil, fmt.Errorf("error scanning file: %s", scanner.Err())
 	}
+
+	// For some Linux systems there can be fixed, large number of CPUs reported in
+	// the `/proc/softirqs` file. This number could be much larger than the actual number of
+	// CPUs in the system. The fields for these phantom CPUs contain zeroes. The approach
+	// taken to remove these phantom CPUs is to remove the columns containing all zeros
+	// to the right (higher CPU numbers). For systems where CPUs are dynamically enabled,
+	// this can lead to CPUs not being reported until enabled. However, this is preferable
+	// to collecting metrics for tens or hundreds of phantom CPUs.
+
+	// First, determine the rightmost CPU column with non-zero data
+	validCpuIndex := 0
+	for _, irq := range irqs {
+		var i int
+		for i = len(irq.Cpus) - 1; i > validCpuIndex && irq.Cpus[i] == 0; i-- {
+		}
+		if i > validCpuIndex {
+			validCpuIndex = i
+		}
+	}
+	// Secondly, remove data for any CPUs above the validCpuIndex
+	validCpuCount := validCpuIndex + 1
+	for i := 0; i < len(irqs); i++ {
+		if len(irqs[i].Cpus) > validCpuCount {
+			irqs[i].Cpus = append(irqs[i].Cpus[:validCpuCount])
+		}
+	}
+
 	return irqs, nil
 }
 
+func parseSpurious(r io.Reader) (uint64, uint64) {
+	count := uint64(0)
+	unhandled := uint64(0)
+	scanner := bufio.NewScanner(r)
+	for scanner.Scan() {
+		s := strings.Fields(scanner.Text())
+		if len(s) < 2 {
+			continue
+		}
+		switch s[0] {
+		case "count":
+			count, _ = strconv.ParseUint(s[1], 10, 64)
+		case "unhandled":
+			unhandled, _ = strconv.ParseUint(s[1], 10, 64)
+		}
+	}
+	return count, unhandled
+}
+
 func gatherTagsFields(irq IRQ) (map[string]string, map[string]interface{}) {
 	tags := map[string]string{"irq": irq.ID, "type": irq.Type, "device": irq.Device}
 	fields := map[string]interface{}{"total": irq.Total}
@@ -112,16 +168,17 @@ func (s *Interrupts) Gather(acc telegraf.Accumulator) error {
 	for measurement, file := range map[string]string{"interrupts": "/proc/interrupts", "soft_interrupts": "/proc/softirqs"} {
 		f, err := os.Open(file)
 		if err != nil {
-			acc.AddError(fmt.Errorf("Could not open file: %s", file))
+			acc.AddError(fmt.Errorf("could not open file: %s", file))
 			continue
 		}
-		defer f.Close()
 		irqs, err := parseInterrupts(f)
+		_ = f.Close()
 		if err != nil {
-			acc.AddError(fmt.Errorf("Parsing %s: %s", file, err))
+			acc.AddError(fmt.Errorf("parsing %s: %s", file, err))
 			continue
 		}
 		reportMetrics(measurement, irqs, acc, s.CpuAsTag)
+		reportSpuriousMetrics(irqs, acc)
 	}
 	return nil
 }
@@ -143,6 +200,18 @@ func reportMetrics(measurement string, irqs []IRQ, acc telegraf.Accumulator, cpu
 	}
 }
 
+func reportSpuriousMetrics(irqs []IRQ, acc telegraf.Accumulator) {
+	for _, irq := range irqs {
+		tags, _ := gatherTagsFields(irq)
+		spuriousFields := map[string]interface{}{
+			"count":     irq.SpuriousCount,
+			"unhandled": irq.SpuriousUnhandled,
+			"total":     irq.Total,
+		}
+		acc.AddFields("spurious_interrupts", spuriousFields, tags)
+	}
+}
+
 func init() {
 	inputs.Add("interrupts", func() telegraf.Input {
 		return &Interrupts{}

diff --git a/plugins/inputs/interrupts/interrupts_test.go b/plugins/inputs/interrupts/interrupts_test.go
@@ -33,7 +33,7 @@ func expectCpuAsFields(m *testutil.Accumulator, t *testing.T, measurement string
 
 func setup(t *testing.T, irqString string, cpuAsTags bool) (*testutil.Accumulator, []IRQ) {
 	f := bytes.NewBufferString(irqString)
-	irqs, err := parseInterrupts(f)
+	irqs, err := parseInterrupts(f, false)
 	require.Equal(t, nil, err)
 	require.NotEqual(t, 0, len(irqs))
 
@@ -154,3 +154,24 @@ func TestCpuAsFieldsHwIrqs(t *testing.T) {
 		expectCpuAsFields(acc, t, "interrupts", irq)
 	}
 }
+
+// =====================================================================================
+//	spurious interrupts
+//
+// Note: the spurious interrupt ID is gathered from /proc/interrupts as part of the
+// hardware interrupts, so its test is not recreated here.
+// =====================================================================================
+
+const spuriousIrqsString = `
+count 12345
+unhandled 89
+last_unhandled 6677
+`
+
+func TestSpuriousParser(t *testing.T) {
+	f := bytes.NewBufferString(spuriousIrqsString)
+	hasSpurious, count, unhandled := parseSpurious(f)
+	require.True(t, hasSpurious, "spurious data found")
+	require.Equal(t, uint64(12345), count, "incorrect parsed count")
+	require.Equal(t, uint64(89), unhandled, "incorrect parsed unhandled")
+}