Add file input plugin and grok parser (#4332)

influxdata · Jul 14, 2018 · 774a9f0 · 774a9f0
1 parent 3f87e5b
commit 774a9f0
Show file tree

Hide file tree

Showing 24 changed files with 558 additions and 154 deletions.
diff --git a/README.md b/README.md
@@ -153,6 +153,7 @@ configuration options.
 * [exec](./plugins/inputs/exec) (generic executable plugin, support JSON, influx, graphite and nagios)
 * [fail2ban](./plugins/inputs/fail2ban)
 * [fibaro](./plugins/inputs/fibaro)
+* [file](./plugins/inputs/file)
 * [filestat](./plugins/inputs/filestat)
 * [fluentd](./plugins/inputs/fluentd)
 * [graylog](./plugins/inputs/graylog)

diff --git a/docs/DATA_FORMATS_INPUT.md b/docs/DATA_FORMATS_INPUT.md
@@ -9,6 +9,7 @@ Telegraf is able to parse the following input data formats into metrics:
 1. [Nagios](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#nagios) (exec input only)
 1. [Collectd](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#collectd)
 1. [Dropwizard](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#dropwizard)
+1. [Grok](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#grok)
 
 Telegraf metrics, like InfluxDB
 [points](https://docs.influxdata.com/influxdb/v0.10/write_protocols/line/),
@@ -657,5 +658,107 @@ For more information about the dropwizard json format see
   # [inputs.exec.dropwizard_tag_paths]
   #   tag1 = "tags.tag1"
   #   tag2 = "tags.tag2"
+```
 
+#### Grok
+Parse logstash-style "grok" patterns. Patterns can be added to patterns, or custom patterns read from custom_pattern_files.
+
+# View logstash grok pattern docs here:
+#   https://www.elastic.co/guide/en/logstash/current/plugins-filters-grok.html
+# All default logstash patterns are supported, these can be viewed here:
+#   https://github.com/logstash-plugins/logstash-patterns-core/blob/master/patterns/grok-patterns
+
+# Available modifiers:
+#   string   (default if nothing is specified)
+#   int
+#   float
+#   duration (ie, 5.23ms gets converted to int nanoseconds)
+#   tag      (converts the field into a tag)
+#   drop     (drops the field completely)
+# Timestamp modifiers:
+#   ts-ansic         ("Mon Jan _2 15:04:05 2006")
+#   ts-unix          ("Mon Jan _2 15:04:05 MST 2006")
+#   ts-ruby          ("Mon Jan 02 15:04:05 -0700 2006")
+#   ts-rfc822        ("02 Jan 06 15:04 MST")
+#   ts-rfc822z       ("02 Jan 06 15:04 -0700")
+#   ts-rfc850        ("Monday, 02-Jan-06 15:04:05 MST")
+#   ts-rfc1123       ("Mon, 02 Jan 2006 15:04:05 MST")
+#   ts-rfc1123z      ("Mon, 02 Jan 2006 15:04:05 -0700")
+#   ts-rfc3339       ("2006-01-02T15:04:05Z07:00")
+#   ts-rfc3339nano   ("2006-01-02T15:04:05.999999999Z07:00")
+#   ts-httpd         ("02/Jan/2006:15:04:05 -0700")
+#   ts-epoch         (seconds since unix epoch)
+#   ts-epochnano     (nanoseconds since unix epoch)
+#   ts-"CUSTOM"
+# CUSTOM time layouts must be within quotes and be the representation of the
+# "reference time", which is Mon Jan 2 15:04:05 -0700 MST 2006
+# See https://golang.org/pkg/time/#Parse for more details.
+
+# Example log file pattern, example log looks like this:
+#   [04/Jun/2016:12:41:45 +0100] 1.25 200 192.168.1.1 5.432µs
+# Breakdown of the DURATION pattern below:
+#   NUMBER  is a builtin logstash grok pattern matching float & int numbers.
+#   [nuµm]? is a regex specifying 0 or 1 of the characters within brackets.
+#   s       is also regex, this pattern must end in "s".
+# so DURATION will match something like '5.324ms' or '6.1µs' or '10s'
+DURATION %{NUMBER}[nuµm]?s
+RESPONSE_CODE %{NUMBER:response_code:tag}
+RESPONSE_TIME %{DURATION:response_time_ns:duration}
+EXAMPLE_LOG \[%{HTTPDATE:ts:ts-httpd}\] %{NUMBER:myfloat:float} %{RESPONSE_CODE} %{IPORHOST:clientip} %{RESPONSE_TIME}
+
+# Wider-ranging username matching vs. logstash built-in %{USER}
+NGUSERNAME [a-zA-Z0-9\.\@\-\+_%]+
+NGUSER %{NGUSERNAME}
+# Wider-ranging client IP matching
+CLIENT (?:%{IPORHOST}|%{HOSTPORT}|::1)
+
+##
+## COMMON LOG PATTERNS
+##
+
+# apache & nginx logs, this is also known as the "common log format"
+#   see https://en.wikipedia.org/wiki/Common_Log_Format
+COMMON_LOG_FORMAT %{CLIENT:client_ip} %{NOTSPACE:ident} %{NOTSPACE:auth} \[%{HTTPDATE:ts:ts-httpd}\] "(?:%{WORD:verb:tag} %{NOTSPACE:request}(?: HTTP/%{NUMBER:http_version:float})?|%{DATA})" %{NUMBER:resp_code:tag} (?:%{NUMBER:resp_bytes:int}|-)
+
+# Combined log format is the same as the common log format but with the addition
+# of two quoted strings at the end for "referrer" and "agent"
+#   See Examples at http://httpd.apache.org/docs/current/mod/mod_log_config.html
+COMBINED_LOG_FORMAT %{COMMON_LOG_FORMAT} %{QS:referrer} %{QS:agent}
+
+# HTTPD log formats
+HTTPD20_ERRORLOG \[%{HTTPDERROR_DATE:timestamp}\] \[%{LOGLEVEL:loglevel:tag}\] (?:\[client %{IPORHOST:clientip}\] ){0,1}%{GREEDYDATA:errormsg}
+HTTPD24_ERRORLOG \[%{HTTPDERROR_DATE:timestamp}\] \[%{WORD:module}:%{LOGLEVEL:loglevel:tag}\] \[pid %{POSINT:pid:int}:tid %{NUMBER:tid:int}\]( \(%{POSINT:proxy_errorcode:int}\)%{DATA:proxy_errormessage}:)?( \[client %{IPORHOST:client}:%{POSINT:clientport}\])? %{DATA:errorcode}: %{GREEDYDATA:message}
+HTTPD_ERRORLOG %{HTTPD20_ERRORLOG}|%{HTTPD24_ERRORLOG}
+
+#### Grok Configuration:
+```toml
+[[inputs.reader]]
+  ## This is a list of patterns to check the given log file(s) for.
+  ## Note that adding patterns here increases processing time. The most
+  ## efficient configuration is to have one pattern per logparser.
+  ## Other common built-in patterns are:
+  ##   %{COMMON_LOG_FORMAT}   (plain apache & nginx access logs)
+  ##   %{COMBINED_LOG_FORMAT} (access logs + referrer & agent)
+  grok_patterns = ["%{COMBINED_LOG_FORMAT}"]
+
+  ## Name of the outputted measurement name.
+  grok_name_override = "apache_access_log"
+
+  ## Full path(s) to custom pattern files.
+  grok_custom_pattern_files = []
+
+  ## Custom patterns can also be defined here. Put one pattern per line.
+  grok_custom_patterns = '''
+  '''
+
+  ## Timezone allows you to provide an override for timestamps that
+  ## don't already include an offset
+  ## e.g. 04/06/2016 12:41:45 data one two 5.43µs
+  ##
+  ## Default: "" which renders UTC
+  ## Options are as follows:
+  ##   1. Local             -- interpret based on machine localtime
+  ##   2. "Canada/Eastern"  -- Unix TZ values like those found in https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
+  ##   3. UTC               -- or blank/unspecified, will return timestamp in UTC
+  grok_timezone = "Canada/Eastern"
 ```
diff --git a/internal/config/config.go b/internal/config/config.go
@@ -1346,6 +1346,59 @@ func buildParser(name string, tbl *ast.Table) (parsers.Parser, error) {
 		}
 	}
 
+	//for grok data_format
+	if node, ok := tbl.Fields["grok_named_patterns"]; ok {
+		if kv, ok := node.(*ast.KeyValue); ok {
+			if ary, ok := kv.Value.(*ast.Array); ok {
+				for _, elem := range ary.Value {
+					if str, ok := elem.(*ast.String); ok {
+						c.GrokNamedPatterns = append(c.GrokNamedPatterns, str.Value)
+					}
+				}
+			}
+		}
+	}
+
+	if node, ok := tbl.Fields["grok_patterns"]; ok {
+		if kv, ok := node.(*ast.KeyValue); ok {
+			if ary, ok := kv.Value.(*ast.Array); ok {
+				for _, elem := range ary.Value {
+					if str, ok := elem.(*ast.String); ok {
+						c.GrokPatterns = append(c.GrokPatterns, str.Value)
+					}
+				}
+			}
+		}
+	}
+
+	if node, ok := tbl.Fields["grok_custom_patterns"]; ok {
+		if kv, ok := node.(*ast.KeyValue); ok {
+			if str, ok := kv.Value.(*ast.String); ok {
+				c.GrokCustomPatterns = str.Value
+			}
+		}
+	}
+
+	if node, ok := tbl.Fields["grok_custom_pattern_files"]; ok {
+		if kv, ok := node.(*ast.KeyValue); ok {
+			if ary, ok := kv.Value.(*ast.Array); ok {
+				for _, elem := range ary.Value {
+					if str, ok := elem.(*ast.String); ok {
+						c.GrokCustomPatternFiles = append(c.GrokCustomPatternFiles, str.Value)
+					}
+				}
+			}
+		}
+	}
+
+	if node, ok := tbl.Fields["grok_timezone"]; ok {
+		if kv, ok := node.(*ast.KeyValue); ok {
+			if str, ok := kv.Value.(*ast.String); ok {
+				c.GrokTimeZone = str.Value
+			}
+		}
+	}
+
 	c.MetricName = name
 
 	delete(tbl.Fields, "data_format")
@@ -1362,6 +1415,11 @@ func buildParser(name string, tbl *ast.Table) (parsers.Parser, error) {
 	delete(tbl.Fields, "dropwizard_time_format")
 	delete(tbl.Fields, "dropwizard_tags_path")
 	delete(tbl.Fields, "dropwizard_tag_paths")
+	delete(tbl.Fields, "grok_named_patterns")
+	delete(tbl.Fields, "grok_patterns")
+	delete(tbl.Fields, "grok_custom_patterns")
+	delete(tbl.Fields, "grok_custom_pattern_files")
+	delete(tbl.Fields, "grok_timezone")
 
 	return parsers.NewParser(c)
 }

diff --git a/plugins/inputs/all/all.go b/plugins/inputs/all/all.go
@@ -30,6 +30,7 @@ import (
 	_ "github.com/influxdata/telegraf/plugins/inputs/exec"
 	_ "github.com/influxdata/telegraf/plugins/inputs/fail2ban"
 	_ "github.com/influxdata/telegraf/plugins/inputs/fibaro"
+	_ "github.com/influxdata/telegraf/plugins/inputs/file"
 	_ "github.com/influxdata/telegraf/plugins/inputs/filestat"
 	_ "github.com/influxdata/telegraf/plugins/inputs/fluentd"
 	_ "github.com/influxdata/telegraf/plugins/inputs/graylog"

diff --git a/plugins/inputs/file/README.md b/plugins/inputs/file/README.md
@@ -0,0 +1,25 @@
+# File Input Plugin
+
+The file plugin updates a list of files every interval and parses the contents
+using the selected [input data format](/docs/DATA_FORMATS_INPUT.md).
+
+Files will always be read in their entirety, if you wish to tail/follow a file
+use the [tail input plugin](/plugins/inputs/tail) instead.
+
+### Configuration:
+```toml
+[[inputs.file]]
+  ## Files to parse each interval.
+  ## These accept standard unix glob matching rules, but with the addition of
+  ## ** as a "super asterisk". ie:
+  ##   /var/log/**.log     -> recursively find all .log files in /var/log
+  ##   /var/log/*/*.log    -> find all .log files with a parent dir in /var/log
+  ##   /var/log/apache.log -> only tail the apache log file
+  files = ["/var/log/apache/access.log"]
+
+  ## Data format to consume.
+  ## Each data format has its own unique set of configuration options, read
+  ## more about them here:
+  ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
+  data_format = "influx"
+```
diff --git a/plugins/inputs/file/dev/docker-compose.yml b/plugins/inputs/file/dev/docker-compose.yml
@@ -0,0 +1,13 @@
+version: '3'
+
+services:
+  telegraf:
+      image: glinton/scratch
+      volumes:
+        - ./telegraf.conf:/telegraf.conf
+        - ../../../../telegraf:/telegraf
+        - ./json_a.log:/var/log/test.log
+      entrypoint:
+        - /telegraf
+        - --config
+        - /telegraf.conf
diff --git a/plugins/inputs/file/dev/json_a.log b/plugins/inputs/file/dev/json_a.log
@@ -0,0 +1,14 @@
+{
+"parent": {
+	"child": 3.0,
+	"ignored_child": "hi"
+},
+"ignored_null": null,
+"integer": 4,
+"list": [3, 4],
+"ignored_parent": {
+	"another_ignored_null": null,
+	"ignored_string": "hello, world!"
+},
+"another_list": [4]
+}
diff --git a/plugins/inputs/file/dev/telegraf.conf b/plugins/inputs/file/dev/telegraf.conf
@@ -0,0 +1,7 @@
+[[inputs.file]]
+  files = ["/var/log/test.log"]
+  data_format = "json"
+  name_override = "json_file"
+
+[[outputs.file]]
+  files = ["stdout"]
diff --git a/plugins/inputs/file/file.go b/plugins/inputs/file/file.go
@@ -0,0 +1,102 @@
+package file
+
+import (
+	"fmt"
+	"io/ioutil"
+
+	"github.com/influxdata/telegraf"
+	"github.com/influxdata/telegraf/internal/globpath"
+	"github.com/influxdata/telegraf/plugins/inputs"
+	"github.com/influxdata/telegraf/plugins/parsers"
+)
+
+type File struct {
+	Files         []string `toml:"files"`
+	FromBeginning bool
+	parser        parsers.Parser
+
+	filenames []string
+}
+
+const sampleConfig = `
+  ## Files to parse each interval.
+  ## These accept standard unix glob matching rules, but with the addition of
+  ## ** as a "super asterisk". ie:
+  ##   /var/log/**.log     -> recursively find all .log files in /var/log
+  ##   /var/log/*/*.log    -> find all .log files with a parent dir in /var/log
+  ##   /var/log/apache.log -> only tail the apache log file
+  files = ["/var/log/apache/access.log"]
+
+  ## The dataformat to be read from files
+  ## Each data format has its own unique set of configuration options, read
+  ## more about them here:
+  ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
+  data_format = "influx"
+`
+
+// SampleConfig returns the default configuration of the Input
+func (f *File) SampleConfig() string {
+	return sampleConfig
+}
+
+func (f *File) Description() string {
+	return "reload and gather from file[s] on telegraf's interval"
+}
+
+func (f *File) Gather(acc telegraf.Accumulator) error {
+	err := f.refreshFilePaths()
+	if err != nil {
+		return err
+	}
+	for _, k := range f.filenames {
+		metrics, err := f.readMetric(k)
+		if err != nil {
+			return err
+		}
+
+		for _, m := range metrics {
+			acc.AddFields(m.Name(), m.Fields(), m.Tags(), m.Time())
+		}
+	}
+	return nil
+}
+
+func (f *File) SetParser(p parsers.Parser) {
+	f.parser = p
+}
+
+func (f *File) refreshFilePaths() error {
+	var allFiles []string
+	for _, file := range f.Files {
+		g, err := globpath.Compile(file)
+		if err != nil {
+			return fmt.Errorf("could not compile glob %v: %v", file, err)
+		}
+		files := g.Match()
+		if len(files) <= 0 {
+			return fmt.Errorf("could not find file: %v", file)
+		}
+
+		for k := range files {
+			allFiles = append(allFiles, k)
+		}
+	}
+
+	f.filenames = allFiles
+	return nil
+}
+
+func (f *File) readMetric(filename string) ([]telegraf.Metric, error) {
+	fileContents, err := ioutil.ReadFile(filename)
+	if err != nil {
+		return nil, fmt.Errorf("E! Error file: %v could not be read, %s", filename, err)
+	}
+	return f.parser.Parse(fileContents)
+
+}
+
+func init() {
+	inputs.Add("file", func() telegraf.Input {
+		return &File{}
+	})
+}