Skip to content

Commit

Permalink
Add file input plugin and grok parser (#4332)
Browse files Browse the repository at this point in the history
  • Loading branch information
maxunt authored and danielnelson committed Jul 14, 2018
1 parent 3f87e5b commit 774a9f0
Show file tree
Hide file tree
Showing 24 changed files with 558 additions and 154 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ configuration options.
* [exec](./plugins/inputs/exec) (generic executable plugin, support JSON, influx, graphite and nagios)
* [fail2ban](./plugins/inputs/fail2ban)
* [fibaro](./plugins/inputs/fibaro)
* [file](./plugins/inputs/file)
* [filestat](./plugins/inputs/filestat)
* [fluentd](./plugins/inputs/fluentd)
* [graylog](./plugins/inputs/graylog)
Expand Down
103 changes: 103 additions & 0 deletions docs/DATA_FORMATS_INPUT.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Telegraf is able to parse the following input data formats into metrics:
1. [Nagios](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#nagios) (exec input only)
1. [Collectd](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#collectd)
1. [Dropwizard](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#dropwizard)
1. [Grok](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md#grok)

Telegraf metrics, like InfluxDB
[points](https://docs.influxdata.com/influxdb/v0.10/write_protocols/line/),
Expand Down Expand Up @@ -657,5 +658,107 @@ For more information about the dropwizard json format see
# [inputs.exec.dropwizard_tag_paths]
# tag1 = "tags.tag1"
# tag2 = "tags.tag2"
```

#### Grok
Parse logstash-style "grok" patterns. Patterns can be added to patterns, or custom patterns read from custom_pattern_files.

# View logstash grok pattern docs here:
# https://www.elastic.co/guide/en/logstash/current/plugins-filters-grok.html
# All default logstash patterns are supported, these can be viewed here:
# https://github.com/logstash-plugins/logstash-patterns-core/blob/master/patterns/grok-patterns

# Available modifiers:
# string (default if nothing is specified)
# int
# float
# duration (ie, 5.23ms gets converted to int nanoseconds)
# tag (converts the field into a tag)
# drop (drops the field completely)
# Timestamp modifiers:
# ts-ansic ("Mon Jan _2 15:04:05 2006")
# ts-unix ("Mon Jan _2 15:04:05 MST 2006")
# ts-ruby ("Mon Jan 02 15:04:05 -0700 2006")
# ts-rfc822 ("02 Jan 06 15:04 MST")
# ts-rfc822z ("02 Jan 06 15:04 -0700")
# ts-rfc850 ("Monday, 02-Jan-06 15:04:05 MST")
# ts-rfc1123 ("Mon, 02 Jan 2006 15:04:05 MST")
# ts-rfc1123z ("Mon, 02 Jan 2006 15:04:05 -0700")
# ts-rfc3339 ("2006-01-02T15:04:05Z07:00")
# ts-rfc3339nano ("2006-01-02T15:04:05.999999999Z07:00")
# ts-httpd ("02/Jan/2006:15:04:05 -0700")
# ts-epoch (seconds since unix epoch)
# ts-epochnano (nanoseconds since unix epoch)
# ts-"CUSTOM"
# CUSTOM time layouts must be within quotes and be the representation of the
# "reference time", which is Mon Jan 2 15:04:05 -0700 MST 2006
# See https://golang.org/pkg/time/#Parse for more details.

# Example log file pattern, example log looks like this:
# [04/Jun/2016:12:41:45 +0100] 1.25 200 192.168.1.1 5.432µs
# Breakdown of the DURATION pattern below:
# NUMBER is a builtin logstash grok pattern matching float & int numbers.
# [nuµm]? is a regex specifying 0 or 1 of the characters within brackets.
# s is also regex, this pattern must end in "s".
# so DURATION will match something like '5.324ms' or '6.1µs' or '10s'
DURATION %{NUMBER}[nuµm]?s
RESPONSE_CODE %{NUMBER:response_code:tag}
RESPONSE_TIME %{DURATION:response_time_ns:duration}
EXAMPLE_LOG \[%{HTTPDATE:ts:ts-httpd}\] %{NUMBER:myfloat:float} %{RESPONSE_CODE} %{IPORHOST:clientip} %{RESPONSE_TIME}

# Wider-ranging username matching vs. logstash built-in %{USER}
NGUSERNAME [a-zA-Z0-9\.\@\-\+_%]+
NGUSER %{NGUSERNAME}
# Wider-ranging client IP matching
CLIENT (?:%{IPORHOST}|%{HOSTPORT}|::1)

##
## COMMON LOG PATTERNS
##

# apache & nginx logs, this is also known as the "common log format"
# see https://en.wikipedia.org/wiki/Common_Log_Format
COMMON_LOG_FORMAT %{CLIENT:client_ip} %{NOTSPACE:ident} %{NOTSPACE:auth} \[%{HTTPDATE:ts:ts-httpd}\] "(?:%{WORD:verb:tag} %{NOTSPACE:request}(?: HTTP/%{NUMBER:http_version:float})?|%{DATA})" %{NUMBER:resp_code:tag} (?:%{NUMBER:resp_bytes:int}|-)

# Combined log format is the same as the common log format but with the addition
# of two quoted strings at the end for "referrer" and "agent"
# See Examples at http://httpd.apache.org/docs/current/mod/mod_log_config.html
COMBINED_LOG_FORMAT %{COMMON_LOG_FORMAT} %{QS:referrer} %{QS:agent}

# HTTPD log formats
HTTPD20_ERRORLOG \[%{HTTPDERROR_DATE:timestamp}\] \[%{LOGLEVEL:loglevel:tag}\] (?:\[client %{IPORHOST:clientip}\] ){0,1}%{GREEDYDATA:errormsg}
HTTPD24_ERRORLOG \[%{HTTPDERROR_DATE:timestamp}\] \[%{WORD:module}:%{LOGLEVEL:loglevel:tag}\] \[pid %{POSINT:pid:int}:tid %{NUMBER:tid:int}\]( \(%{POSINT:proxy_errorcode:int}\)%{DATA:proxy_errormessage}:)?( \[client %{IPORHOST:client}:%{POSINT:clientport}\])? %{DATA:errorcode}: %{GREEDYDATA:message}
HTTPD_ERRORLOG %{HTTPD20_ERRORLOG}|%{HTTPD24_ERRORLOG}

#### Grok Configuration:
```toml
[[inputs.reader]]
## This is a list of patterns to check the given log file(s) for.
## Note that adding patterns here increases processing time. The most
## efficient configuration is to have one pattern per logparser.
## Other common built-in patterns are:
## %{COMMON_LOG_FORMAT} (plain apache & nginx access logs)
## %{COMBINED_LOG_FORMAT} (access logs + referrer & agent)
grok_patterns = ["%{COMBINED_LOG_FORMAT}"]

## Name of the outputted measurement name.
grok_name_override = "apache_access_log"

## Full path(s) to custom pattern files.
grok_custom_pattern_files = []

## Custom patterns can also be defined here. Put one pattern per line.
grok_custom_patterns = '''
'''

## Timezone allows you to provide an override for timestamps that
## don't already include an offset
## e.g. 04/06/2016 12:41:45 data one two 5.43µs
##
## Default: "" which renders UTC
## Options are as follows:
## 1. Local -- interpret based on machine localtime
## 2. "Canada/Eastern" -- Unix TZ values like those found in https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
## 3. UTC -- or blank/unspecified, will return timestamp in UTC
grok_timezone = "Canada/Eastern"
```
58 changes: 58 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -1346,6 +1346,59 @@ func buildParser(name string, tbl *ast.Table) (parsers.Parser, error) {
}
}

//for grok data_format
if node, ok := tbl.Fields["grok_named_patterns"]; ok {
if kv, ok := node.(*ast.KeyValue); ok {
if ary, ok := kv.Value.(*ast.Array); ok {
for _, elem := range ary.Value {
if str, ok := elem.(*ast.String); ok {
c.GrokNamedPatterns = append(c.GrokNamedPatterns, str.Value)
}
}
}
}
}

if node, ok := tbl.Fields["grok_patterns"]; ok {
if kv, ok := node.(*ast.KeyValue); ok {
if ary, ok := kv.Value.(*ast.Array); ok {
for _, elem := range ary.Value {
if str, ok := elem.(*ast.String); ok {
c.GrokPatterns = append(c.GrokPatterns, str.Value)
}
}
}
}
}

if node, ok := tbl.Fields["grok_custom_patterns"]; ok {
if kv, ok := node.(*ast.KeyValue); ok {
if str, ok := kv.Value.(*ast.String); ok {
c.GrokCustomPatterns = str.Value
}
}
}

if node, ok := tbl.Fields["grok_custom_pattern_files"]; ok {
if kv, ok := node.(*ast.KeyValue); ok {
if ary, ok := kv.Value.(*ast.Array); ok {
for _, elem := range ary.Value {
if str, ok := elem.(*ast.String); ok {
c.GrokCustomPatternFiles = append(c.GrokCustomPatternFiles, str.Value)
}
}
}
}
}

if node, ok := tbl.Fields["grok_timezone"]; ok {
if kv, ok := node.(*ast.KeyValue); ok {
if str, ok := kv.Value.(*ast.String); ok {
c.GrokTimeZone = str.Value
}
}
}

c.MetricName = name

delete(tbl.Fields, "data_format")
Expand All @@ -1362,6 +1415,11 @@ func buildParser(name string, tbl *ast.Table) (parsers.Parser, error) {
delete(tbl.Fields, "dropwizard_time_format")
delete(tbl.Fields, "dropwizard_tags_path")
delete(tbl.Fields, "dropwizard_tag_paths")
delete(tbl.Fields, "grok_named_patterns")
delete(tbl.Fields, "grok_patterns")
delete(tbl.Fields, "grok_custom_patterns")
delete(tbl.Fields, "grok_custom_pattern_files")
delete(tbl.Fields, "grok_timezone")

return parsers.NewParser(c)
}
Expand Down
1 change: 1 addition & 0 deletions plugins/inputs/all/all.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
_ "github.com/influxdata/telegraf/plugins/inputs/exec"
_ "github.com/influxdata/telegraf/plugins/inputs/fail2ban"
_ "github.com/influxdata/telegraf/plugins/inputs/fibaro"
_ "github.com/influxdata/telegraf/plugins/inputs/file"
_ "github.com/influxdata/telegraf/plugins/inputs/filestat"
_ "github.com/influxdata/telegraf/plugins/inputs/fluentd"
_ "github.com/influxdata/telegraf/plugins/inputs/graylog"
Expand Down
25 changes: 25 additions & 0 deletions plugins/inputs/file/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# File Input Plugin

The file plugin updates a list of files every interval and parses the contents
using the selected [input data format](/docs/DATA_FORMATS_INPUT.md).

Files will always be read in their entirety, if you wish to tail/follow a file
use the [tail input plugin](/plugins/inputs/tail) instead.

### Configuration:
```toml
[[inputs.file]]
## Files to parse each interval.
## These accept standard unix glob matching rules, but with the addition of
## ** as a "super asterisk". ie:
## /var/log/**.log -> recursively find all .log files in /var/log
## /var/log/*/*.log -> find all .log files with a parent dir in /var/log
## /var/log/apache.log -> only tail the apache log file
files = ["/var/log/apache/access.log"]

## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
data_format = "influx"
```
13 changes: 13 additions & 0 deletions plugins/inputs/file/dev/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
version: '3'

services:
telegraf:
image: glinton/scratch
volumes:
- ./telegraf.conf:/telegraf.conf
- ../../../../telegraf:/telegraf
- ./json_a.log:/var/log/test.log
entrypoint:
- /telegraf
- --config
- /telegraf.conf
14 changes: 14 additions & 0 deletions plugins/inputs/file/dev/json_a.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"parent": {
"child": 3.0,
"ignored_child": "hi"
},
"ignored_null": null,
"integer": 4,
"list": [3, 4],
"ignored_parent": {
"another_ignored_null": null,
"ignored_string": "hello, world!"
},
"another_list": [4]
}
7 changes: 7 additions & 0 deletions plugins/inputs/file/dev/telegraf.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[[inputs.file]]
files = ["/var/log/test.log"]
data_format = "json"
name_override = "json_file"

[[outputs.file]]
files = ["stdout"]
102 changes: 102 additions & 0 deletions plugins/inputs/file/file.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package file

import (
"fmt"
"io/ioutil"

"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal/globpath"
"github.com/influxdata/telegraf/plugins/inputs"
"github.com/influxdata/telegraf/plugins/parsers"
)

type File struct {
Files []string `toml:"files"`
FromBeginning bool
parser parsers.Parser

filenames []string
}

const sampleConfig = `
## Files to parse each interval.
## These accept standard unix glob matching rules, but with the addition of
## ** as a "super asterisk". ie:
## /var/log/**.log -> recursively find all .log files in /var/log
## /var/log/*/*.log -> find all .log files with a parent dir in /var/log
## /var/log/apache.log -> only tail the apache log file
files = ["/var/log/apache/access.log"]
## The dataformat to be read from files
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
data_format = "influx"
`

// SampleConfig returns the default configuration of the Input
func (f *File) SampleConfig() string {
return sampleConfig
}

func (f *File) Description() string {
return "reload and gather from file[s] on telegraf's interval"
}

func (f *File) Gather(acc telegraf.Accumulator) error {
err := f.refreshFilePaths()
if err != nil {
return err
}
for _, k := range f.filenames {
metrics, err := f.readMetric(k)
if err != nil {
return err
}

for _, m := range metrics {
acc.AddFields(m.Name(), m.Fields(), m.Tags(), m.Time())
}
}
return nil
}

func (f *File) SetParser(p parsers.Parser) {
f.parser = p
}

func (f *File) refreshFilePaths() error {
var allFiles []string
for _, file := range f.Files {
g, err := globpath.Compile(file)
if err != nil {
return fmt.Errorf("could not compile glob %v: %v", file, err)
}
files := g.Match()
if len(files) <= 0 {
return fmt.Errorf("could not find file: %v", file)
}

for k := range files {
allFiles = append(allFiles, k)
}
}

f.filenames = allFiles
return nil
}

func (f *File) readMetric(filename string) ([]telegraf.Metric, error) {
fileContents, err := ioutil.ReadFile(filename)
if err != nil {
return nil, fmt.Errorf("E! Error file: %v could not be read, %s", filename, err)
}
return f.parser.Parse(fileContents)

}

func init() {
inputs.Add("file", func() telegraf.Input {
return &File{}
})
}
Loading

0 comments on commit 774a9f0

Please sign in to comment.