From 9e0eb0c0e0cdd2a45a7629b494b914a59bf51b82 Mon Sep 17 00:00:00 2001 From: maxunt Date: Fri, 17 Aug 2018 13:45:22 -0700 Subject: [PATCH] Add ability to set measurement from matched text in grok parser (#4433) --- docs/DATA_FORMATS_INPUT.md | 91 ++++++++++--------- plugins/inputs/file/README.md | 2 +- plugins/inputs/file/dev/docker-compose.yml | 2 +- plugins/inputs/file/dev/json_a.log | 14 --- .../file/{ => dev}/testfiles/grok_a.log | 0 .../file/{ => dev}/testfiles/json_a.log | 0 plugins/inputs/file/file.go | 9 +- plugins/inputs/file/file_test.go | 12 +-- plugins/parsers/grok/parser.go | 6 +- plugins/parsers/grok/parser_test.go | 50 ++++++++++ 10 files changed, 112 insertions(+), 74 deletions(-) delete mode 100644 plugins/inputs/file/dev/json_a.log rename plugins/inputs/file/{ => dev}/testfiles/grok_a.log (100%) rename plugins/inputs/file/{ => dev}/testfiles/json_a.log (100%) diff --git a/docs/DATA_FORMATS_INPUT.md b/docs/DATA_FORMATS_INPUT.md index 753523843f249..ded0170ec80d2 100644 --- a/docs/DATA_FORMATS_INPUT.md +++ b/docs/DATA_FORMATS_INPUT.md @@ -670,50 +670,6 @@ The best way to get acquainted with grok patterns is to read the logstash docs, which are available here: https://www.elastic.co/guide/en/logstash/current/plugins-filters-grok.html -#### Grok Configuration: -```toml -[[inputs.file]] - ## Files to parse each interval. - ## These accept standard unix glob matching rules, but with the addition of - ## ** as a "super asterisk". ie: - ## /var/log/**.log -> recursively find all .log files in /var/log - ## /var/log/*/*.log -> find all .log files with a parent dir in /var/log - ## /var/log/apache.log -> only tail the apache log file - files = ["/var/log/apache/access.log"] - - ## The dataformat to be read from files - ## Each data format has its own unique set of configuration options, read - ## more about them here: - ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md - data_format = "grok" - - ## This is a list of patterns to check the given log file(s) for. - ## Note that adding patterns here increases processing time. The most - ## efficient configuration is to have one pattern. - ## Other common built-in patterns are: - ## %{COMMON_LOG_FORMAT} (plain apache & nginx access logs) - ## %{COMBINED_LOG_FORMAT} (access logs + referrer & agent) - grok_patterns = ["%{COMBINED_LOG_FORMAT}"] - - ## Full path(s) to custom pattern files. - grok_custom_pattern_files = [] - - ## Custom patterns can also be defined here. Put one pattern per line. - grok_custom_patterns = ''' - ''' - - ## Timezone allows you to provide an override for timestamps that - ## don't already include an offset - ## e.g. 04/06/2016 12:41:45 data one two 5.43µs - ## - ## Default: "" which renders UTC - ## Options are as follows: - ## 1. Local -- interpret based on machine localtime - ## 2. "Canada/Eastern" -- Unix TZ values like those found in https://en.wikipedia.org/wiki/List_of_tz_database_time_zones - ## 3. UTC -- or blank/unspecified, will return timestamp in UTC - grok_timezone = "Canada/Eastern" -``` - The grok parser uses a slightly modified version of logstash "grok" patterns, with the format: @@ -740,6 +696,7 @@ You must capture at least one field per line. - duration (ie, 5.23ms gets converted to int nanoseconds) - tag (converts the field into a tag) - drop (drops the field completely) + - measurement (use the matched text as the measurement name) - Timestamp modifiers: - ts (This will auto-learn the timestamp format) - ts-ansic ("Mon Jan _2 15:04:05 2006") @@ -759,7 +716,7 @@ You must capture at least one field per line. - ts-"CUSTOM" CUSTOM time layouts must be within quotes and be the representation of the -"reference time", which is `Mon Jan 2 15:04:05 -0700 MST 2006`. +"reference time", which is `Mon Jan 2 15:04:05 -0700 MST 2006`. To match a comma decimal point you can use a period. For example `%{TIMESTAMP:timestamp:ts-"2006-01-02 15:04:05.000"}` can be used to match `"2018-01-02 15:04:05,000"` To match a comma decimal point you can use a period in the pattern string. See https://golang.org/pkg/time/#Parse for more details. @@ -773,6 +730,50 @@ logstash patterns that depend on these are not supported._ If you need help building patterns to match your logs, you will find the https://grokdebug.herokuapp.com application quite useful! +#### Grok Configuration: +```toml +[[inputs.file]] + ## Files to parse each interval. + ## These accept standard unix glob matching rules, but with the addition of + ## ** as a "super asterisk". ie: + ## /var/log/**.log -> recursively find all .log files in /var/log + ## /var/log/*/*.log -> find all .log files with a parent dir in /var/log + ## /var/log/apache.log -> only tail the apache log file + files = ["/var/log/apache/access.log"] + + ## The dataformat to be read from files + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "grok" + + ## This is a list of patterns to check the given log file(s) for. + ## Note that adding patterns here increases processing time. The most + ## efficient configuration is to have one pattern. + ## Other common built-in patterns are: + ## %{COMMON_LOG_FORMAT} (plain apache & nginx access logs) + ## %{COMBINED_LOG_FORMAT} (access logs + referrer & agent) + grok_patterns = ["%{COMBINED_LOG_FORMAT}"] + + ## Full path(s) to custom pattern files. + grok_custom_pattern_files = [] + + ## Custom patterns can also be defined here. Put one pattern per line. + grok_custom_patterns = ''' + ''' + + ## Timezone allows you to provide an override for timestamps that + ## don't already include an offset + ## e.g. 04/06/2016 12:41:45 data one two 5.43µs + ## + ## Default: "" which renders UTC + ## Options are as follows: + ## 1. Local -- interpret based on machine localtime + ## 2. "Canada/Eastern" -- Unix TZ values like those found in https://en.wikipedia.org/wiki/List_of_tz_database_time_zones + ## 3. UTC -- or blank/unspecified, will return timestamp in UTC + grok_timezone = "Canada/Eastern" +``` + #### Timestamp Examples This example input and config parses a file using a custom timestamp conversion: diff --git a/plugins/inputs/file/README.md b/plugins/inputs/file/README.md index 73a3a2362e0f0..4358b67ad2668 100644 --- a/plugins/inputs/file/README.md +++ b/plugins/inputs/file/README.md @@ -14,7 +14,7 @@ use the [tail input plugin](/plugins/inputs/tail) instead. ## ** as a "super asterisk". ie: ## /var/log/**.log -> recursively find all .log files in /var/log ## /var/log/*/*.log -> find all .log files with a parent dir in /var/log - ## /var/log/apache.log -> only tail the apache log file + ## /var/log/apache.log -> only read the apache log file files = ["/var/log/apache/access.log"] ## Data format to consume. diff --git a/plugins/inputs/file/dev/docker-compose.yml b/plugins/inputs/file/dev/docker-compose.yml index 3c16fca909ebd..efce389f78424 100644 --- a/plugins/inputs/file/dev/docker-compose.yml +++ b/plugins/inputs/file/dev/docker-compose.yml @@ -6,7 +6,7 @@ services: volumes: - ./telegraf.conf:/telegraf.conf - ../../../../telegraf:/telegraf - - ./json_a.log:/var/log/test.log + - ./dev/json_a.log:/var/log/test.log entrypoint: - /telegraf - --config diff --git a/plugins/inputs/file/dev/json_a.log b/plugins/inputs/file/dev/json_a.log deleted file mode 100644 index 0f52e9d1e3b57..0000000000000 --- a/plugins/inputs/file/dev/json_a.log +++ /dev/null @@ -1,14 +0,0 @@ -{ -"parent": { - "child": 3.0, - "ignored_child": "hi" -}, -"ignored_null": null, -"integer": 4, -"list": [3, 4], -"ignored_parent": { - "another_ignored_null": null, - "ignored_string": "hello, world!" -}, -"another_list": [4] -} diff --git a/plugins/inputs/file/testfiles/grok_a.log b/plugins/inputs/file/dev/testfiles/grok_a.log similarity index 100% rename from plugins/inputs/file/testfiles/grok_a.log rename to plugins/inputs/file/dev/testfiles/grok_a.log diff --git a/plugins/inputs/file/testfiles/json_a.log b/plugins/inputs/file/dev/testfiles/json_a.log similarity index 100% rename from plugins/inputs/file/testfiles/json_a.log rename to plugins/inputs/file/dev/testfiles/json_a.log diff --git a/plugins/inputs/file/file.go b/plugins/inputs/file/file.go index 2779561fc2ffb..d6714301eaed2 100644 --- a/plugins/inputs/file/file.go +++ b/plugins/inputs/file/file.go @@ -11,9 +11,8 @@ import ( ) type File struct { - Files []string `toml:"files"` - FromBeginning bool - parser parsers.Parser + Files []string `toml:"files"` + parser parsers.Parser filenames []string } @@ -24,7 +23,7 @@ const sampleConfig = ` ## ** as a "super asterisk". ie: ## /var/log/**.log -> recursively find all .log files in /var/log ## /var/log/*/*.log -> find all .log files with a parent dir in /var/log - ## /var/log/apache.log -> only tail the apache log file + ## /var/log/apache.log -> only read the apache log file files = ["/var/log/apache/access.log"] ## The dataformat to be read from files @@ -40,7 +39,7 @@ func (f *File) SampleConfig() string { } func (f *File) Description() string { - return "reload and gather from file[s] on telegraf's interval" + return "Reload and gather from file[s] on telegraf's interval." } func (f *File) Gather(acc telegraf.Accumulator) error { diff --git a/plugins/inputs/file/file_test.go b/plugins/inputs/file/file_test.go index 28105664615a1..43322c2e84cf9 100644 --- a/plugins/inputs/file/file_test.go +++ b/plugins/inputs/file/file_test.go @@ -14,26 +14,26 @@ import ( func TestRefreshFilePaths(t *testing.T) { wd, err := os.Getwd() r := File{ - Files: []string{filepath.Join(wd, "testfiles/**.log")}, + Files: []string{filepath.Join(wd, "dev/testfiles/**.log")}, } err = r.refreshFilePaths() require.NoError(t, err) - assert.Equal(t, len(r.filenames), 2) + assert.Equal(t, 2, len(r.filenames)) } func TestJSONParserCompile(t *testing.T) { var acc testutil.Accumulator wd, _ := os.Getwd() r := File{ - Files: []string{filepath.Join(wd, "testfiles/json_a.log")}, + Files: []string{filepath.Join(wd, "dev/testfiles/json_a.log")}, } parserConfig := parsers.Config{ DataFormat: "json", TagKeys: []string{"parent_ignored_child"}, } nParser, err := parsers.NewParser(&parserConfig) - r.parser = nParser assert.NoError(t, err) + r.parser = nParser r.Gather(&acc) assert.Equal(t, map[string]string{"parent_ignored_child": "hi"}, acc.Metrics[0].Tags) @@ -44,7 +44,7 @@ func TestGrokParser(t *testing.T) { wd, _ := os.Getwd() var acc testutil.Accumulator r := File{ - Files: []string{filepath.Join(wd, "testfiles/grok_a.log")}, + Files: []string{filepath.Join(wd, "dev/testfiles/grok_a.log")}, } parserConfig := parsers.Config{ @@ -57,5 +57,5 @@ func TestGrokParser(t *testing.T) { assert.NoError(t, err) err = r.Gather(&acc) - assert.Equal(t, 2, len(acc.Metrics)) + assert.Equal(t, len(acc.Metrics), 2) } diff --git a/plugins/parsers/grok/parser.go b/plugins/parsers/grok/parser.go index 096cb8ed830e6..bc65588eb9841 100644 --- a/plugins/parsers/grok/parser.go +++ b/plugins/parsers/grok/parser.go @@ -38,6 +38,7 @@ var timeLayouts = map[string]string{ } const ( + MEASUREMENT = "measurement" INT = "int" TAG = "tag" FLOAT = "float" @@ -217,7 +218,6 @@ func (p *Parser) ParseLine(line string) (telegraf.Metric, error) { if k == "" || v == "" { continue } - // t is the modifier of the field var t string // check if pattern has some modifiers @@ -239,6 +239,8 @@ func (p *Parser) ParseLine(line string) (telegraf.Metric, error) { } switch t { + case MEASUREMENT: + p.Measurement = v case INT: iv, err := strconv.ParseInt(v, 10, 64) if err != nil { @@ -350,7 +352,7 @@ func (p *Parser) ParseLine(line string) (telegraf.Metric, error) { } if len(fields) == 0 { - return nil, fmt.Errorf("logparser_grok: must have one or more fields") + return nil, fmt.Errorf("grok: must have one or more fields") } return metric.New(p.Measurement, tags, fields, p.tsModder.tsMod(timestamp)) diff --git a/plugins/parsers/grok/parser_test.go b/plugins/parsers/grok/parser_test.go index 09f8fa16d89b5..8133d30212156 100644 --- a/plugins/parsers/grok/parser_test.go +++ b/plugins/parsers/grok/parser_test.go @@ -1,6 +1,7 @@ package grok import ( + "log" "testing" "time" @@ -959,3 +960,52 @@ func TestReplaceTimestampComma(t *testing.T) { //Convert Nanosecond to milisecond for compare require.Equal(t, 555, m.Time().Nanosecond()/1000000) } + +func TestDynamicMeasurementModifier(t *testing.T) { + p := &Parser{ + Patterns: []string{"%{TEST}"}, + CustomPatterns: "TEST %{NUMBER:var1:tag} %{NUMBER:var2:float} %{WORD:test:measurement}", + } + + require.NoError(t, p.Compile()) + m, err := p.ParseLine("4 5 hello") + require.NoError(t, err) + require.Equal(t, m.Name(), "hello") +} + +func TestStaticMeasurementModifier(t *testing.T) { + p := &Parser{ + Patterns: []string{"%{WORD:hi:measurement} %{NUMBER:num:string}"}, + } + + require.NoError(t, p.Compile()) + m, err := p.ParseLine("test_name 42") + log.Printf("%v", m) + require.NoError(t, err) + require.Equal(t, "test_name", m.Name()) +} + +// tests that the top level measurement name is used +func TestTwoMeasurementModifier(t *testing.T) { + p := &Parser{ + Patterns: []string{"%{TEST:test_name:measurement}"}, + CustomPatterns: "TEST %{NUMBER:var1:tag} %{NUMBER:var2:measurement} %{WORD:var3:measurement}", + } + + require.NoError(t, p.Compile()) + m, err := p.ParseLine("4 5 hello") + require.NoError(t, err) + require.Equal(t, m.Name(), "4 5 hello") +} + +func TestMeasurementModifierNoName(t *testing.T) { + p := &Parser{ + Patterns: []string{"%{TEST}"}, + CustomPatterns: "TEST %{NUMBER:var1:tag} %{NUMBER:var2:float} %{WORD:hi:measurement}", + } + + require.NoError(t, p.Compile()) + m, err := p.ParseLine("4 5 hello") + require.NoError(t, err) + require.Equal(t, m.Name(), "hello") +}