-
Notifications
You must be signed in to change notification settings - Fork 70
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
RFC: Add support for complex capture group names in parse_regex. #505
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ package confgenerator | |
import ( | ||
"fmt" | ||
"log" | ||
"regexp" | ||
"strings" | ||
|
||
"github.com/GoogleCloudPlatform/ops-agent/confgenerator/filter" | ||
|
@@ -62,6 +63,52 @@ func init() { | |
LoggingProcessorTypes.RegisterType(func() Component { return &LoggingProcessorParseJson{} }) | ||
} | ||
|
||
// rewriteComplexCaptures translates disallowed capture group names into placeholders and a transformation to rename them back in the log record. | ||
func rewriteComplexCaptures(regex, tag string) (string, []fluentbit.Component) { | ||
// Short-circuit regexes that don't have disallowed capture group names | ||
disallowed := regexp.MustCompile(`\(\?P<(?:[A-Za-z0-9_]*[^A-Za-z0-9_>])+(?:[A-Za-z0-9_]*)>`) | ||
if !disallowed.MatchString(regex) { | ||
return regex, nil | ||
} | ||
// Maintain a list of rewritten capture group names | ||
var rewrites []string | ||
captureGroup := regexp.MustCompile(`\(\?P<((?:[^>\\]|\\.)*)>`) | ||
// Can't use ReplaceAllStringFunc, since it doesn't support replacing only captured values | ||
groupIndexes := captureGroup.FindAllStringSubmatchIndex(regex, -1) | ||
l := 0 | ||
var r []string | ||
for _, i := range groupIndexes { | ||
g := regex[i[0]:i[1]] // Full match | ||
s := regex[i[2]:i[3]] // First capture group | ||
r = append(r, regex[l:i[2]]) | ||
// Also replace any capture group whose name starts with "__" | ||
if !disallowed.MatchString(g) && !strings.HasPrefix(s, "__") { | ||
r = append(r, s) | ||
} else { | ||
rewrites = append(rewrites, s) | ||
r = append(r, fmt.Sprintf("__%d", len(rewrites))) | ||
} | ||
l = i[3] | ||
} | ||
r = append(r, regex[l:]) | ||
// Reconstruct the regex | ||
regex = strings.Join(r, "") | ||
// Rename all captured fields | ||
oc := make([][2]string, len(rewrites)) | ||
for i, g := range rewrites { | ||
oc = append(oc, [2]string{"Rename", fmt.Sprintf("__%d %q", i+1, g)}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't allow users to set complicated fields they might want to set such as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, it doesn't, but this wasn't the intended goal. This was supposed to match the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should not match Either way, you're still not giving people a way to set nested fields like this. e.g. you could give {
"logging.googleapis.com/labels": {
"hello": "world"
}
} which in OA config would be called There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about:
The second form is basically the filter syntax, except it uses a non-LogEntry structure which would be confusing... But if |
||
} | ||
rename := fluentbit.Component{ | ||
Kind: "FILTER", | ||
Config: map[string]string{ | ||
"Match": tag, | ||
"Name": "modify", | ||
}, | ||
OrderedConfig: oc, | ||
} | ||
return regex, []fluentbit.Component{rename} | ||
} | ||
|
||
// A LoggingProcessorParseRegex applies a regex to the specified field, storing the named capture groups as keys in the log record. | ||
// This was maintained in addition to the parse_regex_complex to ensure backward compatibility with any existing configurations | ||
type LoggingProcessorParseRegex struct { | ||
|
@@ -77,14 +124,16 @@ func (r LoggingProcessorParseRegex) Type() string { | |
} | ||
|
||
func (p LoggingProcessorParseRegex) Components(tag, uid string) []fluentbit.Component { | ||
regex, transforms := rewriteComplexCaptures(p.Regex, tag) | ||
|
||
parser, parserName := p.ParserShared.Component(tag, uid) | ||
parser.Config["Format"] = "regex" | ||
parser.Config["Regex"] = p.Regex | ||
parser.Config["Regex"] = regex | ||
|
||
return []fluentbit.Component{ | ||
return append([]fluentbit.Component{ | ||
parser, | ||
fluentbit.ParserFilterComponent(tag, p.Field, []string{parserName}), | ||
} | ||
}, transforms...) | ||
} | ||
|
||
type RegexParser struct { | ||
|
@@ -103,10 +152,13 @@ func (p LoggingProcessorParseRegexComplex) Components(tag, uid string) []fluentb | |
parserNames := []string{} | ||
|
||
for idx, parserConfig := range p.Parsers { | ||
regex, transforms := rewriteComplexCaptures(parserConfig.Regex, tag) | ||
|
||
parser, parserName := parserConfig.Parser.Component(tag, fmt.Sprintf("%s.%d", uid, idx)) | ||
parser.Config["Format"] = "regex" | ||
parser.Config["Regex"] = parserConfig.Regex | ||
parser.Config["Regex"] = regex | ||
components = append(components, parser) | ||
components = append(components, transforms...) | ||
parserNames = append(parserNames, parserName) | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
@SET buffers_dir=/var/lib/google-cloud-ops-agent/fluent-bit/buffers | ||
@SET logs_dir=/var/log/google-cloud-ops-agent/subagents | ||
|
||
[SERVICE] | ||
Daemon off | ||
Flush 1 | ||
Log_Level info | ||
dns.resolver legacy | ||
storage.backlog.mem_limit 50M | ||
storage.checksum on | ||
storage.max_chunks_up 128 | ||
storage.metrics on | ||
storage.sync normal | ||
|
||
[INPUT] | ||
Name fluentbit_metrics | ||
Scrape_Interval 60 | ||
Scrape_On_Start True | ||
|
||
[INPUT] | ||
Buffer_Chunk_Size 512k | ||
Buffer_Max_Size 5M | ||
DB ${buffers_dir}/default_pipeline_syslog | ||
Key message | ||
Mem_Buf_Limit 10M | ||
Name tail | ||
Path /var/log/messages,/var/log/syslog | ||
Read_from_Head True | ||
Rotate_Wait 30 | ||
Skip_Long_Lines On | ||
Tag default_pipeline.syslog | ||
storage.type filesystem | ||
|
||
[INPUT] | ||
Buffer_Chunk_Size 512k | ||
Buffer_Max_Size 5M | ||
DB ${buffers_dir}/ops-agent-fluent-bit | ||
Key message | ||
Mem_Buf_Limit 10M | ||
Name tail | ||
Path ${logs_dir}/logging-module.log | ||
Read_from_Head True | ||
Rotate_Wait 30 | ||
Skip_Long_Lines On | ||
Tag ops-agent-fluent-bit | ||
storage.type filesystem | ||
|
||
[FILTER] | ||
Key_Name key_1 | ||
Match default_pipeline.syslog | ||
Name parser | ||
Parser default_pipeline.syslog.0 | ||
|
||
[FILTER] | ||
Match default_pipeline.syslog | ||
Name modify | ||
|
||
|
||
|
||
Rename __1 "logging.googleapis.com/severity" | ||
Rename __2 "z*%\\>\\\\!" | ||
Rename __3 "__6" | ||
|
||
[FILTER] | ||
Add logging.googleapis.com/logName syslog | ||
Match default_pipeline.syslog | ||
Name modify | ||
|
||
[OUTPUT] | ||
Match_Regex ^(default_pipeline\.syslog)$ | ||
Name stackdriver | ||
Retry_Limit 3 | ||
net.connect_timeout_log_error False | ||
resource gce_instance | ||
stackdriver_agent Google-Cloud-Ops-Agent-Logging/latest (BuildDistro=build_distro;Platform=linux;ShortName=linux_platform;ShortVersion=linux_platform_version) | ||
tls On | ||
tls.verify Off | ||
workers 8 | ||
|
||
[OUTPUT] | ||
Match_Regex ^(ops-agent-fluent-bit)$ | ||
Name stackdriver | ||
Retry_Limit 3 | ||
net.connect_timeout_log_error False | ||
resource gce_instance | ||
stackdriver_agent Google-Cloud-Ops-Agent-Logging/latest (BuildDistro=build_distro;Platform=linux;ShortName=linux_platform;ShortVersion=linux_platform_version) | ||
tls On | ||
tls.verify Off | ||
workers 8 | ||
|
||
[OUTPUT] | ||
Match * | ||
Name prometheus_exporter | ||
host 0.0.0.0 | ||
port 20202 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
[PARSER] | ||
Format regex | ||
Name default_pipeline.syslog.0 | ||
Regex ^(?P<__1>[EWID]) (?P<__2>.*) (?P<__3>.)$ | ||
Time_Format time_format_1 | ||
Time_Key time_key_1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it's possible to parse a regex with a regex, because of the nesting parentheses problem. You would need a smarter parser/lexer. We may yet have to go down that road to adopt Stanza, but I really don't want to if we can avoid it...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In general, you're correct. Luckily, this isn't actually parsing a regex — it's only looking for the named capturing groups, which have a much simpler syntax.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm pretty sure I could construct a regex that this code does not parse correctly. e.g. I don't think it would correctly handle
which would match the string
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's fair. It's not really a nesting problem, as much as an escaping problem, and a potential limitation of Go's regexp engine (which, e.g., does not allow lookbehind). It may be possible to slurp up escaped backslashes prefixed by the start anchor or a non-backslash character, which will take care of that. Let me fix the regex (and I can add your example to the test to make sure it stays untouched).