diff --git a/.chloggen/ottl-parse-simple-xml.yaml b/.chloggen/ottl-parse-simple-xml.yaml new file mode 100644 index 000000000000..7bd4d39a6c5c --- /dev/null +++ b/.chloggen/ottl-parse-simple-xml.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: pkg/ottl + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add ParseSimplifiedXML Converter + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [35421] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] diff --git a/pkg/ottl/e2e/e2e_test.go b/pkg/ottl/e2e/e2e_test.go index dcf89bd0c5e2..25d95cbe5db3 100644 --- a/pkg/ottl/e2e/e2e_test.go +++ b/pkg/ottl/e2e/e2e_test.go @@ -663,6 +663,15 @@ func Test_e2e_converters(t *testing.T) { tCtx.GetLogRecord().Attributes().PutStr("test", "k1=v1 k2=\"v2=v3\"") }, }, + { + statement: `set(attributes["test"], ParseSimplifiedXML("1This is a log message!"))`, + want: func(tCtx ottllog.TransformContext) { + attr := tCtx.GetLogRecord().Attributes().PutEmptyMap("test") + log := attr.PutEmptyMap("Log") + log.PutStr("id", "1") + log.PutStr("Message", "This is a log message!") + }, + }, { statement: `set(attributes["test"], ParseXML("This is a log message!"))`, want: func(tCtx ottllog.TransformContext) { diff --git a/pkg/ottl/ottlfuncs/README.md b/pkg/ottl/ottlfuncs/README.md index 6df4154bd9a9..66e9c8a2930e 100644 --- a/pkg/ottl/ottlfuncs/README.md +++ b/pkg/ottl/ottlfuncs/README.md @@ -449,6 +449,7 @@ Available Converters: - [ParseCSV](#parsecsv) - [ParseJSON](#parsejson) - [ParseKeyValue](#parsekeyvalue) +- [ParseSimplifiedXML](#parsesimplifiedxml) - [ParseXML](#parsexml) - [RemoveXML](#removexml) - [Seconds](#seconds) @@ -1335,6 +1336,132 @@ Examples: - `ParseKeyValue("k1!v1_k2!v2_k3!v3", "!", "_")` - `ParseKeyValue(attributes["pairs"])` +### ParseSimplifiedXML + +`ParseSimplifiedXML(target)` + +The `ParseSimplifiedXML` Converter returns a `pcommon.Map` struct that is the result of parsing the target string without preservation of attributes or extraneous text content. + +The goal of this Converter is to produce a more user-friendly representation of XML data than the `ParseXML` Converter. +This Converter should be preferred over `ParseXML` when minor semantic details (e.g. order of elements) are not critically important, when subsequent processing or querying of the result is expected, or when human-readability is a concern. + +This Converter disregards certain aspects of XML, specifically attributes and extraneous text content, in order to produce +a direct representation of XML data. Users are encouraged to simplify their XML documents prior to using `ParseSimplifiedXML`. + +See other functions which may be useful for preparing XML documents: + +- `ConvertAttributesToElementsXML` +- `ConvertTextToElementsXML` +- `RemoveXML` +- `InsertXML` +- `GetXML` + +#### Formal Definitions + +A "Simplified XML" document contains no attributes and no extraneous text content. + +An element has "extraneous text content" when it contains both text and element content. e.g. + +```xml + + bar + world + +``` + +#### Parsing logic + +1. Declaration elements, attributes, comments, and extraneous text content are ignored. +2. Elements which contain a value are converted into key/value pairs. + e.g. `bar` becomes `"foo": "bar"` +3. Elements which contain child elements are converted into a key/value pair where the value is a map. + e.g. ` baz ` becomes `"foo": { "bar": "baz" }` +4. Sibling elements that share the same tag will be combined into a slice. + e.g. ` 1 2 3 ` becomes `"a": { "b": "1", "c": [ "2", "3" ] }`. +5. Empty elements are dropped, but they can determine whether a value should be a slice or map. + e.g. ` 1 ` becomes `"a": { "b": [ "1" ] }` instead of `"a": { "b": "1" }` + +#### Examples + +Parse a Simplified XML document from the body: + +```xml + + 1 + jane +
+ + Something happened + unknown +
+
+``` + +```json +{ + "event": { + "id": 1, + "user": "jane", + "details": { + "time": "2021-10-01T12:00:00Z", + "description": "Something happened", + "cause": "unknown" + } + } +} +``` + +Parse a Simplified XML document with unique child elements: + +```xml + + 1 + 2 + +``` + +```json +{ + "x": { + "y": "1", + "z": "2" + } +} +``` + +Parse a Simplified XML document with multiple elements of the same tag: + +```xml + + 1 + 2 + +``` + +```json +{ + "a": { + "b": ["1", "2"] + } +} +``` + +Parse a Simplified XML document with CDATA element: + +```xml + + 1 + + +``` + +```json +{ + "a": { + "b": ["1", "2"] + } +} +``` ### ParseXML diff --git a/pkg/ottl/ottlfuncs/func_parse_simplified_xml.go b/pkg/ottl/ottlfuncs/func_parse_simplified_xml.go new file mode 100644 index 000000000000..7e4f1e2753fa --- /dev/null +++ b/pkg/ottl/ottlfuncs/func_parse_simplified_xml.go @@ -0,0 +1,134 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package ottlfuncs // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl/ottlfuncs" + +import ( + "context" + "fmt" + + "github.com/antchfx/xmlquery" + "go.opentelemetry.io/collector/pdata/pcommon" + + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl" +) + +type ParseSimplifiedXMLArguments[K any] struct { + Target ottl.StringGetter[K] +} + +func NewParseSimplifiedXMLFactory[K any]() ottl.Factory[K] { + return ottl.NewFactory("ParseSimplifiedXML", &ParseSimplifiedXMLArguments[K]{}, createParseSimplifiedXMLFunction[K]) +} + +func createParseSimplifiedXMLFunction[K any](_ ottl.FunctionContext, oArgs ottl.Arguments) (ottl.ExprFunc[K], error) { + args, ok := oArgs.(*ParseSimplifiedXMLArguments[K]) + + if !ok { + return nil, fmt.Errorf("ParseSimplifiedXML args must be of type *ParseSimplifiedXMLAguments[K]") + } + + return parseSimplifiedXML(args.Target), nil +} + +// The `ParseSimplifiedXML` Converter returns a `pcommon.Map` struct that is the result of parsing the target +// string without preservation of attributes or extraneous text content. +func parseSimplifiedXML[K any](target ottl.StringGetter[K]) ottl.ExprFunc[K] { + return func(ctx context.Context, tCtx K) (any, error) { + var doc *xmlquery.Node + if targetVal, err := target.Get(ctx, tCtx); err != nil { + return nil, err + } else if doc, err = parseNodesXML(targetVal); err != nil { + return nil, err + } + + docMap := pcommon.NewMap() + parseElement(doc, &docMap) + return docMap, nil + } +} + +func parseElement(parent *xmlquery.Node, parentMap *pcommon.Map) { + // Count the number of each element tag so we know whether it will be a member of a slice or not + childTags := make(map[string]int) + for child := parent.FirstChild; child != nil; child = child.NextSibling { + if child.Type != xmlquery.ElementNode { + continue + } + childTags[child.Data]++ + } + if len(childTags) == 0 { + return + } + + // Convert the children, now knowing whether they will be a member of a slice or not + for child := parent.FirstChild; child != nil; child = child.NextSibling { + if child.Type != xmlquery.ElementNode || child.FirstChild == nil { + continue + } + + leafValue := leafValueFromElement(child) + + // Slice of the same element + if childTags[child.Data] > 1 { + // Get or create the slice of children + var childrenSlice pcommon.Slice + childrenValue, ok := parentMap.Get(child.Data) + if ok { + childrenSlice = childrenValue.Slice() + } else { + childrenSlice = parentMap.PutEmptySlice(child.Data) + } + + // Add the child's text content to the slice + if leafValue != "" { + childrenSlice.AppendEmpty().SetStr(leafValue) + continue + } + + // Parse the child to make sure there's something to add + childMap := pcommon.NewMap() + parseElement(child, &childMap) + if childMap.Len() == 0 { + continue + } + + sliceValue := childrenSlice.AppendEmpty() + sliceMap := sliceValue.SetEmptyMap() + childMap.CopyTo(sliceMap) + continue + } + + if leafValue != "" { + parentMap.PutStr(child.Data, leafValue) + continue + } + + // Child will be a map + childMap := pcommon.NewMap() + parseElement(child, &childMap) + if childMap.Len() == 0 { + continue + } + + childMap.CopyTo(parentMap.PutEmptyMap(child.Data)) + } +} + +func leafValueFromElement(node *xmlquery.Node) string { + // First check if there are any child elements. If there are, ignore any extraneous text. + for child := node.FirstChild; child != nil; child = child.NextSibling { + if child.Type == xmlquery.ElementNode { + return "" + } + } + + // No child elements, so return the first text or CDATA content + for child := node.FirstChild; child != nil; child = child.NextSibling { + switch child.Type { + case xmlquery.TextNode, xmlquery.CharDataNode: + return child.Data + } + } + return "" +} diff --git a/pkg/ottl/ottlfuncs/func_parse_simplified_xml_test.go b/pkg/ottl/ottlfuncs/func_parse_simplified_xml_test.go new file mode 100644 index 000000000000..20f3abd6f1d2 --- /dev/null +++ b/pkg/ottl/ottlfuncs/func_parse_simplified_xml_test.go @@ -0,0 +1,278 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package ottlfuncs // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl/ottlfuncs" + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "go.opentelemetry.io/collector/pdata/pcommon" + + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl" +) + +func Test_ParseSimplifiedXML(t *testing.T) { + tests := []struct { + name string + document string + want pcommon.Map + }{ + { + name: "single leaf", + document: `b`, + want: func() pcommon.Map { + m := pcommon.NewMap() + m.PutStr("a", "b") + return m + }(), + }, + { + name: "double leaf", + document: `bc`, + want: func() pcommon.Map { + m := pcommon.NewMap() + b := m.PutEmptySlice("a") + b.AppendEmpty().SetStr("b") + b.AppendEmpty().SetStr("c") + return m + }(), + }, + { + name: "nested maps", + document: `1`, + want: func() pcommon.Map { + m := pcommon.NewMap() + a := m.PutEmptyMap("a") + a.PutStr("b", "1") + return m + }(), + }, + { + name: "mixed slice", + document: `13`, + want: func() pcommon.Map { + m := pcommon.NewMap() + a := m.PutEmptySlice("a") + a.AppendEmpty().SetStr("1") + a.AppendEmpty().SetStr("2") + b := a.AppendEmpty().SetEmptyMap() + b.PutStr("b", "3") + return m + }(), + }, + { + name: "char data leaf", + document: ``, + want: func() pcommon.Map { + m := pcommon.NewMap() + m.PutStr("a", "b") + return m + }(), + }, + { + name: "ignore attributes", + document: `c`, + want: func() pcommon.Map { + m := pcommon.NewMap() + a := m.PutEmptyMap("a") + a.PutStr("b", "c") + return m + }(), + }, + { + name: "ignore declaration", + document: `b`, + want: func() pcommon.Map { + m := pcommon.NewMap() + m.PutStr("a", "b") + return m + }(), + }, + { + name: "ignore comments", + document: `b`, + want: func() pcommon.Map { + m := pcommon.NewMap() + m.PutStr("a", "b") + return m + }(), + }, + { + name: "ignore empty other than comment", + document: `2`, + want: func() pcommon.Map { + m := pcommon.NewMap() + a := m.PutEmptyMap("a") + a.PutStr("b", "2") + return m + }(), + }, + { + name: "empty other than comment forces slice", + document: `24`, + want: func() pcommon.Map { + m := pcommon.NewMap() + a := m.PutEmptyMap("a") + a.PutStr("b", "2") + c := a.PutEmptySlice("c") + c.AppendEmpty().SetStr("4") + return m + }(), + }, + { + name: "ignore extraneous text", + document: `extra13extra2`, + want: func() pcommon.Map { + m := pcommon.NewMap() + a := m.PutEmptyMap("a") + a.PutStr("b", "3") + return m + }(), + }, + { + name: "ignore extraneous CDATA", + document: `3`, + want: func() pcommon.Map { + m := pcommon.NewMap() + a := m.PutEmptyMap("a") + a.PutStr("b", "3") + return m + }(), + }, + { + name: "ignore single empty element", + document: `3`, + want: func() pcommon.Map { + m := pcommon.NewMap() + a := m.PutEmptyMap("a") + a.PutStr("b", "3") + return m + }(), + }, + { + name: "empty element cascade", + document: `2`, + want: func() pcommon.Map { + m := pcommon.NewMap() + a := m.PutEmptyMap("a") + a.PutStr("d", "2") + return m + }(), + }, + { + name: "empty element forces slice", + document: `3`, + want: func() pcommon.Map { + m := pcommon.NewMap() + a := m.PutEmptyMap("a") + b := a.PutEmptySlice("b") + b.AppendEmpty().SetStr("3") + return m + }(), + }, + { + // ParseSimplifiedXML(ConvertAttributesToElementsXML(ConvertTextToElementsXML("..."))) + name: "Simplified WEL", + document: ` + http://schemas.microsoft.com/win/2004/08/events/event + + Microsoft-Windows-Security-Auditing{54849625-5478-4994-a5ba-3e3b0328c30d} + 4625 + 0 + 0 + 12544 + 0 + 0x8010000000000000 + 2024-09-04T08:38:09.7477579Z + 1361885 + {b67ee0c2-a671-0001-5f6b-82e8c1eeda01} + 6562276 + Security + samuel-vahala + + + + SubjectUserSidS-1-0-0 + TargetUserSidS-1-0-0 + Status0xc000006d + WorkstationNameD-508 + +`, + want: func() pcommon.Map { + result := pcommon.NewMap() + event := result.PutEmptyMap("Event") + event.PutStr("xmlns", "http://schemas.microsoft.com/win/2004/08/events/event") + system := event.PutEmptyMap("System") + provider := system.PutEmptyMap("Provider") + provider.PutStr("Name", "Microsoft-Windows-Security-Auditing") + provider.PutStr("Guid", "{54849625-5478-4994-a5ba-3e3b0328c30d}") + system.PutStr("EventID", "4625") + system.PutStr("Version", "0") + system.PutStr("Level", "0") + system.PutStr("Task", "12544") + system.PutStr("Opcode", "0") + system.PutStr("Keywords", "0x8010000000000000") + timeCreated := system.PutEmptyMap("TimeCreated") + timeCreated.PutStr("SystemTime", "2024-09-04T08:38:09.7477579Z") + system.PutStr("EventRecordID", "1361885") + correlation := system.PutEmptyMap("Correlation") + correlation.PutStr("ActivityID", "{b67ee0c2-a671-0001-5f6b-82e8c1eeda01}") + execution := system.PutEmptyMap("Execution") + execution.PutStr("ProcessID", "656") + execution.PutStr("ThreadID", "2276") + system.PutStr("Channel", "Security") + system.PutStr("Computer", "samuel-vahala") + eventData := event.PutEmptyMap("EventData") + data := eventData.PutEmptySlice("Data") + data1 := data.AppendEmpty().SetEmptyMap() + data1.PutStr("Name", "SubjectUserSid") + data1.PutStr("value", "S-1-0-0") + data2 := data.AppendEmpty().SetEmptyMap() + data2.PutStr("Name", "TargetUserSid") + data2.PutStr("value", "S-1-0-0") + data3 := data.AppendEmpty().SetEmptyMap() + data3.PutStr("Name", "Status") + data3.PutStr("value", "0xc000006d") + data4 := data.AppendEmpty().SetEmptyMap() + data4.PutStr("Name", "WorkstationName") + data4.PutStr("value", "D-508") + return result + }(), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + target := ottl.StandardStringGetter[any]{ + Getter: func(_ context.Context, _ any) (any, error) { + return tt.document, nil + }, + } + exprFunc := parseSimplifiedXML(target) + result, err := exprFunc(context.Background(), nil) + assert.NoError(t, err) + assert.Equal(t, tt.want, result) + }) + } +} + +func TestCreateParseSimplifiedXMLFunc(t *testing.T) { + factory := NewParseSimplifiedXMLFactory[any]() + fCtx := ottl.FunctionContext{} + + // Invalid arg type + exprFunc, err := factory.CreateFunction(fCtx, nil) + assert.Error(t, err) + assert.Nil(t, exprFunc) + + // Invalid XML should error on function execution + exprFunc, err = factory.CreateFunction( + fCtx, &ParseSimplifiedXMLArguments[any]{ + Target: invalidXMLGetter(), + }) + assert.NoError(t, err) + assert.NotNil(t, exprFunc) + _, err = exprFunc(context.Background(), nil) + assert.Error(t, err) +} diff --git a/pkg/ottl/ottlfuncs/functions.go b/pkg/ottl/ottlfuncs/functions.go index 1e5086be59a8..9979c1800d3d 100644 --- a/pkg/ottl/ottlfuncs/functions.go +++ b/pkg/ottl/ottlfuncs/functions.go @@ -73,6 +73,7 @@ func converters[K any]() []ottl.Factory[K] { NewParseCSVFactory[K](), NewParseJSONFactory[K](), NewParseKeyValueFactory[K](), + NewParseSimplifiedXMLFactory[K](), NewParseXMLFactory[K](), NewRemoveXMLFactory[K](), NewSecondsFactory[K](),