[pkg/stanza][receiver/filelog] Introduce active file grouping mechani…

…sm (open-telemetry#36518)  #### Description - introduce active file grouping according to the description in ticket - unit tests  #### Link to tracking issue Fixes open-telemetry#23787 --------- Signed-off-by: odubajDT <[email protected]>
shivanthzen · Dec 4, 2024 · 3d4eea2 · 3d4eea2
1 parent b216526
commit 3d4eea2
Show file tree

Hide file tree

Showing 6 changed files with 159 additions and 12 deletions.
diff --git a/.chloggen/file-grouping.yaml b/.chloggen/file-grouping.yaml
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: enhancement
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: pkg/stanza
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: "Introduce active file grouping mechanism."
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [23787]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: []
diff --git a/pkg/stanza/fileconsumer/config_test.go b/pkg/stanza/fileconsumer/config_test.go
@@ -219,6 +219,23 @@ func TestUnmarshal(t *testing.T) {
 					return newMockOperatorConfig(cfg)
 				}(),
 			},
+			{
+				Name: "sort_by_group_by",
+				Expect: func() *mockOperatorConfig {
+					cfg := NewConfig()
+					cfg.OrderingCriteria = matcher.OrderingCriteria{
+						Regex:   `err\.(?P<file_num>[a-zA-Z])\.\d+\.\d{10}\.log`,
+						GroupBy: `err\.(?P<value>[a-z]+).[0-9]*.*log`,
+						SortBy: []matcher.Sort{
+							{
+								SortType: "numeric",
+								RegexKey: "file_num",
+							},
+						},
+					}
+					return newMockOperatorConfig(cfg)
+				}(),
+			},
 			{
 				Name: "poll_interval_no_units",
 				Expect: func() *mockOperatorConfig {

diff --git a/pkg/stanza/fileconsumer/matcher/matcher.go b/pkg/stanza/fileconsumer/matcher/matcher.go
@@ -44,9 +44,10 @@ type Criteria struct {
 }
 
 type OrderingCriteria struct {
-	Regex  string `mapstructure:"regex,omitempty"`
-	TopN   int    `mapstructure:"top_n,omitempty"`
-	SortBy []Sort `mapstructure:"sort_by,omitempty"`
+	Regex   string `mapstructure:"regex,omitempty"`
+	TopN    int    `mapstructure:"top_n,omitempty"`
+	SortBy  []Sort `mapstructure:"sort_by,omitempty"`
+	GroupBy string `mapstructure:"group_by,omitempty"`
 }
 
 type Sort struct {
@@ -80,6 +81,14 @@ func New(c Criteria) (*Matcher, error) {
 		m.filterOpts = append(m.filterOpts, filter.ExcludeOlderThan(c.ExcludeOlderThan))
 	}
 
+	if c.OrderingCriteria.GroupBy != "" {
+		r, err := regexp.Compile(c.OrderingCriteria.GroupBy)
+		if err != nil {
+			return nil, fmt.Errorf("compile group_by regex: %w", err)
+		}
+		m.groupBy = r
+	}
+
 	if len(c.OrderingCriteria.SortBy) == 0 {
 		return m, nil
 	}
@@ -92,14 +101,13 @@ func New(c Criteria) (*Matcher, error) {
 		c.OrderingCriteria.TopN = defaultOrderingCriteriaTopN
 	}
 
-	var regex *regexp.Regexp
 	if orderingCriteriaNeedsRegex(c.OrderingCriteria.SortBy) {
 		if c.OrderingCriteria.Regex == "" {
 			return nil, fmt.Errorf("'regex' must be specified when 'sort_by' is specified")
 		}
 
 		var err error
-		regex, err = regexp.Compile(c.OrderingCriteria.Regex)
+		regex, err := regexp.Compile(c.OrderingCriteria.Regex)
 		if err != nil {
 			return nil, fmt.Errorf("compile regex: %w", err)
 		}
@@ -158,6 +166,7 @@ type Matcher struct {
 	exclude    []string
 	regex      *regexp.Regexp
 	filterOpts []filter.Option
+	groupBy    *regexp.Regexp
 }
 
 // MatchFiles gets a list of paths given an array of glob patterns to include and exclude
@@ -174,9 +183,27 @@ func (m Matcher) MatchFiles() ([]string, error) {
 		return files, errs
 	}
 
-	result, err := filter.Filter(files, m.regex, m.filterOpts...)
-	if len(result) == 0 {
-		return result, errors.Join(err, errs)
+	groups := make(map[string][]string)
+	if m.groupBy != nil {
+		for _, f := range files {
+			matches := m.groupBy.FindStringSubmatch(f)
+			if len(matches) > 1 {
+				group := matches[1]
+				groups[group] = append(groups[group], f)
+			}
+		}
+	} else {
+		groups["1"] = files
+	}
+
+	var result []string
+	for _, groupedFiles := range groups {
+		groupResult, err := filter.Filter(groupedFiles, m.regex, m.filterOpts...)
+		if len(groupResult) == 0 {
+			return groupResult, errors.Join(err, errs)
+		}
+		result = append(result, groupResult...)
 	}
+
 	return result, errs
 }
diff --git a/pkg/stanza/fileconsumer/matcher/matcher_test.go b/pkg/stanza/fileconsumer/matcher/matcher_test.go
@@ -69,6 +69,15 @@ func TestNew(t *testing.T) {
 			},
 			expectedErr: "exclude: parse glob: syntax error in pattern",
 		},
+		{
+			name: "GroupBy",
+			criteria: Criteria{
+				Include: []string{"*.log"},
+				OrderingCriteria: OrderingCriteria{
+					GroupBy: "[a-z]",
+				},
+			},
+		},
 		{
 			name: "RegexEmpty",
 			criteria: Criteria{
@@ -118,6 +127,16 @@ func TestNew(t *testing.T) {
 			},
 			expectedErr: "'top_n' must be a positive integer",
 		},
+		{
+			name: "GroupBy error",
+			criteria: Criteria{
+				Include: []string{"*.log"},
+				OrderingCriteria: OrderingCriteria{
+					GroupBy: "[a-z",
+				},
+			},
+			expectedErr: "compile group_by regex: error parsing regexp: missing closing ]: `[a-z`",
+		},
 		{
 			name: "SortTypeEmpty",
 			criteria: Criteria{
@@ -384,6 +403,54 @@ func TestMatcher(t *testing.T) {
 			},
 			expected: []string{"err.123456789.log"},
 		},
+		{
+			name:    "Numeric Sorting",
+			files:   []string{"err.a.123456788.log", "err.a.123456789.log", "err.a.123456787.log", "err.a.123456786.log", "err.b.123456788.log", "err.b.123456789.log"},
+			include: []string{"err.*.log"},
+			exclude: []string{},
+			filterCriteria: OrderingCriteria{
+				TopN:  6,
+				Regex: `err\.[a-z]\.(?P<value>\d+).*log`,
+				SortBy: []Sort{
+					{
+						SortType:  sortTypeNumeric,
+						RegexKey:  "value",
+						Ascending: false,
+					},
+				},
+			},
+			expected: []string{"err.a.123456789.log", "err.b.123456789.log", "err.a.123456788.log", "err.b.123456788.log", "err.a.123456787.log", "err.a.123456786.log"},
+		},
+		{
+			name:    "Numeric Sorting with grouping",
+			files:   []string{"err.a.123456788.log", "err.a.123456789.log", "err.a.123456787.log", "err.a.123456786.log", "err.b.123456788.log", "err.b.123456789.log"},
+			include: []string{"err.*.log"},
+			exclude: []string{},
+			filterCriteria: OrderingCriteria{
+				TopN:    6,
+				GroupBy: `err\.(?P<value>[a-z]+).[0-9]*.*log`,
+				Regex:   `err\.[a-z]\.(?P<value>\d+).*log`,
+				SortBy: []Sort{
+					{
+						SortType:  sortTypeNumeric,
+						RegexKey:  "value",
+						Ascending: false,
+					},
+				},
+			},
+			expected: []string{"err.a.123456789.log", "err.a.123456788.log", "err.a.123456787.log", "err.a.123456786.log", "err.b.123456789.log", "err.b.123456788.log"},
+		},
+		{
+			name:    "Grouping",
+			files:   []string{"err.a.123456788.log", "err.a.123456789.log", "err.a.123456787.log", "err.b.123456788.log", "err.a.123456786.log", "err.b.123456789.log"},
+			include: []string{"err.*.log"},
+			exclude: []string{},
+			filterCriteria: OrderingCriteria{
+				TopN:    6,
+				GroupBy: `err\.(?P<value>[a-z]+).[0-9]*.*log`,
+			},
+			expected: []string{"err.a.123456786.log", "err.a.123456787.log", "err.a.123456788.log", "err.a.123456789.log", "err.b.123456788.log", "err.b.123456789.log"},
+		},
 		{
 			name:    "Numeric Sorting Ascending",
 			files:   []string{"err.123456789.log", "err.123456788.log", "err.123456786.log", "err.123456787.log"},
@@ -786,7 +853,7 @@ func TestMatcher(t *testing.T) {
 			} else {
 				assert.NoError(t, err)
 			}
-			assert.ElementsMatch(t, tc.expected, files)
+			assert.Equal(t, tc.expected, files)
 		})
 	}
 }

diff --git a/pkg/stanza/fileconsumer/testdata/config.yaml b/pkg/stanza/fileconsumer/testdata/config.yaml
@@ -51,6 +51,14 @@ exclude_one:
     - "*.log"
   exclude:
     - one.log
+sort_by_group_by:
+  type: mock
+  ordering_criteria:
+    regex: 'err\.(?P<file_num>[a-zA-Z])\.\d+\.\d{10}\.log'
+    group_by: 'err\.(?P<value>[a-z]+).[0-9]*.*log'
+    sort_by:
+      - regex_key: file_num
+        sort_type: numeric
 sort_by_numeric:
   type: mock
   ordering_criteria:

diff --git a/receiver/filelogreceiver/README.md b/receiver/filelogreceiver/README.md
@@ -34,7 +34,7 @@ Tails and parses logs from files.
 | `include_file_path_resolved`          | `false`                              | Whether to add the file path after symlinks resolution as the attribute `log.file.path_resolved`.                                                                                                                                                               |
 | `include_file_owner_name`             | `false`                              | Whether to add the file owner name as the attribute `log.file.owner.name`. Not supported for windows.                                                                                                                                                           |
 | `include_file_owner_group_name`       | `false`                              | Whether to add the file group name as the attribute `log.file.owner.group.name`. Not supported for windows.                                                                                                                                                     |
-| `include_file_record_number`            | `false`                              | Whether to add the record number in the file as the attribute `log.file.record_number`.                                                                                                                                                                    |
+| `include_file_record_number`          | `false`                              | Whether to add the record number in the file as the attribute `log.file.record_number`.                                                                                                                                                                         |
 | `poll_interval`                       | 200ms                                | The [duration](#time-parameters) between filesystem polls.                                                                                                                                                                                                      |
 | `fingerprint_size`                    | `1kb`                                | The number of bytes with which to identify a file. The first bytes in the file are used as the fingerprint. Decreasing this value at any point will cause existing fingerprints to forgotten, meaning that all files will be read from the beginning (one time) |
 | `max_log_size`                        | `1MiB`                               | The maximum size of a log entry to read. A log entry will be truncated if it is larger than `max_log_size`. Protects against reading large amounts of data into memory.                                                                                         |
@@ -52,9 +52,10 @@ Tails and parses logs from files.
 | `retry_on_failure.enabled`            | `false`                              | If `true`, the receiver will pause reading a file and attempt to resend the current batch of logs if it encounters an error from downstream components.                                                                                                         |
 | `retry_on_failure.initial_interval`   | `1s`                                 | [Time](#time-parameters) to wait after the first failure before retrying.                                                                                                                                                                                       |
 | `retry_on_failure.max_interval`       | `30s`                                | Upper bound on retry backoff [interval](#time-parameters). Once this value is reached the delay between consecutive retries will remain constant at the specified value.                                                                                        |
-| `retry_on_failure.max_elapsed_time`   | `5m`                                 | Maximum amount of [time](#time-parameters) (including retries) spent trying to send a logs batch to a downstream consumer. Once this value is reached, the data is discarded. Retrying never stops if set to `0`.                                               
+| `retry_on_failure.max_elapsed_time`   | `5m`                                 | Maximum amount of [time](#time-parameters) (including retries) spent trying to send a logs batch to a downstream consumer. Once this value is reached, the data is discarded. Retrying never stops if set to `0`.                                               |
 | `ordering_criteria.regex`             |                                      | Regular expression used for sorting, should contain a named capture groups that are to be used in `regex_key`.                                                                                                                                                  |
-| `ordering_criteria.top_n`             | 1 | The number of files to track when using file ordering. The top N files are tracked after applying the ordering criteria.                                                                                                                                        |
+| `ordering_criteria.gropup_by`         |                                      | Regular expression used for grouping, which is done pre-sorting. Should contain a named capture groups.                                                                                                                                                         |
+| `ordering_criteria.top_n`             | 1                                    | The number of files to track when using file ordering. The top N files are tracked after applying the ordering criteria.                                                                                                                                        |
 | `ordering_criteria.sort_by.sort_type` |                                      | Type of sorting to be performed (e.g., `numeric`, `alphabetical`, `timestamp`, `mtime`)                                                                                                                                                                         |
 | `ordering_criteria.sort_by.location`  |                                      | Relevant if `sort_type` is set to `timestamp`. Defines the location of the timestamp of the file.                                                                                                                                                               |
 | `ordering_criteria.sort_by.format`    |                                      | Relevant if `sort_type` is set to `timestamp`. Defines the strptime format of the timestamp being sorted.                                                                                                                                                       |