From eabe0f8c233e8766f4177b04c6ad7cdb9e69c55d Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 21 Nov 2024 13:06:34 -0500 Subject: [PATCH 01/28] [Filebeat/Filestream] Fix `sourceStore.UpdateIdentifiers` The `sourceStore.UpdateIdentifiers` has always been part of the fileProspector.Init, its purpose is to update the identifiers in the registry if the file identity has changed, however it was generating the wrong key and not updating the in memory registry (store.ephemeralStore). This commit fixes it and also removes `sourceStore.FixUpIdentifiers` because it just a working version of `sourceStore.UpdateIdentifiers`. Now there is a single method to manipulate identifiers in the `sourceStore`. --- .../config/filebeat.global.reference.yml.tmpl | 2 + .../internal/input-logfile/prospector.go | 4 -- .../internal/input-logfile/store.go | 54 +++++-------------- filebeat/input/filestream/prospector.go | 7 ++- 4 files changed, 19 insertions(+), 48 deletions(-) diff --git a/filebeat/_meta/config/filebeat.global.reference.yml.tmpl b/filebeat/_meta/config/filebeat.global.reference.yml.tmpl index 0287fb3f9f57..9d0a3c23974e 100644 --- a/filebeat/_meta/config/filebeat.global.reference.yml.tmpl +++ b/filebeat/_meta/config/filebeat.global.reference.yml.tmpl @@ -15,6 +15,8 @@ # batch of events has been published successfully. The default value is 1s. #filebeat.registry.flush: 1s +# The interval which to run the registry clean up +#filebeat.registry.cleanup_interval: 5m # Starting with Filebeat 7.0, the registry uses a new directory format to store # Filebeat state. After you upgrade, Filebeat will automatically migrate a 6.x diff --git a/filebeat/input/filestream/internal/input-logfile/prospector.go b/filebeat/input/filestream/internal/input-logfile/prospector.go index 733e55fe26ef..4fc383f315d6 100644 --- a/filebeat/input/filestream/internal/input-logfile/prospector.go +++ b/filebeat/input/filestream/internal/input-logfile/prospector.go @@ -56,10 +56,6 @@ type ProspectorCleaner interface { // The function passed to UpdateIdentifiers must return an empty string if the key // remains the same. UpdateIdentifiers(func(v Value) (string, interface{})) - - // FixUpIdentifiers migrates IDs in the registry from inputs - // that used the deprecated `.global` ID. - FixUpIdentifiers(func(v Value) (string, interface{})) } // Value contains the cursor metadata. diff --git a/filebeat/input/filestream/internal/input-logfile/store.go b/filebeat/input/filestream/internal/input-logfile/store.go index 024ca5c9bfdd..726f4e0c5939 100644 --- a/filebeat/input/filestream/internal/input-logfile/store.go +++ b/filebeat/input/filestream/internal/input-logfile/store.go @@ -212,13 +212,21 @@ func (s *sourceStore) CleanIf(pred func(v Value) bool) { } } -// FixUpIdentifiers copies an existing resource to a new ID and marks the previous one +// UpdateIdentifiers copies an existing resource to a new ID and marks the previous one // for removal. -func (s *sourceStore) FixUpIdentifiers(getNewID func(v Value) (string, interface{})) { +func (s *sourceStore) UpdateIdentifiers(getNewID func(v Value) (string, interface{})) { s.store.ephemeralStore.mu.Lock() defer s.store.ephemeralStore.mu.Unlock() for key, res := range s.store.ephemeralStore.table { + // - res.internalState.TTL == 0 is a deleted entry + // - res.internalState.TTL > 0 is an entry that will be removed once the TTL + // is reached + // - res.internalState.TTL == -1 is an entry that will never be removed + if res.internalState.TTL == 0 { + continue + } + if !s.identifier.MatchesInput(key) { continue } @@ -229,7 +237,7 @@ func (s *sourceStore) FixUpIdentifiers(getNewID func(v Value) (string, interface } newKey, updatedMeta := getNewID(res) - if len(newKey) > 0 && res.internalState.TTL > 0 { + if len(newKey) > 0 { if _, ok := s.store.ephemeralStore.table[newKey]; ok { res.lock.Unlock() continue @@ -249,48 +257,10 @@ func (s *sourceStore) FixUpIdentifiers(getNewID func(v Value) (string, interface // Add the new resource to the ephemeralStore so the rest of the // codebase can have access to the new value s.store.ephemeralStore.table[newKey] = r - // Remove the old key from the store s.store.UpdateTTL(res, 0) // aka delete. See store.remove for details s.store.log.Infof("migrated entry in registry from '%s' to '%s'", key, newKey) - } - - res.lock.Unlock() - } -} - -// UpdateIdentifiers copies an existing resource to a new ID and marks the previous one -// for removal. -func (s *sourceStore) UpdateIdentifiers(getNewID func(v Value) (string, interface{})) { - s.store.ephemeralStore.mu.Lock() - defer s.store.ephemeralStore.mu.Unlock() - - for key, res := range s.store.ephemeralStore.table { - if !s.identifier.MatchesInput(key) { - continue - } - - if !res.lock.TryLock() { - continue - } - - newKey, updatedMeta := getNewID(res) - if len(newKey) > 0 && res.internalState.TTL > 0 { - if _, ok := s.store.ephemeralStore.table[newKey]; ok { - res.lock.Unlock() - continue - } - - // Pending updates due to events that have not yet been ACKed - // are not included in the copy. Collection on - // the copy start from the last known ACKed position. - // This might lead to data duplication because the harvester - // will pickup from the last ACKed position using the new key - // and the pending updates will affect the entry with the oldKey. - r := res.copyWithNewKey(newKey) - r.cursorMeta = updatedMeta - r.stored = false - s.store.writeState(r) + s.store.log.Infof("migrated entry in registry from '%s' to '%s'. Cursor: %v", key, newKey, r.cursor) } res.lock.Unlock() diff --git a/filebeat/input/filestream/prospector.go b/filebeat/input/filestream/prospector.go index 2bf737a86fd9..5c7402f64dc5 100644 --- a/filebeat/input/filestream/prospector.go +++ b/filebeat/input/filestream/prospector.go @@ -70,7 +70,7 @@ func (p *fileProspector) Init( // If this fileProspector belongs to an input that did not have an ID // this will find its files in the registry and update them to use the // new ID. - globalCleaner.FixUpIdentifiers(func(v loginp.Value) (id string, val interface{}) { + globalCleaner.UpdateIdentifiers(func(v loginp.Value) (id string, val interface{}) { var fm fileMeta err := v.UnpackCursorMeta(&fm) if err != nil { @@ -101,6 +101,9 @@ func (p *fileProspector) Init( } identifierName := p.identifier.Name() + + // If the file identity has changed, update the registry keys so we can keep + // the state. cleaner.UpdateIdentifiers(func(v loginp.Value) (string, interface{}) { var fm fileMeta err := v.UnpackCursorMeta(&fm) @@ -114,7 +117,7 @@ func (p *fileProspector) Init( } if fm.IdentifierName != identifierName { - newKey := p.identifier.GetSource(loginp.FSEvent{NewPath: fm.Source, Descriptor: fd}).Name() + newKey := newID(p.identifier.GetSource(loginp.FSEvent{NewPath: fm.Source, Descriptor: fd})) fm.IdentifierName = identifierName return newKey, fm } From a2798fe313db19d5dfc5052d29c59ec00d07f3ea Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 21 Nov 2024 13:59:45 -0500 Subject: [PATCH 02/28] Fix tests --- .../internal/input-logfile/store_test.go | 23 +++++++++++-------- filebeat/input/filestream/prospector_test.go | 6 ++++- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/filebeat/input/filestream/internal/input-logfile/store_test.go b/filebeat/input/filestream/internal/input-logfile/store_test.go index 6f19e1afad7b..2d4f98b5d29b 100644 --- a/filebeat/input/filestream/internal/input-logfile/store_test.go +++ b/filebeat/input/filestream/internal/input-logfile/store_test.go @@ -347,11 +347,11 @@ type testMeta struct { func TestSourceStore_UpdateIdentifiers(t *testing.T) { t.Run("update identifiers when TTL is bigger than zero", func(t *testing.T) { backend := createSampleStore(t, map[string]state{ - "test::key1": { + "test::key1": { // Active resource TTL: 60 * time.Second, Meta: testMeta{IdentifierName: "method"}, }, - "test::key2": { + "test::key2": { // Deleted resource TTL: 0 * time.Second, Meta: testMeta{IdentifierName: "method"}, }, @@ -372,22 +372,25 @@ func TestSourceStore_UpdateIdentifiers(t *testing.T) { return "", nil }) - var newState state - s.persistentStore.Get("test::key1::updated", &newState) + // The persistentStore is a mock that does not consider if a state has + // been removed before returning it, thus allowing us to get Updated + // timestamp from when the resource was deleted. + var deletedState state + s.persistentStore.Get("test::key1", &deletedState) want := map[string]state{ - "test::key1": { - Updated: s.Get("test::key1").internalState.Updated, - TTL: 60 * time.Second, + "test::key1": { // old resource is deleted, TTL must be zero + Updated: deletedState.Updated, + TTL: 0 * time.Second, Meta: map[string]interface{}{"identifiername": "method"}, }, - "test::key2": { + "test::key2": { // Unchanged Updated: s.Get("test::key2").internalState.Updated, TTL: 0 * time.Second, Meta: map[string]interface{}{"identifiername": "method"}, }, - "test::key1::updated": { - Updated: newState.Updated, + "test::key1::updated": { // Updated resource + Updated: s.Get("test::key1::updated").internalState.Updated, TTL: 60 * time.Second, Meta: map[string]interface{}{"identifiername": "something"}, }, diff --git a/filebeat/input/filestream/prospector_test.go b/filebeat/input/filestream/prospector_test.go index 552b4218c784..36a6b2fb1762 100644 --- a/filebeat/input/filestream/prospector_test.go +++ b/filebeat/input/filestream/prospector_test.go @@ -112,11 +112,13 @@ func TestProspector_InitUpdateIdentifiers(t *testing.T) { entries map[string]loginp.Value filesOnDisk map[string]loginp.FileDescriptor expectedUpdatedKeys map[string]string + newKey string }{ "prospector init does not update keys if there are no entries": { entries: nil, filesOnDisk: nil, expectedUpdatedKeys: map[string]string{}, + newKey: "foo", // it isn't used but it must not be empty }, "prospector init does not update keys of not existing files": { entries: map[string]loginp.Value{ @@ -129,6 +131,7 @@ func TestProspector_InitUpdateIdentifiers(t *testing.T) { }, filesOnDisk: nil, expectedUpdatedKeys: map[string]string{}, + newKey: "foo", // it isn't used but it must not be empty }, "prospector init updates keys of existing files": { entries: map[string]loginp.Value{ @@ -143,6 +146,7 @@ func TestProspector_InitUpdateIdentifiers(t *testing.T) { tmpFileName: {Info: file.ExtendFileInfo(fi)}, }, expectedUpdatedKeys: map[string]string{"not_path::key1": "path::" + tmpFileName}, + newKey: "path::" + tmpFileName, }, } @@ -155,7 +159,7 @@ func TestProspector_InitUpdateIdentifiers(t *testing.T) { identifier: mustPathIdentifier(false), filewatcher: newMockFileWatcherWithFiles(testCase.filesOnDisk), } - p.Init(testStore, newMockProspectorCleaner(nil), func(loginp.Source) string { return "" }) + p.Init(testStore, newMockProspectorCleaner(nil), func(loginp.Source) string { return testCase.newKey }) assert.EqualValues(t, testCase.expectedUpdatedKeys, testStore.updatedKeys) }) From a4ff07a5668feebd3cd687576b73afa7edeb45b7 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Fri, 22 Nov 2024 18:16:38 -0500 Subject: [PATCH 03/28] Check if source matches the real file This commit checks if 'source' matches the real file by calculating the registry key using the old identifier, if they match, then update the registry. --- .../internal/input-logfile/prospector.go | 3 + .../internal/input-logfile/store.go | 27 +++++--- filebeat/input/filestream/prospector.go | 69 +++++++++++++++++-- .../input/filestream/prospector_creator.go | 6 +- 4 files changed, 86 insertions(+), 19 deletions(-) diff --git a/filebeat/input/filestream/internal/input-logfile/prospector.go b/filebeat/input/filestream/internal/input-logfile/prospector.go index 4fc383f315d6..2f90d440e36d 100644 --- a/filebeat/input/filestream/internal/input-logfile/prospector.go +++ b/filebeat/input/filestream/internal/input-logfile/prospector.go @@ -62,4 +62,7 @@ type ProspectorCleaner interface { type Value interface { // UnpackCursorMeta returns the cursor metadata required by the prospector. UnpackCursorMeta(to interface{}) error + + // Key return the registry's key for this resource + Key() string } diff --git a/filebeat/input/filestream/internal/input-logfile/store.go b/filebeat/input/filestream/internal/input-logfile/store.go index 726f4e0c5939..c0031af35cb8 100644 --- a/filebeat/input/filestream/internal/input-logfile/store.go +++ b/filebeat/input/filestream/internal/input-logfile/store.go @@ -219,10 +219,17 @@ func (s *sourceStore) UpdateIdentifiers(getNewID func(v Value) (string, interfac defer s.store.ephemeralStore.mu.Unlock() for key, res := range s.store.ephemeralStore.table { - // - res.internalState.TTL == 0 is a deleted entry - // - res.internalState.TTL > 0 is an entry that will be removed once the TTL + // Entries in the registry are soft deleted, once the gcStore runs, + // they're actually removed from the in-memory registry (ephemeralStore) + // and marked as removed in the registry operations log. So we need + // to skip all entries that were soft deleted. + // + // - res.internalState.TTL == 0: entry has been deleted + // - res.internalState.TTL == -1: entry will never be removed by TTL + // - res.internalState.TTL > 0: entry will be removed once its TTL // is reached - // - res.internalState.TTL == -1 is an entry that will never be removed + // + // If the entry has been deleted, skip it if res.internalState.TTL == 0 { continue } @@ -243,23 +250,17 @@ func (s *sourceStore) UpdateIdentifiers(getNewID func(v Value) (string, interfac continue } - // Pending updates due to events that have not yet been ACKed - // are not included in the copy. Collection on - // the copy start from the last known ACKed position. - // This might lead to data duplication because the harvester - // will pickup from the last ACKed position using the new key - // and the pending updates will affect the entry with the oldKey. r := res.copyWithNewKey(newKey) r.cursorMeta = updatedMeta r.stored = false - s.store.writeState(r) + s.store.writeState(r) // writeState only writes to the log file // Add the new resource to the ephemeralStore so the rest of the // codebase can have access to the new value s.store.ephemeralStore.table[newKey] = r + // Remove the old key from the store s.store.UpdateTTL(res, 0) // aka delete. See store.remove for details - s.store.log.Infof("migrated entry in registry from '%s' to '%s'", key, newKey) s.store.log.Infof("migrated entry in registry from '%s' to '%s'. Cursor: %v", key, newKey, r.cursor) } @@ -456,6 +457,10 @@ func (r *resource) UnpackCursorMeta(to interface{}) error { return typeconv.Convert(to, r.cursorMeta) } +func (r *resource) Key() string { + return r.key +} + // syncStateSnapshot returns the current insync state based on already ACKed update operations. func (r *resource) inSyncStateSnapshot() state { return state{ diff --git a/filebeat/input/filestream/prospector.go b/filebeat/input/filestream/prospector.go index 5c7402f64dc5..2ca1a4159599 100644 --- a/filebeat/input/filestream/prospector.go +++ b/filebeat/input/filestream/prospector.go @@ -47,11 +47,29 @@ var ignoreInactiveSettings = map[string]ignoreInactiveType{ ignoreInactiveSinceFirstStartStr: IgnoreInactiveSinceFirstStart, } +var identifiersMap = map[string]fileIdentifier{} + +func init() { + // Initialise a default identifier + for name, factory := range identifierFactories { + if name == inodeMarkerName { + // inode marker requries an specific config we cannot infer. + continue + } + var err error + identifiersMap[name], err = factory(nil) + if err != nil { + panic(fmt.Errorf("cannot create identifier '%s': %w", name, err)) + } + } +} + // fileProspector implements the Prospector interface. // It contains a file scanner which returns file system events. // The FS events then trigger either new Harvester runs or updates // the statestore. type fileProspector struct { + logger *logp.Logger filewatcher loginp.FSWatcher identifier fileIdentifier ignoreOlder time.Duration @@ -116,12 +134,53 @@ func (p *fileProspector) Init( return "", fm } - if fm.IdentifierName != identifierName { - newKey := newID(p.identifier.GetSource(loginp.FSEvent{NewPath: fm.Source, Descriptor: fd})) - fm.IdentifierName = identifierName - return newKey, fm + // Return early if: + // - The identifiers are the same + // - The old identifier is fingerprint + // - The old identifier is inode marker + oldIdentifierName := fm.IdentifierName + if oldIdentifierName == identifierName || oldIdentifierName == fingerprintName { + return "", nil } - return "", fm + + // Our current file (source) is in the registry, now we need to ensure + // this registry entry (resource) actually refers to our file. Sources + // are identified by path, however as log files rotate the same path + // can point to different files. + // + // So to ensure we're dealing with the resource from our current file, + // we use the old identifier to generate a registry key for the current + // file we're trying to migrate, if this key matches with the key in the + // registry, then we proceed to update the registry. + registryKey := v.Key() + oldIdentifier, ok := identifiersMap[oldIdentifierName] + if !ok { + // This should never happen, but just in case we properly handle it + // If we cannot find the identifier, move on to the next entry + // some identifiers cannot be migrated + p.logger.Errorf("cannot migrate registry entry from '%s', if the file still exists, it will be re-ingested", oldIdentifierName) + return "", nil + } + // TODO: fix case when old identifier is fingerprint, or just do not handle it + previousIdentifierKey := newID(oldIdentifier.GetSource(loginp.FSEvent{NewPath: fm.Source, Descriptor: fd})) + + // If the registry key and the key generated by the old identifier + // do not match, log it at debug level and do nothing. + if previousIdentifierKey != registryKey { + p.logger.Debugf("registry key: '%s' and previous file identity key: '%s', differ, will not migrate. Source: '%s'", + registryKey, previousIdentifierKey, fm.Source) + // fmt.Printf("registry key: '%s' and previous file identity key: '%s', differ, will not migrate\n", registryKey, previousIdentifierKey) + return "", fm + } + + // The resource matches the file we found in the file system, generate + // a new registry key and return it alongside the updated meta. + newKey := newID(p.identifier.GetSource(loginp.FSEvent{NewPath: fm.Source, Descriptor: fd})) + fm.IdentifierName = identifierName + p.logger.Debugf("registry key: '%s' and previous file identity key: '%s', are the same migrating. Source: '%s'", + registryKey, previousIdentifierKey, fm.Source) + + return newKey, fm }) return nil diff --git a/filebeat/input/filestream/prospector_creator.go b/filebeat/input/filestream/prospector_creator.go index 5142704a614d..91a5e0b30d35 100644 --- a/filebeat/input/filestream/prospector_creator.go +++ b/filebeat/input/filestream/prospector_creator.go @@ -53,9 +53,8 @@ func newProspector(config config) (loginp.Prospector, error) { return nil, fmt.Errorf("error while creating file identifier: %w", err) } - logp.L(). - With("filestream_id", config.ID). - Debugf("file identity is set to %s", identifier.Name()) + logger := logp.L().Named("input.filestream").With("filestream_id", config.ID) + logger.Debugf("file identity is set to %s", identifier.Name()) fileprospector := fileProspector{ filewatcher: filewatcher, @@ -64,6 +63,7 @@ func newProspector(config config) (loginp.Prospector, error) { ignoreInactiveSince: config.IgnoreInactive, cleanRemoved: config.CleanRemoved, stateChangeCloser: config.Close.OnStateChange, + logger: logger.Named("prospector"), } if config.Rotation == nil { return &fileprospector, nil From 3ee0e788bae13a26486e80b8c9ad3b78b307e659 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Fri, 6 Dec 2024 17:29:05 -0500 Subject: [PATCH 04/28] Improve conditions to update registry and comments --- .../internal/input-logfile/store.go | 9 +- filebeat/input/filestream/prospector.go | 126 ++++++++++-------- 2 files changed, 77 insertions(+), 58 deletions(-) diff --git a/filebeat/input/filestream/internal/input-logfile/store.go b/filebeat/input/filestream/internal/input-logfile/store.go index c0031af35cb8..a380d15cc245 100644 --- a/filebeat/input/filestream/internal/input-logfile/store.go +++ b/filebeat/input/filestream/internal/input-logfile/store.go @@ -253,14 +253,19 @@ func (s *sourceStore) UpdateIdentifiers(getNewID func(v Value) (string, interfac r := res.copyWithNewKey(newKey) r.cursorMeta = updatedMeta r.stored = false - s.store.writeState(r) // writeState only writes to the log file + // writeState only writes to the log file (disk) + // the write is synchronous + s.store.writeState(r) // Add the new resource to the ephemeralStore so the rest of the // codebase can have access to the new value s.store.ephemeralStore.table[newKey] = r // Remove the old key from the store - s.store.UpdateTTL(res, 0) // aka delete. See store.remove for details + // aka delete. This is also synchronously + // written to the disk. + // See store.remove for details + s.store.UpdateTTL(res, 0) s.store.log.Infof("migrated entry in registry from '%s' to '%s'. Cursor: %v", key, newKey, r.cursor) } diff --git a/filebeat/input/filestream/prospector.go b/filebeat/input/filestream/prospector.go index 2ca1a4159599..168e8e2ae3e8 100644 --- a/filebeat/input/filestream/prospector.go +++ b/filebeat/input/filestream/prospector.go @@ -120,68 +120,82 @@ func (p *fileProspector) Init( identifierName := p.identifier.Name() - // If the file identity has changed, update the registry keys so we can keep - // the state. - cleaner.UpdateIdentifiers(func(v loginp.Value) (string, interface{}) { - var fm fileMeta - err := v.UnpackCursorMeta(&fm) - if err != nil { - return "", nil - } + // If the file identity has changed to fingerprint, update the registry + // keys so we can keep the state. This is only supported from file + // identities that do not require configuration: + // - native (inode + device ID) + // - path + if identifierName == fingerprintName { + cleaner.UpdateIdentifiers(func(v loginp.Value) (string, interface{}) { + var fm fileMeta + err := v.UnpackCursorMeta(&fm) + if err != nil { + return "", nil + } - fd, ok := files[fm.Source] - if !ok { - return "", fm - } + fd, ok := files[fm.Source] + if !ok { + return "", fm + } - // Return early if: - // - The identifiers are the same - // - The old identifier is fingerprint - // - The old identifier is inode marker - oldIdentifierName := fm.IdentifierName - if oldIdentifierName == identifierName || oldIdentifierName == fingerprintName { - return "", nil - } + // Return early (do nothing) if: + // - The identifiers are the same + // - The old identifier is fingerprint + // - The old identifier is inode marker + oldIdentifierName := fm.IdentifierName + if oldIdentifierName == identifierName || + oldIdentifierName == fingerprintName || + oldIdentifierName == inodeMarkerName { + return "", nil + } - // Our current file (source) is in the registry, now we need to ensure - // this registry entry (resource) actually refers to our file. Sources - // are identified by path, however as log files rotate the same path - // can point to different files. - // - // So to ensure we're dealing with the resource from our current file, - // we use the old identifier to generate a registry key for the current - // file we're trying to migrate, if this key matches with the key in the - // registry, then we proceed to update the registry. - registryKey := v.Key() - oldIdentifier, ok := identifiersMap[oldIdentifierName] - if !ok { - // This should never happen, but just in case we properly handle it - // If we cannot find the identifier, move on to the next entry - // some identifiers cannot be migrated - p.logger.Errorf("cannot migrate registry entry from '%s', if the file still exists, it will be re-ingested", oldIdentifierName) - return "", nil - } - // TODO: fix case when old identifier is fingerprint, or just do not handle it - previousIdentifierKey := newID(oldIdentifier.GetSource(loginp.FSEvent{NewPath: fm.Source, Descriptor: fd})) + // Our current file (source) is in the registry, now we need to ensure + // this registry entry (resource) actually refers to our file. Sources + // are identified by path, however as log files rotate the same path + // can point to different files. + // + // So to ensure we're dealing with the resource from our current file, + // we use the old identifier to generate a registry key for the current + // file we're trying to migrate, if this key matches with the key in the + // registry, then we proceed to update the registry. + registryKey := v.Key() + oldIdentifier, ok := identifiersMap[oldIdentifierName] + if !ok { + // This should never happen, but just in case we properly handle it. + // If we cannot find the identifier, move on to the next entry + // some identifiers cannot be migrated + p.logger.Errorf( + "old file identity '%s' not found while migrating entry to"+ + "new file identity '%s'. If the file still exists, it will be re-ingested", + oldIdentifierName, + identifierName, + ) + return "", nil + } + previousIdentifierKey := newID(oldIdentifier.GetSource( + loginp.FSEvent{ + NewPath: fm.Source, + Descriptor: fd, + })) + + // If the registry key and the key generated by the old identifier + // do not match, log it at debug level and do nothing. + if previousIdentifierKey != registryKey { + p.logger.Debugf("registry key: '%s' and previous file identity key: '%s', differ, will not migrate. Source: '%s'", + registryKey, previousIdentifierKey, fm.Source) + return "", fm + } - // If the registry key and the key generated by the old identifier - // do not match, log it at debug level and do nothing. - if previousIdentifierKey != registryKey { - p.logger.Debugf("registry key: '%s' and previous file identity key: '%s', differ, will not migrate. Source: '%s'", + // The resource matches the file we found in the file system, generate + // a new registry key and return it alongside the updated meta. + newKey := newID(p.identifier.GetSource(loginp.FSEvent{NewPath: fm.Source, Descriptor: fd})) + fm.IdentifierName = identifierName + p.logger.Infof("registry key: '%s' and previous file identity key: '%s', are the same, migrating. Source: '%s'", registryKey, previousIdentifierKey, fm.Source) - // fmt.Printf("registry key: '%s' and previous file identity key: '%s', differ, will not migrate\n", registryKey, previousIdentifierKey) - return "", fm - } - - // The resource matches the file we found in the file system, generate - // a new registry key and return it alongside the updated meta. - newKey := newID(p.identifier.GetSource(loginp.FSEvent{NewPath: fm.Source, Descriptor: fd})) - fm.IdentifierName = identifierName - p.logger.Debugf("registry key: '%s' and previous file identity key: '%s', are the same migrating. Source: '%s'", - registryKey, previousIdentifierKey, fm.Source) - return newKey, fm - }) + return newKey, fm + }) + } return nil } From 4bcebe766308ab5a7191535238c64a3ac6852ca8 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Fri, 6 Dec 2024 17:29:55 -0500 Subject: [PATCH 05/28] Fix exiting tests --- filebeat/input/filestream/prospector_test.go | 27 ++++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/filebeat/input/filestream/prospector_test.go b/filebeat/input/filestream/prospector_test.go index 36a6b2fb1762..b40e2374ed22 100644 --- a/filebeat/input/filestream/prospector_test.go +++ b/filebeat/input/filestream/prospector_test.go @@ -22,7 +22,6 @@ import ( "context" "fmt" "io/fs" - "io/ioutil" "os" "sync" "testing" @@ -54,7 +53,8 @@ func TestProspector_InitCleanIfRemoved(t *testing.T) { "prospector init with clean_removed disabled with entries": { entries: map[string]loginp.Value{ "key1": &mockUnpackValue{ - fileMeta{ + key: "key1", + fileMeta: fileMeta{ Source: "/no/such/path", IdentifierName: "path", }, @@ -67,7 +67,8 @@ func TestProspector_InitCleanIfRemoved(t *testing.T) { "prospector init with clean_removed enabled with entries": { entries: map[string]loginp.Value{ "key1": &mockUnpackValue{ - fileMeta{ + key: "key1", + fileMeta: fileMeta{ Source: "/no/such/path", IdentifierName: "path", }, @@ -97,7 +98,7 @@ func TestProspector_InitCleanIfRemoved(t *testing.T) { } func TestProspector_InitUpdateIdentifiers(t *testing.T) { - f, err := ioutil.TempFile("", "existing_file") + f, err := os.CreateTemp("", "existing_file") if err != nil { t.Fatalf("cannot create temp file") } @@ -118,12 +119,12 @@ func TestProspector_InitUpdateIdentifiers(t *testing.T) { entries: nil, filesOnDisk: nil, expectedUpdatedKeys: map[string]string{}, - newKey: "foo", // it isn't used but it must not be empty }, "prospector init does not update keys of not existing files": { entries: map[string]loginp.Value{ "not_path::key1": &mockUnpackValue{ - fileMeta{ + key: "not_path::key1", + fileMeta: fileMeta{ Source: "/no/such/path", IdentifierName: "not_path", }, @@ -131,12 +132,12 @@ func TestProspector_InitUpdateIdentifiers(t *testing.T) { }, filesOnDisk: nil, expectedUpdatedKeys: map[string]string{}, - newKey: "foo", // it isn't used but it must not be empty }, - "prospector init updates keys of existing files": { + "prospector init does not update keys if new file identity is not fingerprint": { entries: map[string]loginp.Value{ "not_path::key1": &mockUnpackValue{ - fileMeta{ + key: "not_path::key1", + fileMeta: fileMeta{ Source: tmpFileName, IdentifierName: "not_path", }, @@ -145,8 +146,7 @@ func TestProspector_InitUpdateIdentifiers(t *testing.T) { filesOnDisk: map[string]loginp.FileDescriptor{ tmpFileName: {Info: file.ExtendFileInfo(fi)}, }, - expectedUpdatedKeys: map[string]string{"not_path::key1": "path::" + tmpFileName}, - newKey: "path::" + tmpFileName, + expectedUpdatedKeys: map[string]string{}, }, } @@ -604,12 +604,17 @@ func (mu *mockMetadataUpdater) Remove(s loginp.Source) error { type mockUnpackValue struct { fileMeta + key string } func (u *mockUnpackValue) UnpackCursorMeta(to interface{}) error { return typeconv.Convert(to, u.fileMeta) } +func (u *mockUnpackValue) Key() string { + return u.key +} + type mockProspectorCleaner struct { available map[string]loginp.Value cleanedKeys []string From 12ac2f350a62c63296d7706b072bc3c56c453f79 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Fri, 6 Dec 2024 18:41:46 -0500 Subject: [PATCH 06/28] Working test A working test that migrated the file identity from inode to fingerprint. --- filebeat/input/filestream/prospector_test.go | 78 +++++++++++++++++++- 1 file changed, 76 insertions(+), 2 deletions(-) diff --git a/filebeat/input/filestream/prospector_test.go b/filebeat/input/filestream/prospector_test.go index b40e2374ed22..9610fba59c24 100644 --- a/filebeat/input/filestream/prospector_test.go +++ b/filebeat/input/filestream/prospector_test.go @@ -23,11 +23,13 @@ import ( "fmt" "io/fs" "os" + "path/filepath" "sync" "testing" "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" loginp "github.com/elastic/beats/v7/filebeat/input/filestream/internal/input-logfile" input "github.com/elastic/beats/v7/filebeat/input/v2" @@ -159,13 +161,85 @@ func TestProspector_InitUpdateIdentifiers(t *testing.T) { identifier: mustPathIdentifier(false), filewatcher: newMockFileWatcherWithFiles(testCase.filesOnDisk), } - p.Init(testStore, newMockProspectorCleaner(nil), func(loginp.Source) string { return testCase.newKey }) - + err := p.Init(testStore, newMockProspectorCleaner(nil), func(loginp.Source) string { return testCase.newKey }) + require.NoError(t, err, "prospector Init must succeed") assert.EqualValues(t, testCase.expectedUpdatedKeys, testStore.updatedKeys) }) } } +func TestMigrateRegistryToFingerprint(t *testing.T) { + fullPath, err := filepath.Abs(filepath.Join("testdata", "log.log")) + if err != nil { + t.Fatalf("cannot get absolute path from test file: %s", err) + } + f, err := os.Open(fullPath) + if err != nil { + t.Fatalf("cannot open test file") + } + defer f.Close() + tmpFileName := f.Name() + fi, err := f.Stat() + + fd := loginp.FileDescriptor{ + Filename: tmpFileName, + Info: file.ExtendFileInfo(fi), + Fingerprint: "the fingerprint from this file", + } + + inodeIdentifier, _ := newINodeDeviceIdentifier(nil) + fingerprintIdentifier, _ := newFingerprintIdentifier(nil) + newIDFunc := func(s loginp.Source) string { + return "test-input-" + s.Name() + } + fsEvent := loginp.FSEvent{ + OldPath: fullPath, + NewPath: fullPath, + Op: loginp.OpCreate, + Descriptor: fd, + } + + registryKey := newIDFunc(inodeIdentifier.GetSource(fsEvent)) + expectedKey := newIDFunc(fingerprintIdentifier.GetSource(fsEvent)) + + entries := map[string]loginp.Value{ + registryKey: &mockUnpackValue{ + key: registryKey, + fileMeta: fileMeta{ + Source: fullPath, + IdentifierName: nativeName, + }, + }, + } + + testStore := newMockProspectorCleaner(entries) + + filesOnDisk := map[string]loginp.FileDescriptor{ + tmpFileName: fd, + } + + p := fileProspector{ + logger: logp.L(), + identifier: fingerprintIdentifier, + filewatcher: newMockFileWatcherWithFiles(filesOnDisk), + } + + err = p.Init( + testStore, + newMockProspectorCleaner(nil), + newIDFunc, + ) + require.NoError(t, err, "prospector Init must succeed") + // testStore.updatedKeys is in the format + // oldKey -> newKey + assert.Equal( + t, + map[string]string{ + registryKey: "test-input-fingerprint::the fingerprint from this file"}, + testStore.updatedKeys, + expectedKey) +} + func TestProspectorNewAndUpdatedFiles(t *testing.T) { minuteAgo := time.Now().Add(-1 * time.Minute) From 57e61299ae09a7ea1796869ab5d9066ae8a1521e Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Mon, 9 Dec 2024 15:46:49 -0500 Subject: [PATCH 07/28] Run mage check and add all generated files --- filebeat/filebeat.reference.yml | 2 ++ filebeat/include/list.go | 1 + 2 files changed, 3 insertions(+) diff --git a/filebeat/filebeat.reference.yml b/filebeat/filebeat.reference.yml index 14e9f276fb49..0d894c921e94 100644 --- a/filebeat/filebeat.reference.yml +++ b/filebeat/filebeat.reference.yml @@ -1260,6 +1260,8 @@ filebeat.inputs: # batch of events has been published successfully. The default value is 1s. #filebeat.registry.flush: 1s +# The interval which to run the registry clean up +#filebeat.registry.cleanup_interval: 5m # Starting with Filebeat 7.0, the registry uses a new directory format to store # Filebeat state. After you upgrade, Filebeat will automatically migrate a 6.x diff --git a/filebeat/include/list.go b/filebeat/include/list.go index e2a656a2a856..ae05c332eaa4 100644 --- a/filebeat/include/list.go +++ b/filebeat/include/list.go @@ -28,6 +28,7 @@ import ( // Import packages that perform 'func init()'. _ "github.com/elastic/beats/v7/filebeat/input" _ "github.com/elastic/beats/v7/filebeat/input/container" + _ "github.com/elastic/beats/v7/filebeat/input/filestream" _ "github.com/elastic/beats/v7/filebeat/input/log" _ "github.com/elastic/beats/v7/filebeat/input/mqtt" _ "github.com/elastic/beats/v7/filebeat/input/redis" From 2de77ca56a783b5455faa562d7aa5200e0adde4a Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Mon, 9 Dec 2024 17:01:13 -0500 Subject: [PATCH 08/28] Add unit tests for all common cases --- filebeat/input/filestream/prospector_test.go | 151 ++++++++++++++----- 1 file changed, 112 insertions(+), 39 deletions(-) diff --git a/filebeat/input/filestream/prospector_test.go b/filebeat/input/filestream/prospector_test.go index 9610fba59c24..abbf35284146 100644 --- a/filebeat/input/filestream/prospector_test.go +++ b/filebeat/input/filestream/prospector_test.go @@ -35,6 +35,7 @@ import ( input "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/common/file" "github.com/elastic/beats/v7/libbeat/common/transform/typeconv" + conf "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/go-concert/unison" ) @@ -169,11 +170,23 @@ func TestProspector_InitUpdateIdentifiers(t *testing.T) { } func TestMigrateRegistryToFingerprint(t *testing.T) { - fullPath, err := filepath.Abs(filepath.Join("testdata", "log.log")) + const mockFingerprint = "the fingerprint from this file" + const mockInputPrefix = "test-input" + + // We need an empty file as inode marker for the + // 'inode marker' file identity + inodeMarkerFile, err := os.CreateTemp(t.TempDir(), "test-inode-marker") + if err != nil { + t.Fatalf("cannot create inode marker: '%s'", err) + } + inodeMarkerPath := inodeMarkerFile.Name() + inodeMarkerFile.Close() + + logFileFullPath, err := filepath.Abs(filepath.Join("testdata", "log.log")) if err != nil { t.Fatalf("cannot get absolute path from test file: %s", err) } - f, err := os.Open(fullPath) + f, err := os.Open(logFileFullPath) if err != nil { t.Fatalf("cannot open test file") } @@ -184,60 +197,120 @@ func TestMigrateRegistryToFingerprint(t *testing.T) { fd := loginp.FileDescriptor{ Filename: tmpFileName, Info: file.ExtendFileInfo(fi), - Fingerprint: "the fingerprint from this file", + Fingerprint: mockFingerprint, } - inodeIdentifier, _ := newINodeDeviceIdentifier(nil) fingerprintIdentifier, _ := newFingerprintIdentifier(nil) + nativeIdentifier, _ := newINodeDeviceIdentifier(nil) + pathIdentifier, _ := newPathIdentifier(nil) + inodeIdentifier, err := newINodeMarkerIdentifier( + conf.MustNewConfigFrom(map[string]any{ + "path": inodeMarkerPath, + }), + ) + newIDFunc := func(s loginp.Source) string { - return "test-input-" + s.Name() + return mockInputPrefix + "-" + s.Name() } + fsEvent := loginp.FSEvent{ - OldPath: fullPath, - NewPath: fullPath, + OldPath: logFileFullPath, + NewPath: logFileFullPath, Op: loginp.OpCreate, Descriptor: fd, } - registryKey := newIDFunc(inodeIdentifier.GetSource(fsEvent)) - expectedKey := newIDFunc(fingerprintIdentifier.GetSource(fsEvent)) + expectedNewKey := newIDFunc(fingerprintIdentifier.GetSource(fsEvent)) - entries := map[string]loginp.Value{ - registryKey: &mockUnpackValue{ - key: registryKey, - fileMeta: fileMeta{ - Source: fullPath, - IdentifierName: nativeName, - }, - }, + if err != nil { + t.Fatalf("cannot create inodeMarkerIdentifier: %s", err) } - testStore := newMockProspectorCleaner(entries) + testCases := map[string]struct { + oldIdentifier fileIdentifier + newIdentifier fileIdentifier + expectRegistryMigration bool + }{ + "inode to fingerprint succeeds": { + oldIdentifier: nativeIdentifier, + newIdentifier: fingerprintIdentifier, + expectRegistryMigration: true, + }, + "path to fingerprint succeeds": { + oldIdentifier: pathIdentifier, + newIdentifier: fingerprintIdentifier, + expectRegistryMigration: true, + }, + "inode marker to fingerprint fails": { + oldIdentifier: inodeIdentifier, + newIdentifier: fingerprintIdentifier, + }, + "fingerprint to fingerprint fails": { + oldIdentifier: fingerprintIdentifier, + newIdentifier: fingerprintIdentifier, + }, - filesOnDisk := map[string]loginp.FileDescriptor{ - tmpFileName: fd, + // If the new identifier is not fingerprint, it will always fail. + // So we only test a couple of combinations + "fingerprint to native fails": { + oldIdentifier: fingerprintIdentifier, + newIdentifier: nativeIdentifier, + }, + "path to native fails": { + oldIdentifier: pathIdentifier, + newIdentifier: nativeIdentifier, + }, } - p := fileProspector{ - logger: logp.L(), - identifier: fingerprintIdentifier, - filewatcher: newMockFileWatcherWithFiles(filesOnDisk), - } + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + oldKey := newIDFunc(tc.oldIdentifier.GetSource(fsEvent)) + entries := map[string]loginp.Value{ + oldKey: &mockUnpackValue{ + key: oldKey, + fileMeta: fileMeta{ + Source: logFileFullPath, + IdentifierName: tc.oldIdentifier.Name(), + }, + }, + } - err = p.Init( - testStore, - newMockProspectorCleaner(nil), - newIDFunc, - ) - require.NoError(t, err, "prospector Init must succeed") - // testStore.updatedKeys is in the format - // oldKey -> newKey - assert.Equal( - t, - map[string]string{ - registryKey: "test-input-fingerprint::the fingerprint from this file"}, - testStore.updatedKeys, - expectedKey) + testStore := newMockProspectorCleaner(entries) + filesOnDisk := map[string]loginp.FileDescriptor{ + tmpFileName: fd, + } + + p := fileProspector{ + logger: logp.L(), + identifier: tc.newIdentifier, + filewatcher: newMockFileWatcherWithFiles(filesOnDisk), + } + + err = p.Init( + testStore, + newMockProspectorCleaner(nil), + newIDFunc, + ) + require.NoError(t, err, "prospector Init must succeed") + // testStore.updatedKeys is in the format + // oldKey -> newKey + + if tc.expectRegistryMigration { + assert.Equal( + t, + map[string]string{ + oldKey: expectedNewKey}, + testStore.updatedKeys, + "the registry entries were not correctly migrated") + } else { + assert.Equal( + t, + map[string]string{}, + testStore.updatedKeys, + "expecting no migration") + } + }) + } } func TestProspectorNewAndUpdatedFiles(t *testing.T) { From c1915a4ab7a411931aeabb90bb66261a858ca289 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Tue, 10 Dec 2024 13:00:49 -0500 Subject: [PATCH 09/28] Add integration tests --- filebeat/input/filestream/prospector.go | 134 +++++++------- filebeat/input/filestream/testdata/log.log | 10 + filebeat/tests/integration/filestream_test.go | 175 ++++++++++++++++++ .../tests/integration/testdata/inodeMarker | 1 + libbeat/tests/integration/framework.go | 5 + 5 files changed, 259 insertions(+), 66 deletions(-) create mode 100644 filebeat/input/filestream/testdata/log.log create mode 100644 filebeat/tests/integration/testdata/inodeMarker diff --git a/filebeat/input/filestream/prospector.go b/filebeat/input/filestream/prospector.go index 168e8e2ae3e8..cc6bf84537a7 100644 --- a/filebeat/input/filestream/prospector.go +++ b/filebeat/input/filestream/prospector.go @@ -125,77 +125,79 @@ func (p *fileProspector) Init( // identities that do not require configuration: // - native (inode + device ID) // - path - if identifierName == fingerprintName { - cleaner.UpdateIdentifiers(func(v loginp.Value) (string, interface{}) { - var fm fileMeta - err := v.UnpackCursorMeta(&fm) - if err != nil { - return "", nil - } - - fd, ok := files[fm.Source] - if !ok { - return "", fm - } + if identifierName != fingerprintName { + p.logger.Debugf("file identity is '%s', will not migrate registry", identifierName) + return nil + } + cleaner.UpdateIdentifiers(func(v loginp.Value) (string, interface{}) { + var fm fileMeta + err := v.UnpackCursorMeta(&fm) + if err != nil { + return "", nil + } - // Return early (do nothing) if: - // - The identifiers are the same - // - The old identifier is fingerprint - // - The old identifier is inode marker - oldIdentifierName := fm.IdentifierName - if oldIdentifierName == identifierName || - oldIdentifierName == fingerprintName || - oldIdentifierName == inodeMarkerName { - return "", nil - } + fd, ok := files[fm.Source] + if !ok { + return "", fm + } - // Our current file (source) is in the registry, now we need to ensure - // this registry entry (resource) actually refers to our file. Sources - // are identified by path, however as log files rotate the same path - // can point to different files. - // - // So to ensure we're dealing with the resource from our current file, - // we use the old identifier to generate a registry key for the current - // file we're trying to migrate, if this key matches with the key in the - // registry, then we proceed to update the registry. - registryKey := v.Key() - oldIdentifier, ok := identifiersMap[oldIdentifierName] - if !ok { - // This should never happen, but just in case we properly handle it. - // If we cannot find the identifier, move on to the next entry - // some identifiers cannot be migrated - p.logger.Errorf( - "old file identity '%s' not found while migrating entry to"+ - "new file identity '%s'. If the file still exists, it will be re-ingested", - oldIdentifierName, - identifierName, - ) - return "", nil - } - previousIdentifierKey := newID(oldIdentifier.GetSource( - loginp.FSEvent{ - NewPath: fm.Source, - Descriptor: fd, - })) - - // If the registry key and the key generated by the old identifier - // do not match, log it at debug level and do nothing. - if previousIdentifierKey != registryKey { - p.logger.Debugf("registry key: '%s' and previous file identity key: '%s', differ, will not migrate. Source: '%s'", - registryKey, previousIdentifierKey, fm.Source) - return "", fm - } + // Return early (do nothing) if: + // - The identifiers are the same + // - The old identifier is fingerprint + // - The old identifier is inode marker + oldIdentifierName := fm.IdentifierName + if oldIdentifierName == identifierName || + oldIdentifierName == fingerprintName || + oldIdentifierName == inodeMarkerName { + return "", nil + } - // The resource matches the file we found in the file system, generate - // a new registry key and return it alongside the updated meta. - newKey := newID(p.identifier.GetSource(loginp.FSEvent{NewPath: fm.Source, Descriptor: fd})) - fm.IdentifierName = identifierName - p.logger.Infof("registry key: '%s' and previous file identity key: '%s', are the same, migrating. Source: '%s'", + // Our current file (source) is in the registry, now we need to ensure + // this registry entry (resource) actually refers to our file. Sources + // are identified by path, however as log files rotate the same path + // can point to different files. + // + // So to ensure we're dealing with the resource from our current file, + // we use the old identifier to generate a registry key for the current + // file we're trying to migrate, if this key matches with the key in the + // registry, then we proceed to update the registry. + registryKey := v.Key() + oldIdentifier, ok := identifiersMap[oldIdentifierName] + if !ok { + // This should never happen, but just in case we properly handle it. + // If we cannot find the identifier, move on to the next entry + // some identifiers cannot be migrated + p.logger.Errorf( + "old file identity '%s' not found while migrating entry to"+ + "new file identity '%s'. If the file still exists, it will be re-ingested", + oldIdentifierName, + identifierName, + ) + return "", nil + } + previousIdentifierKey := newID(oldIdentifier.GetSource( + loginp.FSEvent{ + NewPath: fm.Source, + Descriptor: fd, + })) + + // If the registry key and the key generated by the old identifier + // do not match, log it at debug level and do nothing. + if previousIdentifierKey != registryKey { + p.logger.Debugf("registry key: '%s' and previous file identity key: '%s', differ, will not migrate. Source: '%s'", registryKey, previousIdentifierKey, fm.Source) + return "", fm + } - return newKey, fm - }) - } + // The resource matches the file we found in the file system, generate + // a new registry key and return it alongside the updated meta. + newKey := newID(p.identifier.GetSource(loginp.FSEvent{NewPath: fm.Source, Descriptor: fd})) + fm.IdentifierName = identifierName + p.logger.Infof("registry key: '%s' and previous file identity key: '%s', are the same, migrating. Source: '%s'", + registryKey, previousIdentifierKey, fm.Source) + + return newKey, fm + }) return nil } diff --git a/filebeat/input/filestream/testdata/log.log b/filebeat/input/filestream/testdata/log.log new file mode 100644 index 000000000000..733afc5a1aaa --- /dev/null +++ b/filebeat/input/filestream/testdata/log.log @@ -0,0 +1,10 @@ +51.157.82.254 - collins3480 [06/Dec/2024:17:03:34 -0500] "GET /enable/transparent HTTP/2.0" 503 29836 +128.72.132.219 - - [06/Dec/2024:17:03:34 -0500] "PATCH /redefine/paradigms/front-end/synergies HTTP/2.0" 200 2307 +153.167.184.78 - - [06/Dec/2024:17:03:34 -0500] "HEAD /leading-edge/interactive/interactive/one-to-one HTTP/2.0" 204 18593 +175.195.94.204 - - [06/Dec/2024:17:03:34 -0500] "PUT /incentivize HTTP/2.0" 301 3998 +235.228.211.66 - hoppe3344 [06/Dec/2024:17:03:34 -0500] "DELETE /proactive/customized/action-items/killer HTTP/2.0" 203 24605 +6.175.232.33 - - [06/Dec/2024:17:03:34 -0500] "HEAD /extensible/productize/b2b HTTP/1.0" 503 15893 +146.190.210.171 - - [06/Dec/2024:17:03:34 -0500] "HEAD /architect/embrace/evolve HTTP/1.0" 502 9833 +224.125.203.225 - - [06/Dec/2024:17:03:34 -0500] "DELETE /turn-key/infrastructures/vortals HTTP/1.0" 100 17062 +194.157.121.128 - nicolas3550 [06/Dec/2024:17:03:34 -0500] "PATCH /vortals/scalable/experiences/deploy HTTP/1.1" 503 8034 +88.58.87.19 - - [06/Dec/2024:17:03:34 -0500] "GET /vertical/schemas HTTP/2.0" 405 2034 diff --git a/filebeat/tests/integration/filestream_test.go b/filebeat/tests/integration/filestream_test.go index 45cf99fbfb61..048abd6a7e03 100644 --- a/filebeat/tests/integration/filestream_test.go +++ b/filebeat/tests/integration/filestream_test.go @@ -21,6 +21,7 @@ package integration import ( "fmt" + "os" "path" "path/filepath" "testing" @@ -271,3 +272,177 @@ logging: 10*time.Second, "Filebeat did log a validation error") } + +func TestFilestreamCanMigrateIdentity(t *testing.T) { + cfgTemplate := ` +filebeat.inputs: + - type: filestream + id: "test-migrate-ID" + paths: + - %s +%s + +filebeat.registry: + flush: 0s + +queue.mem: + flush.timeout: 0s + +path.home: %s + +output.file: + path: ${path.home} + filename: "output-file" + rotate_on_startup: false + +logging: + level: debug + selectors: + - input + - input.filestream + - input.filestream.prospector + metrics: + enabled: false +` + + nativeCfg := ` + file_identity.native: ~ +` + pathCfg := ` + file_identity.path: ~ +` + fingerprintCfg := ` + file_identity.fingerprint: ~ + prospector: + scanner: + fingerprint.enabled: true + check_interval: 0.1s +` + inodeMarkerPath, err := filepath.Abs(filepath.Join("testdata", "inodeMarker")) + if err != nil { + t.Fatalf("cannot get absolute path from inode marker: %s", err) + } + inodeMarkerCfg := " file_identity.inode_marker.path: " + inodeMarkerPath + "\n" + + testCases := map[string]struct { + oldIdentityCfg string + newIdentityCfg string + notMigrateMsg string + expectMigration bool + }{ + "native to fingerprint": { + oldIdentityCfg: nativeCfg, + newIdentityCfg: fingerprintCfg, + expectMigration: true, + }, + + "path to fingerprint": { + oldIdentityCfg: pathCfg, + newIdentityCfg: fingerprintCfg, + expectMigration: true, + }, + + "inode marker to fingerprint": { + oldIdentityCfg: inodeMarkerCfg, + newIdentityCfg: fingerprintCfg, + expectMigration: false, + }, + + "path to native": { + oldIdentityCfg: pathCfg, + newIdentityCfg: nativeCfg, + expectMigration: false, + notMigrateMsg: "file identity is 'native', will not migrate registry", + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + + filebeat := integration.NewBeat( + t, + "filebeat", + "../../filebeat.test", + ) + workDir := filebeat.TempDir() + + logFilepath := filepath.Join(workDir, "log.log") + integration.GenerateLogFile(t, logFilepath, 25, false) + + cfgYAML := fmt.Sprintf(cfgTemplate, logFilepath, tc.oldIdentityCfg, workDir) + filebeat.WriteConfigFile(cfgYAML) + filebeat.Start() + + // Wait for the file to be fully ingested + eofMsg := fmt.Sprintf("End of file reached: %s; Backoff now.", logFilepath) + filebeat.WaitForLogs(eofMsg, time.Second*10, "EOF was not reached") + publishedEvents := filebeat.CountFileLines(filepath.Join(workDir, "output-file*")) + if publishedEvents != 25 { + t.Fatalf("expecting 25 published events, got %d instead", publishedEvents) + } + filebeat.Stop() + + if err := os.Truncate(filebeat.ConfigFilePath(), 0); err != nil { + t.Fatalf("cannot truncate Filebeat's configuration file: %s", err) + } + + newCfg := fmt.Sprintf(cfgTemplate, logFilepath, tc.newIdentityCfg, workDir) + if err := os.WriteFile(filebeat.ConfigFilePath(), []byte(newCfg), 0o644); err != nil { + t.Fatalf("cannot write new configuration file: %s", err) + } + + filebeat.Start() + + // The happy path is to migrate keys, so we assert it first + if tc.expectMigration { + // Test the case where the registry migration happens + migratingMsg := fmt.Sprintf("are the same, migrating. Source: '%s'", logFilepath) + filebeat.WaitForLogs(migratingMsg, time.Second*5, "prospector did not migrate registry entry") + filebeat.WaitForLogs("migrated entry in registry from", time.Second*10, "store did not update registry key") + filebeat.WaitForLogs(eofMsg, time.Second*10, "EOF was not reached the second time") + if publishedEvents != 25 { + t.Fatalf("expecting 25 published events after file migration, got %d instead", publishedEvents) + } + + // Ingest more data to ensure the offset was migrated + integration.GenerateLogFile(t, logFilepath, 17, true) + filebeat.WaitForLogs(eofMsg, time.Second*5, "EOF was not reached the third time") + + publishedEvents = filebeat.CountFileLines(filepath.Join(workDir, "output-file*")) + if publishedEvents != 42 { + t.Fatalf("expecting 42 published events after file migration, got %d instead", publishedEvents) + } + + return + } + + // Another option is for no keys to be migrated because the current + // file identity is not fingerprint + if tc.notMigrateMsg != "" { + filebeat.WaitForLogs(tc.notMigrateMsg, time.Second*5, "the registry should not have been migrated") + } + + // The last thing to test when there is no migration is to assert + // the file has been fully re-ingested because the file identity + // changed + filebeat.WaitForLogs(eofMsg, time.Second*10, "EOF was not reached the second time") + publishedEvents = filebeat.CountFileLines(filepath.Join(workDir, "output-file*")) + if publishedEvents != 50 { + t.Fatalf("expecting 50 published when there was no migration, got %d instead", publishedEvents) + } + + // Ingest more data to ensure the offset is correctly tracked + integration.GenerateLogFile(t, logFilepath, 10, true) + filebeat.WaitForLogs(eofMsg, time.Second*5, "EOF was not reached the third time") + + publishedEvents = filebeat.CountFileLines(filepath.Join(workDir, "output-file*")) + if publishedEvents != 60 { + t.Fatalf( + "expecting 60 published events after re-ingestion and more"+ + " data being added, got %d instead", + publishedEvents, + ) + } + }) + } +} diff --git a/filebeat/tests/integration/testdata/inodeMarker b/filebeat/tests/integration/testdata/inodeMarker new file mode 100644 index 000000000000..302beb58d917 --- /dev/null +++ b/filebeat/tests/integration/testdata/inodeMarker @@ -0,0 +1 @@ +Inode marker is any existing file for the 'inode_marker' file identity. \ No newline at end of file diff --git a/libbeat/tests/integration/framework.go b/libbeat/tests/integration/framework.go index 904fc1e302a9..186d8483f9ff 100644 --- a/libbeat/tests/integration/framework.go +++ b/libbeat/tests/integration/framework.go @@ -994,3 +994,8 @@ func (b *BeatProc) CountFileLines(glob string) int { return bytes.Count(data, []byte{'\n'}) } + +// ConfigFilePath returns the config file path +func (b *BeatProc) ConfigFilePath() string { + return b.configFile +} From 6f33fabd03f1c5bc90be279d2e60e2f9cfeb5efa Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Tue, 10 Dec 2024 14:49:25 -0500 Subject: [PATCH 10/28] Clean up test config --- filebeat/tests/integration/filestream_test.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/filebeat/tests/integration/filestream_test.go b/filebeat/tests/integration/filestream_test.go index 048abd6a7e03..1bb9d630c632 100644 --- a/filebeat/tests/integration/filestream_test.go +++ b/filebeat/tests/integration/filestream_test.go @@ -282,9 +282,6 @@ filebeat.inputs: - %s %s -filebeat.registry: - flush: 0s - queue.mem: flush.timeout: 0s From 9bd1bf67657b3ac4cf4296860bd455003f459c05 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Tue, 10 Dec 2024 14:53:57 -0500 Subject: [PATCH 11/28] fix exiting tests --- filebeat/input/filestream/prospector_test.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/filebeat/input/filestream/prospector_test.go b/filebeat/input/filestream/prospector_test.go index abbf35284146..681c7b74c8d0 100644 --- a/filebeat/input/filestream/prospector_test.go +++ b/filebeat/input/filestream/prospector_test.go @@ -89,6 +89,7 @@ func TestProspector_InitCleanIfRemoved(t *testing.T) { t.Run(name, func(t *testing.T) { testStore := newMockProspectorCleaner(testCase.entries) p := fileProspector{ + logger: logp.L(), identifier: mustPathIdentifier(false), cleanRemoved: testCase.cleanRemoved, filewatcher: newMockFileWatcherWithFiles(testCase.filesOnDisk), @@ -159,6 +160,7 @@ func TestProspector_InitUpdateIdentifiers(t *testing.T) { t.Run(name, func(t *testing.T) { testStore := newMockProspectorCleaner(testCase.entries) p := fileProspector{ + logger: logp.L(), identifier: mustPathIdentifier(false), filewatcher: newMockFileWatcherWithFiles(testCase.filesOnDisk), } @@ -397,6 +399,7 @@ func TestProspectorNewAndUpdatedFiles(t *testing.T) { t.Run(name, func(t *testing.T) { p := fileProspector{ + logger: logp.L(), filewatcher: newMockFileWatcher(test.events, len(test.events)), identifier: mustPathIdentifier(false), ignoreOlder: test.ignoreOlder, @@ -434,6 +437,7 @@ func TestProspectorHarvesterUpdateIgnoredFiles(t *testing.T) { filewatcher := newMockFileWatcher([]loginp.FSEvent{eventCreate}, 2) p := fileProspector{ + logger: logp.L(), filewatcher: filewatcher, identifier: mustPathIdentifier(false), ignoreOlder: 10 * time.Second, @@ -498,6 +502,7 @@ func TestProspectorDeletedFile(t *testing.T) { t.Run(name, func(t *testing.T) { p := fileProspector{ + logger: logp.L(), filewatcher: newMockFileWatcher(test.events, len(test.events)), identifier: mustPathIdentifier(false), cleanRemoved: test.cleanRemoved, @@ -579,6 +584,7 @@ func TestProspectorRenamedFile(t *testing.T) { t.Run(name, func(t *testing.T) { p := fileProspector{ + logger: logp.L(), filewatcher: newMockFileWatcher(test.events, len(test.events)), identifier: mustPathIdentifier(test.trackRename), stateChangeCloser: stateChangeCloserConfig{Renamed: test.closeRenamed}, @@ -851,6 +857,7 @@ func TestOnRenameFileIdentity(t *testing.T) { for k, tc := range testCases { t.Run(k, func(t *testing.T) { p := fileProspector{ + logger: logp.L(), filewatcher: newMockFileWatcher(tc.events, len(tc.events)), identifier: mustPathIdentifier(true), stateChangeCloser: stateChangeCloserConfig{Renamed: true}, From 937e6719078386948b972340fdd0bddd0979e9d1 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Tue, 10 Dec 2024 16:55:18 -0500 Subject: [PATCH 12/28] Add test for corner case This commit adds a test to validate the case when there are multiple registry entries from different files but with the same path. That's the case when there is log rotation. --- filebeat/tests/integration/filestream_test.go | 124 +++++++++++++++++- 1 file changed, 123 insertions(+), 1 deletion(-) diff --git a/filebeat/tests/integration/filestream_test.go b/filebeat/tests/integration/filestream_test.go index 1bb9d630c632..0e9c7baf5516 100644 --- a/filebeat/tests/integration/filestream_test.go +++ b/filebeat/tests/integration/filestream_test.go @@ -20,6 +20,7 @@ package integration import ( + "errors" "fmt" "os" "path" @@ -301,7 +302,6 @@ logging: metrics: enabled: false ` - nativeCfg := ` file_identity.native: ~ ` @@ -443,3 +443,125 @@ logging: }) } } + +func TestFilestreamMigrateIdentityCornerCases(t *testing.T) { + cfgTemplate := ` +filebeat.inputs: + - type: filestream + id: "test-migrate-ID" + paths: + - %s +%s + +queue.mem: + flush.timeout: 0s + +path.home: %s + +output.file: + path: ${path.home} + filename: "output-file" + rotate_on_startup: false + +logging: + level: debug + selectors: + - input + - input.filestream + - input.filestream.prospector + metrics: + enabled: false +` + nativeCfg := ` + file_identity.native: ~ + prospector: + scanner: + check_interval: 0.1s +` + fingerprintCfg := ` + file_identity.fingerprint: ~ + prospector: + scanner: + fingerprint.enabled: true + check_interval: 0.1s +` + + filebeat := integration.NewBeat( + t, + "filebeat", + "../../filebeat.test", + ) + workDir := filebeat.TempDir() + + logFilepath := filepath.Join(workDir, "log.log") + outputFile := filepath.Join(workDir, "output-file*") + + cfgYAML := fmt.Sprintf(cfgTemplate, logFilepath, nativeCfg, workDir) + filebeat.WriteConfigFile(cfgYAML) + filebeat.Start() + + // Create and ingest 4 different files, all with the same path + // to simulate log rotation + createFileAndWaitIngestion(t, logFilepath, outputFile, filebeat, 50, 50) + createFileAndWaitIngestion(t, logFilepath, outputFile, filebeat, 50, 100) + createFileAndWaitIngestion(t, logFilepath, outputFile, filebeat, 50, 150) + createFileAndWaitIngestion(t, logFilepath, outputFile, filebeat, 50, 200) + + filebeat.Stop() + cfgYAML = fmt.Sprintf(cfgTemplate, logFilepath, fingerprintCfg, workDir) + if err := os.WriteFile(filebeat.ConfigFilePath(), []byte(cfgYAML), 0666); err != nil { + t.Fatalf("cannot write config file: %s", err) + } + + filebeat.Start() + + migratingMsg := fmt.Sprintf("are the same, migrating. Source: '%s'", logFilepath) + eofMsg := fmt.Sprintf("End of file reached: %s; Backoff now.", logFilepath) + + filebeat.WaitForLogs(migratingMsg, time.Second*10, "prospector did not migrate registry entry") + filebeat.WaitForLogs("migrated entry in registry from", time.Second*10, "store did not update registry key") + filebeat.WaitForLogs(eofMsg, time.Second*10, "EOF was not reached the second time") + + assertPublishedEvents(t, filebeat, 200, outputFile) + // Ingest more data to ensure the offset was migrated + integration.GenerateLogFile(t, logFilepath, 20, true) + filebeat.WaitForLogs(eofMsg, time.Second*5, "EOF was not reached the third time") + + assertPublishedEvents(t, filebeat, 220, outputFile) +} + +func assertPublishedEvents( + t *testing.T, + filebeat *integration.BeatProc, + expected int, + outputFile string) { + + publishedEvents := filebeat.CountFileLines(outputFile) + if publishedEvents != expected { + t.Fatalf("expecting %d published events after file migration, got %d instead", expected, publishedEvents) + } +} + +func createFileAndWaitIngestion( + t *testing.T, + logFilepath, outputFilepath string, + fb *integration.BeatProc, + n, outputTotal int) { + + _, err := os.Stat(logFilepath) + if err != nil && !errors.Is(err, os.ErrNotExist) { + t.Fatalf("cannot stat log file: %s", err) + } + // Remove the file if it exists + if err == nil { + if err := os.Remove(logFilepath); err != nil { + t.Fatalf("cannot remove log file: %s", err) + } + } + + integration.GenerateLogFile(t, logFilepath, n, false) + + eofMsg := fmt.Sprintf("End of file reached: %s; Backoff now.", logFilepath) + fb.WaitForLogs(eofMsg, time.Second*10, "EOF was not reached") + assertPublishedEvents(t, fb, outputTotal, outputFilepath) +} From fd8872af226b9a6353bf80af9855625d7e279d8e Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Tue, 10 Dec 2024 17:02:57 -0500 Subject: [PATCH 13/28] Update tests to use require function --- filebeat/tests/integration/filestream_test.go | 41 ++++++------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/filebeat/tests/integration/filestream_test.go b/filebeat/tests/integration/filestream_test.go index 0e9c7baf5516..51c6873c909a 100644 --- a/filebeat/tests/integration/filestream_test.go +++ b/filebeat/tests/integration/filestream_test.go @@ -362,7 +362,7 @@ logging: "../../filebeat.test", ) workDir := filebeat.TempDir() - + outputFile := filepath.Join(workDir, "output-file*") logFilepath := filepath.Join(workDir, "log.log") integration.GenerateLogFile(t, logFilepath, 25, false) @@ -373,10 +373,7 @@ logging: // Wait for the file to be fully ingested eofMsg := fmt.Sprintf("End of file reached: %s; Backoff now.", logFilepath) filebeat.WaitForLogs(eofMsg, time.Second*10, "EOF was not reached") - publishedEvents := filebeat.CountFileLines(filepath.Join(workDir, "output-file*")) - if publishedEvents != 25 { - t.Fatalf("expecting 25 published events, got %d instead", publishedEvents) - } + requirePublishedEvents(t, filebeat, 25, outputFile) filebeat.Stop() if err := os.Truncate(filebeat.ConfigFilePath(), 0); err != nil { @@ -397,19 +394,13 @@ logging: filebeat.WaitForLogs(migratingMsg, time.Second*5, "prospector did not migrate registry entry") filebeat.WaitForLogs("migrated entry in registry from", time.Second*10, "store did not update registry key") filebeat.WaitForLogs(eofMsg, time.Second*10, "EOF was not reached the second time") - if publishedEvents != 25 { - t.Fatalf("expecting 25 published events after file migration, got %d instead", publishedEvents) - } + requirePublishedEvents(t, filebeat, 25, outputFile) // Ingest more data to ensure the offset was migrated integration.GenerateLogFile(t, logFilepath, 17, true) filebeat.WaitForLogs(eofMsg, time.Second*5, "EOF was not reached the third time") - publishedEvents = filebeat.CountFileLines(filepath.Join(workDir, "output-file*")) - if publishedEvents != 42 { - t.Fatalf("expecting 42 published events after file migration, got %d instead", publishedEvents) - } - + requirePublishedEvents(t, filebeat, 42, outputFile) return } @@ -423,23 +414,13 @@ logging: // the file has been fully re-ingested because the file identity // changed filebeat.WaitForLogs(eofMsg, time.Second*10, "EOF was not reached the second time") - publishedEvents = filebeat.CountFileLines(filepath.Join(workDir, "output-file*")) - if publishedEvents != 50 { - t.Fatalf("expecting 50 published when there was no migration, got %d instead", publishedEvents) - } + requirePublishedEvents(t, filebeat, 50, outputFile) // Ingest more data to ensure the offset is correctly tracked integration.GenerateLogFile(t, logFilepath, 10, true) filebeat.WaitForLogs(eofMsg, time.Second*5, "EOF was not reached the third time") - publishedEvents = filebeat.CountFileLines(filepath.Join(workDir, "output-file*")) - if publishedEvents != 60 { - t.Fatalf( - "expecting 60 published events after re-ingestion and more"+ - " data being added, got %d instead", - publishedEvents, - ) - } + requirePublishedEvents(t, filebeat, 60, outputFile) }) } } @@ -522,20 +503,21 @@ logging: filebeat.WaitForLogs("migrated entry in registry from", time.Second*10, "store did not update registry key") filebeat.WaitForLogs(eofMsg, time.Second*10, "EOF was not reached the second time") - assertPublishedEvents(t, filebeat, 200, outputFile) + requirePublishedEvents(t, filebeat, 200, outputFile) // Ingest more data to ensure the offset was migrated integration.GenerateLogFile(t, logFilepath, 20, true) filebeat.WaitForLogs(eofMsg, time.Second*5, "EOF was not reached the third time") - assertPublishedEvents(t, filebeat, 220, outputFile) + requirePublishedEvents(t, filebeat, 220, outputFile) } -func assertPublishedEvents( +func requirePublishedEvents( t *testing.T, filebeat *integration.BeatProc, expected int, outputFile string) { + t.Helper() publishedEvents := filebeat.CountFileLines(outputFile) if publishedEvents != expected { t.Fatalf("expecting %d published events after file migration, got %d instead", expected, publishedEvents) @@ -548,6 +530,7 @@ func createFileAndWaitIngestion( fb *integration.BeatProc, n, outputTotal int) { + t.Helper() _, err := os.Stat(logFilepath) if err != nil && !errors.Is(err, os.ErrNotExist) { t.Fatalf("cannot stat log file: %s", err) @@ -563,5 +546,5 @@ func createFileAndWaitIngestion( eofMsg := fmt.Sprintf("End of file reached: %s; Backoff now.", logFilepath) fb.WaitForLogs(eofMsg, time.Second*10, "EOF was not reached") - assertPublishedEvents(t, fb, outputTotal, outputFilepath) + requirePublishedEvents(t, fb, outputTotal, outputFilepath) } From 2af67ec386814f1cf26591eab316a0c8d35fecff Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Tue, 10 Dec 2024 17:26:49 -0500 Subject: [PATCH 14/28] Ensure old entries are removed from the registry --- filebeat/tests/integration/filestream_test.go | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/filebeat/tests/integration/filestream_test.go b/filebeat/tests/integration/filestream_test.go index 51c6873c909a..5a13822ef4eb 100644 --- a/filebeat/tests/integration/filestream_test.go +++ b/filebeat/tests/integration/filestream_test.go @@ -25,6 +25,7 @@ import ( "os" "path" "path/filepath" + "strings" "testing" "time" @@ -323,24 +324,28 @@ logging: testCases := map[string]struct { oldIdentityCfg string + oldIdentityName string newIdentityCfg string notMigrateMsg string expectMigration bool }{ "native to fingerprint": { oldIdentityCfg: nativeCfg, + oldIdentityName: "native", newIdentityCfg: fingerprintCfg, expectMigration: true, }, "path to fingerprint": { oldIdentityCfg: pathCfg, + oldIdentityName: "path", newIdentityCfg: fingerprintCfg, expectMigration: true, }, "inode marker to fingerprint": { oldIdentityCfg: inodeMarkerCfg, + oldIdentityName: "inode_marker", newIdentityCfg: fingerprintCfg, expectMigration: false, }, @@ -348,6 +353,7 @@ logging: "path to native": { oldIdentityCfg: pathCfg, newIdentityCfg: nativeCfg, + oldIdentityName: "path", expectMigration: false, notMigrateMsg: "file identity is 'native', will not migrate registry", }, @@ -401,6 +407,7 @@ logging: filebeat.WaitForLogs(eofMsg, time.Second*5, "EOF was not reached the third time") requirePublishedEvents(t, filebeat, 42, outputFile) + requireNativeEntryRemoved(t, workDir, tc.oldIdentityName) return } @@ -419,7 +426,6 @@ logging: // Ingest more data to ensure the offset is correctly tracked integration.GenerateLogFile(t, logFilepath, 10, true) filebeat.WaitForLogs(eofMsg, time.Second*5, "EOF was not reached the third time") - requirePublishedEvents(t, filebeat, 60, outputFile) }) } @@ -509,6 +515,25 @@ logging: filebeat.WaitForLogs(eofMsg, time.Second*5, "EOF was not reached the third time") requirePublishedEvents(t, filebeat, 220, outputFile) + requireNativeEntryRemoved(t, workDir, "native") +} + +func requireNativeEntryRemoved(t *testing.T, workDir, identity string) { + t.Helper() + + registryLogFile := filepath.Join(workDir, "data", "registry", "filebeat", "log.json") + entries := readFilestreamRegistryLog(t, registryLogFile) + nativeEntries := []registryEntry{} + for _, currentEntry := range entries { + if strings.Contains(currentEntry.Key, identity) { + nativeEntries = append(nativeEntries, currentEntry) + } + } + + lastNativeEntry := nativeEntries[len(nativeEntries)-1] + if lastNativeEntry.TTL != 0 { + t.Errorf("'%s' has not been removed from the registry", lastNativeEntry.Key) + } } func requirePublishedEvents( From d8404b46234f0aaf41f19f38093315f7626b55bf Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Wed, 11 Dec 2024 13:11:39 -0500 Subject: [PATCH 15/28] Update docs, changelog and fix lint warnings --- CHANGELOG.next.asciidoc | 1 + .../inputs/input-filestream-file-options.asciidoc | 7 +++++-- filebeat/docs/inputs/input-filestream.asciidoc | 11 ++++++++--- filebeat/input/filestream/prospector_test.go | 2 +- x-pack/filebeat/filebeat.reference.yml | 2 ++ 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index bb6c4df0f627..f3b13c3d5f2d 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -362,6 +362,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Improve S3 polling mode states registry when using list prefix option. {pull}41869[41869] - AWS S3 input registry cleanup for untracked s3 objects. {pull}41694[41694] - The environment variable `BEATS_AZURE_EVENTHUB_INPUT_TRACING_ENABLED: true` enables internal logs tracer for the azure-eventhub input. {issue}41931[41931] {pull}41932[41932] +- The Filestream input now supports changing the file identity from `native` or `path` to `fingerprint` keeping the state (no data re-ingestion) {issue}40197[40197] {pull}41762[41762] *Auditbeat* diff --git a/filebeat/docs/inputs/input-filestream-file-options.asciidoc b/filebeat/docs/inputs/input-filestream-file-options.asciidoc index 5436d3863dc2..b9badd23b47a 100644 --- a/filebeat/docs/inputs/input-filestream-file-options.asciidoc +++ b/filebeat/docs/inputs/input-filestream-file-options.asciidoc @@ -547,8 +547,11 @@ limit of harvesters. Different `file_identity` methods can be configured to suit the environment where you are collecting log messages. -WARNING: Changing `file_identity` methods between runs may result in -duplicated events in the output. +IMPORTANT: Changing `file_identity` is only supported from if +migrating from `native` or `path` to `fingerprint`. + +WARNING: Any unsupported change in `file_identity` methods between +runs may result in duplicated events in the output. *`native`*:: The default behaviour of {beatname_uc} is to differentiate between files using their inodes and device ids. diff --git a/filebeat/docs/inputs/input-filestream.asciidoc b/filebeat/docs/inputs/input-filestream.asciidoc index 54283d6cce79..a700f22af15d 100644 --- a/filebeat/docs/inputs/input-filestream.asciidoc +++ b/filebeat/docs/inputs/input-filestream.asciidoc @@ -86,7 +86,9 @@ multiple input sections: [[filestream-file-identity]] ==== Reading files on network shares and cloud providers -WARNING: Filebeat does not support reading from network shares and cloud providers. +WARNING: Some file identity methods do not support reading from +network shares and cloud providers, to avoid duplicating events, use +`fingerprint` when reading from network shares or cloud providers. However, one of the limitations of these data sources can be mitigated if you configure Filebeat adequately. @@ -98,8 +100,11 @@ values might change during the lifetime of the file. If this happens of the file. To solve this problem you can configure the `file_identity` option. Possible values besides the default `inode_deviceid` are `path`, `inode_marker` and `fingerprint`. -WARNING: Changing `file_identity` methods between runs may result in -duplicated events in the output. +IMPORTANT: Changing `file_identity` is only supported from if +migrating from `native` or `path` to `fingerprint`. + +WARNING: Any unsupported change in `file_identity` methods between +runs may result in duplicated events in the output. Selecting `path` instructs {beatname_uc} to identify files based on their paths. This is a quick way to avoid rereading files if inode and device ids diff --git a/filebeat/input/filestream/prospector_test.go b/filebeat/input/filestream/prospector_test.go index 681c7b74c8d0..2ebd6b75ffd2 100644 --- a/filebeat/input/filestream/prospector_test.go +++ b/filebeat/input/filestream/prospector_test.go @@ -108,7 +108,7 @@ func TestProspector_InitUpdateIdentifiers(t *testing.T) { } defer f.Close() tmpFileName := f.Name() - fi, err := f.Stat() + fi, err := f.Stat() // nolint:typecheck // It is used on L151 if err != nil { t.Fatalf("cannot stat test file: %v", err) } diff --git a/x-pack/filebeat/filebeat.reference.yml b/x-pack/filebeat/filebeat.reference.yml index c5c04232cd3f..141ba8f5a864 100644 --- a/x-pack/filebeat/filebeat.reference.yml +++ b/x-pack/filebeat/filebeat.reference.yml @@ -3185,6 +3185,8 @@ filebeat.inputs: # batch of events has been published successfully. The default value is 1s. #filebeat.registry.flush: 1s +# The interval which to run the registry clean up +#filebeat.registry.cleanup_interval: 5m # Starting with Filebeat 7.0, the registry uses a new directory format to store # Filebeat state. After you upgrade, Filebeat will automatically migrate a 6.x From b4f1f202809d098210b855a0e9e8b637bf0a8507 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Wed, 11 Dec 2024 17:21:03 -0500 Subject: [PATCH 16/28] Update docs Add more details about the migration to fingerprint and better present fingerprint as an option to overcome normal issues with the native file identity. --- filebeat/docs/faq.asciidoc | 11 +++++ .../input-filestream-file-options.asciidoc | 38 +++++++++++------- .../docs/inputs/input-filestream.asciidoc | 40 +++++++++++++------ 3 files changed, 62 insertions(+), 27 deletions(-) diff --git a/filebeat/docs/faq.asciidoc b/filebeat/docs/faq.asciidoc index ddcdb6a8898f..ee7ceeabad89 100644 --- a/filebeat/docs/faq.asciidoc +++ b/filebeat/docs/faq.asciidoc @@ -19,6 +19,10 @@ We do not recommend reading log files from network volumes. Whenever possible, i send the log files directly from there. Reading files from network volumes (especially on Windows) can have unexpected side effects. For example, changed file identifiers may result in {beatname_uc} reading a log file from scratch again. +If it is not possible to read from the host, then using the +<> +file identity is the next best option. + [[filebeat-not-collecting-lines]] === {beatname_uc} isn't collecting lines from a file @@ -71,6 +75,13 @@ By default states are never removed from the registry file. To resolve the inode You can use <<{beatname_lc}-input-log-clean-removed,`clean_removed`>> for files that are removed from disk. Be aware that `clean_removed` cleans the file state from the registry whenever a file cannot be found during a scan. If the file shows up again later, it will be sent again from scratch. +Aside from that you should also change the +<> to +<>. If you were using `native` (the default) or `path`, +the state of the files will be automatically migrated to +`fingerprint`. + include::filebeat-log-rotation.asciidoc[] [[windows-file-rotation]] diff --git a/filebeat/docs/inputs/input-filestream-file-options.asciidoc b/filebeat/docs/inputs/input-filestream-file-options.asciidoc index b9badd23b47a..8b9bbc1b3aa8 100644 --- a/filebeat/docs/inputs/input-filestream-file-options.asciidoc +++ b/filebeat/docs/inputs/input-filestream-file-options.asciidoc @@ -542,13 +542,15 @@ indirectly set higher priorities on certain inputs by assigning a higher limit of harvesters. [float] +[id="{beatname_lc}-input-{type}-file-identity"] ===== `file_identity` Different `file_identity` methods can be configured to suit the environment where you are collecting log messages. -IMPORTANT: Changing `file_identity` is only supported from if -migrating from `native` or `path` to `fingerprint`. +IMPORTANT: Changing `file_identity` is only supported from `native` or +`path` to `fingerprint`. On those cases {beatname_uc} will +automatically migrate the state of the file when {type} starts. WARNING: Any unsupported change in `file_identity` methods between runs may result in duplicated events in the output. @@ -557,7 +559,9 @@ runs may result in duplicated events in the output. between files using their inodes and device ids. + In some cases these values can change during the lifetime of a file. -For example, when using the Linux link:https://en.wikipedia.org/wiki/Logical_Volume_Manager_%28Linux%29[LVM] (Logical Volume Manager), device numbers are allocated dynamically at module load (refer to link:https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/logical_volume_manager_administration/lv#persistent_numbers[Persistent Device Numbers] in the Red Hat Enterprise Linux documentation). To avoid the possibility of data duplication in this case, you can set `file_identity` to `path` rather than `native`. +For example, when using the Linux link:https://en.wikipedia.org/wiki/Logical_Volume_Manager_%28Linux%29[LVM] (Logical Volume Manager), device numbers are allocated dynamically at module load (refer to link:https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/logical_volume_manager_administration/lv#persistent_numbers[Persistent Device Numbers] in the Red Hat Enterprise Linux documentation). To avoid the possibility of data duplication in this case, you can set `file_identity` to `fingerprint` rather than `native`. ++ +The states of files generated by `native` file identity can be migrated to `fingerprint`. [source,yaml] ---- @@ -565,30 +569,23 @@ file_identity.native: ~ ---- *`path`*:: To identify files based on their paths use this strategy. - ++ WARNING: Only use this strategy if your log files are rotated to a folder outside of the scope of your input or not at all. Otherwise you end up with duplicated events. - ++ WARNING: This strategy does not support renaming files. If an input file is renamed, {beatname_uc} will read it again if the new path matches the settings of the input. ++ +The states of files generated by `path` file identity can be migrated to `fingerprint`. [source,yaml] ---- file_identity.path: ~ ---- -*`inode_marker`*:: If the device id changes from time to time, you must use -this method to distinguish files. This option is not supported on Windows. - -Set the location of the marker file the following way: - -[source,yaml] ----- -file_identity.inode_marker.path: /logs/.filebeat-marker ----- - +[id="{beatname_lc}-input-{type}-file-identity-fingerprint"] *`fingerprint`*:: To identify files based on their content byte range. WARNING: In order to use this file identity option, you must enable the <<{beatname_lc}-input-filestream-scan-fingerprint,fingerprint option in the scanner>>. Once this file identity is enabled, changing the fingerprint configuration (offset, length, or other settings) will lead to a global re-ingestion of all files that match the paths configuration of the input. @@ -600,6 +597,16 @@ Please refer to the <<{beatname_lc}-input-filestream-scan-fingerprint,fingerprin file_identity.fingerprint: ~ ---- +*`inode_marker`*:: If the device id changes from time to time, you must use +this method to distinguish files. This option is not supported on Windows. ++ +Set the location of the marker file the following way: + +[source,yaml] +---- +file_identity.inode_marker.path: /logs/.filebeat-marker +---- + [[filestream-log-rotation-support]] [float] === Log rotation @@ -612,6 +619,7 @@ When reading from rotating files make sure the paths configuration includes both the active file and all rotated files. By default, {beatname_uc} is able to track files correctly in the following strategies: + * create: new active file with a unique name is created on rotation * rename: rotated files are renamed diff --git a/filebeat/docs/inputs/input-filestream.asciidoc b/filebeat/docs/inputs/input-filestream.asciidoc index a700f22af15d..a02f8e2f931a 100644 --- a/filebeat/docs/inputs/input-filestream.asciidoc +++ b/filebeat/docs/inputs/input-filestream.asciidoc @@ -34,6 +34,12 @@ The `log` writes the complete file state. 7. Stale entries can be removed from the registry, even if there is no active input. +8. The input can identify files based on their contents when using the +<> +<> instead +of the default inode and device ID. This solves data duplication +caused by inode reuse. + To configure this input, specify a list of glob-based <> that must be crawled to locate and fetch the log lines. @@ -90,22 +96,39 @@ WARNING: Some file identity methods do not support reading from network shares and cloud providers, to avoid duplicating events, use `fingerprint` when reading from network shares or cloud providers. -However, one of the limitations of these data sources can be mitigated -if you configure Filebeat adequately. - By default, {beatname_uc} identifies files based on their inodes and device IDs. However, on network shares and cloud providers these values might change during the lifetime of the file. If this happens {beatname_uc} thinks that file is new and resends the whole content of the file. To solve this problem you can configure the `file_identity` option. Possible -values besides the default `inode_deviceid` are `path`, `inode_marker` and `fingerprint`. +values besides the default `native` (inode + device ID) are +`fingerprint`, `path` and `inode_marker`. -IMPORTANT: Changing `file_identity` is only supported from if +IMPORTANT: Changing `file_identity` is only supported when migrating from `native` or `path` to `fingerprint`. WARNING: Any unsupported change in `file_identity` methods between runs may result in duplicated events in the output. +`fingerprint` is the recommended file identity because it does not +rely on the file system/OS, it generates a hash from a portion of the +file (the first 1024 bytes, by default) and uses that to identify the +file. This works well with log rotation strategies that move/rename +the file and on Windows as file identifiers might be more +volatile. The downside is that {beatname_uc} will wait until the file +reaches 1024 bytes before start ingesting it. + +WARNING: In order to use this file identity option, one must enable +the <<{beatname_lc}-input-filestream-scan-fingerprint,fingerprint +option in the scanner>>. Once this file identity is enabled, changing +the fingerprint configuration (offset, length, etc) will lead to a +global re-ingestion of all files that match the paths configuration of +the input. + +Please refer to the +<<{beatname_lc}-input-filestream-scan-fingerprint,fingerprint +configuration for details>>. + Selecting `path` instructs {beatname_uc} to identify files based on their paths. This is a quick way to avoid rereading files if inode and device ids might change. However, keep in mind if the files are rotated (renamed), they @@ -122,13 +145,6 @@ example oneliner generates a hidden marker file for the selected mountpoint `/lo Please note that you should not use this option on Windows as file identifiers might be more volatile. -Selecting `fingerprint` instructs {beatname_uc} to identify files based on their -content byte range. - -WARNING: In order to use this file identity option, one must enable the <<{beatname_lc}-input-filestream-scan-fingerprint,fingerprint option in the scanner>>. Once this file identity is enabled, changing the fingerprint configuration (offset, length, etc) will lead to a global re-ingestion of all files that match the paths configuration of the input. - -Please refer to the <<{beatname_lc}-input-filestream-scan-fingerprint,fingerprint configuration for details>>. - ["source","sh",subs="attributes"] ---- $ lsblk -o MOUNTPOINT,UUID | grep /logs | awk '{print $2}' >> /logs/.filebeat-marker From 3d6022bf2234fdb81c2312354b1c7f91df90b0e4 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Wed, 11 Dec 2024 17:43:49 -0500 Subject: [PATCH 17/28] Remove inode marker from tests Remove inode marker from tests and update init function not to panic. --- filebeat/input/filestream/prospector.go | 9 +++++--- filebeat/input/filestream/prospector_test.go | 24 -------------------- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/filebeat/input/filestream/prospector.go b/filebeat/input/filestream/prospector.go index cc6bf84537a7..1b9676a95817 100644 --- a/filebeat/input/filestream/prospector.go +++ b/filebeat/input/filestream/prospector.go @@ -56,11 +56,14 @@ func init() { // inode marker requries an specific config we cannot infer. continue } - var err error - identifiersMap[name], err = factory(nil) + + identifier, err := factory(nil) if err != nil { - panic(fmt.Errorf("cannot create identifier '%s': %w", name, err)) + // Skip identifiers we cannot create. E.g: inode_marker is not + // supported on Windows + continue } + identifiersMap[name] = identifier } } diff --git a/filebeat/input/filestream/prospector_test.go b/filebeat/input/filestream/prospector_test.go index 2ebd6b75ffd2..5ba9ede9cd56 100644 --- a/filebeat/input/filestream/prospector_test.go +++ b/filebeat/input/filestream/prospector_test.go @@ -35,7 +35,6 @@ import ( input "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/common/file" "github.com/elastic/beats/v7/libbeat/common/transform/typeconv" - conf "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/go-concert/unison" ) @@ -175,15 +174,6 @@ func TestMigrateRegistryToFingerprint(t *testing.T) { const mockFingerprint = "the fingerprint from this file" const mockInputPrefix = "test-input" - // We need an empty file as inode marker for the - // 'inode marker' file identity - inodeMarkerFile, err := os.CreateTemp(t.TempDir(), "test-inode-marker") - if err != nil { - t.Fatalf("cannot create inode marker: '%s'", err) - } - inodeMarkerPath := inodeMarkerFile.Name() - inodeMarkerFile.Close() - logFileFullPath, err := filepath.Abs(filepath.Join("testdata", "log.log")) if err != nil { t.Fatalf("cannot get absolute path from test file: %s", err) @@ -205,12 +195,6 @@ func TestMigrateRegistryToFingerprint(t *testing.T) { fingerprintIdentifier, _ := newFingerprintIdentifier(nil) nativeIdentifier, _ := newINodeDeviceIdentifier(nil) pathIdentifier, _ := newPathIdentifier(nil) - inodeIdentifier, err := newINodeMarkerIdentifier( - conf.MustNewConfigFrom(map[string]any{ - "path": inodeMarkerPath, - }), - ) - newIDFunc := func(s loginp.Source) string { return mockInputPrefix + "-" + s.Name() } @@ -224,10 +208,6 @@ func TestMigrateRegistryToFingerprint(t *testing.T) { expectedNewKey := newIDFunc(fingerprintIdentifier.GetSource(fsEvent)) - if err != nil { - t.Fatalf("cannot create inodeMarkerIdentifier: %s", err) - } - testCases := map[string]struct { oldIdentifier fileIdentifier newIdentifier fileIdentifier @@ -243,10 +223,6 @@ func TestMigrateRegistryToFingerprint(t *testing.T) { newIdentifier: fingerprintIdentifier, expectRegistryMigration: true, }, - "inode marker to fingerprint fails": { - oldIdentifier: inodeIdentifier, - newIdentifier: fingerprintIdentifier, - }, "fingerprint to fingerprint fails": { oldIdentifier: fingerprintIdentifier, newIdentifier: fingerprintIdentifier, From 0cff3ccfd1cdad610de44d46d236694958499557 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Wed, 11 Dec 2024 17:45:41 -0500 Subject: [PATCH 18/28] Fix lint warnings --- filebeat/input/filestream/prospector.go | 2 +- filebeat/input/filestream/prospector_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/filebeat/input/filestream/prospector.go b/filebeat/input/filestream/prospector.go index 1b9676a95817..51c9b93d2a7f 100644 --- a/filebeat/input/filestream/prospector.go +++ b/filebeat/input/filestream/prospector.go @@ -53,7 +53,7 @@ func init() { // Initialise a default identifier for name, factory := range identifierFactories { if name == inodeMarkerName { - // inode marker requries an specific config we cannot infer. + // inode marker requires an specific config we cannot infer. continue } diff --git a/filebeat/input/filestream/prospector_test.go b/filebeat/input/filestream/prospector_test.go index 5ba9ede9cd56..8182577e1f29 100644 --- a/filebeat/input/filestream/prospector_test.go +++ b/filebeat/input/filestream/prospector_test.go @@ -107,7 +107,7 @@ func TestProspector_InitUpdateIdentifiers(t *testing.T) { } defer f.Close() tmpFileName := f.Name() - fi, err := f.Stat() // nolint:typecheck // It is used on L151 + fi, err := f.Stat() //nolint:typecheck // It is used on L151 if err != nil { t.Fatalf("cannot stat test file: %v", err) } From 4e73c1e05c75eeeebf6bdb0a5559ad293501a614 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Wed, 11 Dec 2024 18:21:49 -0500 Subject: [PATCH 19/28] Remove inode_marker from tests and small improvements inode_marker is not supported on Windows, so remove it from all tests. Small improvements are done to the code and documentation. --- .../input-filestream-file-options.asciidoc | 9 ++++- .../internal/input-logfile/store.go | 11 +++--- filebeat/input/filestream/prospector.go | 9 ++--- filebeat/input/filestream/prospector_test.go | 5 +-- filebeat/tests/integration/filestream_test.go | 34 +++++-------------- .../tests/integration/testdata/inodeMarker | 1 - 6 files changed, 29 insertions(+), 40 deletions(-) delete mode 100644 filebeat/tests/integration/testdata/inodeMarker diff --git a/filebeat/docs/inputs/input-filestream-file-options.asciidoc b/filebeat/docs/inputs/input-filestream-file-options.asciidoc index 8b9bbc1b3aa8..bb07cd33f749 100644 --- a/filebeat/docs/inputs/input-filestream-file-options.asciidoc +++ b/filebeat/docs/inputs/input-filestream-file-options.asciidoc @@ -559,7 +559,14 @@ runs may result in duplicated events in the output. between files using their inodes and device ids. + In some cases these values can change during the lifetime of a file. -For example, when using the Linux link:https://en.wikipedia.org/wiki/Logical_Volume_Manager_%28Linux%29[LVM] (Logical Volume Manager), device numbers are allocated dynamically at module load (refer to link:https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/logical_volume_manager_administration/lv#persistent_numbers[Persistent Device Numbers] in the Red Hat Enterprise Linux documentation). To avoid the possibility of data duplication in this case, you can set `file_identity` to `fingerprint` rather than `native`. +For example, when using the Linux +link:https://en.wikipedia.org/wiki/Logical_Volume_Manager_%28Linux%29[LVM] +(Logical Volume Manager), device numbers are allocated dynamically at +module load (refer to +link:https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/logical_volume_manager_administration/lv#persistent_numbers[Persistent +Device Numbers] in the Red Hat Enterprise Linux documentation). To +avoid the possibility of data duplication in this case, you can set +`file_identity` to `fingerprint` rather than the default `native`. + The states of files generated by `native` file identity can be migrated to `fingerprint`. diff --git a/filebeat/input/filestream/internal/input-logfile/store.go b/filebeat/input/filestream/internal/input-logfile/store.go index 884c2a186ff4..85f40d1f3a33 100644 --- a/filebeat/input/filestream/internal/input-logfile/store.go +++ b/filebeat/input/filestream/internal/input-logfile/store.go @@ -261,10 +261,11 @@ func (s *sourceStore) UpdateIdentifiers(getNewID func(v Value) (string, interfac // codebase can have access to the new value s.store.ephemeralStore.table[newKey] = r - // Remove the old key from the store - // aka delete. This is also synchronously - // written to the disk. - // See store.remove for details + // Remove the old key from the store aka delete. This is also + // synchronously written to the disk. + // We cannot use store.remove because it will + // acquire the same lock we hold, causing a deadlock. + // See store.remove for details. s.store.UpdateTTL(res, 0) s.store.log.Infof("migrated entry in registry from '%s' to '%s'. Cursor: %v", key, newKey, r.cursor) } @@ -458,10 +459,12 @@ func (r *resource) UnpackCursor(to interface{}) error { return typeconv.Convert(to, r.activeCursor()) } +// UnpackCursorMeta unpacks the cursor metadata's into the provided struct. func (r *resource) UnpackCursorMeta(to interface{}) error { return typeconv.Convert(to, r.cursorMeta) } +// Key returns the resource's key func (r *resource) Key() string { return r.key } diff --git a/filebeat/input/filestream/prospector.go b/filebeat/input/filestream/prospector.go index 51c9b93d2a7f..1e3b7cb6c692 100644 --- a/filebeat/input/filestream/prospector.go +++ b/filebeat/input/filestream/prospector.go @@ -50,7 +50,6 @@ var ignoreInactiveSettings = map[string]ignoreInactiveType{ var identifiersMap = map[string]fileIdentifier{} func init() { - // Initialise a default identifier for name, factory := range identifierFactories { if name == inodeMarkerName { // inode marker requires an specific config we cannot infer. @@ -146,12 +145,10 @@ func (p *fileProspector) Init( // Return early (do nothing) if: // - The identifiers are the same - // - The old identifier is fingerprint - // - The old identifier is inode marker + // - The old identifier is neither native nor path oldIdentifierName := fm.IdentifierName if oldIdentifierName == identifierName || - oldIdentifierName == fingerprintName || - oldIdentifierName == inodeMarkerName { + !(oldIdentifierName == nativeName || oldIdentifierName == pathName) { return "", nil } @@ -187,8 +184,6 @@ func (p *fileProspector) Init( // If the registry key and the key generated by the old identifier // do not match, log it at debug level and do nothing. if previousIdentifierKey != registryKey { - p.logger.Debugf("registry key: '%s' and previous file identity key: '%s', differ, will not migrate. Source: '%s'", - registryKey, previousIdentifierKey, fm.Source) return "", fm } diff --git a/filebeat/input/filestream/prospector_test.go b/filebeat/input/filestream/prospector_test.go index 8182577e1f29..79a9a90df208 100644 --- a/filebeat/input/filestream/prospector_test.go +++ b/filebeat/input/filestream/prospector_test.go @@ -270,14 +270,15 @@ func TestMigrateRegistryToFingerprint(t *testing.T) { newIDFunc, ) require.NoError(t, err, "prospector Init must succeed") + // testStore.updatedKeys is in the format // oldKey -> newKey - if tc.expectRegistryMigration { assert.Equal( t, map[string]string{ - oldKey: expectedNewKey}, + oldKey: expectedNewKey, + }, testStore.updatedKeys, "the registry entries were not correctly migrated") } else { diff --git a/filebeat/tests/integration/filestream_test.go b/filebeat/tests/integration/filestream_test.go index 5a13822ef4eb..029842c78f68 100644 --- a/filebeat/tests/integration/filestream_test.go +++ b/filebeat/tests/integration/filestream_test.go @@ -316,11 +316,6 @@ logging: fingerprint.enabled: true check_interval: 0.1s ` - inodeMarkerPath, err := filepath.Abs(filepath.Join("testdata", "inodeMarker")) - if err != nil { - t.Fatalf("cannot get absolute path from inode marker: %s", err) - } - inodeMarkerCfg := " file_identity.inode_marker.path: " + inodeMarkerPath + "\n" testCases := map[string]struct { oldIdentityCfg string @@ -343,13 +338,6 @@ logging: expectMigration: true, }, - "inode marker to fingerprint": { - oldIdentityCfg: inodeMarkerCfg, - oldIdentityName: "inode_marker", - newIdentityCfg: fingerprintCfg, - expectMigration: false, - }, - "path to native": { oldIdentityCfg: pathCfg, newIdentityCfg: nativeCfg, @@ -361,7 +349,6 @@ logging: for name, tc := range testCases { t.Run(name, func(t *testing.T) { - filebeat := integration.NewBeat( t, "filebeat", @@ -382,10 +369,6 @@ logging: requirePublishedEvents(t, filebeat, 25, outputFile) filebeat.Stop() - if err := os.Truncate(filebeat.ConfigFilePath(), 0); err != nil { - t.Fatalf("cannot truncate Filebeat's configuration file: %s", err) - } - newCfg := fmt.Sprintf(cfgTemplate, logFilepath, tc.newIdentityCfg, workDir) if err := os.WriteFile(filebeat.ConfigFilePath(), []byte(newCfg), 0o644); err != nil { t.Fatalf("cannot write new configuration file: %s", err) @@ -407,7 +390,7 @@ logging: filebeat.WaitForLogs(eofMsg, time.Second*5, "EOF was not reached the third time") requirePublishedEvents(t, filebeat, 42, outputFile) - requireNativeEntryRemoved(t, workDir, tc.oldIdentityName) + requireRegistryEntryRemoved(t, workDir, tc.oldIdentityName) return } @@ -507,30 +490,31 @@ logging: filebeat.WaitForLogs(migratingMsg, time.Second*10, "prospector did not migrate registry entry") filebeat.WaitForLogs("migrated entry in registry from", time.Second*10, "store did not update registry key") - filebeat.WaitForLogs(eofMsg, time.Second*10, "EOF was not reached the second time") + // Filebeat logs the EOF message when it starts and the file had already been fully ingested. + filebeat.WaitForLogs(eofMsg, time.Second*10, "EOF was not reached after restart") requirePublishedEvents(t, filebeat, 200, outputFile) // Ingest more data to ensure the offset was migrated integration.GenerateLogFile(t, logFilepath, 20, true) - filebeat.WaitForLogs(eofMsg, time.Second*5, "EOF was not reached the third time") + filebeat.WaitForLogs(eofMsg, time.Second*5, "EOF was not reached after adding data") requirePublishedEvents(t, filebeat, 220, outputFile) - requireNativeEntryRemoved(t, workDir, "native") + requireRegistryEntryRemoved(t, workDir, "native") } -func requireNativeEntryRemoved(t *testing.T, workDir, identity string) { +func requireRegistryEntryRemoved(t *testing.T, workDir, identity string) { t.Helper() registryLogFile := filepath.Join(workDir, "data", "registry", "filebeat", "log.json") entries := readFilestreamRegistryLog(t, registryLogFile) - nativeEntries := []registryEntry{} + inputEntries := []registryEntry{} for _, currentEntry := range entries { if strings.Contains(currentEntry.Key, identity) { - nativeEntries = append(nativeEntries, currentEntry) + inputEntries = append(inputEntries, currentEntry) } } - lastNativeEntry := nativeEntries[len(nativeEntries)-1] + lastNativeEntry := inputEntries[len(inputEntries)-1] if lastNativeEntry.TTL != 0 { t.Errorf("'%s' has not been removed from the registry", lastNativeEntry.Key) } diff --git a/filebeat/tests/integration/testdata/inodeMarker b/filebeat/tests/integration/testdata/inodeMarker deleted file mode 100644 index 302beb58d917..000000000000 --- a/filebeat/tests/integration/testdata/inodeMarker +++ /dev/null @@ -1 +0,0 @@ -Inode marker is any existing file for the 'inode_marker' file identity. \ No newline at end of file From 7c8a3aea2ca4d8c6d388916a1215c800f3fceb21 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Wed, 11 Dec 2024 19:08:57 -0500 Subject: [PATCH 20/28] Make fingerprint the default file identity --- CHANGELOG.next.asciidoc | 4 +- .../config/filebeat.inputs.reference.yml.tmpl | 4 +- .../input-filestream-file-options.asciidoc | 43 +++++++++++-------- .../docs/inputs/input-filestream.asciidoc | 27 ++++-------- filebeat/filebeat.reference.yml | 4 +- filebeat/input/filestream/fswatch.go | 2 +- filebeat/input/filestream/identifier.go | 2 +- 7 files changed, 42 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index ed7c76cfb774..81af096f4cb4 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -52,7 +52,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Fixes filestream logging the error "filestream input with ID 'ID' already exists, this will lead to data duplication[...]" on Kubernetes when using autodiscover. {pull}41585[41585] - Add kafka compression support for ZSTD. - Filebeat fails to start if there is any input with a duplicated ID. It logs the duplicated IDs and the offending inputs configurations. {pull}41731[41731] - +- The Filestream input only starts to ingest a file when it is >= 1024 bytes in size. This happens because the fingerprint` is the default file identity now. To restore the previous behaviour, set `file_identity.native: ~` and prospector.scanner.fingerprint.enabled: false` {issue}40197[40197] {pull}41762[41762] *Heartbeat* @@ -364,7 +364,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Add support for SSL and Proxy configurations for websoket type in streaming input. {pull}41934[41934] - AWS S3 input registry cleanup for untracked s3 objects. {pull}41694[41694] - The environment variable `BEATS_AZURE_EVENTHUB_INPUT_TRACING_ENABLED: true` enables internal logs tracer for the azure-eventhub input. {issue}41931[41931] {pull}41932[41932] -- The Filestream input now supports changing the file identity from `native` or `path` to `fingerprint` keeping the state (no data re-ingestion) {issue}40197[40197] {pull}41762[41762] +- The Filestream input now uses the `fingerprint` file identity by default. The state from files are automatically migrated if the previous file identity was `native` (the default) or `path`. If the `file_identity` is explicitly set, there is no change in behaviour. {issue}40197[40197] {pull}41762[41762] *Auditbeat* diff --git a/filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl b/filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl index ba6588195823..cf6efb550d17 100644 --- a/filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl +++ b/filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl @@ -303,7 +303,7 @@ filebeat.inputs: # If enabled, instead of relying on the device ID and inode values when comparing files, # compare hashes of the given byte ranges in files. A file becomes an ingest target # when its size grows larger than offset+length (see below). Until then it's ignored. - #prospector.scanner.fingerprint.enabled: false + #prospector.scanner.fingerprint.enabled: true # If fingerprint mode is enabled, sets the offset from the beginning of the file # for the byte range used for computing the fingerprint value. @@ -439,7 +439,7 @@ filebeat.inputs: # Method to determine if two files are the same or not. By default # the Beat considers two files the same if their inode and device id are the same. - #file_identity.native: ~ + #file_identity.fingerprint: ~ # Optional additional fields. These fields can be freely picked # to add additional information to the crawled log files for filtering diff --git a/filebeat/docs/inputs/input-filestream-file-options.asciidoc b/filebeat/docs/inputs/input-filestream-file-options.asciidoc index bb07cd33f749..b87d9e67af6e 100644 --- a/filebeat/docs/inputs/input-filestream-file-options.asciidoc +++ b/filebeat/docs/inputs/input-filestream-file-options.asciidoc @@ -150,9 +150,9 @@ The default setting is 10s. [id="{beatname_lc}-input-{type}-scan-fingerprint"] ===== `prospector.scanner.fingerprint` -Instead of relying on the device ID and inode values when comparing files, compare hashes of the given byte ranges of files. - -Enable this option if you're experiencing data loss or data duplication due to unstable file identifiers provided by the file system. +Instead of relying on the device ID and inode values when comparing +files, compare hashes of the given byte ranges of files. This is the +default behaviour for {beatname_uc}. Following are some scenarios where this can happen: @@ -555,8 +555,29 @@ automatically migrate the state of the file when {type} starts. WARNING: Any unsupported change in `file_identity` methods between runs may result in duplicated events in the output. -*`native`*:: The default behaviour of {beatname_uc} is to differentiate -between files using their inodes and device ids. +[id="{beatname_lc}-input-{type}-file-identity-fingerprint"] +*`fingerprint`*:: The default behaviour of {beatname_uc} is to +identify files based on content by hashing a specific range (0 to 1024 +bytes by default). + +WARNING: In order to use this file identity option, you must enable +the <<{beatname_lc}-input-filestream-scan-fingerprint,fingerprint +option in the scanner>>. Once this file identity is enabled, changing +the fingerprint configuration (offset, length, or other settings) will +lead to a global re-ingestion of all files that match the paths +configuration of the input. + +Please refer to the +<<{beatname_lc}-input-filestream-scan-fingerprint,fingerprint +configuration for details>>. + +[source,yaml] +---- +file_identity.fingerprint: ~ +---- + +*`native`*:: Differentiates between files using their inodes and +device ids. + In some cases these values can change during the lifetime of a file. For example, when using the Linux @@ -592,18 +613,6 @@ The states of files generated by `path` file identity can be migrated to `finger file_identity.path: ~ ---- -[id="{beatname_lc}-input-{type}-file-identity-fingerprint"] -*`fingerprint`*:: To identify files based on their content byte range. - -WARNING: In order to use this file identity option, you must enable the <<{beatname_lc}-input-filestream-scan-fingerprint,fingerprint option in the scanner>>. Once this file identity is enabled, changing the fingerprint configuration (offset, length, or other settings) will lead to a global re-ingestion of all files that match the paths configuration of the input. - -Please refer to the <<{beatname_lc}-input-filestream-scan-fingerprint,fingerprint configuration for details>>. - -[source,yaml] ----- -file_identity.fingerprint: ~ ----- - *`inode_marker`*:: If the device id changes from time to time, you must use this method to distinguish files. This option is not supported on Windows. + diff --git a/filebeat/docs/inputs/input-filestream.asciidoc b/filebeat/docs/inputs/input-filestream.asciidoc index a02f8e2f931a..74b7514b91a2 100644 --- a/filebeat/docs/inputs/input-filestream.asciidoc +++ b/filebeat/docs/inputs/input-filestream.asciidoc @@ -34,11 +34,10 @@ The `log` writes the complete file state. 7. Stale entries can be removed from the registry, even if there is no active input. -8. The input can identify files based on their contents when using the -<> -<> instead -of the default inode and device ID. This solves data duplication -caused by inode reuse. +8. The default behaviour is to identify files based on their contents +using the <> <> This solves data duplication caused by inode reuse. To configure this input, specify a list of glob-based <> that must be crawled to locate and fetch the log lines. @@ -94,15 +93,7 @@ multiple input sections: WARNING: Some file identity methods do not support reading from network shares and cloud providers, to avoid duplicating events, use -`fingerprint` when reading from network shares or cloud providers. - -By default, {beatname_uc} identifies files based on their inodes and -device IDs. However, on network shares and cloud providers these -values might change during the lifetime of the file. If this happens -{beatname_uc} thinks that file is new and resends the whole content -of the file. To solve this problem you can configure the `file_identity` option. Possible -values besides the default `native` (inode + device ID) are -`fingerprint`, `path` and `inode_marker`. +the default `file_identity`: `fingerprint`. IMPORTANT: Changing `file_identity` is only supported when migrating from `native` or `path` to `fingerprint`. @@ -110,17 +101,15 @@ migrating from `native` or `path` to `fingerprint`. WARNING: Any unsupported change in `file_identity` methods between runs may result in duplicated events in the output. -`fingerprint` is the recommended file identity because it does not +`fingerprint` is the default and recommended file identity because it does not rely on the file system/OS, it generates a hash from a portion of the file (the first 1024 bytes, by default) and uses that to identify the file. This works well with log rotation strategies that move/rename the file and on Windows as file identifiers might be more volatile. The downside is that {beatname_uc} will wait until the file -reaches 1024 bytes before start ingesting it. +reaches 1024 bytes before start ingesting any file. -WARNING: In order to use this file identity option, one must enable -the <<{beatname_lc}-input-filestream-scan-fingerprint,fingerprint -option in the scanner>>. Once this file identity is enabled, changing +WARNING: Once this file identity is enabled, changing the fingerprint configuration (offset, length, etc) will lead to a global re-ingestion of all files that match the paths configuration of the input. diff --git a/filebeat/filebeat.reference.yml b/filebeat/filebeat.reference.yml index 88aa79379e4c..6759a9d614b8 100644 --- a/filebeat/filebeat.reference.yml +++ b/filebeat/filebeat.reference.yml @@ -716,7 +716,7 @@ filebeat.inputs: # If enabled, instead of relying on the device ID and inode values when comparing files, # compare hashes of the given byte ranges in files. A file becomes an ingest target # when its size grows larger than offset+length (see below). Until then it's ignored. - #prospector.scanner.fingerprint.enabled: false + #prospector.scanner.fingerprint.enabled: true # If fingerprint mode is enabled, sets the offset from the beginning of the file # for the byte range used for computing the fingerprint value. @@ -852,7 +852,7 @@ filebeat.inputs: # Method to determine if two files are the same or not. By default # the Beat considers two files the same if their inode and device id are the same. - #file_identity.native: ~ + #file_identity.fingerprint: ~ # Optional additional fields. These fields can be freely picked # to add additional information to the crawled log files for filtering diff --git a/filebeat/input/filestream/fswatch.go b/filebeat/input/filestream/fswatch.go index c51d850bbd2c..00d84ed9ab4e 100644 --- a/filebeat/input/filestream/fswatch.go +++ b/filebeat/input/filestream/fswatch.go @@ -278,7 +278,7 @@ func defaultFileScannerConfig() fileScannerConfig { Symlinks: false, RecursiveGlob: true, Fingerprint: fingerprintConfig{ - Enabled: false, + Enabled: true, Offset: 0, Length: DefaultFingerprintSize, }, diff --git a/filebeat/input/filestream/identifier.go b/filebeat/input/filestream/identifier.go index a0cd7903e7ac..08bb0c5f071c 100644 --- a/filebeat/input/filestream/identifier.go +++ b/filebeat/input/filestream/identifier.go @@ -76,7 +76,7 @@ func (f fileSource) Name() string { // newFileIdentifier creates a new state identifier for a log input. func newFileIdentifier(ns *conf.Namespace, suffix string) (fileIdentifier, error) { if ns == nil { - i, err := newINodeDeviceIdentifier(nil) + i, err := newFingerprintIdentifier(nil) if err != nil { return nil, err } From 0feb3bbb53766e0a9f1828eb30f36141a0a20de1 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Wed, 11 Dec 2024 19:31:18 -0500 Subject: [PATCH 21/28] Update old tests to use the old file identity --- CHANGELOG.next.asciidoc | 2 +- filebeat/tests/integration/event_log_file_test.go | 2 ++ filebeat/tests/integration/filestream_test.go | 3 +++ filebeat/tests/integration/filestream_truncation_test.go | 2 ++ filebeat/tests/integration/store_test.go | 2 ++ filebeat/tests/integration/translate_ldap_attribute_test.go | 2 ++ filebeat/tests/system/config/filestream-fixup-id.yml.j2 | 2 ++ filebeat/tests/system/test_reload_inputs.py | 2 ++ 8 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 81af096f4cb4..efc411d803fc 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -52,7 +52,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Fixes filestream logging the error "filestream input with ID 'ID' already exists, this will lead to data duplication[...]" on Kubernetes when using autodiscover. {pull}41585[41585] - Add kafka compression support for ZSTD. - Filebeat fails to start if there is any input with a duplicated ID. It logs the duplicated IDs and the offending inputs configurations. {pull}41731[41731] -- The Filestream input only starts to ingest a file when it is >= 1024 bytes in size. This happens because the fingerprint` is the default file identity now. To restore the previous behaviour, set `file_identity.native: ~` and prospector.scanner.fingerprint.enabled: false` {issue}40197[40197] {pull}41762[41762] +- The Filestream input only starts to ingest a file when it is >= 1024 bytes in size. This happens because the fingerprint` is the default file identity now. To restore the previous behaviour, set `file_identity.native: ~` and `prospector.scanner.fingerprint.enabled: false` {issue}40197[40197] {pull}41762[41762] *Heartbeat* diff --git a/filebeat/tests/integration/event_log_file_test.go b/filebeat/tests/integration/event_log_file_test.go index fce7672199f1..793a3386af7f 100644 --- a/filebeat/tests/integration/event_log_file_test.go +++ b/filebeat/tests/integration/event_log_file_test.go @@ -37,6 +37,8 @@ filebeat.inputs: - type: filestream id: filestream-input-id enabled: true + file_identity.native: ~ + prospector.scanner.fingerprint.enabled: false parsers: - ndjson: target: "" diff --git a/filebeat/tests/integration/filestream_test.go b/filebeat/tests/integration/filestream_test.go index 029842c78f68..24125469dd89 100644 --- a/filebeat/tests/integration/filestream_test.go +++ b/filebeat/tests/integration/filestream_test.go @@ -42,6 +42,8 @@ filebeat.inputs: paths: - %s + file_identity.native: ~ + prospector.scanner.fingerprint.enabled: false clean_inactive: 3s ignore_older: 2s close.on_state_change.inactive: 1s @@ -446,6 +448,7 @@ logging: file_identity.native: ~ prospector: scanner: + fingerprint.enabled: false check_interval: 0.1s ` fingerprintCfg := ` diff --git a/filebeat/tests/integration/filestream_truncation_test.go b/filebeat/tests/integration/filestream_truncation_test.go index 98db9a6ad23b..f495c72f1411 100644 --- a/filebeat/tests/integration/filestream_truncation_test.go +++ b/filebeat/tests/integration/filestream_truncation_test.go @@ -38,6 +38,8 @@ filebeat.inputs: id: a-unique-filestream-input-id enabled: true prospector.scanner.check_interval: 30s + file_identity.native: ~ + prospector.scanner.fingerprint.enabled: false paths: - %s output: diff --git a/filebeat/tests/integration/store_test.go b/filebeat/tests/integration/store_test.go index d4ee36298d51..e187c6826769 100644 --- a/filebeat/tests/integration/store_test.go +++ b/filebeat/tests/integration/store_test.go @@ -41,6 +41,8 @@ filebeat.inputs: close.on_state_change.inactive: 8s ignore_older: 9s prospector.scanner.check_interval: 1s + file_identity.native: ~ + prospector.scanner.fingerprint.enabled: false paths: - %s diff --git a/filebeat/tests/integration/translate_ldap_attribute_test.go b/filebeat/tests/integration/translate_ldap_attribute_test.go index 376be5e36a23..d7c4f129593e 100644 --- a/filebeat/tests/integration/translate_ldap_attribute_test.go +++ b/filebeat/tests/integration/translate_ldap_attribute_test.go @@ -45,6 +45,8 @@ const translateguidCfg = ` filebeat.inputs: - type: filestream id: "test-translateguidCfg" + file_identity.native: ~ + prospector.scanner.fingerprint.enabled: false paths: - %s diff --git a/filebeat/tests/system/config/filestream-fixup-id.yml.j2 b/filebeat/tests/system/config/filestream-fixup-id.yml.j2 index 7617429286de..446b7db9723d 100644 --- a/filebeat/tests/system/config/filestream-fixup-id.yml.j2 +++ b/filebeat/tests/system/config/filestream-fixup-id.yml.j2 @@ -1,6 +1,8 @@ filebeat.inputs: - type: filestream id: test-fix-global-id + file_identity.native: ~ + prospector.scanner.fingerprint.enabled: false enabled: true paths: - {{path}} diff --git a/filebeat/tests/system/test_reload_inputs.py b/filebeat/tests/system/test_reload_inputs.py index dd81a60ffe83..cf58557f3ac3 100644 --- a/filebeat/tests/system/test_reload_inputs.py +++ b/filebeat/tests/system/test_reload_inputs.py @@ -49,6 +49,8 @@ def test_filestream_reload_not_duplicate_id(self): input_config_template = """ - type: filestream id: my-unique-id + file_identity.native: ~ + prospector.scanner.fingerprint.enabled: false paths: - {} """ From 6730cb761d6841c59b7a05cdfa911017647796cb Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Wed, 11 Dec 2024 20:08:31 -0500 Subject: [PATCH 22/28] update reference --- x-pack/filebeat/filebeat.reference.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/filebeat/filebeat.reference.yml b/x-pack/filebeat/filebeat.reference.yml index 141ba8f5a864..338909669a4b 100644 --- a/x-pack/filebeat/filebeat.reference.yml +++ b/x-pack/filebeat/filebeat.reference.yml @@ -2400,7 +2400,7 @@ filebeat.inputs: # If enabled, instead of relying on the device ID and inode values when comparing files, # compare hashes of the given byte ranges in files. A file becomes an ingest target # when its size grows larger than offset+length (see below). Until then it's ignored. - #prospector.scanner.fingerprint.enabled: false + #prospector.scanner.fingerprint.enabled: true # If fingerprint mode is enabled, sets the offset from the beginning of the file # for the byte range used for computing the fingerprint value. @@ -2536,7 +2536,7 @@ filebeat.inputs: # Method to determine if two files are the same or not. By default # the Beat considers two files the same if their inode and device id are the same. - #file_identity.native: ~ + #file_identity.fingerprint: ~ # Optional additional fields. These fields can be freely picked # to add additional information to the crawled log files for filtering From c1693f23ea3822274da01fd2ae442543add45aa6 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 12 Dec 2024 13:05:45 -0500 Subject: [PATCH 23/28] Fix Filestream tests --- filebeat/input/filestream/fswatch_test.go | 6 +++++ filebeat/input/filestream/identifier_test.go | 23 ++++++++++++++------ filebeat/input/filestream/input_test.go | 3 +++ 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/filebeat/input/filestream/fswatch_test.go b/filebeat/input/filestream/fswatch_test.go index 9fae0481ca6a..03674772e0d4 100644 --- a/filebeat/input/filestream/fswatch_test.go +++ b/filebeat/input/filestream/fswatch_test.go @@ -222,6 +222,7 @@ scanner: paths := []string{filepath.Join(dir, "*.log")} cfgStr := ` scanner: + fingerprint.enabled: false check_interval: 10ms ` @@ -260,6 +261,7 @@ scanner: paths := []string{filepath.Join(dir, "*.log")} cfgStr := ` scanner: + fingerprint.enabled: false check_interval: 50ms ` @@ -370,6 +372,7 @@ scanner: } cfgStr := ` scanner: + fingerprint.enabled: false check_interval: 100ms ` @@ -615,6 +618,7 @@ scanner: name: "returns no symlink if the original file is excluded", cfgStr: ` scanner: + fingerprint.enabled: false exclude_files: ['.*exclude.*', '.*traveler.*'] symlinks: true `, @@ -661,6 +665,7 @@ scanner: name: "returns no included symlink if the original file is not included", cfgStr: ` scanner: + fingerprint.enabled: false include_files: ['.*include.*', '.*portal.*'] symlinks: true `, @@ -678,6 +683,7 @@ scanner: name: "returns an included symlink if the original file is included", cfgStr: ` scanner: + fingerprint.enabled: false include_files: ['.*include.*', '.*portal.*', '.*traveler.*'] symlinks: true `, diff --git a/filebeat/input/filestream/identifier_test.go b/filebeat/input/filestream/identifier_test.go index 1fcd4d73efa2..f2cd01028230 100644 --- a/filebeat/input/filestream/identifier_test.go +++ b/filebeat/input/filestream/identifier_test.go @@ -18,7 +18,6 @@ package filestream import ( - "io/ioutil" "os" "testing" @@ -35,12 +34,17 @@ type testFileIdentifierConfig struct { } func TestFileIdentifier(t *testing.T) { - t.Run("default file identifier", func(t *testing.T) { - identifier, err := newFileIdentifier(nil, "") + t.Run("native file identifier", func(t *testing.T) { + cfg := conf.MustNewConfigFrom(`native: ~`) + ns := conf.Namespace{} + if err := cfg.Unpack(&ns); err != nil { + t.Fatalf("cannot unpack config into conf.Namespace: %s", err) + } + identifier, err := newFileIdentifier(&ns, "") require.NoError(t, err) assert.Equal(t, DefaultIdentifierName, identifier.Name()) - tmpFile, err := ioutil.TempFile("", "test_file_identifier_native") + tmpFile, err := os.CreateTemp("", "test_file_identifier_native") if err != nil { t.Fatalf("cannot create temporary file for test: %v", err) } @@ -59,12 +63,17 @@ func TestFileIdentifier(t *testing.T) { assert.Equal(t, identifier.Name()+"::"+file.GetOSState(fi).String(), src.Name()) }) - t.Run("default file identifier with suffix", func(t *testing.T) { - identifier, err := newFileIdentifier(nil, "my-suffix") + t.Run("native file identifier with suffix", func(t *testing.T) { + cfg := conf.MustNewConfigFrom(`native: ~`) + ns := conf.Namespace{} + if err := cfg.Unpack(&ns); err != nil { + t.Fatalf("cannot unpack config into conf.Namespace: %s", err) + } + identifier, err := newFileIdentifier(&ns, "my-suffix") require.NoError(t, err) assert.Equal(t, DefaultIdentifierName, identifier.Name()) - tmpFile, err := ioutil.TempFile("", "test_file_identifier_native") + tmpFile, err := os.CreateTemp("", "test_file_identifier_native") if err != nil { t.Fatalf("cannot create temporary file for test: %v", err) } diff --git a/filebeat/input/filestream/input_test.go b/filebeat/input/filestream/input_test.go index 3dfe176ac017..735ea0d0ffe7 100644 --- a/filebeat/input/filestream/input_test.go +++ b/filebeat/input/filestream/input_test.go @@ -50,6 +50,7 @@ func BenchmarkFilestream(b *testing.B) { cfg := ` type: filestream prospector.scanner.check_interval: 1s +prospector.scanner.fingerprint.enabled: false paths: - ` + filename + ` ` @@ -91,6 +92,7 @@ paths: cfg := ` type: filestream prospector.scanner.check_interval: 1s +prospector.scanner.fingerprint.enabled: false paths: - ` + ingestPath + ` ` @@ -146,6 +148,7 @@ func TestTakeOverTags(t *testing.T) { cfg := fmt.Sprintf(` type: filestream prospector.scanner.check_interval: 1s +prospector.scanner.fingerprint.enabled: false take_over: %t paths: - %s`, testCase.takeOver, filename) From 09002a16ae43c498c3834e250e3024aa8e3d51c7 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 12 Dec 2024 15:00:34 -0500 Subject: [PATCH 24/28] Fix filestream integration tests --- .../filestream/input_integration_test.go | 206 +++++++++++------- 1 file changed, 128 insertions(+), 78 deletions(-) diff --git a/filebeat/input/filestream/input_integration_test.go b/filebeat/input/filestream/input_integration_test.go index 80327d8bcf2c..3d468bb23c04 100644 --- a/filebeat/input/filestream/input_integration_test.go +++ b/filebeat/input/filestream/input_integration_test.go @@ -52,11 +52,13 @@ func TestFilestreamCloseRenamed(t *testing.T) { // the output to receive the event and then close the source file. id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName) + "*"}, - "prospector.scanner.check_interval": "10ms", - "close.on_state_change.check_interval": "1ms", - "close.on_state_change.renamed": "true", + "id": id, + "paths": []string{env.abspath(testlogName) + "*"}, + "prospector.scanner.check_interval": "10ms", + "prospector.scanner.fingerprint.enabled": false, + "close.on_state_change.check_interval": "1ms", + "close.on_state_change.renamed": "true", + "file_identity.native": map[string]any{}, }) testlines := []byte("first log line\n") @@ -94,9 +96,11 @@ func TestFilestreamMetadataUpdatedOnRename(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName) + "*"}, - "prospector.scanner.check_interval": "1ms", + "id": id, + "paths": []string{env.abspath(testlogName) + "*"}, + "prospector.scanner.check_interval": "1ms", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) testline := []byte("log line\n") @@ -132,11 +136,13 @@ func TestFilestreamCloseRemoved(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName) + "*"}, - "prospector.scanner.check_interval": "24h", - "close.on_state_change.check_interval": "1ms", - "close.on_state_change.removed": "true", + "id": id, + "paths": []string{env.abspath(testlogName) + "*"}, + "prospector.scanner.check_interval": "24h", + "close.on_state_change.check_interval": "1ms", + "close.on_state_change.removed": "true", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) testlines := []byte("first log line\n") @@ -209,9 +215,11 @@ func TestFilestreamEmptyLine(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", + "id": id, + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) ctx, cancelInput := context.WithCancel(context.Background()) @@ -248,9 +256,11 @@ func TestFilestreamEmptyLinesOnly(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", + "id": id, + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) ctx, cancelInput := context.WithCancel(context.Background()) @@ -272,8 +282,10 @@ func TestFilestreamBOMUTF8(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, + "id": id, + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) // BOM: 0xEF,0xBB,0xBF @@ -315,9 +327,11 @@ func TestFilestreamUTF16BOMs(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, - "encoding": name, + "id": id, + "paths": []string{env.abspath(testlogName)}, + "encoding": name, + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) line := []byte("first line\n") @@ -348,11 +362,13 @@ func TestFilestreamCloseTimeout(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "24h", - "close.on_state_change.check_interval": "100ms", - "close.reader.after_interval": "500ms", + "id": id, + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "24h", + "close.on_state_change.check_interval": "100ms", + "close.reader.after_interval": "500ms", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) testlines := []byte("first line\n") @@ -382,11 +398,13 @@ func TestFilestreamCloseAfterInterval(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "24h", - "close.on_state_change.check_interval": "100ms", - "close.on_state_change.inactive": "2s", + "id": id, + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "24h", + "close.on_state_change.check_interval": "100ms", + "close.on_state_change.inactive": "2s", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) testlines := []byte("first line\nsecond line\nthird line\n") @@ -417,7 +435,9 @@ func TestFilestreamCloseAfterIntervalRemoved(t *testing.T) { "close.on_state_change.inactive": "100ms", // reader is not stopped when file is removed to see if the reader can still detect // if the file has been inactive even if it have been removed in the meantime - "close.on_state_change.removed": "false", + "close.on_state_change.removed": "false", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) testlines := []byte("first line\nsecond line\nthird line\n") @@ -450,7 +470,9 @@ func TestFilestreamCloseAfterIntervalRenamed(t *testing.T) { "close.on_state_change.inactive": "100ms", // reader is not stopped when file is removed to see if the reader can still detect // if the file has been inactive even if it have been removed in the meantime - "close.on_state_change.removed": "false", + "close.on_state_change.removed": "false", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) testlines := []byte("first line\nsecond line\nthird line\n") @@ -485,7 +507,9 @@ func TestFilestreamCloseAfterIntervalRotatedAndRemoved(t *testing.T) { "close.on_state_change.inactive": "100ms", // reader is not stopped when file is removed to see if the reader can still detect // if the file has been inactive even if it have been removed in the meantime - "close.on_state_change.removed": "false", + "close.on_state_change.removed": "false", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) testlines := []byte("first line\nsecond line\nthird line\n") @@ -558,10 +582,12 @@ func TestFilestreamTruncatedFileOpen(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", - "prospector.scanner.resend_on_touch": "true", + "id": id, + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "prospector.scanner.resend_on_touch": "true", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) ctx, cancelInput := context.WithCancel(context.Background()) @@ -592,11 +618,13 @@ func TestFilestreamTruncatedFileClosed(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", - "prospector.scanner.resend_on_touch": "true", - "close.reader.on_eof": "true", + "id": id, + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "prospector.scanner.resend_on_touch": "true", + "close.reader.on_eof": "true", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) ctx, cancelInput := context.WithCancel(context.Background()) @@ -635,9 +663,11 @@ func TestFilestreamTruncateWithSymlink(t *testing.T) { env.abspath(testlogName), env.abspath(symlinkName), }, - "prospector.scanner.check_interval": "1ms", - "prospector.scanner.resend_on_touch": "true", - "prospector.scanner.symlinks": "true", + "prospector.scanner.check_interval": "1ms", + "prospector.scanner.resend_on_touch": "true", + "prospector.scanner.symlinks": "true", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) lines := []byte("first line\nsecond line\nthird line\n") @@ -707,10 +737,12 @@ func TestFilestreamTruncateCheckOffset(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", - "prospector.scanner.resend_on_touch": "true", + "id": id, + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "prospector.scanner.resend_on_touch": "true", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) ctx, cancelInput := context.WithCancel(context.Background()) @@ -737,9 +769,11 @@ func TestFilestreamTruncateBlockedOutput(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "200ms", + "id": id, + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "200ms", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) testlines := []byte("first line\nsecond line\n") @@ -792,7 +826,9 @@ func TestFilestreamSymlinksEnabled(t *testing.T) { "paths": []string{ env.abspath(symlinkName), }, - "prospector.scanner.symlinks": "true", + "prospector.scanner.symlinks": "true", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) testlines := []byte("first line\n") @@ -824,10 +860,12 @@ func TestFilestreamSymlinkRotated(t *testing.T) { "paths": []string{ env.abspath(symlinkName), }, - "prospector.scanner.check_interval": "1ms", - "prospector.scanner.symlinks": "true", - "close.on_state_change.removed": "false", - "clean_removed": "false", + "prospector.scanner.check_interval": "1ms", + "prospector.scanner.symlinks": "true", + "close.on_state_change.removed": "false", + "clean_removed": "false", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) commonLine := "first line in file " @@ -874,10 +912,12 @@ func TestFilestreamSymlinkRemoved(t *testing.T) { "paths": []string{ env.abspath(symlinkName), }, - "prospector.scanner.check_interval": "1ms", - "prospector.scanner.symlinks": "true", - "close.on_state_change.removed": "false", - "clean_removed": "false", + "prospector.scanner.check_interval": "1ms", + "prospector.scanner.symlinks": "true", + "close.on_state_change.removed": "false", + "clean_removed": "false", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) line := []byte("first line\n") @@ -918,9 +958,11 @@ func TestFilestreamTruncate(t *testing.T) { "paths": []string{ env.abspath("*"), }, - "prospector.scanner.check_interval": "1ms", - "prospector.scanner.resend_on_touch": "true", - "prospector.scanner.symlinks": "true", + "prospector.scanner.check_interval": "1ms", + "prospector.scanner.resend_on_touch": "true", + "prospector.scanner.symlinks": "true", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) lines := []byte("first line\nsecond line\nthird line\n") @@ -978,6 +1020,8 @@ func TestFilestreamHarvestAllFilesWhenHarvesterLimitExceeded(t *testing.T) { "paths": []string{ env.abspath(logFiles[0].path), env.abspath(logFiles[1].path)}, + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) @@ -994,8 +1038,10 @@ func TestGlobalIDCannotBeUsed(t *testing.T) { env := newInputTestingEnvironment(t) testlogName := "test.log" _, err := env.createInput(map[string]interface{}{ - "id": ".global", - "paths": []string{env.abspath(testlogName) + "*"}, + "id": ".global", + "paths": []string{env.abspath(testlogName) + "*"}, + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) if err == nil { t.Fatal("expecting an error because '.global' cannot be used as input ID") @@ -1013,10 +1059,12 @@ func TestRotatingCloseInactiveLargerWriteRate(t *testing.T) { "paths": []string{ env.abspath("*"), }, - "prospector.scanner.check_interval": "100ms", - "close.on_state_change.check_interval": "1s", - "close.on_state_change.inactive": "5s", - "ignore_older": "10s", + "prospector.scanner.check_interval": "100ms", + "close.on_state_change.check_interval": "1s", + "close.on_state_change.inactive": "5s", + "ignore_older": "10s", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) ctx, cancelInput := context.WithCancel(context.Background()) @@ -1060,10 +1108,12 @@ func TestRotatingCloseInactiveLowWriteRate(t *testing.T) { "paths": []string{ env.abspath("*"), }, - "prospector.scanner.check_interval": "1ms", - "close.on_state_change.check_interval": "1ms", - "close.on_state_change.inactive": "1s", - "ignore_older": "10s", + "prospector.scanner.check_interval": "1ms", + "close.on_state_change.check_interval": "1ms", + "close.on_state_change.inactive": "1s", + "ignore_older": "10s", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) ctx, cancelInput := context.WithCancel(context.Background()) From 9758447b09134d9959f387fb3fae5ffb375f0d44 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 12 Dec 2024 16:55:08 -0500 Subject: [PATCH 25/28] Fix more tests --- filebeat/input/filestream/environment_test.go | 11 ++++++++--- filebeat/input/filestream/input_integration_test.go | 10 ++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/filebeat/input/filestream/environment_test.go b/filebeat/input/filestream/environment_test.go index f9804bb16f32..58e048b4b40f 100644 --- a/filebeat/input/filestream/environment_test.go +++ b/filebeat/input/filestream/environment_test.go @@ -448,9 +448,14 @@ func (e *inputTestingEnvironment) waitUntilAtLeastEventCount(count int) { // waitUntilHarvesterIsDone detects Harvester stop by checking if the last client has been closed // as when a Harvester stops the client is closed. func (e *inputTestingEnvironment) waitUntilHarvesterIsDone() { - for !e.pipeline.clients[len(e.pipeline.clients)-1].closed { - time.Sleep(10 * time.Millisecond) - } + require.Eventually( + e.t, + func() bool { + return e.pipeline.clients[len(e.pipeline.clients)-1].closed + }, + time.Second*10, + time.Millisecond*10, + "The last connected client has not closed it's connection") } // requireEventsReceived requires that the list of messages has made it into the output. diff --git a/filebeat/input/filestream/input_integration_test.go b/filebeat/input/filestream/input_integration_test.go index 3d468bb23c04..a8a950e377f4 100644 --- a/filebeat/input/filestream/input_integration_test.go +++ b/filebeat/input/filestream/input_integration_test.go @@ -179,10 +179,12 @@ func TestFilestreamCloseEOF(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "24h", - "close.reader.on_eof": "true", + "id": id, + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "24h", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, + "close.reader.on_eof": "true", }) testlines := []byte("first log line\n") From 68c4a642a7e9e38232999f8ac17f77c92ec40929 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Fri, 13 Dec 2024 18:44:04 -0500 Subject: [PATCH 26/28] Fix more tests --- filebeat/input/filestream/environment_test.go | 1 + .../filestream/input_integration_test.go | 24 ++-- .../legacy_metrics_integration_test.go | 8 +- .../filestream/metrics_integration_test.go | 12 +- .../filestream/parsers_integration_test.go | 110 +++++++++++------- 5 files changed, 97 insertions(+), 58 deletions(-) diff --git a/filebeat/input/filestream/environment_test.go b/filebeat/input/filestream/environment_test.go index 58e048b4b40f..80460d6b3b4a 100644 --- a/filebeat/input/filestream/environment_test.go +++ b/filebeat/input/filestream/environment_test.go @@ -386,6 +386,7 @@ func getIDFromPath(filepath, inputID string, fi os.FileInfo) string { // waitUntilEventCount waits until total count events arrive to the client. func (e *inputTestingEnvironment) waitUntilEventCount(count int) { + e.t.Helper() msg := &strings.Builder{} require.Eventuallyf(e.t, func() bool { msg.Reset() diff --git a/filebeat/input/filestream/input_integration_test.go b/filebeat/input/filestream/input_integration_test.go index a8a950e377f4..5c063481dd53 100644 --- a/filebeat/input/filestream/input_integration_test.go +++ b/filebeat/input/filestream/input_integration_test.go @@ -55,9 +55,9 @@ func TestFilestreamCloseRenamed(t *testing.T) { "id": id, "paths": []string{env.abspath(testlogName) + "*"}, "prospector.scanner.check_interval": "10ms", - "prospector.scanner.fingerprint.enabled": false, "close.on_state_change.check_interval": "1ms", "close.on_state_change.renamed": "true", + "prospector.scanner.fingerprint.enabled": false, "file_identity.native": map[string]any{}, }) @@ -540,11 +540,13 @@ func TestFilestreamCloseAfterIntervalRotatedAndNewRemoved(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", - "close.on_state_change.check_interval": "10ms", - "close.on_state_change.inactive": "100ms", + "id": id, + "paths": []string{env.abspath(testlogName)}, + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": "false", + "prospector.scanner.check_interval": "1ms", + "close.on_state_change.check_interval": "10ms", + "close.on_state_change.inactive": "100ms", // reader is not stopped when file is removed to see if the reader can still detect // if the file has been inactive even if it have been removed in the meantime "close.on_state_change.removed": "false", @@ -707,10 +709,12 @@ func TestFilestreamTruncateBigScannerInterval(t *testing.T) { testlogName := "test.log" id := "fake-ID-" + uuid.Must(uuid.NewV4()).String() inp := env.mustCreateInput(map[string]interface{}{ - "id": id, - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "5s", - "prospector.scanner.resend_on_touch": "true", + "id": id, + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "5s", + "prospector.scanner.resend_on_touch": "true", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, }) ctx, cancelInput := context.WithCancel(context.Background()) diff --git a/filebeat/input/filestream/legacy_metrics_integration_test.go b/filebeat/input/filestream/legacy_metrics_integration_test.go index 649ede41f3e2..ec2e18a3706b 100644 --- a/filebeat/input/filestream/legacy_metrics_integration_test.go +++ b/filebeat/input/filestream/legacy_metrics_integration_test.go @@ -41,6 +41,8 @@ filebeat.inputs: enabled: true close.reader.after_interval: 1s prospector.scanner.check_interval: 500ms + file_identity.native: ~ + prospector.scanner.fingerprint.enabled: false paths: - %s/*.filestream - type: log @@ -48,6 +50,8 @@ filebeat.inputs: enabled: true close_timeout: 1s scan_frequency: 500ms + file_identity.native: ~ + prospector.scanner.fingerprint.enabled: false paths: - %s/*.log @@ -71,7 +75,9 @@ func TestLegacyMetrics(t *testing.T) { filebeat.WriteConfigFile(cfg) filebeat.Start() - filebeat.WaitForLogs("Metrics endpoint listening on:", 10*time.Second) + filebeat.WaitForLogs("Metrics endpoint listening on:", + 10*time.Second, + "metrics endpoint did not start") // After starting Filebeat all counters must be zero waitForMetrics(t, diff --git a/filebeat/input/filestream/metrics_integration_test.go b/filebeat/input/filestream/metrics_integration_test.go index 3671f076d0ed..b551b2321b76 100644 --- a/filebeat/input/filestream/metrics_integration_test.go +++ b/filebeat/input/filestream/metrics_integration_test.go @@ -33,11 +33,13 @@ func TestFilestreamMetrics(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "24h", - "close.on_state_change.check_interval": "100ms", - "close.on_state_change.inactive": "2s", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "24h", + "close.on_state_change.check_interval": "100ms", + "close.on_state_change.inactive": "2s", + "prospector.scanner.fingerprint.enabled": false, + "file_identity.native": map[string]any{}, }) testlines := []byte("first line\nsecond line\nthird line\n") diff --git a/filebeat/input/filestream/parsers_integration_test.go b/filebeat/input/filestream/parsers_integration_test.go index 619d39f05126..858f4e6d1ce3 100644 --- a/filebeat/input/filestream/parsers_integration_test.go +++ b/filebeat/input/filestream/parsers_integration_test.go @@ -29,9 +29,11 @@ func TestParsersAgentLogs(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "ndjson": map[string]interface{}{ @@ -65,9 +67,11 @@ func TestParsersIncludeMessage(t *testing.T) { testlogName := "test.log" readLine := "include this" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "100ms", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "100ms", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "include_message": map[string]interface{}{ @@ -98,9 +102,11 @@ func TestParsersDockerLogsFiltering(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "ndjson": map[string]interface{}{ @@ -137,9 +143,11 @@ func TestParsersSimpleJSONOverwrite(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "ndjson": map[string]interface{}{ @@ -173,9 +181,11 @@ func TestParsersTimestampInJSONMessage(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "ndjson": map[string]interface{}{ @@ -214,9 +224,11 @@ func TestParsersJavaElasticsearchLogs(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "multiline": map[string]interface{}{ @@ -249,9 +261,11 @@ func TestParsersCStyleLog(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "multiline": map[string]interface{}{ @@ -290,9 +304,11 @@ func TestParsersRabbitMQMultilineLog(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "multiline": map[string]interface{}{ @@ -335,9 +351,11 @@ func TestParsersMultilineMaxLines(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "multiline": map[string]interface{}{ @@ -379,9 +397,11 @@ func TestParsersMultilineTimeout(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "multiline": map[string]interface{}{ @@ -444,10 +464,12 @@ func TestParsersMultilineMaxBytes(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", - "message_max_bytes": 50, + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "message_max_bytes": 50, + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "multiline": map[string]interface{}{ @@ -486,10 +508,12 @@ func TestParsersCloseTimeoutWithMultiline(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", - "close.reader.after_interval": "1s", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "close.reader.after_interval": "1s", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "multiline": map[string]interface{}{ @@ -551,10 +575,12 @@ func TestParsersConsecutiveNewline(t *testing.T) { testlogName := "test.log" inp := env.mustCreateInput(map[string]interface{}{ - "id": "fake-ID", - "paths": []string{env.abspath(testlogName)}, - "prospector.scanner.check_interval": "1ms", - "close.reader.after_interval": "1s", + "id": "fake-ID", + "paths": []string{env.abspath(testlogName)}, + "prospector.scanner.check_interval": "1ms", + "close.reader.after_interval": "1s", + "file_identity.native": map[string]any{}, + "prospector.scanner.fingerprint.enabled": false, "parsers": []map[string]interface{}{ { "multiline": map[string]interface{}{ From 889302967ca2d2c3bea25adf4395b085452ca723 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 19 Dec 2024 12:07:38 -0500 Subject: [PATCH 27/28] implement review suggestions --- filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl | 3 ++- filebeat/input/filestream/prospector_test.go | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl b/filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl index cf6efb550d17..5e44bcdb09e5 100644 --- a/filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl +++ b/filebeat/_meta/config/filebeat.inputs.reference.yml.tmpl @@ -438,7 +438,8 @@ filebeat.inputs: #clean_removed: true # Method to determine if two files are the same or not. By default - # the Beat considers two files the same if their inode and device id are the same. + # a fingerprint is generated using the first 1024 bytes of the file, + # if the fingerprints match, then the files are considered equal. #file_identity.fingerprint: ~ # Optional additional fields. These fields can be freely picked diff --git a/filebeat/input/filestream/prospector_test.go b/filebeat/input/filestream/prospector_test.go index 79a9a90df208..c1e806e3948d 100644 --- a/filebeat/input/filestream/prospector_test.go +++ b/filebeat/input/filestream/prospector_test.go @@ -101,7 +101,7 @@ func TestProspector_InitCleanIfRemoved(t *testing.T) { } func TestProspector_InitUpdateIdentifiers(t *testing.T) { - f, err := os.CreateTemp("", "existing_file") + f, err := os.CreateTemp(t.TempDir(), "existing_file") if err != nil { t.Fatalf("cannot create temp file") } From d516a86268a09c67a078529a90b840f36bab1010 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 19 Dec 2024 12:30:42 -0500 Subject: [PATCH 28/28] update generated files --- filebeat/filebeat.reference.yml | 3 ++- x-pack/filebeat/filebeat.reference.yml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/filebeat/filebeat.reference.yml b/filebeat/filebeat.reference.yml index 6759a9d614b8..54e3cd433b3c 100644 --- a/filebeat/filebeat.reference.yml +++ b/filebeat/filebeat.reference.yml @@ -851,7 +851,8 @@ filebeat.inputs: #clean_removed: true # Method to determine if two files are the same or not. By default - # the Beat considers two files the same if their inode and device id are the same. + # a fingerprint is generated using the first 1024 bytes of the file, + # if the fingerprints match, then the files are considered equal. #file_identity.fingerprint: ~ # Optional additional fields. These fields can be freely picked diff --git a/x-pack/filebeat/filebeat.reference.yml b/x-pack/filebeat/filebeat.reference.yml index 338909669a4b..3e831d99fa28 100644 --- a/x-pack/filebeat/filebeat.reference.yml +++ b/x-pack/filebeat/filebeat.reference.yml @@ -2535,7 +2535,8 @@ filebeat.inputs: #clean_removed: true # Method to determine if two files are the same or not. By default - # the Beat considers two files the same if their inode and device id are the same. + # a fingerprint is generated using the first 1024 bytes of the file, + # if the fingerprints match, then the files are considered equal. #file_identity.fingerprint: ~ # Optional additional fields. These fields can be freely picked