From f2f13b2dc0d00e886393c89d0610f3ccf6b31daa Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Sun, 14 Jan 2024 20:52:02 -0800 Subject: [PATCH 01/29] Write large diffs to tmp files --- pkg/gitparse/gitparse.go | 180 ++++++++++++++++++++++++++++++++------- pkg/sources/git/git.go | 94 ++++++++++++++------ 2 files changed, 217 insertions(+), 57 deletions(-) diff --git a/pkg/gitparse/gitparse.go b/pkg/gitparse/gitparse.go index dcd544c85531..5b3049248030 100644 --- a/pkg/gitparse/gitparse.go +++ b/pkg/gitparse/gitparse.go @@ -4,7 +4,6 @@ import ( "bufio" "bytes" "fmt" - "github.com/go-logr/logr" "io" "os" "os/exec" @@ -13,6 +12,9 @@ import ( "strings" "time" + "github.com/go-logr/logr" + + "github.com/trufflesecurity/trufflehog/v3/pkg/cleantemp" "github.com/trufflesecurity/trufflehog/v3/pkg/common" "github.com/trufflesecurity/trufflehog/v3/pkg/context" ) @@ -40,10 +42,86 @@ type Commit struct { // Diff contains the info about a file diff in a commit. type Diff struct { - PathB string - LineStart int - Content bytes.Buffer - IsBinary bool + PathB string + LineStart int + Content bytes.Buffer // Keep in-memory buffer for smaller diffs + streamDestination *os.File // File destination for larger diffs + IsBinary bool + threshold int // Size threshold to switch to file +} + +type diffOption func(*Diff) + +// withPathB sets the PathB option. +func withPathB(pathB string) diffOption { return func(d *Diff) { d.PathB = pathB } } + +// withThreshold sets the threshold option. +// TODO: Leverage this option in the future. +func withThreshold(threshold int) diffOption { return func(d *Diff) { d.threshold = threshold } } + +// NewDiff creates a new Diff with a threshold. +func NewDiff(opts ...diffOption) *Diff { + const defaultThreshold = 20 * 1024 * 1024 // 20MB + d := &Diff{threshold: defaultThreshold} + for _, opt := range opts { + opt(d) + } + + return d +} + +// write handles writing diff data to either an in-memory buffer or a file, depending on the size. +func (d *Diff) write(ctx context.Context, p []byte) error { + if d.Content.Len()+len(p) <= d.threshold { + // If the total size is within the threshold, write to the buffer. + ctx.Logger().V(4).Info( + "writing to buffer", + "data_size", len(p), + "content_size", d.Content.Len(), + ) + _, err := d.Content.Write(p) + return err + } + // Switch to file writing if threshold is exceeded. + // This helps in managing memory efficiently for large diffs. + if d.streamDestination == nil { + var err error + d.streamDestination, err = os.CreateTemp(os.TempDir(), cleantemp.MkFilename()) + if err != nil { + return err + } + + // Transfer existing data in buffer to the file, then clear the buffer. + // This ensures all the diff data is in one place - either entirely in the buffer or the file. + if d.Content.Len() > 0 { + ctx.Logger().V(4).Info("writing buffer to file", "content_size", d.Content.Len()) + if _, err := d.streamDestination.Write(d.Content.Bytes()); err != nil { + return err + } + // Replace the buffer with a new one to free up memory. + d.Content = bytes.Buffer{} + } + } + ctx.Logger().V(4).Info("writing to file", "data_size", len(p)) + + _, err := d.streamDestination.Write(p) + return err +} + +// finalize ensures proper closure of resources associated with the Diff. +// handle the final flush in the finalize method, in case there's data remaining in the buffer. +// This method should be called to release resources, especially when writing to a file. +func (d *Diff) finalize() error { + if d.streamDestination == nil { + return nil + } + + if d.Content.Len() > 0 { + if _, err := d.streamDestination.Write(d.Content.Bytes()); err != nil { + return err + } + } + return d.streamDestination.Close() } // Parser sets values used in GitParse. @@ -53,6 +131,51 @@ type Parser struct { dateFormat string } +// noOpCloser wraps an io.Reader to add a no-op Close method, forming an io.ReadCloser. +type noOpCloser struct{ io.Reader } + +// Close performs no operation (no-op) and returns nil. +// It's used to fulfill the io.Closer interface. +func (noc *noOpCloser) Close() error { return nil } + +// DiffContentReadCloser returns an io.ReadCloser for reading the content of a Diff. +// If the diff content size exceeds a predefined threshold, it is stored in a temporary file, +// and the function returns an auto-deleting file reader (newAutoDeletingFileReader) to read from this file. +// For smaller diffs that fit within the threshold, the content is kept in memory, +// and the function returns a no-op closer wrapper (noOpCloser) around a bytes.Reader. +// The caller is responsible for calling Close on the returned io.ReadCloser in both cases. +func DiffContentReadCloser(d *Diff) (io.ReadCloser, error) { + if d.streamDestination != nil { + // Data is in a file, read from the file. + file, err := os.Open(d.streamDestination.Name()) + if err != nil { + return nil, err + } + return newAutoDeletingFileReader(file), nil + } + // Data is in memory. + return &noOpCloser{Reader: bytes.NewReader(d.Content.Bytes())}, nil +} + +// autoDeletingFileReader wraps an *os.File and deletes the file on Close +type autoDeletingFileReader struct{ file *os.File } + +// newAutoDeletingFileReader creates a new autoDeletingFileReader +func newAutoDeletingFileReader(file *os.File) *autoDeletingFileReader { + return &autoDeletingFileReader{file: file} +} + +// Read implements the io.Reader interface +func (r *autoDeletingFileReader) Read(p []byte) (int, error) { + return r.file.Read(p) +} + +// Close implements the io.Closer interface, deletes the file after closing +func (r *autoDeletingFileReader) Close() error { + defer os.Remove(r.file.Name()) // Delete the file after closing + return r.file.Close() +} + type ParseState int const ( @@ -254,11 +377,11 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch outReader := bufio.NewReader(stdOut) var ( currentCommit *Commit - currentDiff Diff totalLogSize int ) var latestState = Initial + currentDiff := NewDiff() defer common.RecoverWithExit(ctx) defer close(commitChan) @@ -278,19 +401,20 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch // If there is a currentDiff, add it to currentCommit. if currentDiff.Content.Len() > 0 || currentDiff.IsBinary { - currentCommit.Diffs = append(currentCommit.Diffs, currentDiff) + currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff) currentCommit.Size += currentDiff.Content.Len() } // If there is a currentCommit, send it to the channel. if currentCommit != nil { + if err := currentDiff.finalize(); err != nil { + ctx.Logger().Error(err, "failed to finalize diff") + } commitChan <- *currentCommit totalLogSize += currentCommit.Size } // Create a new currentDiff and currentCommit - currentDiff = Diff{} - currentCommit = &Commit{ - Message: strings.Builder{}, - } + currentDiff = NewDiff() + currentCommit = &Commit{Message: strings.Builder{}} // Check that the commit line contains a hash and set it. if len(line) >= 47 { currentCommit.Hash = string(line[7:47]) @@ -327,7 +451,10 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch currentCommit = &Commit{} } if currentDiff.Content.Len() > 0 || currentDiff.IsBinary { - currentCommit.Diffs = append(currentCommit.Diffs, currentDiff) + currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff) + if err := currentDiff.finalize(); err != nil { + ctx.Logger().Error(err, "failed to finalize diff") + } // If the currentDiff is over 1GB, drop it into the channel so it isn't held in memory waiting for more commits. totalSize := 0 for _, diff := range currentCommit.Diffs { @@ -348,7 +475,7 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch currentCommit.Message.WriteString(oldCommit.Message.String()) } } - currentDiff = Diff{} + currentDiff = NewDiff() case isModeLine(isStaged, latestState, line): latestState = ModeLine // NoOp @@ -376,11 +503,9 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch latestState = HunkLineNumberLine if currentDiff.Content.Len() > 0 || currentDiff.IsBinary { - currentCommit.Diffs = append(currentCommit.Diffs, currentDiff) - } - currentDiff = Diff{ - PathB: currentDiff.PathB, + currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff) } + currentDiff = NewDiff(withPathB(currentDiff.PathB)) words := bytes.Split(line, []byte(" ")) if len(words) >= 3 { @@ -395,24 +520,21 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch latestState = HunkContentLine } // TODO: Why do we care about this? It creates empty lines in the diff. If there are no plusLines, it's just newlines. - currentDiff.Content.Write([]byte("\n")) + if err := currentDiff.write(ctx, []byte("\n")); err != nil { + ctx.Logger().Error(err, "failed to write to diff") + } case isHunkPlusLine(isStaged, latestState, line): if latestState != HunkContentLine { latestState = HunkContentLine } - currentDiff.Content.Write(line[1:]) - case isHunkMinusLine(isStaged, latestState, line): - if latestState != HunkContentLine { - latestState = HunkContentLine + if err := currentDiff.write(ctx, line[1:]); err != nil { + ctx.Logger().Error(err, "failed to write to diff") } // NoOp. We only care about additions. - case isHunkNewlineWarningLine(isStaged, latestState, line): - if latestState != HunkContentLine { - latestState = HunkContentLine - } - // NoOp - case isHunkEmptyLine(isStaged, latestState, line): + case isHunkMinusLine(isStaged, latestState, line), + isHunkNewlineWarningLine(isStaged, latestState, line), + isHunkEmptyLine(isStaged, latestState, line): if latestState != HunkContentLine { latestState = HunkContentLine } @@ -446,7 +568,7 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch break } } - cleanupParse(currentCommit, ¤tDiff, commitChan, &totalLogSize) + cleanupParse(currentCommit, currentDiff, commitChan, &totalLogSize) ctx.Logger().V(2).Info("finished parsing git log.", "total_log_size", totalLogSize) } diff --git a/pkg/sources/git/git.go b/pkg/sources/git/git.go index 1237c904b251..81f96d68d6ab 100644 --- a/pkg/sources/git/git.go +++ b/pkg/sources/git/git.go @@ -18,13 +18,12 @@ import ( "github.com/go-git/go-git/v5/plumbing" "github.com/go-git/go-git/v5/plumbing/object" "github.com/google/go-github/v42/github" + diskbufferreader "github.com/trufflesecurity/disk-buffer-reader" "golang.org/x/oauth2" "golang.org/x/sync/semaphore" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" - diskbufferreader "github.com/trufflesecurity/disk-buffer-reader" - "github.com/trufflesecurity/trufflehog/v3/pkg/cleantemp" "github.com/trufflesecurity/trufflehog/v3/pkg/common" "github.com/trufflesecurity/trufflehog/v3/pkg/context" @@ -500,6 +499,7 @@ func (s *Git) ScanCommits(ctx context.Context, repo *git.Repository, path string atomic.AddUint64(&s.metrics.commitsScanned, 1) logger.V(5).Info("scanning commit", "commit", commit.Hash) for _, diff := range commit.Diffs { + diff := diff if !scanOptions.Filter.Pass(diff.PathB) { continue } @@ -532,20 +532,37 @@ func (s *Git) ScanCommits(ctx context.Context, repo *git.Repository, path string } if diff.Content.Len() > sources.ChunkSize+sources.PeekSize { - s.gitChunk(ctx, diff, fileName, email, hash, when, remoteURL, reporter) + s.gitChunk(ctx, &diff, fileName, email, hash, when, remoteURL, reporter) continue } - metadata := s.sourceMetadataFunc(fileName, email, hash, when, remoteURL, int64(diff.LineStart)) - chunk := sources.Chunk{ - SourceName: s.sourceName, - SourceID: s.sourceID, - JobID: s.jobID, - SourceType: s.sourceType, - SourceMetadata: metadata, - Data: diff.Content.Bytes(), - Verify: s.verify, + + chunkData := func(d *gitparse.Diff) error { + metadata := s.sourceMetadataFunc(fileName, email, hash, when, remoteURL, int64(diff.LineStart)) + + reader, err := gitparse.DiffContentReadCloser(d) + if err != nil { + ctx.Logger().Error(err, "error creating reader for commits", "filename", fileName, "commit", hash, "file", diff.PathB) + return nil + } + defer reader.Close() + + data := make([]byte, diff.Content.Len()) + if _, err := reader.Read(data); err != nil { + ctx.Logger().Error(err, "error reading diff content for commit", "filename", fileName, "commit", hash, "file", diff.PathB) + return nil + } + chunk := sources.Chunk{ + SourceName: s.sourceName, + SourceID: s.sourceID, + JobID: s.jobID, + SourceType: s.sourceType, + SourceMetadata: metadata, + Data: data, + Verify: s.verify, + } + return reporter.ChunkOk(ctx, chunk) } - if err := reporter.ChunkOk(ctx, chunk); err != nil { + if err := chunkData(&diff); err != nil { return err } } @@ -553,8 +570,15 @@ func (s *Git) ScanCommits(ctx context.Context, repo *git.Repository, path string return nil } -func (s *Git) gitChunk(ctx context.Context, diff gitparse.Diff, fileName, email, hash, when, urlMetadata string, reporter sources.ChunkReporter) { - originalChunk := bufio.NewScanner(&diff.Content) +func (s *Git) gitChunk(ctx context.Context, diff *gitparse.Diff, fileName, email, hash, when, urlMetadata string, reporter sources.ChunkReporter) { + reader, err := gitparse.DiffContentReadCloser(diff) + if err != nil { + ctx.Logger().Error(err, "error creating reader for chunk", "filename", fileName, "commit", hash, "file", diff.PathB) + return + } + defer reader.Close() + + originalChunk := bufio.NewScanner(reader) newChunkBuffer := bytes.Buffer{} lastOffset := 0 for offset := 0; originalChunk.Scan(); offset++ { @@ -646,6 +670,7 @@ func (s *Git) ScanStaged(ctx context.Context, repo *git.Repository, path string, ctx.Logger().V(1).Info("scanning staged changes", "path", path) for commit := range commitChan { for _, diff := range commit.Diffs { + diff := diff logger := ctx.Logger().WithValues("filename", diff.PathB, "commit", commit.Hash, "file", diff.PathB) logger.V(2).Info("scanning staged changes from git") @@ -695,17 +720,33 @@ func (s *Git) ScanStaged(ctx context.Context, repo *git.Repository, path string, continue } - metadata := s.sourceMetadataFunc(fileName, email, "Staged", when, urlMetadata, int64(diff.LineStart)) - chunk := sources.Chunk{ - SourceName: s.sourceName, - SourceID: s.sourceID, - JobID: s.jobID, - SourceType: s.sourceType, - SourceMetadata: metadata, - Data: diff.Content.Bytes(), - Verify: s.verify, + chunkData := func(d *gitparse.Diff) error { + metadata := s.sourceMetadataFunc(fileName, email, "Staged", when, urlMetadata, int64(diff.LineStart)) + + reader, err := gitparse.DiffContentReadCloser(d) + if err != nil { + ctx.Logger().Error(err, "error creating reader for staged", "filename", fileName, "commit", hash, "file", diff.PathB) + return nil + } + defer reader.Close() + + data := make([]byte, diff.Content.Len()) + if _, err := reader.Read(data); err != nil { + ctx.Logger().Error(err, "error reading diff content for staged", "filename", fileName, "commit", hash, "file", diff.PathB) + return nil + } + chunk := sources.Chunk{ + SourceName: s.sourceName, + SourceID: s.sourceID, + JobID: s.jobID, + SourceType: s.sourceType, + SourceMetadata: metadata, + Data: data, + Verify: s.verify, + } + return reporter.ChunkOk(ctx, chunk) } - if err := reporter.ChunkOk(ctx, chunk); err != nil { + if err := chunkData(&diff); err != nil { return err } } @@ -1040,9 +1081,6 @@ func (s *Git) handleBinary(ctx context.Context, gitDir string, reporter sources. return err } defer func() { - if err := fileReader.Close(); err != nil { - ctx.Logger().Error(err, "error closing fileReader") - } if err := cmd.Wait(); err != nil { ctx.Logger().Error( err, "error waiting for command", From 7d2452a582e46f5d112ae074f89397fe43209636 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Tue, 16 Jan 2024 11:04:51 -0800 Subject: [PATCH 02/29] address comments --- pkg/gitparse/gitparse.go | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/pkg/gitparse/gitparse.go b/pkg/gitparse/gitparse.go index 5b3049248030..d971a3e2ebe2 100644 --- a/pkg/gitparse/gitparse.go +++ b/pkg/gitparse/gitparse.go @@ -131,13 +131,6 @@ type Parser struct { dateFormat string } -// noOpCloser wraps an io.Reader to add a no-op Close method, forming an io.ReadCloser. -type noOpCloser struct{ io.Reader } - -// Close performs no operation (no-op) and returns nil. -// It's used to fulfill the io.Closer interface. -func (noc *noOpCloser) Close() error { return nil } - // DiffContentReadCloser returns an io.ReadCloser for reading the content of a Diff. // If the diff content size exceeds a predefined threshold, it is stored in a temporary file, // and the function returns an auto-deleting file reader (newAutoDeletingFileReader) to read from this file. @@ -154,26 +147,21 @@ func DiffContentReadCloser(d *Diff) (io.ReadCloser, error) { return newAutoDeletingFileReader(file), nil } // Data is in memory. - return &noOpCloser{Reader: bytes.NewReader(d.Content.Bytes())}, nil + return io.NopCloser(bytes.NewReader(d.Content.Bytes())), nil } // autoDeletingFileReader wraps an *os.File and deletes the file on Close -type autoDeletingFileReader struct{ file *os.File } +type autoDeletingFileReader struct{ *os.File } // newAutoDeletingFileReader creates a new autoDeletingFileReader func newAutoDeletingFileReader(file *os.File) *autoDeletingFileReader { - return &autoDeletingFileReader{file: file} -} - -// Read implements the io.Reader interface -func (r *autoDeletingFileReader) Read(p []byte) (int, error) { - return r.file.Read(p) + return &autoDeletingFileReader{File: file} } // Close implements the io.Closer interface, deletes the file after closing func (r *autoDeletingFileReader) Close() error { - defer os.Remove(r.file.Name()) // Delete the file after closing - return r.file.Close() + defer os.Remove(r.Name()) // Delete the file after closing + return r.File.Close() } type ParseState int From 0802a62c0e30ccffe3a5364212569948364f5731 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Wed, 17 Jan 2024 15:11:00 -0800 Subject: [PATCH 03/29] Move bufferedfilewriter to own pkg --- pkg/gitparse/gitparse.go | 122 ++------- pkg/gitparse/gitparse_test.go | 22 +- pkg/sources/git/git.go | 12 +- .../bufferedfilewriter.go | 124 +++++++++ .../bufferedfilewriter_test.go | 242 ++++++++++++++++++ 5 files changed, 407 insertions(+), 115 deletions(-) create mode 100644 pkg/writers/buffered_file_writer/bufferedfilewriter.go create mode 100644 pkg/writers/buffered_file_writer/bufferedfilewriter_test.go diff --git a/pkg/gitparse/gitparse.go b/pkg/gitparse/gitparse.go index d971a3e2ebe2..350ea4209a7f 100644 --- a/pkg/gitparse/gitparse.go +++ b/pkg/gitparse/gitparse.go @@ -14,9 +14,9 @@ import ( "github.com/go-logr/logr" - "github.com/trufflesecurity/trufflehog/v3/pkg/cleantemp" "github.com/trufflesecurity/trufflehog/v3/pkg/common" "github.com/trufflesecurity/trufflehog/v3/pkg/context" + "github.com/trufflesecurity/trufflehog/v3/pkg/writers/buffered_file_writer" ) const ( @@ -42,12 +42,11 @@ type Commit struct { // Diff contains the info about a file diff in a commit. type Diff struct { - PathB string - LineStart int - Content bytes.Buffer // Keep in-memory buffer for smaller diffs - streamDestination *os.File // File destination for larger diffs - IsBinary bool - threshold int // Size threshold to switch to file + PathB string + LineStart int + // Used to keep small diffs in memory and larger diffs in a file. + contentWriter *bufferedfilewriter.BufferedFileWriter + IsBinary bool } type diffOption func(*Diff) @@ -55,74 +54,34 @@ type diffOption func(*Diff) // withPathB sets the PathB option. func withPathB(pathB string) diffOption { return func(d *Diff) { d.PathB = pathB } } -// withThreshold sets the threshold option. -// TODO: Leverage this option in the future. -func withThreshold(threshold int) diffOption { return func(d *Diff) { d.threshold = threshold } } - // NewDiff creates a new Diff with a threshold. func NewDiff(opts ...diffOption) *Diff { const defaultThreshold = 20 * 1024 * 1024 // 20MB - d := &Diff{threshold: defaultThreshold} + d := new(Diff) for _, opt := range opts { opt(d) } + d.contentWriter = bufferedfilewriter.New(bufferedfilewriter.WithThreshold(defaultThreshold)) return d } -// write handles writing diff data to either an in-memory buffer or a file, depending on the size. -func (d *Diff) write(ctx context.Context, p []byte) error { - if d.Content.Len()+len(p) <= d.threshold { - // If the total size is within the threshold, write to the buffer. - ctx.Logger().V(4).Info( - "writing to buffer", - "data_size", len(p), - "content_size", d.Content.Len(), - ) - _, err := d.Content.Write(p) - return err - } - // Switch to file writing if threshold is exceeded. - // This helps in managing memory efficiently for large diffs. - if d.streamDestination == nil { - var err error - d.streamDestination, err = os.CreateTemp(os.TempDir(), cleantemp.MkFilename()) - if err != nil { - return err - } +// Len returns the length of the storage. +func (d *Diff) Len() int { return d.contentWriter.Len() } - // Transfer existing data in buffer to the file, then clear the buffer. - // This ensures all the diff data is in one place - either entirely in the buffer or the file. - if d.Content.Len() > 0 { - ctx.Logger().V(4).Info("writing buffer to file", "content_size", d.Content.Len()) - if _, err := d.streamDestination.Write(d.Content.Bytes()); err != nil { - return err - } - // Replace the buffer with a new one to free up memory. - d.Content = bytes.Buffer{} - } - } - ctx.Logger().V(4).Info("writing to file", "data_size", len(p)) +// ReadCloser returns a ReadCloser for the contentWriter. +func (d *Diff) ReadCloser() (io.ReadCloser, error) { return d.contentWriter.ReadCloser() } - _, err := d.streamDestination.Write(p) +// write delegates to the contentWriter. +func (d *Diff) write(ctx context.Context, p []byte) error { + _, err := d.contentWriter.Write(ctx, p) return err } // finalize ensures proper closure of resources associated with the Diff. // handle the final flush in the finalize method, in case there's data remaining in the buffer. // This method should be called to release resources, especially when writing to a file. -func (d *Diff) finalize() error { - if d.streamDestination == nil { - return nil - } - - if d.Content.Len() > 0 { - if _, err := d.streamDestination.Write(d.Content.Bytes()); err != nil { - return err - } - } - return d.streamDestination.Close() -} +func (d *Diff) finalize() error { return d.contentWriter.Close() } // Parser sets values used in GitParse. type Parser struct { @@ -131,39 +90,6 @@ type Parser struct { dateFormat string } -// DiffContentReadCloser returns an io.ReadCloser for reading the content of a Diff. -// If the diff content size exceeds a predefined threshold, it is stored in a temporary file, -// and the function returns an auto-deleting file reader (newAutoDeletingFileReader) to read from this file. -// For smaller diffs that fit within the threshold, the content is kept in memory, -// and the function returns a no-op closer wrapper (noOpCloser) around a bytes.Reader. -// The caller is responsible for calling Close on the returned io.ReadCloser in both cases. -func DiffContentReadCloser(d *Diff) (io.ReadCloser, error) { - if d.streamDestination != nil { - // Data is in a file, read from the file. - file, err := os.Open(d.streamDestination.Name()) - if err != nil { - return nil, err - } - return newAutoDeletingFileReader(file), nil - } - // Data is in memory. - return io.NopCloser(bytes.NewReader(d.Content.Bytes())), nil -} - -// autoDeletingFileReader wraps an *os.File and deletes the file on Close -type autoDeletingFileReader struct{ *os.File } - -// newAutoDeletingFileReader creates a new autoDeletingFileReader -func newAutoDeletingFileReader(file *os.File) *autoDeletingFileReader { - return &autoDeletingFileReader{File: file} -} - -// Close implements the io.Closer interface, deletes the file after closing -func (r *autoDeletingFileReader) Close() error { - defer os.Remove(r.Name()) // Delete the file after closing - return r.File.Close() -} - type ParseState int const ( @@ -263,7 +189,7 @@ func (c1 *Commit) Equal(c2 *Commit) bool { return false case d1.LineStart != d2.LineStart: return false - case d1.Content.String() != d2.Content.String(): + case d1.contentWriter.String() != d2.contentWriter.String(): return false case d1.IsBinary != d2.IsBinary: return false @@ -388,9 +314,9 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch latestState = CommitLine // If there is a currentDiff, add it to currentCommit. - if currentDiff.Content.Len() > 0 || currentDiff.IsBinary { + if currentDiff.Len() > 0 || currentDiff.IsBinary { currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff) - currentCommit.Size += currentDiff.Content.Len() + currentCommit.Size += currentDiff.Len() } // If there is a currentCommit, send it to the channel. if currentCommit != nil { @@ -438,7 +364,7 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch if currentCommit == nil { currentCommit = &Commit{} } - if currentDiff.Content.Len() > 0 || currentDiff.IsBinary { + if currentDiff.Len() > 0 || currentDiff.IsBinary { currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff) if err := currentDiff.finalize(); err != nil { ctx.Logger().Error(err, "failed to finalize diff") @@ -446,7 +372,7 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch // If the currentDiff is over 1GB, drop it into the channel so it isn't held in memory waiting for more commits. totalSize := 0 for _, diff := range currentCommit.Diffs { - totalSize += diff.Content.Len() + totalSize += diff.Len() } if totalSize > c.maxCommitSize { oldCommit := currentCommit @@ -490,7 +416,7 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch case isHunkLineNumberLine(isStaged, latestState, line): latestState = HunkLineNumberLine - if currentDiff.Content.Len() > 0 || currentDiff.IsBinary { + if currentDiff.Len() > 0 || currentDiff.IsBinary { currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff) } currentDiff = NewDiff(withPathB(currentDiff.PathB)) @@ -549,7 +475,7 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch latestState = ParseFailure } - if currentDiff.Content.Len() > c.maxDiffSize { + if currentDiff.Len() > c.maxDiffSize { ctx.Logger().V(2).Info(fmt.Sprintf( "Diff for %s exceeded MaxDiffSize(%d)", currentDiff.PathB, c.maxDiffSize, )) @@ -828,7 +754,7 @@ func isCommitSeparatorLine(isStaged bool, latestState ParseState, line []byte) b func cleanupParse(currentCommit *Commit, currentDiff *Diff, commitChan chan Commit, totalLogSize *int) { // Ignore empty or binary diffs (this condition may be redundant). - if currentDiff != nil && (currentDiff.Content.Len() > 0 || currentDiff.IsBinary) { + if currentDiff != nil && (currentDiff.Len() > 0 || currentDiff.IsBinary) { currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff) } if currentCommit != nil { diff --git a/pkg/gitparse/gitparse_test.go b/pkg/gitparse/gitparse_test.go index f343aead0011..5956c57667f4 100644 --- a/pkg/gitparse/gitparse_test.go +++ b/pkg/gitparse/gitparse_test.go @@ -603,7 +603,7 @@ index 090c6ba6..38d67dd2 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -165,7 +165,7 @@ func Start(ctx context.Context, options ...EngineOption) *Engine { - seenDetectors := make(map[config.DetectorID]struct{}, len(dets)) + seenDetectors := make(map[config.DetectorID]struct{}, Len(dets)) for _, det := range dets { id := config.GetDetectorID(det) - if _, ok := seenDetectors[id]; ok { @@ -741,16 +741,16 @@ func TestIndividualCommitParsing(t *testing.T) { } j++ } - //for _, pass := range test.passes { + // for _, pass := range test.passes { // if !test.function(false, pass.latestState, pass.line) { // t.Errorf("%s: Parser did not recognize correct line. (%s)", name, string(pass.line)) // } - //} - //for _, fail := range test.fails { + // } + // for _, fail := range test.fails { // if test.function(false, fail.latestState, fail.line) { // t.Errorf("%s: Parser did not recognize incorrect line. (%s)", name, string(fail.line)) // } - //} + // } } } @@ -802,24 +802,24 @@ func TestStagedDiffParsing(t *testing.T) { Content: *bytes.NewBuffer([]byte("The Nameless is the origin of Heaven and Earth;\nThe named is the mother of all things.\n\nTherefore let there always be non-being,\n so we may see their subtlety,\nAnd let there always be being,\n so we may see their outcome.\nThe two are the same,\nBut after they are produced,\n they have different names.\nThey both may be called deep and profound.\nDeeper and more profound,\nThe door of all subtleties!\n")), IsBinary: false, }, - //{ + // { // PathB: "", // LineStart: 0, // Content: *bytes.NewBuffer([]byte("\n")), // IsBinary: false, - //}, - //{ + // }, + // { // PathB: "", // LineStart: 0, // Content: *bytes.NewBuffer([]byte("\n")), // IsBinary: false, - //}, - //{ + // }, + // { // PathB: "", // LineStart: 0, // Content: *bytes.NewBuffer([]byte("\n")), // IsBinary: false, - //}, + // }, }, }, } diff --git a/pkg/sources/git/git.go b/pkg/sources/git/git.go index 81f96d68d6ab..ded49a6f13f6 100644 --- a/pkg/sources/git/git.go +++ b/pkg/sources/git/git.go @@ -531,7 +531,7 @@ func (s *Git) ScanCommits(ctx context.Context, repo *git.Repository, path string continue } - if diff.Content.Len() > sources.ChunkSize+sources.PeekSize { + if diff.Len() > sources.ChunkSize+sources.PeekSize { s.gitChunk(ctx, &diff, fileName, email, hash, when, remoteURL, reporter) continue } @@ -539,14 +539,14 @@ func (s *Git) ScanCommits(ctx context.Context, repo *git.Repository, path string chunkData := func(d *gitparse.Diff) error { metadata := s.sourceMetadataFunc(fileName, email, hash, when, remoteURL, int64(diff.LineStart)) - reader, err := gitparse.DiffContentReadCloser(d) + reader, err := diff.ReadCloser() if err != nil { ctx.Logger().Error(err, "error creating reader for commits", "filename", fileName, "commit", hash, "file", diff.PathB) return nil } defer reader.Close() - data := make([]byte, diff.Content.Len()) + data := make([]byte, diff.Len()) if _, err := reader.Read(data); err != nil { ctx.Logger().Error(err, "error reading diff content for commit", "filename", fileName, "commit", hash, "file", diff.PathB) return nil @@ -571,7 +571,7 @@ func (s *Git) ScanCommits(ctx context.Context, repo *git.Repository, path string } func (s *Git) gitChunk(ctx context.Context, diff *gitparse.Diff, fileName, email, hash, when, urlMetadata string, reporter sources.ChunkReporter) { - reader, err := gitparse.DiffContentReadCloser(diff) + reader, err := diff.ReadCloser() if err != nil { ctx.Logger().Error(err, "error creating reader for chunk", "filename", fileName, "commit", hash, "file", diff.PathB) return @@ -723,14 +723,14 @@ func (s *Git) ScanStaged(ctx context.Context, repo *git.Repository, path string, chunkData := func(d *gitparse.Diff) error { metadata := s.sourceMetadataFunc(fileName, email, "Staged", when, urlMetadata, int64(diff.LineStart)) - reader, err := gitparse.DiffContentReadCloser(d) + reader, err := diff.ReadCloser() if err != nil { ctx.Logger().Error(err, "error creating reader for staged", "filename", fileName, "commit", hash, "file", diff.PathB) return nil } defer reader.Close() - data := make([]byte, diff.Content.Len()) + data := make([]byte, diff.Len()) if _, err := reader.Read(data); err != nil { ctx.Logger().Error(err, "error reading diff content for staged", "filename", fileName, "commit", hash, "file", diff.PathB) return nil diff --git a/pkg/writers/buffered_file_writer/bufferedfilewriter.go b/pkg/writers/buffered_file_writer/bufferedfilewriter.go new file mode 100644 index 000000000000..8239bcbd51c0 --- /dev/null +++ b/pkg/writers/buffered_file_writer/bufferedfilewriter.go @@ -0,0 +1,124 @@ +package bufferedfilewriter + +import ( + "bytes" + "io" + "os" + + "github.com/trufflesecurity/trufflehog/v3/pkg/cleantemp" + "github.com/trufflesecurity/trufflehog/v3/pkg/context" +) + +// BufferedFileWriter manages a buffer for writing data, flushing to a file when a threshold is exceeded. +type BufferedFileWriter struct { + threshold uint64 + buf bytes.Buffer + file *os.File +} + +// Option is a function that modifies a BufferedFileWriter. +type Option func(*BufferedFileWriter) + +// WithThreshold sets the threshold for switching to file writing. +func WithThreshold(threshold uint64) Option { + return func(w *BufferedFileWriter) { w.threshold = threshold } +} + +// New creates a new BufferedFileWriter with the given options. +func New(opts ...Option) *BufferedFileWriter { + const defaultThreshold = 20 * 1024 * 1024 // 20MB + w := &BufferedFileWriter{threshold: defaultThreshold} + for _, opt := range opts { + opt(w) + } + return w +} + +// Len returns the number of bytes in the buffer. +func (w *BufferedFileWriter) Len() int { return w.buf.Len() } + +// String returns the contents of the buffer as a string. +func (w *BufferedFileWriter) String() string { return w.buf.String() } + +// Write writes data to the buffer or a file, depending on the size. +func (w *BufferedFileWriter) Write(ctx context.Context, p []byte) (int, error) { + if uint64(w.buf.Len()+len(p)) <= w.threshold { + // If the total size is within the threshold, write to the buffer. + ctx.Logger().V(4).Info( + "writing to buffer", + "data_size", len(p), + "content_size", w.buf.Len(), + ) + return w.buf.Write(p) + } + + // Switch to file writing if threshold is exceeded. + // This helps in managing memory efficiently for large diffs. + if w.file == nil { + var err error + w.file, err = os.CreateTemp(os.TempDir(), cleantemp.MkFilename()) + if err != nil { + return 0, err + } + + // Transfer existing data in buffer to the file, then clear the buffer. + // This ensures all the diff data is in one place - either entirely in the buffer or the file. + if w.buf.Len() > 0 { + ctx.Logger().V(4).Info("writing buffer to file", "content_size", w.buf.Len()) + if _, err := w.file.Write(w.buf.Bytes()); err != nil { + return 0, err + } + // Replace the buffer with a new one to free up memory. + w.buf = bytes.Buffer{} + } + } + ctx.Logger().V(4).Info("writing to file", "data_size", len(p)) + + return w.file.Write(p) +} + +// Close flushes any remaining data in the buffer to the file and closes the file if it was created. +func (w *BufferedFileWriter) Close() error { + if w.file == nil { + return nil + } + + if w.buf.Len() > 0 { + _, err := w.file.Write(w.buf.Bytes()) + if err != nil { + return err + } + } + return w.file.Close() +} + +// ReadCloser returns an io.ReadCloser to read the written content. If the total content size exceeds the +// predefined threshold, it is stored in a temporary file and a file reader is returned. +// For content under the threshold, it is kept in memory and a bytes reader on the buffer is returned. +// The caller should call Close() on the returned io.Reader when done to ensure files are cleaned up. +func (w *BufferedFileWriter) ReadCloser() (io.ReadCloser, error) { + if w.file != nil { + // Data is in a file, read from the file. + file, err := os.Open(w.file.Name()) + if err != nil { + return nil, err + } + return newAutoDeletingFileReader(file), nil + } + // Data is in memory. + return io.NopCloser(bytes.NewReader(w.buf.Bytes())), nil +} + +// autoDeletingFileReader wraps an *os.File and deletes the file on Close. +type autoDeletingFileReader struct{ *os.File } + +// newAutoDeletingFileReader creates a new autoDeletingFileReader. +func newAutoDeletingFileReader(file *os.File) *autoDeletingFileReader { + return &autoDeletingFileReader{File: file} +} + +// Close implements the io.Closer interface, deletes the file after closing. +func (r *autoDeletingFileReader) Close() error { + defer os.Remove(r.Name()) // Delete the file after closing + return r.File.Close() +} diff --git a/pkg/writers/buffered_file_writer/bufferedfilewriter_test.go b/pkg/writers/buffered_file_writer/bufferedfilewriter_test.go new file mode 100644 index 000000000000..abd4744f2782 --- /dev/null +++ b/pkg/writers/buffered_file_writer/bufferedfilewriter_test.go @@ -0,0 +1,242 @@ +package bufferedfilewriter + +import ( + "os" + "testing" + "time" + + "github.com/stretchr/testify/assert" + + "github.com/trufflesecurity/trufflehog/v3/pkg/context" +) + +func TestBufferedFileWriterNewThreshold(t *testing.T) { + t.Parallel() + + const ( + defaultThreshold = 20 * 1024 * 1024 // 20MB + customThreshold = 10 * 1024 * 1024 // 10MB + ) + + tests := []struct { + name string + options []Option + expectedThreshold uint64 + }{ + {name: "Default Threshold", expectedThreshold: defaultThreshold}, + {name: "Custom Threshold", options: []Option{WithThreshold(customThreshold)}, expectedThreshold: customThreshold}, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + writer := New(tc.options...) + assert.Equal(t, tc.expectedThreshold, writer.threshold) + }) + } +} + +func TestBufferedFileWriterString(t *testing.T) { + t.Parallel() + tests := []struct { + name string + input []byte + expectedStr string + }{ + {name: "Empty", input: []byte(""), expectedStr: ""}, + {name: "Nil", input: nil, expectedStr: ""}, + {name: "Small content", input: []byte("hello"), expectedStr: "hello"}, + {name: "Large content", input: []byte("longer string with more characters"), expectedStr: "longer string with more characters"}, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + writer := New() + _, err := writer.Write(context.Background(), tc.input) + assert.NoError(t, err) + + str := writer.String() + assert.Equal(t, tc.expectedStr, str, "String content mismatch") + }) + } +} + +func TestBufferedFileWriterLen(t *testing.T) { + t.Parallel() + tests := []struct { + name string + input []byte + expectedLen int + }{ + {name: "Empty", input: []byte(""), expectedLen: 0}, + {name: "Nil", input: nil, expectedLen: 0}, + {name: "Small content", input: []byte("hello"), expectedLen: 5}, + {name: "Large content", input: []byte("longer string with more characters"), expectedLen: 34}, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + writer := New() + _, err := writer.Write(context.Background(), tc.input) + assert.NoError(t, err) + + length := writer.Len() + assert.Equal(t, tc.expectedLen, length) + }) + } +} + +// TestBufferedFileWriterWriteWithinThreshold tests that data is written to the buffer when the threshold +// is not exceeded. +func TestBufferedFileWriterWriteWithinThreshold(t *testing.T) { + t.Parallel() + + ctx := context.Background() + data := []byte("hello world") + + writer := New(WithThreshold(64)) + _, err := writer.Write(ctx, data) + assert.NoError(t, err) + + assert.Equal(t, data, writer.buf.Bytes()) +} + +// TestBufferedFileWriterWriteExceedsThreshold tests that data is written to a file when the threshold +// is exceeded. +func TestBufferedFileWriterWriteExceedsThreshold(t *testing.T) { + t.Parallel() + + ctx := context.Background() + data := []byte("hello world") + + writer := New(WithThreshold(5)) + _, err := writer.Write(ctx, data) + assert.NoError(t, err) + + defer writer.Close() + + assert.NotNil(t, writer.file) + assert.Len(t, writer.buf.Bytes(), 0) + fileContents, err := os.ReadFile(writer.file.Name()) + assert.NoError(t, err) + assert.Equal(t, data, fileContents) +} + +// TestBufferedFileWriterWriteAfterFlush tests that data is written to a file when the threshold +// is exceeded, and subsequent writes are to the buffer until the threshold is exceeded again. +func TestBufferedFileWriterWriteAfterFlush(t *testing.T) { + t.Parallel() + + ctx := context.Background() + initialData := []byte("initial data is longer than subsequent data") + subsequentData := []byte("subsequent data") + + // Initialize writer with a threshold that initialData will exceed. + writer := New(WithThreshold(uint64(len(initialData) - 1))) + _, err := writer.Write(ctx, initialData) + assert.NoError(t, err) + + defer writer.Close() + + // Get the file modification time after the initial write. + initialModTime, err := getFileModTime(t, writer.file.Name()) + assert.NoError(t, err) + fileContents, err := os.ReadFile(writer.file.Name()) + assert.NoError(t, err) + assert.Equal(t, initialData, fileContents) + + // Perform a subsequent write with data under the threshold. + _, err = writer.Write(ctx, subsequentData) + assert.NoError(t, err) + + assert.Equal(t, subsequentData, writer.buf.Bytes()) // Check buffer contents + finalModTime, err := getFileModTime(t, writer.file.Name()) + assert.NoError(t, err) + assert.Equal(t, initialModTime, finalModTime) // File should not be modified again +} + +func getFileModTime(t *testing.T, fileName string) (time.Time, error) { + t.Helper() + + fileInfo, err := os.Stat(fileName) + if err != nil { + return time.Time{}, err + } + return fileInfo.ModTime(), nil +} + +func TestBufferedFileWriterClose(t *testing.T) { + t.Parallel() + + const threshold = 10 + ctx := context.Background() + + tests := []struct { + name string + prepareWriter func(*BufferedFileWriter) // Function to prepare the writer for the test + expectFileContent string + }{ + { + name: "No File Created, Only Buffer Data", + prepareWriter: func(w *BufferedFileWriter) { + // Write data under the threshold + w.Write(ctx, []byte("small data")) + }, + expectFileContent: "", + }, + { + name: "File Created, No Data in Buffer", + prepareWriter: func(w *BufferedFileWriter) { + // Write data over the threshold to create a file + w.Write(ctx, []byte("large data is more than the threshold")) + }, + expectFileContent: "large data is more than the threshold", + }, + { + name: "File Created, Data in Buffer", + prepareWriter: func(w *BufferedFileWriter) { + // Write data over the threshold to create a file, then write more data + w.Write(ctx, []byte("large data is more than the threshold")) + w.Write(ctx, []byte(" more data")) + }, + expectFileContent: "large data is more than the threshold more data", + }, + { + name: "File Created, Buffer Cleared", + prepareWriter: func(w *BufferedFileWriter) { + // Write data over the threshold to create a file, then clear the buffer. + w.Write(ctx, []byte("large data is more than the threshold")) + w.buf.Reset() + }, + expectFileContent: "large data is more than the threshold", + }, + } + + for _, tc := range tests { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + writer := New(WithThreshold(threshold)) + + tc.prepareWriter(writer) + + err := writer.Close() + assert.NoError(t, err) + + if writer.file != nil { + fileContents, err := os.ReadFile(writer.file.Name()) + assert.NoError(t, err) + assert.Equal(t, tc.expectFileContent, string(fileContents)) + return + } + + // If no file was created, the buffer should be empty. + assert.Equal(t, tc.expectFileContent, "") + }) + } +} From 0365023f1a284a304fdc1ed85ee727d10bc96c13 Mon Sep 17 00:00:00 2001 From: Ahrav Dutta Date: Wed, 17 Jan 2024 15:43:17 -0800 Subject: [PATCH 04/29] update test --- pkg/gitparse/gitparse.go | 1 - pkg/gitparse/gitparse_test.go | 245 ++++++++++++++++++---------------- 2 files changed, 129 insertions(+), 117 deletions(-) diff --git a/pkg/gitparse/gitparse.go b/pkg/gitparse/gitparse.go index 350ea4209a7f..7659b465ee15 100644 --- a/pkg/gitparse/gitparse.go +++ b/pkg/gitparse/gitparse.go @@ -196,7 +196,6 @@ func (c1 *Commit) Equal(c2 *Commit) bool { } } return true - } // RepoPath parses the output of the `git log` command for the `source` path. diff --git a/pkg/gitparse/gitparse_test.go b/pkg/gitparse/gitparse_test.go index 5956c57667f4..91f67545aece 100644 --- a/pkg/gitparse/gitparse_test.go +++ b/pkg/gitparse/gitparse_test.go @@ -7,6 +7,7 @@ import ( "time" "github.com/trufflesecurity/trufflehog/v3/pkg/context" + bufferedfilewriter "github.com/trufflesecurity/trufflehog/v3/pkg/writers/buffered_file_writer" ) type testCaseLine struct { @@ -755,6 +756,7 @@ func TestIndividualCommitParsing(t *testing.T) { } func TestStagedDiffParsing(t *testing.T) { + expected := []Commit{ { Hash: "", @@ -763,44 +765,44 @@ func TestStagedDiffParsing(t *testing.T) { Message: strings.Builder{}, Diffs: []Diff{ { - PathB: "aws", - LineStart: 1, - Content: *bytes.NewBuffer([]byte("[default]\naws_access_key_id = AKIAXYZDQCEN4B6JSJQI\naws_secret_access_key = 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\noutput = json\nregion = us-east-2\n")), - IsBinary: false, + PathB: "aws", + LineStart: 1, + contentWriter: createBufferedFileWriterWithContent([]byte("[default]\naws_access_key_id = AKIAXYZDQCEN4B6JSJQI\naws_secret_access_key = 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\noutput = json\nregion = us-east-2\n")), + IsBinary: false, }, { - PathB: "aws2", - LineStart: 1, - Content: *bytes.NewBuffer([]byte("\n\nthis is the secret: [Default]\nAccess key Id: AKIAILE3JG6KMS3HZGCA\nSecret Access Key: 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\n\nokay thank you bye\n")), - IsBinary: false, + PathB: "aws2", + LineStart: 1, + contentWriter: createBufferedFileWriterWithContent([]byte("\n\nthis is the secret: [Default]\nAccess key Id: AKIAILE3JG6KMS3HZGCA\nSecret Access Key: 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\n\nokay thank you bye\n")), + IsBinary: false, }, { - PathB: "core/runtime/src/main/java/io/quarkus/runtime/QuarkusApplication.java", - LineStart: 3, - Content: *bytes.NewBuffer([]byte("/**\n * This is usually used for command mode applications with a startup logic. The logic is executed inside\n * {@link QuarkusApplication#run} method before the main application exits.\n */\n")), - IsBinary: false, + PathB: "core/runtime/src/main/java/io/quarkus/runtime/QuarkusApplication.java", + LineStart: 3, + contentWriter: createBufferedFileWriterWithContent([]byte("/**\n * This is usually used for command mode applications with a startup logic. The logic is executed inside\n * {@link QuarkusApplication#run} method before the main application exits.\n */\n")), + IsBinary: false, }, { PathB: "trufflehog_3.42.0_linux_arm64.tar.gz", IsBinary: true, }, { - PathB: "tzu", - LineStart: 11, - Content: *bytes.NewBuffer([]byte("\n\n\n\nSource: https://www.gnu.org/software/diffutils/manual/diffutils.html#An-Example-of-Unified-Format\n")), - IsBinary: false, + PathB: "tzu", + LineStart: 11, + contentWriter: createBufferedFileWriterWithContent([]byte("\n\n\n\nSource: https://www.gnu.org/software/diffutils/manual/diffutils.html#An-Example-of-Unified-Format\n")), + IsBinary: false, }, { - PathB: "lao", - LineStart: 1, - Content: *bytes.NewBuffer([]byte("The Way that can be told of is not the eternal Way;\nThe name that can be named is not the eternal name.\nThe Nameless is the origin of Heaven and Earth;\nThe Named is the mother of all things.\nTherefore let there always be non-being,\n so we may see their subtlety,\nAnd let there always be being,\n so we may see their outcome.\nThe two are the same,\nBut after they are produced,\n they have different names.\n")), - IsBinary: false, + PathB: "lao", + LineStart: 1, + contentWriter: createBufferedFileWriterWithContent([]byte("The Way that can be told of is not the eternal Way;\nThe name that can be named is not the eternal name.\nThe Nameless is the origin of Heaven and Earth;\nThe Named is the mother of all things.\nTherefore let there always be non-being,\n so we may see their subtlety,\nAnd let there always be being,\n so we may see their outcome.\nThe two are the same,\nBut after they are produced,\n they have different names.\n")), + IsBinary: false, }, { - PathB: "tzu", - LineStart: 1, - Content: *bytes.NewBuffer([]byte("The Nameless is the origin of Heaven and Earth;\nThe named is the mother of all things.\n\nTherefore let there always be non-being,\n so we may see their subtlety,\nAnd let there always be being,\n so we may see their outcome.\nThe two are the same,\nBut after they are produced,\n they have different names.\nThey both may be called deep and profound.\nDeeper and more profound,\nThe door of all subtleties!\n")), - IsBinary: false, + PathB: "tzu", + LineStart: 1, + contentWriter: createBufferedFileWriterWithContent([]byte("The Nameless is the origin of Heaven and Earth;\nThe named is the mother of all things.\n\nTherefore let there always be non-being,\n so we may see their subtlety,\nAnd let there always be being,\n so we may see their outcome.\nThe two are the same,\nBut after they are produced,\n they have different names.\nThey both may be called deep and profound.\nDeeper and more profound,\nThe door of all subtleties!\n")), + IsBinary: false, }, // { // PathB: "", @@ -844,6 +846,15 @@ func TestStagedDiffParsing(t *testing.T) { } } +func createBufferedFileWriterWithContent(content []byte) *bufferedfilewriter.BufferedFileWriter { + writer := bufferedfilewriter.New() // Add options as needed + _, err := writer.Write(context.Background(), content) + if err != nil { + panic("Failed to write content: " + err.Error()) + } + return writer +} + func TestCommitParseFailureRecovery(t *testing.T) { expected := []Commit{ { @@ -853,10 +864,10 @@ func TestCommitParseFailureRecovery(t *testing.T) { Message: newStringBuilderValue("Add travis testing\n"), Diffs: []Diff{ { - PathB: ".travis.yml", - LineStart: 1, - Content: *bytes.NewBuffer([]byte("language: python\npython:\n - \"2.6\"\n - \"2.7\"\n - \"3.2\"\n - \"3.3\"\n - \"3.4\"\n - \"3.5\"\n - \"3.5-dev\" # 3.5 development branch\n - \"3.6\"\n - \"3.6-dev\" # 3.6 development branch\n - \"3.7-dev\" # 3.7 development branch\n - \"nightly\"\n")), - IsBinary: false, + PathB: ".travis.yml", + LineStart: 1, + contentWriter: createBufferedFileWriterWithContent([]byte("language: python\npython:\n - \"2.6\"\n - \"2.7\"\n - \"3.2\"\n - \"3.3\"\n - \"3.4\"\n - \"3.5\"\n - \"3.5-dev\" # 3.5 development branch\n - \"3.6\"\n - \"3.6-dev\" # 3.6 development branch\n - \"3.7-dev\" # 3.7 development branch\n - \"nightly\"\n")), + IsBinary: false, }, }, }, @@ -874,10 +885,10 @@ func TestCommitParseFailureRecovery(t *testing.T) { Message: newStringBuilderValue("Change file\n"), Diffs: []Diff{ { - PathB: "tzu", - LineStart: 11, - Content: *bytes.NewBuffer([]byte("\n\n\n\nSource: https://www.gnu.org/software/diffutils/manual/diffutils.html#An-Example-of-Unified-Format\n")), - IsBinary: false, + PathB: "tzu", + LineStart: 11, + contentWriter: createBufferedFileWriterWithContent([]byte("\n\n\n\nSource: https://www.gnu.org/software/diffutils/manual/diffutils.html#An-Example-of-Unified-Format\n")), + IsBinary: false, }, }, }, @@ -982,22 +993,22 @@ func TestDiffParseFailureRecovery(t *testing.T) { Message: strings.Builder{}, Diffs: []Diff{ { - PathB: "aws", - LineStart: 1, - Content: *bytes.NewBuffer([]byte("[default]\naws_access_key_id = AKIAXYZDQCEN4B6JSJQI\naws_secret_access_key = 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\noutput = json\nregion = us-east-2\n")), - IsBinary: false, + PathB: "aws", + LineStart: 1, + contentWriter: createBufferedFileWriterWithContent([]byte("[default]\naws_access_key_id = AKIAXYZDQCEN4B6JSJQI\naws_secret_access_key = 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\noutput = json\nregion = us-east-2\n")), + IsBinary: false, }, { - PathB: "tzu", - LineStart: 11, - Content: *bytes.NewBuffer([]byte("\n\n\n\nSource: https://www.gnu.org/software/diffutils/manual/diffutils.html#An-Example-of-Unified-Format\n")), - IsBinary: false, + PathB: "tzu", + LineStart: 11, + contentWriter: createBufferedFileWriterWithContent([]byte("\n\n\n\nSource: https://www.gnu.org/software/diffutils/manual/diffutils.html#An-Example-of-Unified-Format\n")), + IsBinary: false, }, { - PathB: "tzu", - LineStart: 1, - Content: *bytes.NewBuffer([]byte("The Nameless is the origin of Heaven and Earth;\nThe named is the mother of all things.\n\nTherefore let there always be non-being,\n so we may see their subtlety,\nAnd let there always be being,\n so we may see their outcome.\nThe two are the same,\nBut after they are produced,\n they have different names.\nThey both may be called deep and profound.\nDeeper and more profound,\nThe door of all subtleties!\n")), - IsBinary: false, + PathB: "tzu", + LineStart: 1, + contentWriter: createBufferedFileWriterWithContent([]byte("The Nameless is the origin of Heaven and Earth;\nThe named is the mother of all things.\n\nTherefore let there always be non-being,\n so we may see their subtlety,\nAnd let there always be being,\n so we may see their outcome.\nThe two are the same,\nBut after they are produced,\n they have different names.\nThey both may be called deep and profound.\nDeeper and more profound,\nThe door of all subtleties!\n")), + IsBinary: false, }, }, }, @@ -1129,8 +1140,8 @@ func TestMaxDiffSize(t *testing.T) { }() commit := <-commitChan - if commit.Diffs[0].Content.Len() > parser.maxDiffSize+1024 { - t.Errorf("diff did not match MaxDiffSize. Got: %d, expected (max): %d", commit.Diffs[0].Content.Len(), parser.maxDiffSize+1024) + if commit.Diffs[0].Len() > parser.maxDiffSize+1024 { + t.Errorf("diff did not match MaxDiffSize. Got: %d, expected (max): %d", commit.Diffs[0].Len(), parser.maxDiffSize+1024) } } @@ -1592,22 +1603,22 @@ func expectedCommits() []Commit { Message: newStringBuilderValue("Added Unusable coloring\n"), Diffs: []Diff{ { - PathB: "components/item.lua", - LineStart: 9, - Content: *bytes.NewBuffer([]byte("\n\nlocal Unfit = LibStub('Unfit-1.0')\n\n\n")), - IsBinary: false, + PathB: "components/item.lua", + LineStart: 9, + contentWriter: createBufferedFileWriterWithContent([]byte("\n\nlocal Unfit = LibStub('Unfit-1.0')\n\n\n")), + IsBinary: false, }, { - PathB: "embeds.xml", - LineStart: 6, - Content: *bytes.NewBuffer([]byte("\n\n