Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat] - tmp file diffs #2306

Merged
merged 38 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
f2f13b2
Write large diffs to tmp files
ahrav Jan 15, 2024
7d2452a
address comments
ahrav Jan 16, 2024
0802a62
Move bufferedfilewriter to own pkg
ahrav Jan 17, 2024
0365023
update test
ahrav Jan 17, 2024
57e5d6a
swallow write err
ahrav Jan 17, 2024
dad92c5
Merge branch 'main' into feat-tmp-file-diffs
ahrav Jan 17, 2024
fd41709
use buffer pool
ahrav Jan 17, 2024
da2ff88
use size vs len
ahrav Jan 18, 2024
b3260ff
Merge branch 'main' into feat-tmp-file-diffs
ahrav Jan 19, 2024
73903e7
use interface
ahrav Jan 19, 2024
994c268
fix test
ahrav Jan 19, 2024
f8e87e9
update comments
ahrav Jan 19, 2024
bba85f4
fix test
ahrav Jan 20, 2024
2e7479e
remove unused
ahrav Jan 20, 2024
8cc4246
remove
ahrav Jan 20, 2024
f114a4a
remove unused
ahrav Jan 20, 2024
bf25bd2
Merge branch 'feat-tmp-file-diffs' of github.com:trufflesecurity/truf…
ahrav Jan 20, 2024
390a5a5
move parser and commit struct closer to where they are used
ahrav Jan 20, 2024
bb82471
linter change
ahrav Jan 20, 2024
9af37d8
merge main
ahrav Jan 23, 2024
bab3f04
add more kvp pairs to error
ahrav Jan 23, 2024
f26ac45
fix test
ahrav Jan 23, 2024
cbd4ddf
update
ahrav Jan 23, 2024
7aef649
address comments
ahrav Jan 23, 2024
47544ad
remove bufferedfile writer
ahrav Jan 24, 2024
3c7a1d5
Merge branch 'main' into feat-tmp-file-diffs
ahrav Jan 24, 2024
a9ee71a
Merge branch 'main' into feat-tmp-file-diffs
ahrav Jan 26, 2024
419107e
address comments
ahrav Jan 29, 2024
1bd8b27
adjust interface
ahrav Jan 29, 2024
c259b84
Merge branch 'main' into feat-tmp-file-diffs
ahrav Jan 29, 2024
1485c9a
fix finalize
ahrav Jan 29, 2024
73a766f
address comments
ahrav Jan 30, 2024
1699097
lint
ahrav Jan 30, 2024
3ac95f5
remove guard
ahrav Jan 30, 2024
e442142
Merge branch 'main' into feat-tmp-file-diffs
ahrav Jan 30, 2024
bff34a4
fix
ahrav Jan 30, 2024
fd5181e
Merge branch 'feat-tmp-file-diffs' of github.com:trufflesecurity/truf…
ahrav Jan 30, 2024
1c577db
add TODO
ahrav Jan 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 151 additions & 29 deletions pkg/gitparse/gitparse.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"bufio"
"bytes"
"fmt"
"github.com/go-logr/logr"
"io"
"os"
"os/exec"
Expand All @@ -13,6 +12,9 @@
"strings"
"time"

"github.com/go-logr/logr"

"github.com/trufflesecurity/trufflehog/v3/pkg/cleantemp"
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
)
Expand Down Expand Up @@ -40,10 +42,86 @@

// Diff contains the info about a file diff in a commit.
type Diff struct {
PathB string
LineStart int
Content bytes.Buffer
IsBinary bool
PathB string
LineStart int
Content bytes.Buffer // Keep in-memory buffer for smaller diffs
streamDestination *os.File // File destination for larger diffs
ahrav marked this conversation as resolved.
Show resolved Hide resolved
IsBinary bool
threshold int // Size threshold to switch to file
}

type diffOption func(*Diff)

// withPathB sets the PathB option.
func withPathB(pathB string) diffOption { return func(d *Diff) { d.PathB = pathB } }

// withThreshold sets the threshold option.
// TODO: Leverage this option in the future.
func withThreshold(threshold int) diffOption { return func(d *Diff) { d.threshold = threshold } }

Check failure on line 60 in pkg/gitparse/gitparse.go

View workflow job for this annotation

GitHub Actions / lint

func `withThreshold` is unused (unused)

// NewDiff creates a new Diff with a threshold.
func NewDiff(opts ...diffOption) *Diff {
const defaultThreshold = 20 * 1024 * 1024 // 20MB
d := &Diff{threshold: defaultThreshold}
for _, opt := range opts {
opt(d)
}

return d
}

// write handles writing diff data to either an in-memory buffer or a file, depending on the size.
func (d *Diff) write(ctx context.Context, p []byte) error {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit (optional): This is more of an architectural suggestion -- I think it would be nice to implement the in-memory / file switch buffer as it's own package and use it in the Diff struct, similar to how DiskBufferReader is its own import. We could also add tests to show the switching works without the "business logic" of diff parsing.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved it to its own pkg.

if d.Content.Len()+len(p) <= d.threshold {
// If the total size is within the threshold, write to the buffer.
ctx.Logger().V(4).Info(
"writing to buffer",
"data_size", len(p),
"content_size", d.Content.Len(),
)
_, err := d.Content.Write(p)
return err
}
// Switch to file writing if threshold is exceeded.
// This helps in managing memory efficiently for large diffs.
if d.streamDestination == nil {
var err error
d.streamDestination, err = os.CreateTemp(os.TempDir(), cleantemp.MkFilename())
if err != nil {
return err
}

// Transfer existing data in buffer to the file, then clear the buffer.
// This ensures all the diff data is in one place - either entirely in the buffer or the file.
if d.Content.Len() > 0 {
ctx.Logger().V(4).Info("writing buffer to file", "content_size", d.Content.Len())
if _, err := d.streamDestination.Write(d.Content.Bytes()); err != nil {
return err
}
// Replace the buffer with a new one to free up memory.
d.Content = bytes.Buffer{}
}
}
ctx.Logger().V(4).Info("writing to file", "data_size", len(p))

_, err := d.streamDestination.Write(p)
return err
}

// finalize ensures proper closure of resources associated with the Diff.
// handle the final flush in the finalize method, in case there's data remaining in the buffer.
// This method should be called to release resources, especially when writing to a file.
func (d *Diff) finalize() error {
if d.streamDestination == nil {
return nil
}

if d.Content.Len() > 0 {
if _, err := d.streamDestination.Write(d.Content.Bytes()); err != nil {
return err
}
}
return d.streamDestination.Close()
}

// Parser sets values used in GitParse.
Expand All @@ -53,6 +131,51 @@
dateFormat string
}

// noOpCloser wraps an io.Reader to add a no-op Close method, forming an io.ReadCloser.
type noOpCloser struct{ io.Reader }
ahrav marked this conversation as resolved.
Show resolved Hide resolved

// Close performs no operation (no-op) and returns nil.
// It's used to fulfill the io.Closer interface.
func (noc *noOpCloser) Close() error { return nil }

// DiffContentReadCloser returns an io.ReadCloser for reading the content of a Diff.
// If the diff content size exceeds a predefined threshold, it is stored in a temporary file,
// and the function returns an auto-deleting file reader (newAutoDeletingFileReader) to read from this file.
// For smaller diffs that fit within the threshold, the content is kept in memory,
// and the function returns a no-op closer wrapper (noOpCloser) around a bytes.Reader.
// The caller is responsible for calling Close on the returned io.ReadCloser in both cases.
func DiffContentReadCloser(d *Diff) (io.ReadCloser, error) {
if d.streamDestination != nil {
// Data is in a file, read from the file.
file, err := os.Open(d.streamDestination.Name())
if err != nil {
return nil, err
}
return newAutoDeletingFileReader(file), nil
}
// Data is in memory.
return &noOpCloser{Reader: bytes.NewReader(d.Content.Bytes())}, nil
}
ahrav marked this conversation as resolved.
Show resolved Hide resolved

// autoDeletingFileReader wraps an *os.File and deletes the file on Close
type autoDeletingFileReader struct{ file *os.File }

// newAutoDeletingFileReader creates a new autoDeletingFileReader
func newAutoDeletingFileReader(file *os.File) *autoDeletingFileReader {
return &autoDeletingFileReader{file: file}
}

// Read implements the io.Reader interface
func (r *autoDeletingFileReader) Read(p []byte) (int, error) {
return r.file.Read(p)
}
ahrav marked this conversation as resolved.
Show resolved Hide resolved

// Close implements the io.Closer interface, deletes the file after closing
func (r *autoDeletingFileReader) Close() error {
defer os.Remove(r.file.Name()) // Delete the file after closing
return r.file.Close()
}

type ParseState int

const (
Expand Down Expand Up @@ -254,11 +377,11 @@
outReader := bufio.NewReader(stdOut)
var (
currentCommit *Commit
currentDiff Diff

totalLogSize int
)
var latestState = Initial
currentDiff := NewDiff()

defer common.RecoverWithExit(ctx)
defer close(commitChan)
Expand All @@ -278,19 +401,20 @@

// If there is a currentDiff, add it to currentCommit.
if currentDiff.Content.Len() > 0 || currentDiff.IsBinary {
currentCommit.Diffs = append(currentCommit.Diffs, currentDiff)
currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff)
currentCommit.Size += currentDiff.Content.Len()
}
// If there is a currentCommit, send it to the channel.
if currentCommit != nil {
if err := currentDiff.finalize(); err != nil {
ctx.Logger().Error(err, "failed to finalize diff")
}
commitChan <- *currentCommit
totalLogSize += currentCommit.Size
}
// Create a new currentDiff and currentCommit
currentDiff = Diff{}
currentCommit = &Commit{
Message: strings.Builder{},
}
currentDiff = NewDiff()
currentCommit = &Commit{Message: strings.Builder{}}
// Check that the commit line contains a hash and set it.
if len(line) >= 47 {
currentCommit.Hash = string(line[7:47])
Expand Down Expand Up @@ -327,7 +451,10 @@
currentCommit = &Commit{}
}
if currentDiff.Content.Len() > 0 || currentDiff.IsBinary {
currentCommit.Diffs = append(currentCommit.Diffs, currentDiff)
currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff)
if err := currentDiff.finalize(); err != nil {
ctx.Logger().Error(err, "failed to finalize diff")
ahrav marked this conversation as resolved.
Show resolved Hide resolved
}
// If the currentDiff is over 1GB, drop it into the channel so it isn't held in memory waiting for more commits.
totalSize := 0
for _, diff := range currentCommit.Diffs {
Expand All @@ -348,7 +475,7 @@
currentCommit.Message.WriteString(oldCommit.Message.String())
}
}
currentDiff = Diff{}
currentDiff = NewDiff()
case isModeLine(isStaged, latestState, line):
latestState = ModeLine
// NoOp
Expand Down Expand Up @@ -376,11 +503,9 @@
latestState = HunkLineNumberLine

if currentDiff.Content.Len() > 0 || currentDiff.IsBinary {
currentCommit.Diffs = append(currentCommit.Diffs, currentDiff)
}
currentDiff = Diff{
PathB: currentDiff.PathB,
currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff)
}
currentDiff = NewDiff(withPathB(currentDiff.PathB))

words := bytes.Split(line, []byte(" "))
if len(words) >= 3 {
Expand All @@ -395,24 +520,21 @@
latestState = HunkContentLine
}
// TODO: Why do we care about this? It creates empty lines in the diff. If there are no plusLines, it's just newlines.
currentDiff.Content.Write([]byte("\n"))
if err := currentDiff.write(ctx, []byte("\n")); err != nil {
ctx.Logger().Error(err, "failed to write to diff")
}
case isHunkPlusLine(isStaged, latestState, line):
if latestState != HunkContentLine {
latestState = HunkContentLine
}

currentDiff.Content.Write(line[1:])
case isHunkMinusLine(isStaged, latestState, line):
if latestState != HunkContentLine {
latestState = HunkContentLine
if err := currentDiff.write(ctx, line[1:]); err != nil {
ctx.Logger().Error(err, "failed to write to diff")
}
// NoOp. We only care about additions.
case isHunkNewlineWarningLine(isStaged, latestState, line):
if latestState != HunkContentLine {
latestState = HunkContentLine
}
// NoOp
case isHunkEmptyLine(isStaged, latestState, line):
case isHunkMinusLine(isStaged, latestState, line),
isHunkNewlineWarningLine(isStaged, latestState, line),
isHunkEmptyLine(isStaged, latestState, line):
if latestState != HunkContentLine {
latestState = HunkContentLine
}
Expand Down Expand Up @@ -446,7 +568,7 @@
break
}
}
cleanupParse(currentCommit, &currentDiff, commitChan, &totalLogSize)
cleanupParse(currentCommit, currentDiff, commitChan, &totalLogSize)

ctx.Logger().V(2).Info("finished parsing git log.", "total_log_size", totalLogSize)
}
Expand Down
Loading
Loading