Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat] - tmp file diffs #2306

Merged
merged 38 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
f2f13b2
Write large diffs to tmp files
ahrav Jan 15, 2024
7d2452a
address comments
ahrav Jan 16, 2024
0802a62
Move bufferedfilewriter to own pkg
ahrav Jan 17, 2024
0365023
update test
ahrav Jan 17, 2024
57e5d6a
swallow write err
ahrav Jan 17, 2024
dad92c5
Merge branch 'main' into feat-tmp-file-diffs
ahrav Jan 17, 2024
fd41709
use buffer pool
ahrav Jan 17, 2024
da2ff88
use size vs len
ahrav Jan 18, 2024
b3260ff
Merge branch 'main' into feat-tmp-file-diffs
ahrav Jan 19, 2024
73903e7
use interface
ahrav Jan 19, 2024
994c268
fix test
ahrav Jan 19, 2024
f8e87e9
update comments
ahrav Jan 19, 2024
bba85f4
fix test
ahrav Jan 20, 2024
2e7479e
remove unused
ahrav Jan 20, 2024
8cc4246
remove
ahrav Jan 20, 2024
f114a4a
remove unused
ahrav Jan 20, 2024
bf25bd2
Merge branch 'feat-tmp-file-diffs' of github.com:trufflesecurity/truf…
ahrav Jan 20, 2024
390a5a5
move parser and commit struct closer to where they are used
ahrav Jan 20, 2024
bb82471
linter change
ahrav Jan 20, 2024
9af37d8
merge main
ahrav Jan 23, 2024
bab3f04
add more kvp pairs to error
ahrav Jan 23, 2024
f26ac45
fix test
ahrav Jan 23, 2024
cbd4ddf
update
ahrav Jan 23, 2024
7aef649
address comments
ahrav Jan 23, 2024
47544ad
remove bufferedfile writer
ahrav Jan 24, 2024
3c7a1d5
Merge branch 'main' into feat-tmp-file-diffs
ahrav Jan 24, 2024
a9ee71a
Merge branch 'main' into feat-tmp-file-diffs
ahrav Jan 26, 2024
419107e
address comments
ahrav Jan 29, 2024
1bd8b27
adjust interface
ahrav Jan 29, 2024
c259b84
Merge branch 'main' into feat-tmp-file-diffs
ahrav Jan 29, 2024
1485c9a
fix finalize
ahrav Jan 29, 2024
73a766f
address comments
ahrav Jan 30, 2024
1699097
lint
ahrav Jan 30, 2024
3ac95f5
remove guard
ahrav Jan 30, 2024
e442142
Merge branch 'main' into feat-tmp-file-diffs
ahrav Jan 30, 2024
bff34a4
fix
ahrav Jan 30, 2024
fd5181e
Merge branch 'feat-tmp-file-diffs' of github.com:trufflesecurity/truf…
ahrav Jan 30, 2024
1c577db
add TODO
ahrav Jan 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 71 additions & 36 deletions pkg/gitparse/gitparse.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"bufio"
"bytes"
"fmt"
"github.com/go-logr/logr"
"io"
"os"
"os/exec"
Expand All @@ -13,8 +12,11 @@ import (
"strings"
"time"

"github.com/go-logr/logr"

"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/writers/buffered_file_writer"
)

const (
Expand Down Expand Up @@ -42,10 +44,45 @@ type Commit struct {
type Diff struct {
PathB string
LineStart int
Content bytes.Buffer
IsBinary bool
// Used to keep small diffs in memory and larger diffs in a file.
contentWriter *bufferedfilewriter.BufferedFileWriter
IsBinary bool
}

type diffOption func(*Diff)

// withPathB sets the PathB option.
func withPathB(pathB string) diffOption { return func(d *Diff) { d.PathB = pathB } }

// NewDiff creates a new Diff with a threshold.
func NewDiff(opts ...diffOption) *Diff {
const defaultThreshold = 20 * 1024 * 1024 // 20MB
d := new(Diff)
for _, opt := range opts {
opt(d)
}
d.contentWriter = bufferedfilewriter.New(bufferedfilewriter.WithThreshold(defaultThreshold))

return d
}

// Len returns the length of the storage.
func (d *Diff) Len() int { return d.contentWriter.Len() }
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These methods probably can be removed, but I was trying to avoid reaching into the contentWriter directly. This is also exported since we need access to this in the git source. Open to changing it if we feel these are providing any additional encapsulation.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to guard them behind the state like ReadCloser()? I find it a bit surprising that some of the read methods are guarded and some aren't.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like that idea. Will guard them.


// ReadCloser returns a ReadCloser for the contentWriter.
func (d *Diff) ReadCloser() (io.ReadCloser, error) { return d.contentWriter.ReadCloser() }

// write delegates to the contentWriter.
func (d *Diff) write(ctx context.Context, p []byte) error {
_, err := d.contentWriter.Write(ctx, p)
return err
}

// finalize ensures proper closure of resources associated with the Diff.
// handle the final flush in the finalize method, in case there's data remaining in the buffer.
// This method should be called to release resources, especially when writing to a file.
func (d *Diff) finalize() error { return d.contentWriter.Close() }

// Parser sets values used in GitParse.
type Parser struct {
maxDiffSize int
Expand Down Expand Up @@ -152,14 +189,13 @@ func (c1 *Commit) Equal(c2 *Commit) bool {
return false
case d1.LineStart != d2.LineStart:
return false
case d1.Content.String() != d2.Content.String():
case d1.contentWriter.String() != d2.contentWriter.String():
ahrav marked this conversation as resolved.
Show resolved Hide resolved
return false
case d1.IsBinary != d2.IsBinary:
return false
}
}
return true

}

// RepoPath parses the output of the `git log` command for the `source` path.
Expand Down Expand Up @@ -254,11 +290,11 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
outReader := bufio.NewReader(stdOut)
var (
currentCommit *Commit
currentDiff Diff

totalLogSize int
)
var latestState = Initial
currentDiff := NewDiff()

defer common.RecoverWithExit(ctx)
defer close(commitChan)
Expand All @@ -277,20 +313,21 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
latestState = CommitLine

// If there is a currentDiff, add it to currentCommit.
if currentDiff.Content.Len() > 0 || currentDiff.IsBinary {
currentCommit.Diffs = append(currentCommit.Diffs, currentDiff)
currentCommit.Size += currentDiff.Content.Len()
if currentDiff.Len() > 0 || currentDiff.IsBinary {
currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff)
currentCommit.Size += currentDiff.Len()
}
// If there is a currentCommit, send it to the channel.
if currentCommit != nil {
if err := currentDiff.finalize(); err != nil {
ctx.Logger().Error(err, "failed to finalize diff")
}
commitChan <- *currentCommit
totalLogSize += currentCommit.Size
}
// Create a new currentDiff and currentCommit
currentDiff = Diff{}
currentCommit = &Commit{
Message: strings.Builder{},
}
currentDiff = NewDiff()
currentCommit = &Commit{Message: strings.Builder{}}
// Check that the commit line contains a hash and set it.
if len(line) >= 47 {
currentCommit.Hash = string(line[7:47])
Expand Down Expand Up @@ -326,12 +363,15 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
if currentCommit == nil {
currentCommit = &Commit{}
}
if currentDiff.Content.Len() > 0 || currentDiff.IsBinary {
currentCommit.Diffs = append(currentCommit.Diffs, currentDiff)
if currentDiff.Len() > 0 || currentDiff.IsBinary {
currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff)
if err := currentDiff.finalize(); err != nil {
ctx.Logger().Error(err, "failed to finalize diff")
ahrav marked this conversation as resolved.
Show resolved Hide resolved
}
// If the currentDiff is over 1GB, drop it into the channel so it isn't held in memory waiting for more commits.
totalSize := 0
for _, diff := range currentCommit.Diffs {
totalSize += diff.Content.Len()
totalSize += diff.Len()
}
if totalSize > c.maxCommitSize {
oldCommit := currentCommit
Expand All @@ -348,7 +388,7 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
currentCommit.Message.WriteString(oldCommit.Message.String())
}
}
currentDiff = Diff{}
currentDiff = NewDiff()
case isModeLine(isStaged, latestState, line):
latestState = ModeLine
// NoOp
Expand All @@ -375,12 +415,10 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
case isHunkLineNumberLine(isStaged, latestState, line):
latestState = HunkLineNumberLine

if currentDiff.Content.Len() > 0 || currentDiff.IsBinary {
currentCommit.Diffs = append(currentCommit.Diffs, currentDiff)
}
currentDiff = Diff{
PathB: currentDiff.PathB,
if currentDiff.Len() > 0 || currentDiff.IsBinary {
currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff)
}
currentDiff = NewDiff(withPathB(currentDiff.PathB))

words := bytes.Split(line, []byte(" "))
if len(words) >= 3 {
Expand All @@ -395,24 +433,21 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
latestState = HunkContentLine
}
// TODO: Why do we care about this? It creates empty lines in the diff. If there are no plusLines, it's just newlines.
currentDiff.Content.Write([]byte("\n"))
if err := currentDiff.write(ctx, []byte("\n")); err != nil {
ctx.Logger().Error(err, "failed to write to diff")
}
case isHunkPlusLine(isStaged, latestState, line):
if latestState != HunkContentLine {
latestState = HunkContentLine
}

currentDiff.Content.Write(line[1:])
case isHunkMinusLine(isStaged, latestState, line):
if latestState != HunkContentLine {
latestState = HunkContentLine
if err := currentDiff.write(ctx, line[1:]); err != nil {
ctx.Logger().Error(err, "failed to write to diff")
}
// NoOp. We only care about additions.
case isHunkNewlineWarningLine(isStaged, latestState, line):
if latestState != HunkContentLine {
latestState = HunkContentLine
}
// NoOp
case isHunkEmptyLine(isStaged, latestState, line):
case isHunkMinusLine(isStaged, latestState, line),
isHunkNewlineWarningLine(isStaged, latestState, line),
isHunkEmptyLine(isStaged, latestState, line):
if latestState != HunkContentLine {
latestState = HunkContentLine
}
Expand All @@ -439,14 +474,14 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, commitChan ch
latestState = ParseFailure
}

if currentDiff.Content.Len() > c.maxDiffSize {
if currentDiff.Len() > c.maxDiffSize {
ctx.Logger().V(2).Info(fmt.Sprintf(
"Diff for %s exceeded MaxDiffSize(%d)", currentDiff.PathB, c.maxDiffSize,
))
break
}
}
cleanupParse(currentCommit, &currentDiff, commitChan, &totalLogSize)
cleanupParse(currentCommit, currentDiff, commitChan, &totalLogSize)

ctx.Logger().V(2).Info("finished parsing git log.", "total_log_size", totalLogSize)
}
Expand Down Expand Up @@ -718,7 +753,7 @@ func isCommitSeparatorLine(isStaged bool, latestState ParseState, line []byte) b

func cleanupParse(currentCommit *Commit, currentDiff *Diff, commitChan chan Commit, totalLogSize *int) {
// Ignore empty or binary diffs (this condition may be redundant).
if currentDiff != nil && (currentDiff.Content.Len() > 0 || currentDiff.IsBinary) {
if currentDiff != nil && (currentDiff.Len() > 0 || currentDiff.IsBinary) {
currentCommit.Diffs = append(currentCommit.Diffs, *currentDiff)
}
if currentCommit != nil {
Expand Down
Loading
Loading