Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scan GitHub and GitLab refs that aren't cloned by default #1918

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions hack/snifftest/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,15 +197,16 @@ func main() {
SkipBinaries: true,
SkipArchives: false,
Concurrency: runtime.NumCPU(),
SourceMetadataFunc: func(file, email, commit, timestamp, repository string, line int64) *source_metadatapb.MetaData {
SourceMetadataFunc: func(repository, commit, commitSource, email, timestamp, file string, line int64) *source_metadatapb.MetaData {
return &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Git{
Git: &source_metadatapb.Git{
Commit: commit,
File: file,
Email: email,
Repository: repository,
Timestamp: timestamp,
Repository: repository,
Commit: commit,
CommitSource: commitSource,
Email: email,
Timestamp: timestamp,
File: file,
},
},
}
Expand Down
68 changes: 61 additions & 7 deletions pkg/gitparse/gitparse.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@ func (d *Diff) finalize() error { return d.contentWriter.CloseForWriting() }

// Commit contains commit header info and diffs.
type Commit struct {
Hash string
Hash string
// The source of a commit, if it doesn't exist in the repository's history.
Source string
Author string
Committer string
Date time.Time
Expand Down Expand Up @@ -231,19 +233,23 @@ func (c *Parser) RepoPath(
) (chan *Diff, error) {
args := []string{
"-C", source,
"--no-replace-objects",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you add this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's from this hidden comment thread. GitHub's PR UI is a nightmare.

"log",
"--patch", // https://git-scm.com/docs/git-log#Documentation/git-log.txt---patch
"--full-history",
"--date=format:%a %b %d %H:%M:%S %Y %z",
"--pretty=fuller", // https://git-scm.com/docs/git-log#_pretty_formats
"--notes", // https://git-scm.com/docs/git-log#Documentation/git-log.txt---notesltrefgt
"--source", // https://git-scm.com/docs/git-log#Documentation/git-log.txt---source
}
if abbreviatedLog {
// https://git-scm.com/docs/git-log#Documentation/git-log.txt---diff-filterACDMRTUXB82308203
args = append(args, "--diff-filter=AM")
}
if head != "" {
args = append(args, head)
} else {
// https://git-scm.com/docs/git-log#Documentation/git-log.txt---all
args = append(args, "--all")
}
for _, glob := range excludedGlobs {
Expand Down Expand Up @@ -332,10 +338,9 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, diffChan chan
outReader := bufio.NewReader(stdOut)
var (
currentCommit *Commit

totalLogSize int
totalLogSize int
latestState = Initial
)
var latestState = Initial

diff := func(c *Commit, opts ...diffOption) *Diff {
opts = append(opts, withCustomContentWriter(bufferwriter.New()))
Expand Down Expand Up @@ -395,10 +400,18 @@ func (c *Parser) FromReader(ctx context.Context, stdOut io.Reader, diffChan chan
// Create a new currentDiff and currentCommit
currentCommit = &Commit{Message: strings.Builder{}}
currentDiff = diff(currentCommit)
// Check that the commit line contains a hash and set it.
if len(line) >= 47 {
currentCommit.Hash = string(line[7:47])

hash, ref := parseCommitLine(line)
if hash == nil || ref == nil {
ctx.Logger().Error(
fmt.Errorf(`expected line to match 'commit <hash> <ref>', got %q`, line),
"Failed to parse CommitLine")
latestState = ParseFailure
continue
}

currentCommit.Hash = string(hash)
currentCommit.Source = parseSourceRef(ref)
case isMergeLine(isStaged, latestState, line):
latestState = MergeLine
case isAuthorLine(isStaged, latestState, line):
Expand Down Expand Up @@ -614,6 +627,47 @@ func isCommitLine(isStaged bool, latestState ParseState, line []byte) bool {
return false
}

func parseCommitLine(line []byte) (hash []byte, ref []byte) {
// Check that the commit line contains a 40-character hash and set it.
// `commit e5575cd6f2d21d3a1a604287c7bf4a7eab2266e0\n`
if len(line) >= 47 {
hash = line[7:47]
}

// Check if the commit line includes branch references.
// `commit 2dbbb28727c7c2954438666dafba57bb8c714d3b refs/heads/fix/github-enterprise-gist\n`
if len(line) > 48 {
ref = line[48 : len(line)-1]
}

return
}

// ParseCommitSource s
// https://git-scm.com/docs/git-log#Documentation/git-log.txt---source
func parseSourceRef(ref []byte) string {
// We don't care about 'normal' refs.
if bytes.HasPrefix(ref, []byte("refs/heads/")) || bytes.HasPrefix(ref, []byte("refs/tags/")) {
return ""
}

// Handle GitHub pull requests.
// e.g., `refs/pull/238/head` or `refs/pull/1234/merge`
if after, ok := bytes.CutPrefix(ref, []byte("refs/pull/")); ok {
prNumber := after[:bytes.Index(after, []byte("/"))]
return "Pull request #" + string(prNumber)
}

// Handle GitLab merge requests
// e.g., `refs/merge-requests/238/head` or `refs/merge-requests/1234/merge`
if after, ok := bytes.CutPrefix(ref, []byte("refs/merge-requests/")); ok {
mrNumber := after[:bytes.Index(after, []byte("/"))]
return "Merge request #" + string(mrNumber)
}

return fmt.Sprintf("%s (hidden ref)", string(ref))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does "hidden" just mean "not visible via the respective web UI?" My small concern here is that this also happens if we simply fail to parse the relevant ref because we made a mistake somewhere. What do you think of just returning the ref as-is in that case - would it be confusing?

Copy link
Contributor Author

@rgmz rgmz Jan 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does "hidden" just mean "not visible via the respective web UI?"

It means "not visible with default Git behaviour"; sometimes these refs are only discoverable via the web UI or API. The term is inspired by this blog post.

The Git ref namespace (refs/*) can store almost any path, however, Git and Git-adjacent tools only look at refs/heads/* and refs/tags/*. Other refs (if they exist) aren't fetched unless you explicitly request them from a remote.

Two examples of this are:

  1. Deleted GitHub/GitLab pull request branches being stored under refs/pull/* and refs/merge-requests/*
  2. Deleted commits that are manually fetched

My small concern here is that this also happens if we simply fail to parse the relevant ref because we made a mistake somewhere. What do you think of just returning the ref as-is in that case - would it be confusing?

I think it's worth signaling that these refs are special and don't technically exist in the repository. Returning the ref as-is is already a source of confusion (#3493).

}

// Author: Bill Rich <[email protected]>
func isAuthorLine(isStaged bool, latestState ParseState, line []byte) bool {
if isStaged || !(latestState == CommitLine || latestState == MergeLine) {
Expand Down
39 changes: 39 additions & 0 deletions pkg/gitparse/gitparse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,45 @@ func TestLineChecksNoStaged(t *testing.T) {
}
}

func Test_parseCommitLine(t *testing.T) {
cases := map[string][]string{
"commit 198c63cb8212a99cc4352bc72f25e5444a786291 refs/heads/main\n": {"198c63cb8212a99cc4352bc72f25e5444a786291", "refs/heads/main"},
"commit e76dfb98ab9001daa869191b6aebe8cf4cd3b22a refs/remotes/origin/debug/aws-logging\n": {"e76dfb98ab9001daa869191b6aebe8cf4cd3b22a", "refs/remotes/origin/debug/aws-logging"},
}

for line, expected := range cases {
hash, ref := parseCommitLine([]byte(line))
if string(hash) != expected[0] {
t.Errorf("Expected: %s, Got: %s", expected[0], hash)
}
if string(ref) != expected[1] {
t.Errorf("Expected: %s, Got: %s", expected[1], ref)
}
}
}

func Test_parseSourceRef(t *testing.T) {
cases := map[string]string{
"refs/heads/master": "",
"refs/tags/v3.0.5": "",
// refs/merge-requests/33/head
"refs/heads/thog/mr/33/head": "Merge request #33",
// refs/merge-requests/19/merge
"refs/heads/thog/mr/19/merge": "Merge request #19",
// refs/pull/980/head
"refs/heads/thog/pr/980/head": "Pull request #980",
// refs/pull/1644/merge
"refs/heads/thog/pr/1644/merge": "Pull request #1644",
}

for line, expected := range cases {
source := parseSourceRef([]byte(line))
if source != expected {
t.Errorf("Expected: %s, Got: %s", expected, source)
}
}
}

func TestBinaryPathParse(t *testing.T) {
cases := map[string]string{
"Binary files a/trufflehog_3.42.0_linux_arm64.tar.gz and /dev/null differ\n": "",
Expand Down
Loading