From 8f9c202cb74e049eb438d40a74286e61e64785f7 Mon Sep 17 00:00:00 2001 From: Richard Gomez Date: Sat, 1 Jun 2024 11:58:39 -0400 Subject: [PATCH] wip: use --mirror --- hack/snifftest/main.go | 14 ++++++------ pkg/engine/github.go | 2 ++ pkg/engine/gitlab.go | 2 ++ pkg/gitparse/gitparse.go | 17 +++++++------- pkg/sources/git/git.go | 44 ++++++++++++++++++++++++++++-------- pkg/sources/github/github.go | 6 ++--- pkg/sources/github/repo.go | 2 -- 7 files changed, 57 insertions(+), 30 deletions(-) diff --git a/hack/snifftest/main.go b/hack/snifftest/main.go index c2606ba2b21d..cc16291b6673 100644 --- a/hack/snifftest/main.go +++ b/hack/snifftest/main.go @@ -197,16 +197,16 @@ func main() { SkipBinaries: true, SkipArchives: false, Concurrency: runtime.NumCPU(), - SourceMetadataFunc: func(repository, commit, ref, email, timestamp, file string, line int64) *source_metadatapb.MetaData { + SourceMetadataFunc: func(repository, commit, commitSource, email, timestamp, file string, line int64) *source_metadatapb.MetaData { return &source_metadatapb.MetaData{ Data: &source_metadatapb.MetaData_Git{ Git: &source_metadatapb.Git{ - Repository: repository, - Commit: commit, - CommitRef: ref, - Email: email, - Timestamp: timestamp, - File: file, + Repository: repository, + Commit: commit, + CommitSource: commitSource, + Email: email, + Timestamp: timestamp, + File: file, }, }, } diff --git a/pkg/engine/github.go b/pkg/engine/github.go index 9bf84072a7d0..be889c849520 100644 --- a/pkg/engine/github.go +++ b/pkg/engine/github.go @@ -47,6 +47,8 @@ func (e *Engine) ScanGitHub(ctx context.Context, c sources.GithubConfig) error { opts := []git.ScanOption{ git.ScanOptionFilter(c.Filter), git.ScanOptionLogOptions(logOptions), + // Repositories are cloned with `--mirror` which is bare. + git.ScanOptionBare(true), } scanOptions := git.NewScanOptions(opts...) diff --git a/pkg/engine/gitlab.go b/pkg/engine/gitlab.go index 73885caed08e..de1224693d0e 100644 --- a/pkg/engine/gitlab.go +++ b/pkg/engine/gitlab.go @@ -21,6 +21,8 @@ func (e *Engine) ScanGitLab(ctx context.Context, c sources.GitlabConfig) error { opts := []git.ScanOption{ git.ScanOptionFilter(c.Filter), git.ScanOptionLogOptions(logOptions), + // Repositories are cloned with `--mirror` which is bare. + git.ScanOptionBare(true), } scanOptions := git.NewScanOptions(opts...) diff --git a/pkg/gitparse/gitparse.go b/pkg/gitparse/gitparse.go index 5d7fdaa807e3..5edb3b1319da 100644 --- a/pkg/gitparse/gitparse.go +++ b/pkg/gitparse/gitparse.go @@ -226,6 +226,7 @@ func NewParser(options ...Option) *Parser { func (c *Parser) RepoPath(ctx context.Context, source string, head string, abbreviatedLog bool, excludedGlobs []string, isBare bool) (chan *Diff, error) { args := []string{ "-C", source, + "--no-replace-objects", "log", "--patch", // https://git-scm.com/docs/git-log#Documentation/git-log.txt---patch "--full-history", @@ -636,28 +637,26 @@ func parseCommitLine(line []byte) (hash []byte, ref []byte) { // ParseCommitSource s // https://git-scm.com/docs/git-log#Documentation/git-log.txt---source func parseSourceRef(ref []byte) string { - // Remove the `refs/heads/thog` prefix. - // (We don't care about refs without this prefix.) - ref, ok := bytes.CutPrefix(ref, []byte("refs/heads/thog/")) - if !ok { + // We don't care about 'normal' refs. + if bytes.HasPrefix(ref, []byte("refs/heads/")) || bytes.HasPrefix(ref, []byte("refs/tags/")) { return "" } // Handle GitHub pull requests. - // e.g., `pr/238/head` or `pr/1234/merge` - if after, ok := bytes.CutPrefix(ref, []byte("pr/")); ok { + // e.g., `refs/pull/238/head` or `refs/pull/1234/merge` + if after, ok := bytes.CutPrefix(ref, []byte("refs/pull/")); ok { prNumber := after[:bytes.Index(after, []byte("/"))] return "Pull request #" + string(prNumber) } // Handle GitLab merge requests - // e.g., `mr/238/head` or `mr/1234/merge` - if after, ok := bytes.CutPrefix(ref, []byte("mr/")); ok { + // e.g., `refs/merge-requests/238/head` or `refs/merge-requests/1234/merge` + if after, ok := bytes.CutPrefix(ref, []byte("refs/merge-requests/")); ok { mrNumber := after[:bytes.Index(after, []byte("/"))] return "Merge request #" + string(mrNumber) } - return "" + return fmt.Sprintf("%s (hidden ref)", string(ref)) } // Author: Bill Rich diff --git a/pkg/sources/git/git.go b/pkg/sources/git/git.go index 61ef9ce43194..199f482f9499 100644 --- a/pkg/sources/git/git.go +++ b/pkg/sources/git/git.go @@ -234,9 +234,11 @@ func (s *Source) Init(aCtx context.Context, name string, jobId sources.JobID, so func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ ...sources.ChunkingTarget) error { reporter := sources.ChanReporter{Ch: chunksChan} if err := s.scanRepos(ctx, reporter); err != nil { + ctx.Logger().Error(err, "Chunks scanRepos") return err } if err := s.scanDirs(ctx, reporter); err != nil { + ctx.Logger().Error(err, "Chunks scanDirs") return err } @@ -251,6 +253,7 @@ func (s *Source) Chunks(ctx context.Context, chunksChan chan *sources.Chunk, _ . // scanRepos scans the configured repositories in s.conn.Repositories. func (s *Source) scanRepos(ctx context.Context, reporter sources.ChunkReporter) error { + ctx.Logger().Info("scanRepos", "repos", s.conn.Repositories) if len(s.conn.Repositories) == 0 { return nil } @@ -270,6 +273,7 @@ func (s *Source) scanRepos(ctx context.Context, reporter sources.ChunkReporter) // scanRepo scans a single provided repository. func (s *Source) scanRepo(ctx context.Context, repoURI string, reporter sources.ChunkReporter) error { + ctx.Logger().Info("scanRepo", "uri", repoURI) var cloneFunc func() (string, *git.Repository, error) switch cred := s.conn.GetCredential().(type) { case *sourcespb.Git_BasicAuth: @@ -323,13 +327,16 @@ func (s *Source) scanDirs(ctx context.Context, reporter sources.ChunkReporter) e // scanDir scans a single provided directory. func (s *Source) scanDir(ctx context.Context, gitDir string, reporter sources.ChunkReporter) error { + ctx.Logger().Info("scanDir", "dir", gitDir) if !s.scanOptions.Bare && strings.HasSuffix(gitDir, "git") { + ctx.Logger().Info("skipping bare repo", "dir", gitDir) // TODO: Figure out why we skip directories ending in "git". return nil } // try paths instead of url repo, err := RepoFromPath(gitDir, s.scanOptions.Bare) if err != nil { + ctx.Logger().Error(err, "error getting RepoFromPath", "dir", gitDir) return reporter.ChunkErr(ctx, err) } @@ -417,9 +424,13 @@ func executeClone(ctx context.Context, params cloneParams) (*git.Repository, err } gitArgs := []string{ - "clone", cloneURL.String(), - params.clonePath, - "--quiet", // https://git-scm.com/docs/git-clone#Documentation/git-clone.txt-code--quietcode + "clone", cloneURL.String(), params.clonePath, + // Don't output non-vital information. + // https://git-scm.com/docs/git-clone#Documentation/git-clone.txt-code--quietcode + "--quiet", + // Fetch all refs from the remote. + // https://github.com/trufflesecurity/trufflehog/issues/1588 + "--mirror", } gitArgs = append(gitArgs, params.args...) cloneCmd := exec.Command("git", gitArgs...) @@ -456,10 +467,9 @@ func executeClone(ctx context.Context, params cloneParams) (*git.Repository, err return nil, fmt.Errorf("could not clone repo: %s, %w", safeURL, err) } - options := &git.PlainOpenOptions{DetectDotGit: true, EnableDotGitCommonDir: true} - repo, err := git.PlainOpenWithOptions(params.clonePath, options) + repo, err := git.PlainOpen(params.clonePath) if err != nil { - return nil, fmt.Errorf("could not open cloned repo: %w", err) + return nil, fmt.Errorf("could not open cloned repo %s: %w", safeURL, err) } logger.V(1).Info("successfully cloned repo") @@ -537,6 +547,13 @@ func (s *Git) ScanCommits(ctx context.Context, repo *git.Repository, path string repoCtx = ctx } + var gitDir string + if scanOptions.Bare { + gitDir = path + } else { + gitDir = filepath.Join(path, gitDirName) + } + logger := repoCtx.Logger() var logValues []any if scanOptions.BaseHash != "" { @@ -557,8 +574,6 @@ func (s *Git) ScanCommits(ctx context.Context, repo *git.Repository, path string return nil } - gitDir := filepath.Join(path, gitDirName) - logger.Info("scanning repo", logValues...) var depth int64 @@ -787,7 +802,13 @@ func (s *Git) ScanStaged(ctx context.Context, repo *git.Repository, path string, } reachedBase := false - gitDir := filepath.Join(path, gitDirName) + + var gitDir string + if scanOptions.Bare { + gitDir = path + } else { + gitDir = filepath.Join(path, gitDirName) + } logger := ctx.Logger() var logValues []any @@ -908,6 +929,7 @@ func (s *Git) ScanRepo(ctx context.Context, repo *git.Repository, repoPath strin if scanOptions == nil { scanOptions = NewScanOptions() } + ctx.Logger().Info("ScanRepo") if err := normalizeConfig(scanOptions, repo); err != nil { return err } @@ -1063,6 +1085,7 @@ func TryAdditionalBaseRefs(repo *git.Repository, base string) (*plumbing.Hash, e // prepareRepoSinceCommit clones a repo starting at the given commitHash and returns the cloned repo path. func prepareRepoSinceCommit(ctx context.Context, uriString, commitHash string) (string, bool, error) { + ctx.Logger().Info("prepareRepoSinceCommit", "commit", commitHash) if commitHash == "" { return PrepareRepo(ctx, uriString) } @@ -1248,15 +1271,18 @@ func (s *Git) handleBinary(ctx context.Context, gitDir string, reporter sources. func (s *Source) Enumerate(ctx context.Context, reporter sources.UnitReporter) error { for _, repo := range s.conn.GetDirectories() { + ctx.Logger().Info("enumerating dirs", "repo", repo) if repo == "" { continue } unit := SourceUnit{ID: repo, Kind: UnitDir} if err := reporter.UnitOk(ctx, unit); err != nil { + ctx.Logger().Error(err, "failed to chunk") return err } } for _, repo := range s.conn.GetRepositories() { + ctx.Logger().Info("enumerating repos", "repo", repo) if repo == "" { continue } diff --git a/pkg/sources/github/github.go b/pkg/sources/github/github.go index 641d2f386c20..05b4540f5f4c 100644 --- a/pkg/sources/github/github.go +++ b/pkg/sources/github/github.go @@ -788,9 +788,6 @@ func (s *Source) cloneAndScanRepo(ctx context.Context, client *github.Client, re } defer os.RemoveAll(path) - // TODO: Can this be set once or does it need to be set on every iteration? Is |s.scanOptions| set every clone? - s.setScanOptions(s.conn.Base, s.conn.Head) - // Repo size is not collected for wikis. var logger logr.Logger if !strings.HasSuffix(repoURL, ".wiki.git") && repoInfo.size > 0 { @@ -800,6 +797,9 @@ func (s *Source) cloneAndScanRepo(ctx context.Context, client *github.Client, re } logger.V(2).Info("scanning repo") + // TODO: Can this be set once or does it need to be set on every iteration? Is |s.scanOptions| set every clone? + s.setScanOptions(s.conn.Base, s.conn.Head) + start := time.Now() if err = s.git.ScanRepo(ctx, repo, path, s.scanOptions, sources.ChanReporter{Ch: chunksChan}); err != nil { return duration, fmt.Errorf("error scanning repo %s: %w", repoURL, err) diff --git a/pkg/sources/github/repo.go b/pkg/sources/github/repo.go index 4cb652c43af2..8726ccddab2f 100644 --- a/pkg/sources/github/repo.go +++ b/pkg/sources/github/repo.go @@ -75,7 +75,6 @@ func (s *Source) cloneRepo( if err != nil { return "", nil, err } - case *sourcespb.GitHub_GithubApp: s.githubUser, s.githubToken, err = s.userAndToken(ctx, installationClient) if err != nil { @@ -86,7 +85,6 @@ func (s *Source) cloneRepo( if err != nil { return "", nil, err } - case *sourcespb.GitHub_Token: if err := s.getUserAndToken(ctx, repoURL, installationClient); err != nil { return "", nil, fmt.Errorf("error getting token for repo %s: %w", repoURL, err)