Skip to content

Commit

Permalink
Apply path-based ignore rules to Git history
Browse files Browse the repository at this point in the history
Fixes #17.
  • Loading branch information
bradlarsen committed Jul 30, 2024
1 parent 9e7e5a6 commit d535978
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 16 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

Note that although several rules share the same name now, they all still have distinct IDs.

- The default set of patterns for the existing gitignore-style path-based exclusion mechanism (`scan --ignore=GITIGNORE_FILE`) has been expanded ([#209](https://github.com/praetorian-inc/noseyparker/pull/209)).
The new patterns cover test files from things like vendored Python, Node.js, and Go packages.

- The gitignore-style path-based exclusion patterns (`scan --ignore=GITIGNORE_FILE`) now also apply to content found within Git history, and not just paths on the filesystem ([#209](https://github.com/praetorian-inc/noseyparker/pull/209)).
When a blob is found in Git history with at least 1 associated pathname, if all of the associated pathnames match the ignore rules, the blob is not scanned.


## [v0.18.1](https://github.com/praetorian-inc/noseyparker/releases/v0.18.1) (2024-07-12)

### Fixes
Expand Down
88 changes: 77 additions & 11 deletions crates/input-enumerator/src/git_repo_enumerator.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use anyhow::{Context, Result};
use gix::{hashtable::HashMap, ObjectId, OdbHandle, Repository};
use ignore::gitignore::Gitignore;
use smallvec::SmallVec;
use std::path::{Path, PathBuf};
// use std::time::Instant;
Expand Down Expand Up @@ -116,11 +117,16 @@ pub struct BlobMetadata {
pub struct GitRepoWithMetadataEnumerator<'a> {
path: &'a Path,
repo: &'a Repository,
gitignore: &'a Gitignore,
}

impl<'a> GitRepoWithMetadataEnumerator<'a> {
pub fn new(path: &'a Path, repo: &'a Repository) -> Self {
Self { path, repo }
pub fn new(path: &'a Path, repo: &'a Repository, gitignore: &'a Gitignore) -> Self {
Self {
path,
repo,
gitignore,
}
}

pub fn run(&self, progress: &mut Progress) -> Result<GitRepoResult> {
Expand Down Expand Up @@ -241,7 +247,7 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
let path = self.path.to_owned();
match metadata_graph.repo_metadata(progress) {
Err(e) => {
warn!("failed to compute reachable blobs: {e}");
warn!("Failed to compute reachable blobs; ignoring metadata: {e}");
let blobs = blobs
.into_iter()
.map(|(blob_oid, num_bytes)| BlobMetadata {
Expand All @@ -257,7 +263,6 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
})
}
Ok(md) => {
// FIXME: apply path-based ignore rules to blobs here, like the filesystem enumerator
let mut inverted =
HashMap::<ObjectId, SmallVec<[BlobAppearance; 1]>>::with_capacity_and_hasher(
num_blobs,
Expand All @@ -273,19 +278,80 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> {
}
}

let blobs = blobs
// Build blobs result set.
//
// Apply any path-based ignore rules to blobs here, like the filesystem enumerator,
// filtering out blobs that have paths to ignore.
//
// Note that the behavior of ignoring blobs from Git repositories may be
// surprising.
//
// A blob may appear within a Git repository under many different paths.
// Nosey Parker doesn't compute the *entire* set of paths that each blob
// appears with. Instead, Nosey Parker computes the set of paths that each blob was
// *first introduced* with.
//
// It is also possible to instruct Nosey Parker to compute *no* path information
// for Git history.
//
// It's also possible (though rare) that a blob appears in a Git repository with
// _no_ path whatsoever.
//
// Anyway, when Nosey Parker is determining whether a blob should be gitignored or
// not, the logic is this:
//
// - If the set of pathnames for a blob is empty, *do not* ignore the blob.
//
// - If the set of pathnames for a blob is *not* empty, if *all* of the pathnames
// match the gitignore rules, ignore the blob.

let blobs: Vec<_> = blobs
.into_iter()
.map(|(blob_oid, num_bytes)| {
let first_seen = inverted
.get(&blob_oid)
.map_or(SmallVec::new(), |v| v.clone());
BlobMetadata {
.filter_map(|(blob_oid, num_bytes)| match inverted.get(&blob_oid) {
None => Some(BlobMetadata {
blob_oid,
num_bytes,
first_seen,
first_seen: SmallVec::new(),
}),

Some(first_seen) => {
let first_seen: SmallVec<_> = first_seen
.iter()
.filter(|entry| {
use bstr::ByteSlice;
match entry.path.to_path() {
Ok(path) => {
let is_dir = false;
let m = self.gitignore.matched(path, is_dir);
let is_ignore = m.is_ignore();
// if is_ignore {
// debug!("ignoring path {}: {m:?}", path.display());
// }
!is_ignore
}
Err(_e) => {
// debug!("error converting to path: {e}");
true
}
}
})
.cloned()
.collect();

if first_seen.is_empty() {
// warn!("ignoring blob {blob_oid}");
None
} else {
Some(BlobMetadata {
blob_oid,
num_bytes,
first_seen,
})
}
}
})
.collect();

Ok(GitRepoResult {
path,
blobs,
Expand Down
45 changes: 40 additions & 5 deletions crates/input-enumerator/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ pub mod git_commit_metadata;
pub mod git_metadata_graph;

use anyhow::Result;
use ignore::{DirEntry, WalkBuilder, WalkState};
use ignore::{
gitignore::{Gitignore, GitignoreBuilder},
DirEntry, WalkBuilder, WalkState,
};
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use tracing::{debug, error, warn};
Expand Down Expand Up @@ -43,6 +46,8 @@ struct VisitorBuilder<'t> {
global_files: &'t Mutex<Vec<FileResult>>,
global_git_repos: &'t Mutex<Vec<GitRepoResult>>,

gitignore: &'t Gitignore,

progress: &'t Progress,
}

Expand All @@ -59,6 +64,7 @@ where
local_git_repos: Vec::new(),
global_files: self.global_files,
global_git_repos: self.global_git_repos,
gitignore: self.gitignore,
progress: self.progress.clone(),
})
}
Expand All @@ -78,6 +84,8 @@ struct Visitor<'t> {
global_files: &'t Mutex<Vec<FileResult>>,
global_git_repos: &'t Mutex<Vec<GitRepoResult>>,

gitignore: &'t Gitignore,

progress: Progress,
}

Expand Down Expand Up @@ -149,7 +157,11 @@ impl<'t> ignore::ParallelVisitor for Visitor<'t> {
debug!("Found Git repo at {}", path.display());

if self.collect_git_metadata {
let enumerator = GitRepoWithMetadataEnumerator::new(path, &repository);
let enumerator = GitRepoWithMetadataEnumerator::new(
path,
&repository,
&self.gitignore,
);
match enumerator.run(&mut self.progress) {
Err(e) => {
error!(
Expand Down Expand Up @@ -197,14 +209,25 @@ impl<'t> ignore::ParallelVisitor for Visitor<'t> {
/// - Enumeration of blobs found in Git repositories
/// - Support for ignoring files based on size or using path-based gitignore-style rules
pub struct FilesystemEnumerator {
/// The inner filesystem walker builder
walk_builder: WalkBuilder,

// We store the max file size here in addition to inside the `walk_builder` to work around a
// bug in `ignore` where max filesize is not applied to top-level file inputs, only inputs that
// appear under a directory.
/// A gitignore builder, used for propagating path-based ignore rules into git history enumeration
///
/// Note that this is a duplicate representation of the gitignore rules stored within
/// `walk_builder`. There seems to be no avoiding that with the APIs exposed by the
/// `WalkBuilder` type today.
gitignore_builder: GitignoreBuilder,

/// We store the max file size here in addition to inside the `walk_builder` to work around a
/// bug in `ignore` where max filesize is not applied to top-level file inputs, only inputs that
/// appear under a directory.
max_file_size: Option<u64>,

/// Should git metadata (commit and path information) be collected?
collect_git_metadata: bool,

/// Should git history be scanned at all?
enumerate_git_history: bool,
}

Expand All @@ -230,11 +253,14 @@ impl FilesystemEnumerator {
builder.max_filesize(max_file_size);
builder.standard_filters(false);

let gitignore_builder = GitignoreBuilder::new("");

Ok(FilesystemEnumerator {
walk_builder: builder,
max_file_size,
collect_git_metadata: Self::DEFAULT_COLLECT_GIT_METADATA,
enumerate_git_history: Self::DEFAULT_ENUMERATE_GIT_HISTORY,
gitignore_builder,
})
}

Expand All @@ -246,6 +272,12 @@ impl FilesystemEnumerator {

/// Add a set of gitignore-style rules from the given ignore file.
pub fn add_ignore<T: AsRef<Path>>(&mut self, path: T) -> Result<&mut Self> {
let path = path.as_ref();

if let Some(e) = self.gitignore_builder.add(path) {
Err(e)?;
}

match self.walk_builder.add_ignore(path) {
Some(e) => Err(e)?,
None => Ok(self),
Expand Down Expand Up @@ -294,12 +326,15 @@ impl FilesystemEnumerator {
let files = Mutex::new(Vec::new());
let git_repos = Mutex::new(Vec::new());

let gitignore = self.gitignore_builder.build()?;

let mut visitor_builder = VisitorBuilder {
collect_git_metadata: self.collect_git_metadata,
enumerate_git_history: self.enumerate_git_history,
max_file_size: self.max_file_size,
global_files: &files,
global_git_repos: &git_repos,
gitignore: &gitignore,
progress,
};

Expand Down

0 comments on commit d535978

Please sign in to comment.