From cd6c187f8503163bfaf9640330ad52d099fdfd3d Mon Sep 17 00:00:00 2001 From: Brad Larsen Date: Fri, 27 Sep 2024 10:46:04 -0400 Subject: [PATCH] Reduce memory and CPU use when scanning (#222) Rework git metadata calculation to reduce peak memory and wall clock time when scanning. This includes many changes. The net effect of all this is a typical 30% speedup and 50% memory reduction when scanning Git repositories; in pathological cases, up to a 5x speedup and 20x memory reduction. - Git metadata graph: - Do not construct in-memory graph of all trees and blob names; instead, read tree objects from repo as late as possible - Use `SmallVec` to reduce heap fragmentation and small heap allocations - Use more suitable initial size for worklists and scratch buffers to reduce reallocations and heap fragmentation - Use the fastest / slimmest order for iterating object headers from a git repository when initially counting objects - Eliminate redundant intermediate data structures; remove unused fields from remaining intermediate data structures - Avoid temporary allocations when concatenating blob filenames - Fix a longstanding bug where a blob introduced multiple times within a single commit would have only a single arbitrary pathname reported - `BStringTable`: - change default initialization to create an empty table - change `get_or_intern` to avoid heap-allocated temporaries when an entry already exists - Scanning: - Use an `Arc` instead of `CommitMetadata` and `Arc` instead of `PathBuf` within git blob provenance entries (allows sharing; sometimes reduces memory use of these object types 10,000x) - Use a higher default level of parallelism: require 3GiB RAM instead of 4GiB per parallel job - Open Git repositories a single time instead of twice - Fix clippy nits --- CHANGELOG.md | 15 +- .../input-enumerator/src/blob_appearance.rs | 8 +- crates/input-enumerator/src/bstring_table.rs | 36 +- .../src/git_metadata_graph.rs | 694 +++++++++++------- .../src/git_repo_enumerator.rs | 317 +++----- crates/input-enumerator/src/lib.rs | 34 + crates/noseyparker-cli/src/args.rs | 2 +- .../src/cmd_report/sarif_format.rs | 2 +- crates/noseyparker-cli/src/cmd_scan.rs | 52 +- crates/noseyparker/Cargo.toml | 2 +- crates/noseyparker/src/provenance.rs | 11 +- crates/noseyparker/src/provenance_set.rs | 3 +- 12 files changed, 622 insertions(+), 554 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 06a306b6b..475d26e78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Changes - Inputs are now enumerated incrementally as scanning proceeds rather than done in an initial batch step ([#216](https://github.com/praetorian-inc/noseyparker/pull/216)). - This reduces peak memory use and CPU time 10-20%, particularly in environments with slow I/O. + This reduces peak memory use and wall clock time 10-20%, particularly in environments with slow I/O. A consequence of this change is that the total amount of data to scan is not known until it has actually been scanned, and so the scanning progress bar no longer shows a completion percentage. - When cloning Git repositories while scanning, the progress bar for now includes the current repository URL ([#212](https://github.com/praetorian-inc/noseyparker/pull/212)). @@ -38,11 +38,22 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Bitbucket App Password ([#219](https://github.com/praetorian-inc/noseyparker/pull/219) from @gemesa) -### Fixes +- The default number of parallel scanner jobs is now higher on many systems ([#222](https://github.com/praetorian-inc/noseyparker/pull/222)). + This value is determined in part by the amount of system RAM; + due to several memory use improvements, the required minim RAM per job has been reduced, allowing for more parallelism. +### Fixes - The `Google OAuth Credentials` rule has been revised to avoid runtime errors about an empty capture group. + - The `AWS Secret Access Key` rule has been revised to avoid runtime `Regex failed to match` errors. +- The code that determines first-commit provenance information for blobs from Git repositories has been reworked to improve memory use ([#222](https://github.com/praetorian-inc/noseyparker/pull/222)). + In typical cases of scanning Git repositories, this reduces both peak memory use and wall clock time by 20-50%. + In certain pathological cases, such as [Homebrew](https://github.com/homebrew/homebrew-core) or [nixpkgs](https://github.com/NixOS/nixpkgs), the new implementation uses up to 20x less peak memory and up to 5x less wall clock time. + +- When determining blob provenance informatino from Git repositories, blobs that first appear multiple times within a single commit will now be reported with _all_ names they appear with ([#222](https://github.com/praetorian-inc/noseyparker/pull/222)). + Previously, one of the pathnames would be arbitrarily selected. + ## [v0.19.0](https://github.com/praetorian-inc/noseyparker/releases/v0.19.0) (2024-07-30) diff --git a/crates/input-enumerator/src/blob_appearance.rs b/crates/input-enumerator/src/blob_appearance.rs index 61f0d3505..02c01cae5 100644 --- a/crates/input-enumerator/src/blob_appearance.rs +++ b/crates/input-enumerator/src/blob_appearance.rs @@ -1,13 +1,13 @@ +use crate::git_commit_metadata::CommitMetadata; use bstr::{BString, ByteSlice}; -use gix::ObjectId; use smallvec::SmallVec; use std::path::Path; +use std::sync::Arc; /// Where was a particular blob seen? #[derive(Clone, Debug, serde::Serialize)] pub struct BlobAppearance { - /// The commit ID - pub commit_oid: ObjectId, + pub commit_metadata: Arc, /// The path given to the blob pub path: BString, @@ -21,4 +21,4 @@ impl BlobAppearance { } /// A set of `BlobAppearance` entries -pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>; +pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 2]>; diff --git a/crates/input-enumerator/src/bstring_table.rs b/crates/input-enumerator/src/bstring_table.rs index 4d17cb00e..3b7a10861 100644 --- a/crates/input-enumerator/src/bstring_table.rs +++ b/crates/input-enumerator/src/bstring_table.rs @@ -1,5 +1,5 @@ use bstr::{BStr, BString}; -use std::collections::{hash_map::Entry, HashMap}; +use std::collections::HashMap; #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Symbol { @@ -7,9 +7,14 @@ pub struct Symbol { j: T, } +#[allow(clippy::len_without_is_empty)] pub trait SymbolType: Copy + PartialEq + Eq + std::hash::Hash { fn to_range(self) -> std::ops::Range; fn from_range(r: std::ops::Range) -> Self; + + fn len(&self) -> usize { + self.to_range().len() + } } impl SymbolType for Symbol { @@ -49,7 +54,10 @@ pub struct BStringTable> { impl BStringTable { pub fn new() -> Self { - Self::with_capacity(32 * 1024, 1024 * 1024) + Self { + storage: Default::default(), + mapping: Default::default(), + } } pub fn with_capacity(num_symbols: usize, total_bytes: usize) -> Self { @@ -60,15 +68,17 @@ impl BStringTable { } #[inline] - pub fn get_or_intern(&mut self, s: BString) -> S { - match self.mapping.entry(s) { - Entry::Occupied(e) => *e.get(), - Entry::Vacant(e) => { - let s = e.key(); + pub fn get_or_intern(&mut self, s: &BStr) -> S { + match self.mapping.get(s) { + Some(s) => *s, + None => { + let s = s.to_owned(); let i = self.storage.len(); let j = i + s.len(); self.storage.extend(s.as_slice()); - *e.insert(S::from_range(i..j)) + let k = S::from_range(i..j); + self.mapping.insert(s, k); + k } } } @@ -91,12 +101,12 @@ mod tests { let s1 = BStr::new("Hello"); let s2 = BStr::new("World"); - let sym1 = t.get_or_intern(s1.to_owned()); - let sym1a = t.get_or_intern(s1.to_owned()); + let sym1 = t.get_or_intern(s1); + let sym1a = t.get_or_intern(s1); assert_eq!(sym1, sym1a); - let sym2 = t.get_or_intern(s2.to_owned()); - let sym2a = t.get_or_intern(s2.to_owned()); + let sym2 = t.get_or_intern(s2); + let sym2a = t.get_or_intern(s2); assert_eq!(sym2, sym2a); assert_ne!(sym1, sym2); @@ -104,7 +114,7 @@ mod tests { assert_eq!(s1, t.resolve(sym1)); assert_eq!(s2, t.resolve(sym2)); - let sym1b = t.get_or_intern(s1.to_owned()); + let sym1b = t.get_or_intern(s1); assert_eq!(sym1, sym1b); } } diff --git a/crates/input-enumerator/src/git_metadata_graph.rs b/crates/input-enumerator/src/git_metadata_graph.rs index a3ef72c4f..f76ad2b19 100644 --- a/crates/input-enumerator/src/git_metadata_graph.rs +++ b/crates/input-enumerator/src/git_metadata_graph.rs @@ -2,7 +2,9 @@ use anyhow::{bail, Context, Result}; use bstr::BString; use fixedbitset::FixedBitSet; use gix::hashtable::{hash_map, HashMap}; -use gix::ObjectId; +use gix::objs::tree::EntryKind; +use gix::prelude::*; +use gix::{object::Kind, ObjectId, OdbHandle}; use petgraph::graph::{DiGraph, EdgeIndex, IndexType, NodeIndex}; use petgraph::prelude::*; use petgraph::visit::Visitable; @@ -10,15 +12,16 @@ use roaring::RoaringBitmap; use smallvec::SmallVec; use std::collections::BinaryHeap; use std::time::Instant; -use tracing::{debug, error, warn}; +use tracing::{debug, error, error_span, warn}; -use crate::bstring_table::BStringTable; +use crate::bstring_table::{BStringTable, SymbolType}; +use crate::{unwrap_ok_or_continue, unwrap_some_or_continue}; type Symbol = crate::bstring_table::Symbol; /// A newtype for commit graph indexes, to prevent mixing up indexes from different types of graphs #[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone, Default, Debug)] -pub struct CommitGraphIdx(NodeIndex); +pub(crate) struct CommitGraphIdx(NodeIndex); /// Boilerplate instance for the index newtype unsafe impl IndexType for CommitGraphIdx { @@ -39,82 +42,235 @@ unsafe impl IndexType for CommitGraphIdx { type CommitNodeIdx = NodeIndex; type CommitEdgeIdx = EdgeIndex; -/// A newtype for tree and blob graph indexes, to prevent mixing up indexes from different types of graphs +/// A newtype wrapper for a u32, to map to gix::ObjectId to use as an index in other array-based +/// data structures. #[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Copy, Clone, Default, Debug)] -pub struct TreeBlobGraphIdx(NodeIndex); +pub(crate) struct ObjectIdx(u32); -/// Boilerplate instance for the index newtype -unsafe impl IndexType for TreeBlobGraphIdx { - #[inline(always)] - fn new(x: usize) -> Self { - Self(NodeIndex::new(x)) - } - #[inline(always)] - fn index(&self) -> usize { - self.0.index() - } - #[inline(always)] - fn max() -> Self { - Self(::max()) +impl ObjectIdx { + pub(crate) fn new(x: usize) -> Self { + Self(x.try_into().unwrap()) } -} -pub type TreeBlobNodeIdx = NodeIndex; -pub type TreeBlobEdgeIdx = EdgeIndex; - -pub struct CommitMetadata { - pub oid: ObjectId, - pub tree_idx: Option, + pub(crate) fn as_usize(&self) -> usize { + self.0 as usize + } } -#[derive(PartialEq, Eq, Debug)] -enum TreeBlobKind { - Tree, - Blob, +#[derive(Clone, Copy)] +pub(crate) struct CommitMetadata { + pub(crate) oid: ObjectId, + pub(crate) tree_idx: Option, } -pub struct TreeBlobMetadata { - kind: TreeBlobKind, - oid: ObjectId, +/// A compact set of git objects, denoted via `ObjectIdx` +#[derive(Clone, Debug, Default)] +struct SeenObjectSet { + seen_trees: RoaringBitmap, + seen_blobs: RoaringBitmap, } -#[derive(Clone, Debug)] -struct SeenTreeBlobSet(RoaringBitmap); - -impl SeenTreeBlobSet { - #[inline] - pub fn new() -> Self { - SeenTreeBlobSet(RoaringBitmap::new()) +impl SeenObjectSet { + pub(crate) fn new() -> Self { + SeenObjectSet { + seen_trees: RoaringBitmap::new(), + seen_blobs: RoaringBitmap::new(), + } } - #[inline] - pub fn contains(&self, idx: TreeBlobNodeIdx) -> Result { + /// Returns whether the value was absent from the set + fn insert(set: &mut RoaringBitmap, idx: ObjectIdx) -> Result { let idx = idx - .index() + .as_usize() .try_into() .context("index should be representable with a u32")?; - Ok(self.0.contains(idx)) + Ok(set.insert(idx)) } - #[inline] - pub fn insert(&mut self, idx: TreeBlobNodeIdx) -> Result { + fn contains(set: &RoaringBitmap, idx: ObjectIdx) -> Result { let idx = idx - .index() + .as_usize() .try_into() .context("index should be representable with a u32")?; - Ok(self.0.insert(idx)) + Ok(set.contains(idx)) } - #[inline] - pub fn union_update(&mut self, other: &Self) { - self.0 |= &other.0; + /// Returns whether the value was absent from the set + pub(crate) fn insert_tree(&mut self, idx: ObjectIdx) -> Result { + Self::insert(&mut self.seen_trees, idx) + } + + /// Returns whether the value was absent from the set + pub(crate) fn insert_blob(&mut self, idx: ObjectIdx) -> Result { + Self::insert(&mut self.seen_blobs, idx) + } + + pub(crate) fn contains_blob(&self, idx: ObjectIdx) -> Result { + Self::contains(&self.seen_blobs, idx) + } + + pub(crate) fn union_update(&mut self, other: &Self) { + self.seen_blobs |= &other.seen_blobs; + self.seen_trees |= &other.seen_trees; } } -impl Default for SeenTreeBlobSet { - #[inline] - fn default() -> Self { - Self::new() +struct ObjectIdBimap { + oid_to_idx: HashMap, + idx_to_oid: Vec, +} + +impl ObjectIdBimap { + fn with_capacity(capacity: usize) -> Self { + Self { + oid_to_idx: HashMap::with_capacity_and_hasher(capacity, Default::default()), + idx_to_oid: Vec::with_capacity(capacity), + } + } + + fn insert(&mut self, oid: ObjectId) { + match self.oid_to_idx.entry(oid) { + gix::hashtable::hash_map::Entry::Occupied(e) => { + warn!("object {} seen multiple times", e.key()); + } + gix::hashtable::hash_map::Entry::Vacant(e) => { + let idx = ObjectIdx::new(self.idx_to_oid.len()); + self.idx_to_oid.push(*e.key()); + e.insert(idx); + } + } + } + + fn get_oid(&self, idx: ObjectIdx) -> Option<&gix::oid> { + self.idx_to_oid.get(idx.as_usize()).map(|v| v.as_ref()) + } + + fn get_idx(&self, oid: &gix::oid) -> Option { + self.oid_to_idx.get(oid).copied() + } + + fn len(&self) -> usize { + self.idx_to_oid.len() + } +} + +// Some types and data structures for recursively enumerating tree objects +type Symbols = SmallVec<[Symbol; 6]>; +type TreeWorklistItem = (Symbols, ObjectId); +type TreeWorklist = Vec; + +/// An in-memory index that organizes various objects within a Git repository. +/// +/// - It associates a u32-based ID with each object +/// - It partitions object IDs according to object type (commit, blob, tree, tag) +pub(crate) struct RepositoryIndex { + trees: ObjectIdBimap, + commits: ObjectIdBimap, + blobs: ObjectIdBimap, + tags: ObjectIdBimap, +} + +impl RepositoryIndex { + pub(crate) fn new(odb: &OdbHandle) -> Result { + use gix::odb::store::iter::Ordering; + use gix::prelude::*; + + // Get object count to allow for exact index allocation size + // Use fastest gix ordering mode + let mut num_tags = 0; + let mut num_trees = 0; + let mut num_blobs = 0; + let mut num_commits = 0; + + for oid in odb + .iter() + .context("Failed to iterate object database")? + .with_ordering(Ordering::PackLexicographicalThenLooseLexicographical) + { + let oid = unwrap_ok_or_continue!(oid, |e| { error!("Failed to read object id: {e}") }); + let hdr = unwrap_ok_or_continue!(odb.header(oid), |e| { + error!("Failed to read object header for {oid}: {e}") + }); + match hdr.kind() { + Kind::Tree => num_trees += 1, + Kind::Blob => num_blobs += 1, + Kind::Commit => num_commits += 1, + Kind::Tag => num_tags += 1, + } + } + + // Allocate indexes exactly to the size needed + let mut trees = ObjectIdBimap::with_capacity(num_trees); + let mut commits = ObjectIdBimap::with_capacity(num_commits); + let mut blobs = ObjectIdBimap::with_capacity(num_blobs); + let mut tags = ObjectIdBimap::with_capacity(num_tags); + + // Now build in-memory index + // Use slower gix ordering mode, but one that puts objects in a possibly more efficient + // order for reading + for oid in odb + .iter() + .context("Failed to iterate object database")? + .with_ordering(Ordering::PackAscendingOffsetThenLooseLexicographical) + { + let oid = unwrap_ok_or_continue!(oid, |e| { error!("Failed to read object id: {e}") }); + let hdr = unwrap_ok_or_continue!(odb.header(oid), |e| { + error!("Failed to read object header for {oid}: {e}") + }); + match hdr.kind() { + Kind::Tree => trees.insert(oid), + Kind::Blob => blobs.insert(oid), + Kind::Commit => commits.insert(oid), + Kind::Tag => tags.insert(oid), + } + } + + Ok(Self { + trees, + commits, + blobs, + tags, + }) + } + + pub(crate) fn num_commits(&self) -> usize { + self.commits.len() + } + + pub(crate) fn num_blobs(&self) -> usize { + self.blobs.len() + } + + pub(crate) fn num_trees(&self) -> usize { + self.trees.len() + } + + pub(crate) fn num_tags(&self) -> usize { + self.tags.len() + } + + pub(crate) fn num_objects(&self) -> usize { + self.num_commits() + self.num_blobs() + self.num_tags() + self.num_trees() + } + + pub(crate) fn get_tree_oid(&self, idx: ObjectIdx) -> Option<&gix::oid> { + self.trees.get_oid(idx) + } + + pub(crate) fn get_tree_index(&self, oid: &gix::oid) -> Option { + self.trees.get_idx(oid) + } + + pub(crate) fn get_blob_index(&self, oid: &gix::oid) -> Option { + self.blobs.get_idx(oid) + } + + pub(crate) fn into_blobs(self) -> Vec { + self.blobs.idx_to_oid + } + + pub(crate) fn commits(&self) -> &[ObjectId] { + self.commits.idx_to_oid.as_slice() } } @@ -127,54 +283,24 @@ impl Default for SeenTreeBlobSet { /// This is also an in-memory graph of the trees and blobs in Git. /// Each tree object is a node that has an outgoing edge to each of its immediate children, labeled /// with the name of that child. -pub struct GitMetadataGraph { - symbols: BStringTable, - +pub(crate) struct GitMetadataGraph { commit_oid_to_node_idx: HashMap, commits: DiGraph, - - tree_blob_oid_to_node_idx: HashMap, - trees_and_blobs: DiGraph, } impl GitMetadataGraph { /// Create a new commit graph with the given capacity. - pub fn with_capacity(num_commits: usize, num_trees: usize, num_blobs: usize) -> Self { - let num_trees_and_blobs = num_trees + num_blobs; - + pub(crate) fn with_capacity(num_commits: usize) -> Self { // use 2x the number of commits, assuming that most commits have a single parent commit, // except merges, which usually have 2 let commit_edges_capacity = num_commits * 2; - let tree_blob_edges_capacity = { - // this is an estimate of how many trees+blobs change in a typical commit - let avg_increase = (num_trees_and_blobs as f64 / num_commits as f64).max(1.0); - - // guess at how many trees+blobs do we expect to see total in a typical commit - // assume that 6% of a tree's contents change at each commit - // - // XXX: this is magic, but chosen empirically from looking at a few test repos. - // it would be better if we could sample some commits - let baseline = avg_increase / 0.06; - - (num_commits as f64 * (baseline + avg_increase)) as usize - } - .min(1024 * 1024 * 1024); - // info!("tree blob edges capacity: {tree_blob_edges_capacity}"); - Self { - symbols: BStringTable::new(), - commit_oid_to_node_idx: HashMap::with_capacity_and_hasher( num_commits, Default::default(), ), commits: DiGraph::with_capacity(num_commits, commit_edges_capacity), - trees_and_blobs: DiGraph::with_capacity(num_trees_and_blobs, tree_blob_edges_capacity), - tree_blob_oid_to_node_idx: HashMap::with_capacity_and_hasher( - num_trees_and_blobs, - Default::default(), - ), } } @@ -182,30 +308,20 @@ impl GitMetadataGraph { /// /// Panics if the given graph node index is not valid for this graph. #[inline] - pub fn get_commit_metadata(&self, idx: CommitNodeIdx) -> &CommitMetadata { + pub(crate) fn get_commit_metadata(&self, idx: CommitNodeIdx) -> &CommitMetadata { self.commits .node_weight(idx) .expect("commit graph node index should be valid") } - /// Get the tree/blob metadata for the given graph node index. - /// - /// Panics if the given graph node index is not valid for this graph. - #[inline] - pub fn get_tree_blob_metadata(&self, idx: TreeBlobNodeIdx) -> &TreeBlobMetadata { - self.trees_and_blobs - .node_weight(idx) - .expect("tree/blob graph node index should be valid") - } - /// Get the index of the graph node for the given commit, creating it if needed. /// /// If a node already exists for the given commit and `tree_idx` is given, the node's metadata /// is updated with the given value. - pub fn get_commit_idx( + pub(crate) fn get_commit_idx( &mut self, oid: ObjectId, - tree_idx: Option, + tree_idx: Option, ) -> CommitNodeIdx { match self.commit_oid_to_node_idx.entry(oid) { hash_map::Entry::Occupied(e) => { @@ -228,7 +344,7 @@ impl GitMetadataGraph { /// Add a new edge between two commits, returning its index. /// /// NOTE: If an edge already exists between the two commits, a parallel edge is added. - pub fn add_commit_edge( + pub(crate) fn add_commit_edge( &mut self, parent_idx: CommitNodeIdx, child_idx: CommitNodeIdx, @@ -237,106 +353,56 @@ impl GitMetadataGraph { // `self.commits.update_edge(parent_idx, child_idx, ())`. self.commits.add_edge(parent_idx, child_idx, ()) } - - /// Add a new graph node for the given blob object, returning its index. - pub fn get_blob_idx(&mut self, blob_oid: ObjectId) -> TreeBlobNodeIdx { - *self - .tree_blob_oid_to_node_idx - .entry(blob_oid) - .or_insert_with(|| { - self.trees_and_blobs.add_node(TreeBlobMetadata { - kind: TreeBlobKind::Blob, - oid: blob_oid, - }) - }) - } - - /// Add a new graph node for the given tree object, returning its index. - pub fn get_tree_idx(&mut self, tree_oid: ObjectId) -> TreeBlobNodeIdx { - *self - .tree_blob_oid_to_node_idx - .entry(tree_oid) - .or_insert_with(|| { - self.trees_and_blobs.add_node(TreeBlobMetadata { - kind: TreeBlobKind::Tree, - oid: tree_oid, - }) - }) - } - - /// Add a new edge for a tree and another tree or blob. - /// - /// NOTE: If such an edge already exists, a parallel edge is added. - pub fn add_tree_edge( - &mut self, - parent_idx: TreeBlobNodeIdx, - child_idx: TreeBlobNodeIdx, - name: BString, - ) -> TreeBlobEdgeIdx { - // NOTE: this will allow parallel (i.e., duplicate) edges from `from_idx` to `to_idx`. - // - // For alternative behavior that doesn't add parallel edges, use - // `self.commits.update_edge(from_idx, to_idx, ())`. - let sym = self.symbols.get_or_intern(name); - self.trees_and_blobs.add_edge(parent_idx, child_idx, sym) - } - - pub fn num_commits(&self) -> usize { - self.commits.node_count() - } - - pub fn num_trees(&self) -> usize { - self.trees_and_blobs - .node_weights() - .filter(|md| md.kind == TreeBlobKind::Tree) - .count() - } - - pub fn num_blobs(&self) -> usize { - self.trees_and_blobs - .node_weights() - .filter(|md| md.kind == TreeBlobKind::Blob) - .count() - } } -pub struct RepoMetadata { - /// index of the commit this is for; indexes into the commits graph - pub commit_oid: ObjectId, +pub(crate) type IntroducedBlobs = SmallVec<[(ObjectId, BString); 4]>; + +pub(crate) struct CommitBlobMetadata { + /// index of the commit this entry applies to + pub(crate) commit_oid: ObjectId, /// set of introduced blobs and path names - pub introduced_blobs: Vec<(ObjectId, BString)>, + pub(crate) introduced_blobs: IntroducedBlobs, } impl GitMetadataGraph { - pub fn get_repo_metadata(&self) -> Result> { + pub(crate) fn get_repo_metadata( + self, + repo_index: &RepositoryIndex, + repo: &gix::Repository, + ) -> Result> { + let _span = + error_span!("get_repo_metadata", path = repo.path().display().to_string()).entered(); + let t1 = Instant::now(); - let symbols = &self.symbols; let cg = &self.commits; - let tbg = &self.trees_and_blobs; let num_commits = cg.node_count(); - // The set of seen trees and blobs. This has an entry for _each_ commit, though at runtime, - // not all of these seen sets will be "live". - // - // FIXME: merge this data structure with the `worklist` priority queue; See https://docs.rs/priority-queue; this allows O(1) updates of items in the queue - let mut seen_sets: Vec> = vec![None; num_commits]; - - let mut blobs_introduced: Vec> = vec![Vec::new(); num_commits]; - // An adapatation of Kahn's topological sorting algorithm, to visit the commit nodes in // topological order: // This algorithm naturally mantains a frontier of still-to-expand nodes. // - // We attach to each node in the frontier a set of seen blobs and seen trees in the traversal - // up to that point. + // We attach to each node in the frontier a set of seen blobs and seen trees in the + // traversal up to that point. + + // A mapping of graph index of a commit to the set of seen trees and blobs. + // + // There is one such set for _each_ commit, though at runtime, not all of these seen sets + // will be "live" (those will have `None` values). + // + // XXX: this could be merged with the `commit_worklist` priority queue with a suitable data + // structure (one allowing O(1) updates of items in the queue) + let mut seen_sets: Vec> = vec![None; num_commits]; + + // A mapping of graph index of a commit to the set of blobs introduced by that commit + let mut blobs_introduced: Vec = vec![IntroducedBlobs::new(); num_commits]; // NOTE: petgraph comes with a pre-built data type for keeping track of visited nodes, // but has no such thing for keeping track of visited edges, so we make our own. - let mut visited_edges = FixedBitSet::with_capacity(cg.edge_count()); + let mut visited_commit_edges = FixedBitSet::with_capacity(cg.edge_count()); - // Keep track of which commit nodes we have seen: needed to avoid re-visiting nodes in the rare - // present of parallel (i.e., multiple) edges between two commits. + // Keep track of which commit nodes we have seen: needed to avoid re-visiting nodes in the + // rare present of parallel (i.e., multiple) edges between two commits. let mut visited_commits = cg.visit_map(); // We use an ordered queue for the worklist instead of a deque or simple vector. @@ -364,9 +430,14 @@ impl GitMetadataGraph { Ok(std::cmp::Reverse(count)) }; - // A queue of commit graph node indexes, ordered by minimum out-degree - let mut worklist = - BinaryHeap::<(OutDegree, CommitNodeIdx)>::with_capacity(1024.max(num_commits / 2)); + // A table for interned bytestrings; used to represent filename path fragments, drastically + // reducing peak memory use + let mut symbols = BStringTable::with_capacity(32 * 1024, 1024 * 1024); + + // A queue of commit graph node indexes, ordered by minimum out-degree. + // Invariant: each entry commit has no unprocessed parent commits + let mut commit_worklist = + BinaryHeap::<(OutDegree, CommitNodeIdx)>::with_capacity(num_commits); // Initialize with commit nodes that have no parents for root_idx in cg @@ -374,30 +445,33 @@ impl GitMetadataGraph { .filter(|idx| cg.neighbors_directed(*idx, Incoming).count() == 0) { let out_degree = commit_out_degree(root_idx)?; - worklist.push((out_degree, root_idx)); - seen_sets[root_idx.index()] = Some(SeenTreeBlobSet::new()); + commit_worklist.push((out_degree, root_idx)); + seen_sets[root_idx.index()] = Some(SeenObjectSet::new()); } - let mut tree_worklist: Vec<(TreeBlobNodeIdx, SmallVec<[Symbol; 2]>)> = - Vec::with_capacity(32 * 1024); + // A worklist of tree objects (and no other type) to be traversed + let mut tree_worklist = TreeWorklist::with_capacity(32 * 1024); + let mut tree_buf = Vec::with_capacity(1024 * 1024); + // A scratch buffer for new blobs encountered while traversing a tree + let mut blobs_encountered = Vec::with_capacity(16 * 1024); // various counters for statistics - let mut max_frontier_size = 0; // max value of size of `worklist` + let mut max_frontier_size = 0; // max value of size of `commit_worklist` let mut num_blobs_introduced = 0; // total number of blobs introduced in commits let mut num_trees_introduced = 0; // total number of trees introduced in commits let mut num_commits_visited = 0; // total number of commits visited - let mut num_live_seen_sets = worklist.len(); - let mut max_live_seen_sets = 0; // max value of `num_live_seen_sets` + let mut num_live_seen_sets = commit_worklist.len(); // current number of live seen sets + let mut max_live_seen_sets = num_live_seen_sets; // max value of `num_live_seen_sets` - while let Some((_out_degree, commit_idx)) = worklist.pop() { + while let Some((_out_degree, commit_idx)) = commit_worklist.pop() { let commit_index = commit_idx.index(); if visited_commits.put(commit_index) { warn!("found duplicate commit node {commit_index}"); continue; } - // info!("{commit_index}: {out_degree:?} {:?}", worklist.iter().map(|e| e.0).max()); + let introduced = &mut blobs_introduced[commit_index]; let mut seen = seen_sets[commit_index] .take() @@ -405,106 +479,84 @@ impl GitMetadataGraph { assert!(num_live_seen_sets > 0); num_live_seen_sets -= 1; + // Update stats num_commits_visited += 1; - max_frontier_size = max_frontier_size.max(worklist.len() + 1); + max_frontier_size = max_frontier_size.max(commit_worklist.len() + 1); max_live_seen_sets = max_live_seen_sets.max(num_live_seen_sets); // Update `seen` with the tree and blob IDs reachable from this commit let commit_md = self.get_commit_metadata(commit_idx); - // FIXME: improve this type to avoid a runtime check here - match commit_md.tree_idx { - None => { - warn!( - "commit metadata missing for {}; blob metadata may be incomplete or wrong", - commit_md.oid - ); - // NOTE: if we reach this point, we still need to enumerate child nodes, even - // though we can't traverse the commit's tree. - // Otherwise, we spuriously fail later, incorrectly reporting a cycle detected. - } - Some(tree_idx) => { - assert!(tree_worklist.is_empty()); - if !seen.contains(tree_idx)? { - tree_worklist.push((tree_idx, SmallVec::new())); - } - - //while let Some((name_path, idx)) = tree_worklist.pop() { - while let Some((idx, name_path)) = tree_worklist.pop() { - let metadata = self.get_tree_blob_metadata(idx); - match metadata.kind { - TreeBlobKind::Tree => num_trees_introduced += 1, - TreeBlobKind::Blob => { - num_blobs_introduced += 1; - - let name_path: Vec = - bstr::join("/", name_path.iter().map(|s| symbols.resolve(*s))); - blobs_introduced[commit_index] - .push((metadata.oid, BString::from(name_path))); - // info!("{}: {}: {name_path:?}", commit_md.oid, metadata.oid); - } - } - - for edge in tbg.edges_directed(idx, Outgoing) { - let child_idx = edge.target(); - if !seen.insert(child_idx)? { - continue; - } - - let mut child_name_path = name_path.clone(); - child_name_path.push(*edge.weight()); - tree_worklist.push((child_idx, child_name_path)); - } - } + if let Some(tree_idx) = commit_md.tree_idx { + assert!(tree_worklist.is_empty()); + if seen.insert_tree(tree_idx)? { + tree_worklist.push(( + SmallVec::new(), + repo_index.get_tree_oid(tree_idx).unwrap().to_owned(), + )); + + visit_tree( + repo, + &mut symbols, + repo_index, + &mut num_trees_introduced, + &mut num_blobs_introduced, + &mut seen, + introduced, + &mut tree_buf, + &mut tree_worklist, + &mut blobs_encountered, + )?; } + } else { + warn!( + "Failed to find commit metadata for {}; blob metadata may be incomplete or wrong", + commit_md.oid + ); + // NOTE: if we reach this point, we still need to process the child commits, even + // though we can't traverse this commit's tree. + // Otherwise, we spuriously fail later, incorrectly reporting a cycle detected. } // Propagate this commit's seen set into each of its immediate child commit's seen set. + // Handle the last child commit specially: it inherits this commit node's seen set, + // as it will no longer be needed. This optimization reduces memory traffic, especially + // in the common case of a single commit parent. let mut edges = cg.edges_directed(commit_idx, Outgoing).peekable(); while let Some(edge) = edges.next() { let edge_index = edge.id().index(); - if visited_edges.put(edge_index) { - error!( - "Edge {edge_index} already visited -- this was supposed to be impossible!" - ); + if visited_commit_edges.put(edge_index) { + error!("Edge {edge_index} already visited -- supposed to be impossible!"); continue; } - let child_idx = edge.target(); + let child_idx = edge.target(); let child_seen = &mut seen_sets[child_idx.index()]; - match child_seen.as_mut() { - Some(child_seen) => { - // Already have a seen set allocated for this child commit. Update it. - child_seen.union_update(&seen); - } - None => { - // No seen set allocated yet for this child commit; make a new one, - // recycling the current parent commit's seen set if possible. We can do - // this on the last loop iteration (i.e., when we are working on the last - // child commit), because at that point, the parent's seen set will no - // longer be needed. This optimization reduces memory traffic, especially - // in the common case of a single commit parent. - - num_live_seen_sets += 1; - if edges.peek().is_none() { - *child_seen = Some(std::mem::take(&mut seen)); - } else { - *child_seen = Some(seen.clone()); - } + if let Some(child_seen) = child_seen.as_mut() { + // Already have a seen set allocated for this child commit. Update it. + child_seen.union_update(&seen); + } else { + // No seen set allocated yet for this child commit. + // Make one, recycling the current parent commit's seen set if possible. + num_live_seen_sets += 1; + if edges.peek().is_none() { + *child_seen = Some(std::mem::take(&mut seen)); + } else { + *child_seen = Some(seen.clone()); } } // If the child commit node has no unvisited parent commits, add it to the worklist if !cg .edges_directed(child_idx, Incoming) - .any(|edge| !visited_edges.contains(edge.id().index())) + .any(|edge| !visited_commit_edges.contains(edge.id().index())) { - worklist.push((commit_out_degree(child_idx)?, child_idx)); + commit_worklist.push((commit_out_degree(child_idx)?, child_idx)); } } } - if visited_edges.count_ones(..) != visited_edges.len() { - bail!("topological traversal of commits failed: a commit cycle!?"); + if visited_commit_edges.count_ones(..) != visited_commit_edges.len() { + bail!("Topological traversal of commits failed: a commit cycle!?"); } assert_eq!(num_commits_visited, num_commits); @@ -521,14 +573,116 @@ impl GitMetadataGraph { ); // Massage intermediate accumulated results into output format - let commit_metadata = cg + let commit_metadata: Vec = cg .node_weights() .zip(blobs_introduced) - .map(|(md, intro)| RepoMetadata { + .map(|(md, introduced_blobs)| CommitBlobMetadata { commit_oid: md.oid, - introduced_blobs: intro, + introduced_blobs, }) .collect(); + Ok(commit_metadata) } } + +#[allow(clippy::too_many_arguments)] +fn visit_tree( + repo: &gix::Repository, + symbols: &mut BStringTable, + repo_index: &RepositoryIndex, + num_trees_introduced: &mut usize, + num_blobs_introduced: &mut usize, + seen: &mut SeenObjectSet, + introduced: &mut IntroducedBlobs, + tree_buf: &mut Vec, + tree_worklist: &mut TreeWorklist, + blobs_encountered: &mut Vec, +) -> Result<()> { + blobs_encountered.clear(); + while let Some((name_path, tree_oid)) = tree_worklist.pop() { + // read the tree object from the repo, + // enumerate its child entries, and extend the worklist with the unseen child trees + let tree_iter = unwrap_ok_or_continue!( + repo.objects.find_tree_iter(&tree_oid, tree_buf), + |e| error!("Failed to find tree {tree_oid}: {e}"), + ); + + *num_trees_introduced += 1; + + for child in tree_iter { + let child = unwrap_ok_or_continue!(child, |e| { + error!("Failed to read tree entry from {tree_oid}: {e}") + }); + // skip non-tree / non-blob tree entries + match child.mode.kind() { + EntryKind::Link | EntryKind::Commit => continue, + + EntryKind::Tree => { + let child_idx = + unwrap_some_or_continue!(repo_index.get_tree_index(child.oid), || error!( + "Failed to find tree index for {} from tree {tree_oid}", + child.oid + ),); + if !seen.insert_tree(child_idx)? { + continue; + } + let mut child_name_path = name_path.clone(); + child_name_path.push(symbols.get_or_intern(child.filename)); + tree_worklist.push((child_name_path, child.oid.to_owned())); + } + + EntryKind::Blob | EntryKind::BlobExecutable => { + let child_idx = + unwrap_some_or_continue!(repo_index.get_blob_index(child.oid), || error!( + "Failed to find blob index for {} from tree {tree_oid}", + child.oid + )); + if seen.contains_blob(child_idx)? { + continue; + } + blobs_encountered.push(child_idx); + + *num_blobs_introduced += 1; + + // Compute full path to blob as a bytestring. + // Instead of using `bstr::join`, manually construct the string to + // avoid intermediate allocations. + let name_path = { + use bstr::ByteVec; + + let fname = symbols.get_or_intern(child.filename); + + let needed_len = name_path.iter().map(|s| s.len()).sum::() + + child.filename.len() + + name_path.len(); + let mut it = name_path + .iter() + .copied() + .chain(std::iter::once(fname)) + .map(|s| symbols.resolve(s)); + let mut buf = Vec::with_capacity(needed_len); + if let Some(p) = it.next() { + buf.push_str(p); + for p in it { + buf.push_char('/'); + buf.push_str(p); + } + } + debug_assert_eq!(needed_len, buf.capacity()); + debug_assert_eq!(needed_len, buf.len()); + BString::from(buf) + }; + introduced.push((child.oid.to_owned(), name_path)); + } + } + } + } + + for blob_idx in blobs_encountered.iter() { + seen.insert_blob(*blob_idx)?; + } + blobs_encountered.clear(); + + Ok(()) +} diff --git a/crates/input-enumerator/src/git_repo_enumerator.rs b/crates/input-enumerator/src/git_repo_enumerator.rs index cfdde19f0..1cee11938 100644 --- a/crates/input-enumerator/src/git_repo_enumerator.rs +++ b/crates/input-enumerator/src/git_repo_enumerator.rs @@ -1,85 +1,17 @@ use anyhow::{Context, Result}; -use gix::{hashtable::HashMap, ObjectId, OdbHandle, Repository}; +use gix::{hashtable::HashMap, ObjectId, Repository}; use ignore::gitignore::Gitignore; use smallvec::SmallVec; use std::path::{Path, PathBuf}; +use std::sync::Arc; use std::time::Instant; // use std::time::Instant; -use tracing::{debug, debug_span, warn}; +use tracing::{debug, debug_span, error}; use crate::blob_appearance::{BlobAppearance, BlobAppearanceSet}; use crate::git_commit_metadata::CommitMetadata; -use crate::git_metadata_graph::GitMetadataGraph; - -// ------------------------------------------------------------------------------------------------- -// implementation helpers -// ------------------------------------------------------------------------------------------------- -macro_rules! unwrap_or_continue { - ($arg:expr) => { - match $arg { - Ok(v) => v, - Err(_e) => { - continue; - } - } - }; - ($arg:expr, $on_error:expr) => { - match $arg { - Ok(v) => v, - Err(e) => { - #[allow(clippy::redundant_closure_call)] - $on_error(e); - continue; - } - } - }; -} - -pub struct ObjectCounts { - num_commits: usize, - num_trees: usize, - num_blobs: usize, -} - -// TODO: measure how helpful or pointless it is to count the objects in advance -fn count_git_objects(odb: &OdbHandle) -> Result { - let t1 = Instant::now(); - - use gix::object::Kind; - use gix::odb::store::iter::Ordering; - use gix::prelude::*; - - let mut num_commits = 0; - let mut num_trees = 0; - let mut num_blobs = 0; - let mut num_objects = 0; - - for oid in odb - .iter() - .context("Failed to iterate object database")? - .with_ordering(Ordering::PackAscendingOffsetThenLooseLexicographical) - { - num_objects += 1; - let oid = unwrap_or_continue!(oid, |e| { warn!("Failed to read object id: {e}") }); - let hdr = unwrap_or_continue!(odb.header(oid), |e| { - warn!("Failed to read object header for {oid}: {e}") - }); - match hdr.kind() { - Kind::Commit => num_commits += 1, - Kind::Tree => num_trees += 1, - Kind::Blob => num_blobs += 1, - Kind::Tag => {} - } - } - - debug!("Counted {num_objects} objects in {:.6}s", t1.elapsed().as_secs_f64()); - - Ok(ObjectCounts { - num_commits, - num_trees, - num_blobs, - }) -} +use crate::git_metadata_graph::{GitMetadataGraph, RepositoryIndex}; +use crate::{unwrap_ok_or_continue, unwrap_some_or_continue}; // ------------------------------------------------------------------------------------------------- // enumeration return types @@ -93,27 +25,11 @@ pub struct GitRepoResult { /// The blobs to be scanned pub blobs: Vec, - - /// Finite map from commit ID to metadata - /// - /// NOTE: this may be incomplete, missing entries for some commits - pub commit_metadata: HashMap, -} - -impl GitRepoResult { - pub fn total_blob_bytes(&self) -> u64 { - self.blobs.iter().map(|t| t.num_bytes).sum() - } - - pub fn num_blobs(&self) -> u64 { - self.blobs.len() as u64 - } } #[derive(Clone)] pub struct BlobMetadata { pub blob_oid: ObjectId, - pub num_bytes: u64, pub first_seen: BlobAppearanceSet, } @@ -138,142 +54,101 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { pub fn run(self) -> Result { let t1 = Instant::now(); - use gix::object::Kind; - use gix::odb::store::iter::Ordering; use gix::prelude::*; let _span = debug_span!("enumerate_git_with_metadata", "{}", self.path.display()).entered(); let odb = &self.repo.objects; - // TODO: measure how helpful or pointless it is to count the objects in advance - // First count the objects to figure out how big to allocate data structures. // We're assuming that the repository doesn't change in the meantime. // If it does, our allocation estimates won't be right. Too bad! - let ObjectCounts { - num_commits, - num_trees, - num_blobs, - } = count_git_objects(odb)?; - - let mut blobs: Vec<(ObjectId, u64)> = Vec::with_capacity(num_blobs); - let mut metadata_graph = GitMetadataGraph::with_capacity(num_commits, num_trees, num_blobs); - - // scratch buffer used for decoding commits and trees. - // size chosen here based on experimentation: biggest commit/tree in cpython is 250k - let orig_scratch_capacity = 1024 * 1024; - let mut scratch: Vec = Vec::with_capacity(orig_scratch_capacity); + let object_index = RepositoryIndex::new(odb)?; + debug!( + "Indexed {} objects in {:.6}s; {} blobs; {} commits", + object_index.num_objects(), + t1.elapsed().as_secs_f64(), + object_index.num_blobs(), + object_index.num_commits(), + ); + + let mut metadata_graph = GitMetadataGraph::with_capacity(object_index.num_commits()); + + // scratch buffer used for decoding commits. + // size chosen here based on experimentation: biggest commit/tree in cpython is ~250KiB + // choose a big enough size to avoid resizing in almost all cases + let mut scratch: Vec = Vec::with_capacity(4 * 1024 * 1024); let mut commit_metadata = - HashMap::with_capacity_and_hasher(num_commits, Default::default()); - - for oid in odb - .iter() - .context("Failed to iterate object database")? - .with_ordering(Ordering::PackAscendingOffsetThenLooseLexicographical) - { - let oid = unwrap_or_continue!(oid, |e| warn!("Failed to read object id: {e}")); - let hdr = unwrap_or_continue!(odb.header(oid), |e| warn!( - "Failed to read object header for {oid}: {e}" - )); - match hdr.kind() { - Kind::Blob => { - let obj_size = hdr.size(); - metadata_graph.get_blob_idx(oid); - blobs.push((oid, obj_size)); - } - - Kind::Commit => { - let commit = unwrap_or_continue!(odb.find_commit(&oid, &mut scratch), |e| { - warn!("Failed to find commit {oid}: {e}"); - }); - - let tree_idx = metadata_graph.get_tree_idx(commit.tree()); - let commit_idx = metadata_graph.get_commit_idx(oid, Some(tree_idx)); - for parent_oid in commit.parents() { - let parent_idx = metadata_graph.get_commit_idx(parent_oid, None); - metadata_graph.add_commit_edge(parent_idx, commit_idx); - } - - let committer = &commit.committer; - let author = &commit.author; - let md = CommitMetadata { - commit_id: oid, - committer_name: committer.name.to_owned(), - committer_timestamp: committer.time, - committer_email: committer.email.to_owned(), - author_name: author.name.to_owned(), - author_timestamp: author.time, - author_email: author.email.to_owned(), - message: commit.message.to_owned(), - }; - commit_metadata.insert(oid, md); - } - - Kind::Tree => { - let tree_idx = metadata_graph.get_tree_idx(oid); - let tree_ref_iter = - unwrap_or_continue!(odb.find_tree_iter(&oid, &mut scratch), |e| { - warn!("Failed to find tree {oid}: {e}"); - }); - for child in tree_ref_iter { - let child = unwrap_or_continue!(child, |e| { - warn!("Failed to decode entry in tree {oid}: {e}"); - }); - use gix::objs::tree::EntryKind; - let child_idx = match child.mode.kind() { - EntryKind::Tree => metadata_graph.get_tree_idx(child.oid.into()), - EntryKind::Blob | EntryKind::BlobExecutable => { - metadata_graph.get_blob_idx(child.oid.into()) - } - _ => continue, - }; - - metadata_graph.add_tree_edge( - tree_idx, - child_idx, - child.filename.to_owned(), - ); - } - } - - _ => {} + HashMap::with_capacity_and_hasher(object_index.num_commits(), Default::default()); + + for commit_oid in object_index.commits() { + let commit = unwrap_ok_or_continue!(odb.find_commit(commit_oid, &mut scratch), |e| { + error!("Failed to find commit {commit_oid}: {e}"); + }); + + let tree_oid = commit.tree(); + let tree_idx = unwrap_some_or_continue!(object_index.get_tree_index(&tree_oid), || { + error!("Failed to find tree {tree_oid} for commit {commit_oid}"); + }); + let commit_idx = metadata_graph.get_commit_idx(*commit_oid, Some(tree_idx)); + for parent_oid in commit.parents() { + let parent_idx = metadata_graph.get_commit_idx(parent_oid, None); + metadata_graph.add_commit_edge(parent_idx, commit_idx); } + + let committer = &commit.committer; + let author = &commit.author; + let md = CommitMetadata { + commit_id: *commit_oid, + committer_name: committer.name.to_owned(), + committer_timestamp: committer.time, + committer_email: committer.email.to_owned(), + author_name: author.name.to_owned(), + author_timestamp: author.time, + author_email: author.email.to_owned(), + message: commit.message.to_owned(), + }; + commit_metadata.insert(*commit_oid, Arc::new(md)); } debug!("Built metadata graph in {:.6}s", t1.elapsed().as_secs_f64()); - let path = self.path.to_owned(); - match metadata_graph.get_repo_metadata() { + match metadata_graph.get_repo_metadata(&object_index, &self.repo) { Err(e) => { - warn!("Failed to compute reachable blobs; ignoring metadata: {e}"); - let blobs = blobs + error!("Failed to compute reachable blobs; ignoring metadata: {e}"); + let blobs = object_index + .into_blobs() .into_iter() - .map(|(blob_oid, num_bytes)| BlobMetadata { + .map(|blob_oid| BlobMetadata { blob_oid, - num_bytes, first_seen: Default::default(), }) .collect(); Ok(GitRepoResult { repository: self.repo, - path, + path: self.path.to_owned(), blobs, - commit_metadata, }) } Ok(md) => { - let mut inverted = - HashMap::>::with_capacity_and_hasher( - num_blobs, - Default::default(), - ); + let mut blob_to_appearance: HashMap = object_index + .into_blobs() + .into_iter() + .map(|b| (b, SmallVec::new())) + .collect(); + for e in md.into_iter() { + let commit_metadata = + unwrap_some_or_continue!(commit_metadata.get(&e.commit_oid), || { + error!("Failed to find commit metadata for {}", e.commit_oid); + }); for (blob_oid, path) in e.introduced_blobs.into_iter() { - let vals = inverted.entry(blob_oid).or_insert(SmallVec::new()); + let vals = blob_to_appearance + .entry(blob_oid) + .or_insert(SmallVec::new()); vals.push(BlobAppearance { - commit_oid: e.commit_oid, + commit_metadata: commit_metadata.clone(), path, }); } @@ -295,27 +170,20 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { // // It's also possible (though rare) that a blob appears in a Git repository with // _no_ path whatsoever. - // - // Anyway, when Nosey Parker is determining whether a blob should be gitignored or - // not, the logic is this: - // - // - If the set of pathnames for a blob is empty, *do not* ignore the blob. - // - // - If the set of pathnames for a blob is *not* empty, if *all* of the pathnames - // match the gitignore rules, ignore the blob. - - let blobs: Vec<_> = blobs + let blobs: Vec = blob_to_appearance .into_iter() - .filter_map(|(blob_oid, num_bytes)| match inverted.get(&blob_oid) { - None => Some(BlobMetadata { - blob_oid, - num_bytes, - first_seen: SmallVec::new(), - }), - - Some(first_seen) => { - let first_seen: SmallVec<_> = first_seen - .iter() + .filter_map(|(blob_oid, first_seen)| { + if first_seen.is_empty() { + // no commit metadata at all for blob + Some(BlobMetadata { + blob_oid, + first_seen, + }) + } else { + // filter out path-ignored provenance entries; suppress blob if all + // provenance entries get filtered + let first_seen: BlobAppearanceSet = first_seen + .into_iter() .filter(|entry| { use bstr::ByteSlice; match entry.path.to_path() { @@ -334,7 +202,6 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { } } }) - .cloned() .collect(); if first_seen.is_empty() { @@ -343,7 +210,6 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { } else { Some(BlobMetadata { blob_oid, - num_bytes, first_seen, }) } @@ -353,9 +219,8 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { Ok(GitRepoResult { repository: self.repo, - path, + path: self.path.to_owned(), blobs, - commit_metadata, }) } } @@ -384,37 +249,33 @@ impl<'a> GitRepoEnumerator<'a> { let odb = &self.repo.objects; - let mut blobs: Vec<(ObjectId, u64)> = Vec::with_capacity(64 * 1024); + let mut blobs: Vec = Vec::with_capacity(64 * 1024); for oid in odb .iter() .context("Failed to iterate object database")? .with_ordering(Ordering::PackAscendingOffsetThenLooseLexicographical) { - let oid = unwrap_or_continue!(oid, |e| warn!("Failed to read object id: {e}")); - let hdr = unwrap_or_continue!(odb.header(oid), |e| warn!( + let oid = unwrap_ok_or_continue!(oid, |e| error!("Failed to read object id: {e}")); + let hdr = unwrap_ok_or_continue!(odb.header(oid), |e| error!( "Failed to read object header for {oid}: {e}" )); if hdr.kind() == Kind::Blob { - let obj_size = hdr.size(); - blobs.push((oid, obj_size)); + blobs.push(oid); } } - let path = self.path.to_owned(); let blobs = blobs .into_iter() - .map(|(blob_oid, num_bytes)| BlobMetadata { + .map(|blob_oid| BlobMetadata { blob_oid, - num_bytes, first_seen: Default::default(), }) .collect(); Ok(GitRepoResult { repository: self.repo, - path, + path: self.path.to_owned(), blobs, - commit_metadata: Default::default(), }) } } diff --git a/crates/input-enumerator/src/lib.rs b/crates/input-enumerator/src/lib.rs index 80b287eab..7ce297633 100644 --- a/crates/input-enumerator/src/lib.rs +++ b/crates/input-enumerator/src/lib.rs @@ -11,6 +11,40 @@ use ignore::{DirEntry, WalkBuilder, WalkState}; use std::path::{Path, PathBuf}; use tracing::{debug, error, warn}; +// ------------------------------------------------------------------------------------------------- +// helper macros +// ------------------------------------------------------------------------------------------------- +macro_rules! unwrap_some_or_continue { + ($arg:expr, $on_error:expr $(,)?) => { + match $arg { + Some(v) => v, + None => { + #[allow(clippy::redundant_closure_call)] + $on_error(); + continue; + } + } + }; +} + +pub(crate) use unwrap_some_or_continue; + +macro_rules! unwrap_ok_or_continue { + ($arg:expr, $on_error:expr $(,)?) => { + match $arg { + Ok(v) => v, + Err(e) => { + #[allow(clippy::redundant_closure_call)] + $on_error(e); + continue; + } + } + }; +} + +pub(crate) use unwrap_ok_or_continue; + +// ------------------------------------------------------------------------------------------------- mod git_repo_enumerator; pub use git_repo_enumerator::{GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator}; diff --git a/crates/noseyparker-cli/src/args.rs b/crates/noseyparker-cli/src/args.rs index 271b65f4c..ec2416462 100644 --- a/crates/noseyparker-cli/src/args.rs +++ b/crates/noseyparker-cli/src/args.rs @@ -104,7 +104,7 @@ fn default_scan_jobs() -> usize { match (std::thread::available_parallelism(), *RAM_GB) { (Ok(v), Some(ram_gb)) => { let n: usize = v.into(); - let max_n = (ram_gb / 4.0).ceil().max(1.0) as usize; + let max_n = (ram_gb / 3.0).ceil().max(1.0) as usize; n.clamp(1, max_n) } (Ok(v), None) => v.into(), diff --git a/crates/noseyparker-cli/src/cmd_report/sarif_format.rs b/crates/noseyparker-cli/src/cmd_report/sarif_format.rs index d700c7e4b..4f7068ff6 100644 --- a/crates/noseyparker-cli/src/cmd_report/sarif_format.rs +++ b/crates/noseyparker-cli/src/cmd_report/sarif_format.rs @@ -143,7 +143,7 @@ fn noseyparker_sarif_rules() -> Result> { .iter_rules() .map(|rule| { let help = sarif::MultiformatMessageStringBuilder::default() - .text(&rule.references.join("\n")) + .text(rule.references.join("\n")) .build()?; // FIXME: add better descriptions to Nosey Parker rules diff --git a/crates/noseyparker-cli/src/cmd_scan.rs b/crates/noseyparker-cli/src/cmd_scan.rs index 538547adf..a58d35274 100644 --- a/crates/noseyparker-cli/src/cmd_scan.rs +++ b/crates/noseyparker-cli/src/cmd_scan.rs @@ -2,6 +2,7 @@ use anyhow::{bail, Context, Result}; use indicatif::{HumanBytes, HumanCount, HumanDuration}; use rayon::prelude::*; use std::path::{Path, PathBuf}; +use std::sync::Arc; use std::sync::Mutex; use std::time::{Duration, Instant}; use tracing::{debug, debug_span, error, info, trace, warn}; @@ -100,7 +101,6 @@ impl ParallelIterator for EnumeratorFileIter { use std::io::BufRead; (1usize..) .zip(self.reader.lines()) - .into_iter() .filter_map(|(line_num, line)| line.map(|line| (line_num, line)).ok()) .par_bridge() .map(|(line_num, line)| { @@ -168,6 +168,7 @@ impl ParallelIterator for GitRepoResultIter { C: rayon::iter::plumbing::UnindexedConsumer, { let repo = self.inner.repository.into_sync(); + let repo_path = Arc::new(self.inner.path.clone()); self.inner .blobs .into_par_iter() @@ -189,33 +190,28 @@ impl ParallelIterator for GitRepoResultIter { || repo.to_thread_local(), |repo, md| -> Result<(ProvenanceSet, Blob)> { let blob_id = md.blob_oid; - let repo_path = &self.inner.path; - - let blob = { - let mut blob = repo.find_object(blob_id).with_context(|| { - format!( - "Failed to read blob {blob_id} from Git repository at {}", - repo_path.display(), - ) - })?; + let blob = || -> Result { + let mut blob = repo.find_object(blob_id)?.try_into_blob()?; let data = std::mem::take(&mut blob.data); // avoid a copy - Blob::new(BlobId::from(&blob_id), data) - }; - - let provenance = ProvenanceSet::try_from_iter(md.first_seen.iter().map(|e| { - let commit_metadata = self - .inner - .commit_metadata - .get(&e.commit_oid) - .expect("should have commit metadata"); - Provenance::from_git_repo_with_first_commit( - repo_path.clone(), - commit_metadata.clone(), - e.path.clone(), + Ok(Blob::new(BlobId::from(&blob_id), data)) + }() + .with_context(|| { + format!( + "Failed to read blob {blob_id} from Git repository at {}", + repo_path.display(), ) - })) - .unwrap_or_else(|| Provenance::from_git_repo(repo_path.clone()).into()); + })?; + + let provenance = + ProvenanceSet::try_from_iter(md.first_seen.into_iter().map(|e| { + Provenance::from_git_repo_with_first_commit( + repo_path.clone(), + e.commit_metadata, + e.path, + ) + })) + .unwrap_or_else(|| Provenance::from_git_repo(repo_path.clone()).into()); Ok((provenance, blob)) }, @@ -374,7 +370,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> // --------------------------------------------------------------------------------------------- let repo_urls = { let mut repo_urls = args.input_specifier_args.git_url.clone(); - repo_urls.extend(enumerate_github_repos(&global_args, &args)?); + repo_urls.extend(enumerate_github_repos(global_args, args)?); repo_urls.sort(); repo_urls.dedup(); repo_urls @@ -386,7 +382,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> let input_roots = { let mut input_roots = args.input_specifier_args.path_inputs.clone(); if !repo_urls.is_empty() { - input_roots.extend(clone_git_repo_urls(&global_args, &args, &datastore, repo_urls)?); + input_roots.extend(clone_git_repo_urls(global_args, args, &datastore, repo_urls)?); } input_roots.sort(); input_roots.dedup(); @@ -483,7 +479,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> let guesser = Guesser::new().expect("should be able to create filetype guessser"); { let mut init_time = blob_processor_init_time.lock().unwrap(); - *init_time = *init_time + t1.elapsed(); + *init_time += t1.elapsed(); } BlobProcessor { diff --git a/crates/noseyparker/Cargo.toml b/crates/noseyparker/Cargo.toml index 29a0aa8e1..1e9763e5b 100644 --- a/crates/noseyparker/Cargo.toml +++ b/crates/noseyparker/Cargo.toml @@ -49,7 +49,7 @@ rusqlite = { version = "0.32", features = ["bundled", "backup", "serde_json"] } schemars = { version = "0.8", features = ["smallvec"] } secrecy = { version = "0.8.0", optional = true } smallvec = { version = "1", features = ["const_generics", "const_new", "union"] } -serde = { version = "1.0", features = ["derive"] } +serde = { version = "1.0", features = ["derive", "rc"] } serde_json = { version = "1.0" } thiserror = "1" tokio = { version = "1.23", optional = true } diff --git a/crates/noseyparker/src/provenance.rs b/crates/noseyparker/src/provenance.rs index 6bf80cdce..2396a0059 100644 --- a/crates/noseyparker/src/provenance.rs +++ b/crates/noseyparker/src/provenance.rs @@ -4,6 +4,7 @@ use input_enumerator::git_commit_metadata::CommitMetadata; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; +use std::sync::Arc; // ------------------------------------------------------------------------------------------------- // Provenance @@ -28,7 +29,7 @@ impl Provenance { /// commit provenance. /// /// See also `from_git_repo_with_first_commit`. - pub fn from_git_repo(repo_path: PathBuf) -> Self { + pub fn from_git_repo(repo_path: Arc) -> Self { Provenance::GitRepo(GitRepoProvenance { repo_path, first_commit: None, @@ -40,8 +41,8 @@ impl Provenance { /// /// See also `from_git_repo`. pub fn from_git_repo_with_first_commit( - repo_path: PathBuf, - commit_metadata: CommitMetadata, + repo_path: Arc, + commit_metadata: Arc, blob_path: BString, ) -> Self { let first_commit = Some(CommitProvenance { @@ -109,7 +110,7 @@ pub struct FileProvenance { /// Indicates that a blob was seen in a Git repo, optionally with particular commit provenance info #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)] pub struct GitRepoProvenance { - pub repo_path: PathBuf, + pub repo_path: Arc, pub first_commit: Option, } @@ -119,7 +120,7 @@ pub struct GitRepoProvenance { /// How was a particular Git commit encountered? #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)] pub struct CommitProvenance { - pub commit_metadata: CommitMetadata, + pub commit_metadata: Arc, #[serde(with = "BStringLossyUtf8")] pub blob_path: BString, diff --git a/crates/noseyparker/src/provenance_set.rs b/crates/noseyparker/src/provenance_set.rs index 47ee4cfe5..5234a41da 100644 --- a/crates/noseyparker/src/provenance_set.rs +++ b/crates/noseyparker/src/provenance_set.rs @@ -2,6 +2,7 @@ use schemars::JsonSchema; use serde::ser::SerializeSeq; use std::collections::HashSet; use std::path::PathBuf; +use std::sync::Arc; use crate::provenance::Provenance; @@ -52,7 +53,7 @@ impl ProvenanceSet { /// Create a new `ProvenanceSet` from the given items, filtering out redundant less-specific /// `Provenance` records. pub fn new(provenance: Provenance, more_provenance: Vec) -> Self { - let mut git_repos_with_detailed: HashSet = HashSet::new(); + let mut git_repos_with_detailed: HashSet> = HashSet::new(); for p in std::iter::once(&provenance).chain(&more_provenance) { if let Provenance::GitRepo(e) = p {