Reduce memory and CPU use when scanning (#222)

Rework git metadata calculation to reduce peak memory and wall clock time when scanning. This includes many changes. The net effect of all this is a typical 30% speedup and 50% memory reduction when scanning Git repositories; in pathological cases, up to a 5x speedup and 20x memory reduction. - Git metadata graph: - Do not construct in-memory graph of all trees and blob names; instead, read tree objects from repo as late as possible - Use `SmallVec` to reduce heap fragmentation and small heap allocations - Use more suitable initial size for worklists and scratch buffers to reduce reallocations and heap fragmentation - Use the fastest / slimmest order for iterating object headers from a git repository when initially counting objects - Eliminate redundant intermediate data structures; remove unused fields from remaining intermediate data structures - Avoid temporary allocations when concatenating blob filenames - Fix a longstanding bug where a blob introduced multiple times within a single commit would have only a single arbitrary pathname reported - `BStringTable`: - change default initialization to create an empty table - change `get_or_intern` to avoid heap-allocated temporaries when an entry already exists - Scanning: - Use an `Arc<CommitMetadata>` instead of `CommitMetadata` and `Arc<PathBuf>` instead of `PathBuf` within git blob provenance entries (allows sharing; sometimes reduces memory use of these object types 10,000x) - Use a higher default level of parallelism: require 3GiB RAM instead of 4GiB per parallel job - Open Git repositories a single time instead of twice - Fix clippy nits
praetorian-inc · Sep 27, 2024 · cd6c187 · cd6c187
1 parent 0518d80
commit cd6c187
Show file tree

Hide file tree

Showing 12 changed files with 622 additions and 554 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,7 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ### Changes
 - Inputs are now enumerated incrementally as scanning proceeds rather than done in an initial batch step ([#216](https://github.com/praetorian-inc/noseyparker/pull/216)).
-  This reduces peak memory use and CPU time 10-20%, particularly in environments with slow I/O.
+  This reduces peak memory use and wall clock time 10-20%, particularly in environments with slow I/O.
   A consequence of this change is that the total amount of data to scan is not known until it has actually been scanned, and so the scanning progress bar no longer shows a completion percentage.
 
 - When cloning Git repositories while scanning, the progress bar for now includes the current repository URL ([#212](https://github.com/praetorian-inc/noseyparker/pull/212)).
@@ -38,11 +38,22 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
   - Bitbucket App Password ([#219](https://github.com/praetorian-inc/noseyparker/pull/219) from @gemesa)
 
-### Fixes
+- The default number of parallel scanner jobs is now higher on many systems ([#222](https://github.com/praetorian-inc/noseyparker/pull/222)).
+  This value is determined in part by the amount of system RAM;
+  due to several memory use improvements, the required minim RAM per job has been reduced, allowing for more parallelism.
 
+### Fixes
 - The `Google OAuth Credentials` rule has been revised to avoid runtime errors about an empty capture group.
+
 - The `AWS Secret Access Key` rule has been revised to avoid runtime `Regex failed to match` errors.
 
+- The code that determines first-commit provenance information for blobs from Git repositories has been reworked to improve memory use ([#222](https://github.com/praetorian-inc/noseyparker/pull/222)).
+  In typical cases of scanning Git repositories, this reduces both peak memory use and wall clock time by 20-50%.
+  In certain pathological cases, such as [Homebrew](https://github.com/homebrew/homebrew-core) or [nixpkgs](https://github.com/NixOS/nixpkgs), the new implementation uses up to 20x less peak memory and up to 5x less wall clock time.
+
+- When determining blob provenance informatino from Git repositories, blobs that first appear multiple times within a single commit will now be reported with _all_ names they appear with ([#222](https://github.com/praetorian-inc/noseyparker/pull/222)).
+  Previously, one of the pathnames would be arbitrarily selected.
+
 
 ## [v0.19.0](https://github.com/praetorian-inc/noseyparker/releases/v0.19.0) (2024-07-30)
 

diff --git a/crates/input-enumerator/src/blob_appearance.rs b/crates/input-enumerator/src/blob_appearance.rs
@@ -1,13 +1,13 @@
+use crate::git_commit_metadata::CommitMetadata;
 use bstr::{BString, ByteSlice};
-use gix::ObjectId;
 use smallvec::SmallVec;
 use std::path::Path;
+use std::sync::Arc;
 
 /// Where was a particular blob seen?
 #[derive(Clone, Debug, serde::Serialize)]
 pub struct BlobAppearance {
-    /// The commit ID
-    pub commit_oid: ObjectId,
+    pub commit_metadata: Arc<CommitMetadata>,
 
     /// The path given to the blob
     pub path: BString,
@@ -21,4 +21,4 @@ impl BlobAppearance {
 }
 
 /// A set of `BlobAppearance` entries
-pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 1]>;
+pub type BlobAppearanceSet = SmallVec<[BlobAppearance; 2]>;
diff --git a/crates/input-enumerator/src/bstring_table.rs b/crates/input-enumerator/src/bstring_table.rs
@@ -1,15 +1,20 @@
 use bstr::{BStr, BString};
-use std::collections::{hash_map::Entry, HashMap};
+use std::collections::HashMap;
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct Symbol<T> {
     i: T,
     j: T,
 }
 
+#[allow(clippy::len_without_is_empty)]
 pub trait SymbolType: Copy + PartialEq + Eq + std::hash::Hash {
     fn to_range(self) -> std::ops::Range<usize>;
     fn from_range(r: std::ops::Range<usize>) -> Self;
+
+    fn len(&self) -> usize {
+        self.to_range().len()
+    }
 }
 
 impl SymbolType for Symbol<usize> {
@@ -49,7 +54,10 @@ pub struct BStringTable<S = Symbol<u32>> {
 
 impl<S: SymbolType> BStringTable<S> {
     pub fn new() -> Self {
-        Self::with_capacity(32 * 1024, 1024 * 1024)
+        Self {
+            storage: Default::default(),
+            mapping: Default::default(),
+        }
     }
 
     pub fn with_capacity(num_symbols: usize, total_bytes: usize) -> Self {
@@ -60,15 +68,17 @@ impl<S: SymbolType> BStringTable<S> {
     }
 
     #[inline]
-    pub fn get_or_intern(&mut self, s: BString) -> S {
-        match self.mapping.entry(s) {
-            Entry::Occupied(e) => *e.get(),
-            Entry::Vacant(e) => {
-                let s = e.key();
+    pub fn get_or_intern(&mut self, s: &BStr) -> S {
+        match self.mapping.get(s) {
+            Some(s) => *s,
+            None => {
+                let s = s.to_owned();
                 let i = self.storage.len();
                 let j = i + s.len();
                 self.storage.extend(s.as_slice());
-                *e.insert(S::from_range(i..j))
+                let k = S::from_range(i..j);
+                self.mapping.insert(s, k);
+                k
             }
         }
     }
@@ -91,20 +101,20 @@ mod tests {
         let s1 = BStr::new("Hello");
         let s2 = BStr::new("World");
 
-        let sym1 = t.get_or_intern(s1.to_owned());
-        let sym1a = t.get_or_intern(s1.to_owned());
+        let sym1 = t.get_or_intern(s1);
+        let sym1a = t.get_or_intern(s1);
         assert_eq!(sym1, sym1a);
 
-        let sym2 = t.get_or_intern(s2.to_owned());
-        let sym2a = t.get_or_intern(s2.to_owned());
+        let sym2 = t.get_or_intern(s2);
+        let sym2a = t.get_or_intern(s2);
         assert_eq!(sym2, sym2a);
 
         assert_ne!(sym1, sym2);
 
         assert_eq!(s1, t.resolve(sym1));
         assert_eq!(s2, t.resolve(sym2));
 
-        let sym1b = t.get_or_intern(s1.to_owned());
+        let sym1b = t.get_or_intern(s1);
         assert_eq!(sym1, sym1b);
     }
 }