Merge pull request #46 from bacpop/0.3.1-candidate

Restore filter accuracy for fastq input
bacpop · Jul 10, 2023 · 746bb6a · 746bb6a
2 parents 5d6c1dc + 54efe80
commit 746bb6a
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 96 deletions.
diff --git a/src/lib.rs b/src/lib.rs
@@ -106,7 +106,7 @@
 //! - `--min-count`. Specify a minimum number of appearances a k-mer must have
 //! to be included. This is an effective way of filtering sequencing errors if set
 //! to at least three, but higher may be appropriate for higher coverage data.
-//! A two-step blocked bloom and countmin filter is used for memory efficiency.
+//! A two-step blocked bloom and hash table filter is used for memory efficiency.
 //! - `--qual-filter`. `none` do not filter based on quality scores.
 //! `middle` (default) filter k-mers where the middle base is below the minimum quality.
 //! `strict` filter k-mers where any base is below the minimum quality.

diff --git a/src/ska_dict.rs b/src/ska_dict.rs
@@ -29,8 +29,8 @@ use crate::ska_dict::split_kmer::SplitKmer;
 pub mod bit_encoding;
 use crate::ska_dict::bit_encoding::{decode_base, UInt, IUPAC};
 
-pub mod count_min_filter;
-use crate::ska_dict::count_min_filter::KmerFilter;
+pub mod bloom_filter;
+use crate::ska_dict::bloom_filter::KmerFilter;
 
 pub mod nthash;
 
@@ -47,7 +47,7 @@ pub struct SkaDict<IntT> {
     name: String,
     /// Split k-mer dictionary split-k:middle-base
     split_kmers: HashMap<IntT, u8>,
-    /// A countmin filter for counting from fastq files
+    /// A bloom filter for counting from fastq files
     kmer_filter: KmerFilter,
 }
 
@@ -198,7 +198,7 @@ where
             sample_idx,
             name: name.to_string(),
             split_kmers: HashMap::default(),
-            kmer_filter: KmerFilter::empty(qual.min_count),
+            kmer_filter: KmerFilter::new(qual.min_count),
         };
 
         // Check if we're working with reads, and initalise the CM filter if so

diff --git a/src/ska_dict/count_min_filter.rs → src/ska_dict/bloom_filter.rs b/src/ska_dict/count_min_filter.rs → src/ska_dict/bloom_filter.rs
@@ -1,57 +1,44 @@
-//! A Countmin filter for counting k-mers from reads
+//! A Bloom filter for counting k-mers from reads
 //!
 //! This is to support a minimum count from FASTQ files, for basic error
 //! correction, while keeping memory use low and predictable. Use the
 //! [`KmerFilter`] struct.
-//!
-//! See <https://en.wikipedia.org/wiki/Count-min_sketch> for more
-//! details on this data structure.
 
 use std::borrow::BorrowMut;
 use std::cmp::Ordering;
 
+use hashbrown::HashMap;
+
 use super::bit_encoding::UInt;
 use super::split_kmer::SplitKmer;
 
 /// Default bloom filter width (expected number of k-mers)
 ///
 /// 2^27 =~ 130M
 const BLOOM_WIDTH: usize = 1 << 27;
-/// Number of bits to use in each bloom block
+/// Number of bits to use in each bloom block (~1% FPR)
 const BITS_PER_ENTRY: usize = 12;
-/// Default number of countmin hashes/table height (controls false positive rate)
-const CM_HEIGHT: usize = 3;
-/// Default countmin filter width (expected number of k-mers > two occurences)
-///
-/// 2^27 =~ 130M
-const CM_WIDTH: usize = 1 << 24;
 
-/// A Countmin table of specified width and height, counts input k-mers, returns
-/// whether they have passed a count threshold.
+/// A filter which counts input k-mers, returns whether they have passed a count threshold.
 ///
-/// Also uses a blocked bloom filter as a first pass to remove singletons.
+/// Uses a blocked bloom filter as a first pass to remove singletons.
 /// Code for blocked bloom filter based on:
 /// <https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/2021/10/02/wordbasedbloom.cpp>
 ///
 /// This has the advantage of using less memory than a larger countmin filter,
 /// being a bit faster (bloom is ~3x faster than countmin, but having count also
 /// allows entry to dictionary to be only checked once for each passing k-mer)
+///
+/// Once passed through the bloom filter, a HashMap is used for counts >=2.
+/// This filter therefore has no false-negatives and negligible false-negatives
 #[derive(Debug, Clone, Default)]
 pub struct KmerFilter {
     /// Size of the bloom filter
     buf_size: u64,
     /// Buffer for the bloom filter
     buffer: Vec<u64>,
-    /// Table width (estimated number of unique k-mers)
-    width: usize,
-    /// Number of bits to shift masked value to get table entry
-    width_shift: u32,
-    /// Table height (number of hashes)
-    height: usize,
-    /// Mask to convert hash into table column
-    mask: u64,
     /// Table of counts
-    counts: Vec<u16>,
+    counts: HashMap<u64, u16>,
     /// Minimum count to pass filter
     min_count: u16,
 }
@@ -98,41 +85,27 @@ impl KmerFilter {
         }
     }
 
-    /// Creates a new countmin table of specified size, and pass threshold
+    /// Creates a new filter with given threshold
     ///
     /// Note:
-    /// - Must call [`KmerFilter::init()`] before using.
-    /// - Width will be rounded down to the closest power of two
     /// - Maximum count is [`u16::MAX`] i.e. 65535
-    pub fn empty(min_count: u16) -> Self {
-        // Consistent with consts used, but ensures a power of two
-        let width_bits: usize = f64::floor(f64::log2(CM_WIDTH as f64)) as usize;
-        let width = 1 << (width_bits + 1);
-        // Use MSB rather than LSB
-        let width_shift = u64::BITS - width_bits as u32;
-        let mask = (width as u64 - 1) << width_shift;
-
+    /// - Must call [`KmerFilter::init()`] before using.
+    pub fn new(min_count: u16) -> Self {
+        let buf_size =
+            f64::round(BLOOM_WIDTH as f64 * (BITS_PER_ENTRY as f64 / 8.0) / (u64::BITS as f64))
+                as u64;
         Self {
-            buf_size: f64::round(
-                BLOOM_WIDTH as f64 * (BITS_PER_ENTRY as f64 / 8.0) / (u64::BITS as f64),
-            ) as u64,
+            buf_size,
             buffer: Vec::new(),
-            width,
-            width_shift,
-            height: CM_HEIGHT,
-            mask,
-            counts: Vec::new(),
+            counts: HashMap::new(),
             min_count,
         }
     }
 
     /// Initialises table so it is ready for use.
     ///
-    /// Allocates memory and sets hash functions.
+    /// Allocates memory for bloom filter (which can be avoided with FASTA input)
     pub fn init(&mut self) {
-        if self.counts.is_empty() {
-            self.counts = vec![0; self.width * self.height];
-        }
         if self.buffer.is_empty() {
             self.buffer.resize(self.buf_size as usize, 0);
         }
@@ -148,28 +121,25 @@ impl KmerFilter {
             0 | 1 => Ordering::Equal,
             // Just the bloom filter
             2 => {
-                if self.bloom_add_and_check(kmer.get_hash(0)) {
+                if self.bloom_add_and_check(kmer.get_hash()) {
                     Ordering::Equal
                 } else {
                     Ordering::Less
                 }
             }
-            // Bloom filter then countmin
+            // Bloom filter then hash table
             _ => {
-                if self.bloom_add_and_check(kmer.get_hash(0)) {
-                    let mut count = 0;
-                    for row_idx in 0..self.height {
-                        let hash_val = kmer.get_hash(row_idx);
-                        let table_idx = row_idx * self.width
-                            + (((hash_val & self.mask) >> self.width_shift) as usize);
-                        self.counts[table_idx] = self.counts[table_idx].saturating_add(1);
-                        if row_idx == 0 {
-                            count = self.counts[table_idx];
-                        } else {
-                            count = u16::min(count, self.counts[table_idx]);
-                        }
-                    }
-                    (count + 1).cmp(&self.min_count)
+                let kmer_hash = kmer.get_hash();
+                if self.bloom_add_and_check(kmer_hash) {
+                    let mut count: u16 = 2;
+                    self.counts
+                        .entry(kmer_hash)
+                        .and_modify(|curr_cnt| {
+                            count = curr_cnt.saturating_add(1);
+                            *curr_cnt = count
+                        })
+                        .or_insert(count);
+                    self.min_count.cmp(&count)
                 } else {
                     Ordering::Less
                 }

diff --git a/src/ska_dict/nthash.rs b/src/ska_dict/nthash.rs
@@ -21,8 +21,6 @@ const RC_HASH_LOOKUP: [u64; 4] = [
     0x3c8b_fbb3_95c6_0474,
     0x3193_c185_62a0_2b4c,
 ];
-const MULTISHIFT: i32 = 27;
-const MULTISEED: u64 = 0x90b4_5d39_fb6d_a1fa;
 
 /// Stores forward and (optionally) reverse complement hashes of k-mers in a nucleotide sequence
 #[derive(Debug)]
@@ -76,17 +74,4 @@ impl NtHashIterator {
             self.fh
         }
     }
-
-    /// Generates more hashes of the sequence.
-    ///
-    /// This only operates on the hash value, not the full sequence, so won't resolve
-    /// a full hash collision, but is suitable for use in the countmin table where
-    /// most of the hash is masked off.
-    pub fn extra_hash(&self, extra_idx: usize) -> u64 {
-        let mut new_hash = self
-            .curr_hash()
-            .wrapping_mul(MULTISEED.wrapping_mul((extra_idx ^ self.k) as u64));
-        new_hash ^= new_hash >> MULTISHIFT;
-        new_hash
-    }
 }
diff --git a/src/ska_dict/split_kmer.rs b/src/ska_dict/split_kmer.rs
@@ -296,23 +296,14 @@ impl<'a, IntT: for<'b> UInt<'b>> SplitKmer<'a, IntT> {
 
     /// Get a `u64` hash of the current k-mer using [`NtHashIterator`]
     ///
-    /// Can get alternative hashes for a countmin table by giving an index
-    ///
     /// # Panics
     ///
     /// If called after creating with `is_reads = false`
-    pub fn get_hash(&self, hash_idx: usize) -> u64 {
-        if hash_idx == 0 {
-            self.hash_gen
-                .as_ref()
-                .expect("Trying to get unitialised hash")
-                .curr_hash()
-        } else {
-            self.hash_gen
-                .as_ref()
-                .expect("Trying to get unitialised hash")
-                .extra_hash(hash_idx)
-        }
+    pub fn get_hash(&self) -> u64 {
+        self.hash_gen
+            .as_ref()
+            .expect("Trying to get unitialised hash")
+            .curr_hash()
     }
 
     /// Get the next split k-mer in the sequence