diff --git a/src/lib.rs b/src/lib.rs index 4b7bbd2..264cfda 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -106,7 +106,7 @@ //! - `--min-count`. Specify a minimum number of appearances a k-mer must have //! to be included. This is an effective way of filtering sequencing errors if set //! to at least three, but higher may be appropriate for higher coverage data. -//! A two-step blocked bloom and countmin filter is used for memory efficiency. +//! A two-step blocked bloom and hash table filter is used for memory efficiency. //! - `--qual-filter`. `none` do not filter based on quality scores. //! `middle` (default) filter k-mers where the middle base is below the minimum quality. //! `strict` filter k-mers where any base is below the minimum quality. diff --git a/src/ska_dict.rs b/src/ska_dict.rs index f57ea94..961876e 100644 --- a/src/ska_dict.rs +++ b/src/ska_dict.rs @@ -29,8 +29,8 @@ use crate::ska_dict::split_kmer::SplitKmer; pub mod bit_encoding; use crate::ska_dict::bit_encoding::{decode_base, UInt, IUPAC}; -pub mod count_min_filter; -use crate::ska_dict::count_min_filter::KmerFilter; +pub mod bloom_filter; +use crate::ska_dict::bloom_filter::KmerFilter; pub mod nthash; @@ -47,7 +47,7 @@ pub struct SkaDict { name: String, /// Split k-mer dictionary split-k:middle-base split_kmers: HashMap, - /// A countmin filter for counting from fastq files + /// A bloom filter for counting from fastq files kmer_filter: KmerFilter, } @@ -198,7 +198,7 @@ where sample_idx, name: name.to_string(), split_kmers: HashMap::default(), - kmer_filter: KmerFilter::empty(qual.min_count), + kmer_filter: KmerFilter::new(qual.min_count), }; // Check if we're working with reads, and initalise the CM filter if so diff --git a/src/ska_dict/count_min_filter.rs b/src/ska_dict/bloom_filter.rs similarity index 56% rename from src/ska_dict/count_min_filter.rs rename to src/ska_dict/bloom_filter.rs index e231251..cd3a64b 100644 --- a/src/ska_dict/count_min_filter.rs +++ b/src/ska_dict/bloom_filter.rs @@ -1,15 +1,14 @@ -//! A Countmin filter for counting k-mers from reads +//! A Bloom filter for counting k-mers from reads //! //! This is to support a minimum count from FASTQ files, for basic error //! correction, while keeping memory use low and predictable. Use the //! [`KmerFilter`] struct. -//! -//! See for more -//! details on this data structure. use std::borrow::BorrowMut; use std::cmp::Ordering; +use hashbrown::HashMap; + use super::bit_encoding::UInt; use super::split_kmer::SplitKmer; @@ -17,41 +16,29 @@ use super::split_kmer::SplitKmer; /// /// 2^27 =~ 130M const BLOOM_WIDTH: usize = 1 << 27; -/// Number of bits to use in each bloom block +/// Number of bits to use in each bloom block (~1% FPR) const BITS_PER_ENTRY: usize = 12; -/// Default number of countmin hashes/table height (controls false positive rate) -const CM_HEIGHT: usize = 3; -/// Default countmin filter width (expected number of k-mers > two occurences) -/// -/// 2^27 =~ 130M -const CM_WIDTH: usize = 1 << 24; -/// A Countmin table of specified width and height, counts input k-mers, returns -/// whether they have passed a count threshold. +/// A filter which counts input k-mers, returns whether they have passed a count threshold. /// -/// Also uses a blocked bloom filter as a first pass to remove singletons. +/// Uses a blocked bloom filter as a first pass to remove singletons. /// Code for blocked bloom filter based on: /// /// /// This has the advantage of using less memory than a larger countmin filter, /// being a bit faster (bloom is ~3x faster than countmin, but having count also /// allows entry to dictionary to be only checked once for each passing k-mer) +/// +/// Once passed through the bloom filter, a HashMap is used for counts >=2. +/// This filter therefore has no false-negatives and negligible false-negatives #[derive(Debug, Clone, Default)] pub struct KmerFilter { /// Size of the bloom filter buf_size: u64, /// Buffer for the bloom filter buffer: Vec, - /// Table width (estimated number of unique k-mers) - width: usize, - /// Number of bits to shift masked value to get table entry - width_shift: u32, - /// Table height (number of hashes) - height: usize, - /// Mask to convert hash into table column - mask: u64, /// Table of counts - counts: Vec, + counts: HashMap, /// Minimum count to pass filter min_count: u16, } @@ -98,41 +85,27 @@ impl KmerFilter { } } - /// Creates a new countmin table of specified size, and pass threshold + /// Creates a new filter with given threshold /// /// Note: - /// - Must call [`KmerFilter::init()`] before using. - /// - Width will be rounded down to the closest power of two /// - Maximum count is [`u16::MAX`] i.e. 65535 - pub fn empty(min_count: u16) -> Self { - // Consistent with consts used, but ensures a power of two - let width_bits: usize = f64::floor(f64::log2(CM_WIDTH as f64)) as usize; - let width = 1 << (width_bits + 1); - // Use MSB rather than LSB - let width_shift = u64::BITS - width_bits as u32; - let mask = (width as u64 - 1) << width_shift; - + /// - Must call [`KmerFilter::init()`] before using. + pub fn new(min_count: u16) -> Self { + let buf_size = + f64::round(BLOOM_WIDTH as f64 * (BITS_PER_ENTRY as f64 / 8.0) / (u64::BITS as f64)) + as u64; Self { - buf_size: f64::round( - BLOOM_WIDTH as f64 * (BITS_PER_ENTRY as f64 / 8.0) / (u64::BITS as f64), - ) as u64, + buf_size, buffer: Vec::new(), - width, - width_shift, - height: CM_HEIGHT, - mask, - counts: Vec::new(), + counts: HashMap::new(), min_count, } } /// Initialises table so it is ready for use. /// - /// Allocates memory and sets hash functions. + /// Allocates memory for bloom filter (which can be avoided with FASTA input) pub fn init(&mut self) { - if self.counts.is_empty() { - self.counts = vec![0; self.width * self.height]; - } if self.buffer.is_empty() { self.buffer.resize(self.buf_size as usize, 0); } @@ -148,28 +121,25 @@ impl KmerFilter { 0 | 1 => Ordering::Equal, // Just the bloom filter 2 => { - if self.bloom_add_and_check(kmer.get_hash(0)) { + if self.bloom_add_and_check(kmer.get_hash()) { Ordering::Equal } else { Ordering::Less } } - // Bloom filter then countmin + // Bloom filter then hash table _ => { - if self.bloom_add_and_check(kmer.get_hash(0)) { - let mut count = 0; - for row_idx in 0..self.height { - let hash_val = kmer.get_hash(row_idx); - let table_idx = row_idx * self.width - + (((hash_val & self.mask) >> self.width_shift) as usize); - self.counts[table_idx] = self.counts[table_idx].saturating_add(1); - if row_idx == 0 { - count = self.counts[table_idx]; - } else { - count = u16::min(count, self.counts[table_idx]); - } - } - (count + 1).cmp(&self.min_count) + let kmer_hash = kmer.get_hash(); + if self.bloom_add_and_check(kmer_hash) { + let mut count: u16 = 2; + self.counts + .entry(kmer_hash) + .and_modify(|curr_cnt| { + count = curr_cnt.saturating_add(1); + *curr_cnt = count + }) + .or_insert(count); + self.min_count.cmp(&count) } else { Ordering::Less } diff --git a/src/ska_dict/nthash.rs b/src/ska_dict/nthash.rs index b2a6d0a..7206a0f 100644 --- a/src/ska_dict/nthash.rs +++ b/src/ska_dict/nthash.rs @@ -21,8 +21,6 @@ const RC_HASH_LOOKUP: [u64; 4] = [ 0x3c8b_fbb3_95c6_0474, 0x3193_c185_62a0_2b4c, ]; -const MULTISHIFT: i32 = 27; -const MULTISEED: u64 = 0x90b4_5d39_fb6d_a1fa; /// Stores forward and (optionally) reverse complement hashes of k-mers in a nucleotide sequence #[derive(Debug)] @@ -76,17 +74,4 @@ impl NtHashIterator { self.fh } } - - /// Generates more hashes of the sequence. - /// - /// This only operates on the hash value, not the full sequence, so won't resolve - /// a full hash collision, but is suitable for use in the countmin table where - /// most of the hash is masked off. - pub fn extra_hash(&self, extra_idx: usize) -> u64 { - let mut new_hash = self - .curr_hash() - .wrapping_mul(MULTISEED.wrapping_mul((extra_idx ^ self.k) as u64)); - new_hash ^= new_hash >> MULTISHIFT; - new_hash - } } diff --git a/src/ska_dict/split_kmer.rs b/src/ska_dict/split_kmer.rs index 432f82d..df0e691 100644 --- a/src/ska_dict/split_kmer.rs +++ b/src/ska_dict/split_kmer.rs @@ -296,23 +296,14 @@ impl<'a, IntT: for<'b> UInt<'b>> SplitKmer<'a, IntT> { /// Get a `u64` hash of the current k-mer using [`NtHashIterator`] /// - /// Can get alternative hashes for a countmin table by giving an index - /// /// # Panics /// /// If called after creating with `is_reads = false` - pub fn get_hash(&self, hash_idx: usize) -> u64 { - if hash_idx == 0 { - self.hash_gen - .as_ref() - .expect("Trying to get unitialised hash") - .curr_hash() - } else { - self.hash_gen - .as_ref() - .expect("Trying to get unitialised hash") - .extra_hash(hash_idx) - } + pub fn get_hash(&self) -> u64 { + self.hash_gen + .as_ref() + .expect("Trying to get unitialised hash") + .curr_hash() } /// Get the next split k-mer in the sequence