Skip to content

Commit

Permalink
Merge pull request #46 from bacpop/0.3.1-candidate
Browse files Browse the repository at this point in the history
Restore filter accuracy for fastq input
  • Loading branch information
johnlees authored Jul 10, 2023
2 parents 5d6c1dc + 54efe80 commit 746bb6a
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 96 deletions.
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@
//! - `--min-count`. Specify a minimum number of appearances a k-mer must have
//! to be included. This is an effective way of filtering sequencing errors if set
//! to at least three, but higher may be appropriate for higher coverage data.
//! A two-step blocked bloom and countmin filter is used for memory efficiency.
//! A two-step blocked bloom and hash table filter is used for memory efficiency.
//! - `--qual-filter`. `none` do not filter based on quality scores.
//! `middle` (default) filter k-mers where the middle base is below the minimum quality.
//! `strict` filter k-mers where any base is below the minimum quality.
Expand Down
8 changes: 4 additions & 4 deletions src/ska_dict.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ use crate::ska_dict::split_kmer::SplitKmer;
pub mod bit_encoding;
use crate::ska_dict::bit_encoding::{decode_base, UInt, IUPAC};

pub mod count_min_filter;
use crate::ska_dict::count_min_filter::KmerFilter;
pub mod bloom_filter;
use crate::ska_dict::bloom_filter::KmerFilter;

pub mod nthash;

Expand All @@ -47,7 +47,7 @@ pub struct SkaDict<IntT> {
name: String,
/// Split k-mer dictionary split-k:middle-base
split_kmers: HashMap<IntT, u8>,
/// A countmin filter for counting from fastq files
/// A bloom filter for counting from fastq files
kmer_filter: KmerFilter,
}

Expand Down Expand Up @@ -198,7 +198,7 @@ where
sample_idx,
name: name.to_string(),
split_kmers: HashMap::default(),
kmer_filter: KmerFilter::empty(qual.min_count),
kmer_filter: KmerFilter::new(qual.min_count),
};

// Check if we're working with reads, and initalise the CM filter if so
Expand Down
94 changes: 32 additions & 62 deletions src/ska_dict/count_min_filter.rs → src/ska_dict/bloom_filter.rs
Original file line number Diff line number Diff line change
@@ -1,57 +1,44 @@
//! A Countmin filter for counting k-mers from reads
//! A Bloom filter for counting k-mers from reads
//!
//! This is to support a minimum count from FASTQ files, for basic error
//! correction, while keeping memory use low and predictable. Use the
//! [`KmerFilter`] struct.
//!
//! See <https://en.wikipedia.org/wiki/Count-min_sketch> for more
//! details on this data structure.
use std::borrow::BorrowMut;
use std::cmp::Ordering;

use hashbrown::HashMap;

use super::bit_encoding::UInt;
use super::split_kmer::SplitKmer;

/// Default bloom filter width (expected number of k-mers)
///
/// 2^27 =~ 130M
const BLOOM_WIDTH: usize = 1 << 27;
/// Number of bits to use in each bloom block
/// Number of bits to use in each bloom block (~1% FPR)
const BITS_PER_ENTRY: usize = 12;
/// Default number of countmin hashes/table height (controls false positive rate)
const CM_HEIGHT: usize = 3;
/// Default countmin filter width (expected number of k-mers > two occurences)
///
/// 2^27 =~ 130M
const CM_WIDTH: usize = 1 << 24;

/// A Countmin table of specified width and height, counts input k-mers, returns
/// whether they have passed a count threshold.
/// A filter which counts input k-mers, returns whether they have passed a count threshold.
///
/// Also uses a blocked bloom filter as a first pass to remove singletons.
/// Uses a blocked bloom filter as a first pass to remove singletons.
/// Code for blocked bloom filter based on:
/// <https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/2021/10/02/wordbasedbloom.cpp>
///
/// This has the advantage of using less memory than a larger countmin filter,
/// being a bit faster (bloom is ~3x faster than countmin, but having count also
/// allows entry to dictionary to be only checked once for each passing k-mer)
///
/// Once passed through the bloom filter, a HashMap is used for counts >=2.
/// This filter therefore has no false-negatives and negligible false-negatives
#[derive(Debug, Clone, Default)]
pub struct KmerFilter {
/// Size of the bloom filter
buf_size: u64,
/// Buffer for the bloom filter
buffer: Vec<u64>,
/// Table width (estimated number of unique k-mers)
width: usize,
/// Number of bits to shift masked value to get table entry
width_shift: u32,
/// Table height (number of hashes)
height: usize,
/// Mask to convert hash into table column
mask: u64,
/// Table of counts
counts: Vec<u16>,
counts: HashMap<u64, u16>,
/// Minimum count to pass filter
min_count: u16,
}
Expand Down Expand Up @@ -98,41 +85,27 @@ impl KmerFilter {
}
}

/// Creates a new countmin table of specified size, and pass threshold
/// Creates a new filter with given threshold
///
/// Note:
/// - Must call [`KmerFilter::init()`] before using.
/// - Width will be rounded down to the closest power of two
/// - Maximum count is [`u16::MAX`] i.e. 65535
pub fn empty(min_count: u16) -> Self {
// Consistent with consts used, but ensures a power of two
let width_bits: usize = f64::floor(f64::log2(CM_WIDTH as f64)) as usize;
let width = 1 << (width_bits + 1);
// Use MSB rather than LSB
let width_shift = u64::BITS - width_bits as u32;
let mask = (width as u64 - 1) << width_shift;

/// - Must call [`KmerFilter::init()`] before using.
pub fn new(min_count: u16) -> Self {
let buf_size =
f64::round(BLOOM_WIDTH as f64 * (BITS_PER_ENTRY as f64 / 8.0) / (u64::BITS as f64))
as u64;
Self {
buf_size: f64::round(
BLOOM_WIDTH as f64 * (BITS_PER_ENTRY as f64 / 8.0) / (u64::BITS as f64),
) as u64,
buf_size,
buffer: Vec::new(),
width,
width_shift,
height: CM_HEIGHT,
mask,
counts: Vec::new(),
counts: HashMap::new(),
min_count,
}
}

/// Initialises table so it is ready for use.
///
/// Allocates memory and sets hash functions.
/// Allocates memory for bloom filter (which can be avoided with FASTA input)
pub fn init(&mut self) {
if self.counts.is_empty() {
self.counts = vec![0; self.width * self.height];
}
if self.buffer.is_empty() {
self.buffer.resize(self.buf_size as usize, 0);
}
Expand All @@ -148,28 +121,25 @@ impl KmerFilter {
0 | 1 => Ordering::Equal,
// Just the bloom filter
2 => {
if self.bloom_add_and_check(kmer.get_hash(0)) {
if self.bloom_add_and_check(kmer.get_hash()) {
Ordering::Equal
} else {
Ordering::Less
}
}
// Bloom filter then countmin
// Bloom filter then hash table
_ => {
if self.bloom_add_and_check(kmer.get_hash(0)) {
let mut count = 0;
for row_idx in 0..self.height {
let hash_val = kmer.get_hash(row_idx);
let table_idx = row_idx * self.width
+ (((hash_val & self.mask) >> self.width_shift) as usize);
self.counts[table_idx] = self.counts[table_idx].saturating_add(1);
if row_idx == 0 {
count = self.counts[table_idx];
} else {
count = u16::min(count, self.counts[table_idx]);
}
}
(count + 1).cmp(&self.min_count)
let kmer_hash = kmer.get_hash();
if self.bloom_add_and_check(kmer_hash) {
let mut count: u16 = 2;
self.counts
.entry(kmer_hash)
.and_modify(|curr_cnt| {
count = curr_cnt.saturating_add(1);
*curr_cnt = count
})
.or_insert(count);
self.min_count.cmp(&count)
} else {
Ordering::Less
}
Expand Down
15 changes: 0 additions & 15 deletions src/ska_dict/nthash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ const RC_HASH_LOOKUP: [u64; 4] = [
0x3c8b_fbb3_95c6_0474,
0x3193_c185_62a0_2b4c,
];
const MULTISHIFT: i32 = 27;
const MULTISEED: u64 = 0x90b4_5d39_fb6d_a1fa;

/// Stores forward and (optionally) reverse complement hashes of k-mers in a nucleotide sequence
#[derive(Debug)]
Expand Down Expand Up @@ -76,17 +74,4 @@ impl NtHashIterator {
self.fh
}
}

/// Generates more hashes of the sequence.
///
/// This only operates on the hash value, not the full sequence, so won't resolve
/// a full hash collision, but is suitable for use in the countmin table where
/// most of the hash is masked off.
pub fn extra_hash(&self, extra_idx: usize) -> u64 {
let mut new_hash = self
.curr_hash()
.wrapping_mul(MULTISEED.wrapping_mul((extra_idx ^ self.k) as u64));
new_hash ^= new_hash >> MULTISHIFT;
new_hash
}
}
19 changes: 5 additions & 14 deletions src/ska_dict/split_kmer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -296,23 +296,14 @@ impl<'a, IntT: for<'b> UInt<'b>> SplitKmer<'a, IntT> {

/// Get a `u64` hash of the current k-mer using [`NtHashIterator`]
///
/// Can get alternative hashes for a countmin table by giving an index
///
/// # Panics
///
/// If called after creating with `is_reads = false`
pub fn get_hash(&self, hash_idx: usize) -> u64 {
if hash_idx == 0 {
self.hash_gen
.as_ref()
.expect("Trying to get unitialised hash")
.curr_hash()
} else {
self.hash_gen
.as_ref()
.expect("Trying to get unitialised hash")
.extra_hash(hash_idx)
}
pub fn get_hash(&self) -> u64 {
self.hash_gen
.as_ref()
.expect("Trying to get unitialised hash")
.curr_hash()
}

/// Get the next split k-mer in the sequence
Expand Down

0 comments on commit 746bb6a

Please sign in to comment.