From ab68d1742520952514d5b9748804f05dbce01882 Mon Sep 17 00:00:00 2001 From: Henk den Bakker Date: Fri, 1 Mar 2019 23:03:07 -0500 Subject: [PATCH] fix low coverage bug --- src/batch_search_pe.rs | 2 +- src/build.rs | 284 ++++++++++++++++++++------------------- src/kmer.rs | 283 +++++++++++++++++++++++++++++++-------- src/main.rs | 2 +- src/perfect_search.rs | 2 +- src/read_id_mt_pe.rs | 294 +++++++++++++++++++++++++++++++++++------ src/seq.rs | 4 +- 7 files changed, 636 insertions(+), 235 deletions(-) diff --git a/src/batch_search_pe.rs b/src/batch_search_pe.rs index a6a3899..e7a4eb1 100644 --- a/src/batch_search_pe.rs +++ b/src/batch_search_pe.rs @@ -108,7 +108,7 @@ pub fn batch_search( eprintln!("{}", file1); eprintln!("Counting k-mers, this may take a while!"); let vec_query = kmer::read_fasta(file1.to_owned().to_string()); - let unfiltered = kmer::kmerize_vector(vec_query, k_size, 1); + let unfiltered = kmer::kmerize_vector(&vec_query, k_size, 1); let kmers_query = if gene_search { kmer::clean_map(unfiltered, 0) } else if filter < 0 { diff --git a/src/build.rs b/src/build.rs index 335e993..c0715f2 100644 --- a/src/build.rs +++ b/src/build.rs @@ -84,9 +84,9 @@ pub fn build_single( } else { let vec = kmer::read_fasta(v[0].to_string()); let kmers = if cutoff == -1 { - kmer::kmerize_vector(vec, k_size, 1) + kmer::kmerize_vector(&vec, k_size, 1) } else { - let unfiltered = kmer::kmerize_vector(vec, k_size, 1); + let unfiltered = kmer::kmerize_vector(&vec, k_size, 1); kmer::clean_map(unfiltered, cutoff as usize) }; ref_kmer.insert(accession.to_string(), kmers.len()); @@ -159,31 +159,17 @@ pub fn build_multi( let bloom_vec: Vec = (0..bloom_size).collect(); let d: Vec<_>; { - let c: Vec<_>; - eprintln!( - "Inference of Bloom filters in parallel using {} threads.", - t - ); - c = map_vec - .par_iter() - .map(|l| { - if l.1.len() == 2 { - let unfiltered = kmer::kmers_fq_pe_qual(vec![&l.1[0], &l.1[1]], k_size, 1, quality); - let kmers = if cutoff == -1 { - let auto_cutoff = kmer::auto_cutoff(unfiltered.to_owned()); - kmer::clean_map(unfiltered, auto_cutoff) - } else { - kmer::clean_map(unfiltered, cutoff as usize) - }; - let mut filter = - simple_bloom::BloomFilter::new(bloom_size as usize, num_hash as usize); - for kmer in kmers.keys() { - filter.insert(&kmer); - } - (l.0, filter.bits, kmers.len()) - } else { - if l.1[0].ends_with("gz") { - let unfiltered = kmer::kmers_from_fq_qual(l.1[0].to_owned(), k_size, 1, 15); + let c: Vec<_>; + eprintln!( + "Inference of Bloom filters in parallel using {} threads.", + t + ); + c = map_vec + .par_iter() + .map(|l| { + if l.1.len() == 2 { + let unfiltered = + kmer::kmers_fq_pe_qual(vec![&l.1[0], &l.1[1]], k_size, 1, quality); let kmers = if cutoff == -1 { let auto_cutoff = kmer::auto_cutoff(unfiltered.to_owned()); kmer::clean_map(unfiltered, auto_cutoff) @@ -197,51 +183,66 @@ pub fn build_multi( } (l.0, filter.bits, kmers.len()) } else { - let vec = kmer::read_fasta(l.1[0].to_string()); - let kmers = if cutoff == -1 { - kmer::kmerize_vector(vec, k_size, 1) + if l.1[0].ends_with("gz") { + let unfiltered = kmer::kmers_from_fq_qual(l.1[0].to_owned(), k_size, 1, 15); + let kmers = if cutoff == -1 { + let auto_cutoff = kmer::auto_cutoff(unfiltered.to_owned()); + kmer::clean_map(unfiltered, auto_cutoff) + } else { + kmer::clean_map(unfiltered, cutoff as usize) + }; + let mut filter = + simple_bloom::BloomFilter::new(bloom_size as usize, num_hash as usize); + for kmer in kmers.keys() { + filter.insert(&kmer); + } + (l.0, filter.bits, kmers.len()) } else { - let unfiltered = kmer::kmerize_vector(vec, k_size, 1); - kmer::clean_map(unfiltered, cutoff as usize) - }; - let mut filter = - simple_bloom::BloomFilter::new(bloom_size as usize, num_hash as usize); - for kmer in kmers.keys() { - filter.insert(&kmer); + let vec = kmer::read_fasta(l.1[0].to_string()); + let kmers = if cutoff == -1 { + kmer::kmerize_vector(&vec, k_size, 1) + } else { + let unfiltered = kmer::kmerize_vector(&vec, k_size, 1); + kmer::clean_map(unfiltered, cutoff as usize) + }; + let mut filter = + simple_bloom::BloomFilter::new(bloom_size as usize, num_hash as usize); + for kmer in kmers.keys() { + filter.insert(&kmer); + } + (l.0, filter.bits, kmers.len()) } - (l.0, filter.bits, kmers.len()) } - } - }) - .collect(); - for t in &c { - ref_kmer.insert(t.0.to_string(), t.2); - } - for accession in map.keys() { - accessions.push(accession); - } - accessions.sort(); - //create hash table with colors for accessions - let num_taxa = accessions.len(); - for (c, s) in accessions.iter().enumerate() { - accession_colors.insert(s.to_string(), c); - colors_accession.insert(c, s.to_string()); - } - eprintln!("Creation of index, this may take a while!"); - //create actual index, the most straight forward way, but not very efficient - d = bloom_vec - .par_iter() - .map(|i| { - let mut bitvec = BitVec::from_elem(num_taxa, false); - for a in &c { - if a.1[*i] { - bitvec.set(accession_colors[&a.0.to_string()], true); + }) + .collect(); + for t in &c { + ref_kmer.insert(t.0.to_string(), t.2); + } + for accession in map.keys() { + accessions.push(accession); + } + accessions.sort(); + //create hash table with colors for accessions + let num_taxa = accessions.len(); + for (c, s) in accessions.iter().enumerate() { + accession_colors.insert(s.to_string(), c); + colors_accession.insert(c, s.to_string()); + } + eprintln!("Creation of index, this may take a while!"); + //create actual index, the most straight forward way, but not very efficient + d = bloom_vec + .par_iter() + .map(|i| { + let mut bitvec = BitVec::from_elem(num_taxa, false); + for a in &c { + if a.1[*i] { + bitvec.set(accession_colors[&a.0.to_string()], true); + } } - } - (i, bitvec) - }) - .collect(); - drop(c); + (i, bitvec) + }) + .collect(); + drop(c); } let mut bigsi_map = fnv::FnvHashMap::default(); for t in d { @@ -285,33 +286,17 @@ pub fn build_multi_mini( let bloom_vec: Vec = (0..bloom_size).collect(); let d: Vec<_>; { - let c: Vec<_>; - eprintln!( - "Inference of Bloom filters in parallel using {} threads.", - t - ); - c = map_vec - .par_iter() - .map(|l| { - if l.1.len() == 2 { - let unfiltered = - kmer::kmers_fq_pe_minimizer_qual(vec![&l.1[0], &l.1[1]], k_size, m, 1, quality); - let kmers = if cutoff == -1 { - let auto_cutoff = kmer::auto_cutoff(unfiltered.to_owned()); - kmer::clean_map(unfiltered, auto_cutoff) - } else { - kmer::clean_map(unfiltered, cutoff as usize) - }; - let mut filter = - simple_bloom::BloomFilter::new(bloom_size as usize, num_hash as usize); - for kmer in kmers.keys() { - filter.insert(&kmer); - } - (l.0, filter.bits, kmers.len()) - } else { - if l.1[0].ends_with("gz") { - let unfiltered = kmer::kmers_from_fq_minimizer_qual( - l.1[0].to_owned(), + let c: Vec<_>; + eprintln!( + "Inference of Bloom filters in parallel using {} threads.", + t + ); + c = map_vec + .par_iter() + .map(|l| { + if l.1.len() == 2 { + let unfiltered = kmer::kmers_fq_pe_minimizer_qual( + vec![&l.1[0], &l.1[1]], k_size, m, 1, @@ -330,52 +315,73 @@ pub fn build_multi_mini( } (l.0, filter.bits, kmers.len()) } else { - let vec = kmer::read_fasta(l.1[0].to_string()); - let kmers = if cutoff == -1 { - kmer::minimerize_vector_skip_n(&vec, k_size, m, 1) + if l.1[0].ends_with("gz") { + let unfiltered = kmer::kmers_from_fq_minimizer_qual( + l.1[0].to_owned(), + k_size, + m, + 1, + quality, + ); + let kmers = if cutoff == -1 { + let auto_cutoff = kmer::auto_cutoff(unfiltered.to_owned()); + kmer::clean_map(unfiltered, auto_cutoff) + } else { + kmer::clean_map(unfiltered, cutoff as usize) + }; + let mut filter = + simple_bloom::BloomFilter::new(bloom_size as usize, num_hash as usize); + for kmer in kmers.keys() { + filter.insert(&kmer); + } + (l.0, filter.bits, kmers.len()) } else { - let unfiltered = kmer::minimerize_vector_skip_n(&vec, k_size, m, 1); - kmer::clean_map(unfiltered, cutoff as usize) - }; - let mut filter = - simple_bloom::BloomFilter::new(bloom_size as usize, num_hash as usize); - for kmer in kmers.keys() { - filter.insert(&kmer); + let vec = kmer::read_fasta(l.1[0].to_string()); + let kmers = if cutoff == -1 { + kmer::minimerize_vector_skip_n(&vec, k_size, m, 1) + } else { + let unfiltered = kmer::minimerize_vector_skip_n(&vec, k_size, m, 1); + kmer::clean_map(unfiltered, cutoff as usize) + }; + let mut filter = + simple_bloom::BloomFilter::new(bloom_size as usize, num_hash as usize); + for kmer in kmers.keys() { + filter.insert(&kmer); + } + (l.0, filter.bits, kmers.len()) } - (l.0, filter.bits, kmers.len()) } - } - }) - .collect(); - for t in &c { - ref_kmer.insert(t.0.to_string(), t.2); - } - for accession in map.keys() { - accessions.push(accession); - } - accessions.sort(); - //create hash table with colors for accessions - let num_taxa = accessions.len(); - for (c, s) in accessions.iter().enumerate() { - accession_colors.insert(s.to_string(), c); - colors_accession.insert(c, s.to_string()); - } - eprintln!("Creation of index, this may take a while!"); - //create actual index, the most straight forward way, but not very efficient - //this can be done in parallel! - d = bloom_vec - .par_iter() - .map(|i| { - let mut bitvec = BitVec::from_elem(num_taxa, false); - for a in &c { - if a.1[*i] { - bitvec.set(accession_colors[&a.0.to_string()], true); - } - } - (i, bitvec) - }) - .collect(); + }) + .collect(); + for t in &c { + ref_kmer.insert(t.0.to_string(), t.2); + } + for accession in map.keys() { + accessions.push(accession); } + accessions.sort(); + //create hash table with colors for accessions + let num_taxa = accessions.len(); + for (c, s) in accessions.iter().enumerate() { + accession_colors.insert(s.to_string(), c); + colors_accession.insert(c, s.to_string()); + } + eprintln!("Creation of index, this may take a while!"); + //create actual index, the most straight forward way, but not very efficient + //this can be done in parallel! + d = bloom_vec + .par_iter() + .map(|i| { + let mut bitvec = BitVec::from_elem(num_taxa, false); + for a in &c { + if a.1[*i] { + bitvec.set(accession_colors[&a.0.to_string()], true); + } + } + (i, bitvec) + }) + .collect(); + } let mut bigsi_map = fnv::FnvHashMap::default(); for t in d { if t.1.none() { @@ -440,9 +446,9 @@ pub fn build_single_mini( } else { let vec = kmer::read_fasta(v[0].to_string()); let kmers = if cutoff == -1 { - kmer::kmerize_vector(vec, k_size, 1) + kmer::kmerize_vector(&vec, k_size, 1) } else { - let unfiltered = kmer::kmerize_vector(vec, k_size, 1); + let unfiltered = kmer::kmerize_vector(&vec, k_size, 1); kmer::clean_map(unfiltered, cutoff as usize) }; ref_kmer.insert(accession.to_string(), kmers.len()); diff --git a/src/kmer.rs b/src/kmer.rs index f8549a2..eeb426c 100644 --- a/src/kmer.rs +++ b/src/kmer.rs @@ -85,7 +85,7 @@ pub fn read_fasta_mf(filename: String) -> (Vec, Vec) { #[inline] pub fn kmerize_vector( - v: Vec, + v: &Vec, k: usize, d: usize, ) -> fnv::FnvHashMap { @@ -124,6 +124,41 @@ pub fn kmerize_vector( map } +#[inline] +pub fn kmerize_vector_set( + v: &Vec, + k: usize, + d: usize, +) -> fnv::FnvHashSet { + let mut set = fnv::FnvHashSet::default(); + for l in v { + if l.len() < k { + continue; + } else { + let length_l = l.len(); + if length_l < k { + continue; + } else { + let l_r = revcomp(&l); + for i in (0..l.len() - k + 1).step_by(d) { + if seq::has_no_n(l[i..i + k].as_bytes()) { + if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] { + set.insert(l[i..i + k].to_string().to_uppercase()); + } else { + set.insert( + l_r[length_l - (i + k)..length_l - i] + .to_string() + .to_uppercase(), + ); + } + } + } + } + } + } + set +} + #[inline] pub fn kmerize_vector_uppercase( v: Vec, @@ -182,6 +217,56 @@ pub fn kmerize_vector_skip_n( map } +#[inline] +pub fn kmerize_vector_skip_n_set( + v: &Vec, + k: usize, + d: usize, +) -> fnv::FnvHashSet { + let mut map = fnv::FnvHashSet::default(); + for l in v { + let length_l = l.len(); + let l_r = revcomp(&l); + for i in (0..l.len() - k + 1).step_by(d) { + if seq::has_no_n(l[i..i + k].as_bytes()) { + if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] { + map.insert(l[i..i + k].to_string()); + } else { + map.insert(l_r[length_l - (i + k)..length_l - i].to_string()); + } + } else { + continue; + } + } + } + map +} + +#[inline] +pub fn kmerize_vector_skip_n_set_str( + v: &Vec<&str>, + k: usize, + d: usize, +) -> fnv::FnvHashSet { + let mut map = fnv::FnvHashSet::default(); + for l in v { + let length_l = l.len(); + let l_r = revcomp(&l); + for i in (0..l.len() - k + 1).step_by(d) { + if seq::has_no_n(l[i..i + k].as_bytes()) { + if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] { + map.insert(l[i..i + k].to_string()); + } else { + map.insert(l_r[length_l - (i + k)..length_l - i].to_string()); + } + } else { + continue; + } + } + } + map +} + #[inline] pub fn kmerize_string(l: String, k: usize) -> Option> { let mut map = fnv::FnvHashMap::default(); @@ -275,6 +360,72 @@ pub fn minimerize_vector_skip_n( map } +pub fn minimerize_vector_skip_n_set( + v: &Vec, + k: usize, + m: usize, + d: usize, +) -> fnv::FnvHashSet { + let mut map = fnv::FnvHashSet::default(); + for l in v { + let length_l = l.len(); + if length_l < k { + continue; + } else { + let l_r = revcomp(&l); + for i in (0..l.len() - k + 1).step_by(d) { + //if i % d == 0 { + if seq::has_no_n(l[i..i + k].as_bytes()) { + if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] { + let min = find_minimizer(&l[i..i + k], m); + map.insert(min.to_uppercase()); + } else { + let min = find_minimizer(&l_r[length_l - (i + k)..length_l - i], m); + map.insert(min.to_uppercase()); + } + } else { + continue; + } + //} + } + } + } + map +} + +pub fn minimerize_vector_skip_n_set_str( + v: &Vec<&str>, + k: usize, + m: usize, + d: usize, +) -> fnv::FnvHashSet { + let mut map = fnv::FnvHashSet::default(); + for l in v { + let length_l = l.len(); + if length_l < k { + continue; + } else { + let l_r = revcomp(&l); + for i in (0..l.len() - k + 1).step_by(d) { + //if i % d == 0 { + if seq::has_no_n(l[i..i + k].as_bytes()) { + if l[i..i + k] < l_r[length_l - (i + k)..length_l - i] { + let min = find_minimizer(&l[i..i + k], m); + map.insert(min.to_uppercase()); + } else { + let min = find_minimizer(&l_r[length_l - (i + k)..length_l - i], m); + map.insert(min.to_uppercase()); + } + } else { + continue; + } + //} + } + } + } + map +} + pub fn kmers_from_fq(filename: String, k: usize) -> fnv::FnvHashMap { let f = File::open(filename).expect("file not found"); let mut map = fnv::FnvHashMap::default(); @@ -328,7 +479,7 @@ pub fn kmers_from_fq_qual( fastq.seq1 = l.to_owned(); } else if line_count % 4 == 0 { fastq.qual1 = l.to_owned(); - let masked1 = seq::qual_mask(fastq.seq1.to_owned(), fastq.qual1, qual_offset); + let masked1 = seq::qual_mask(&fastq.seq1, &fastq.qual1, qual_offset); for l in vec![masked1] { let length_l = l.len(); if length_l < k { @@ -465,8 +616,8 @@ pub fn kmers_fq_pe_qual( Some(l2) => { fastq.qual1 = l.to_owned(); fastq.qual2 = l2.unwrap().to_owned(); - let masked1 = seq::qual_mask(fastq.seq1.to_owned(), fastq.qual1, qual_offset); - let masked2 = seq::qual_mask(fastq.seq2.to_owned(), fastq.qual2, qual_offset); + let masked1 = seq::qual_mask(&fastq.seq1, &fastq.qual1, qual_offset); + let masked2 = seq::qual_mask(&fastq.seq2, &fastq.qual2, qual_offset); for l in vec![masked1, masked2] { let length_l = l.len(); if length_l < k { @@ -579,8 +730,8 @@ pub fn kmers_fq_pe_minimizer_qual( Some(l2) => { fastq.qual1 = l.to_owned(); fastq.qual2 = l2.unwrap().to_owned(); - let masked1 = seq::qual_mask(fastq.seq1.to_owned(), fastq.qual1, qual_offset); - let masked2 = seq::qual_mask(fastq.seq2.to_owned(), fastq.qual2, qual_offset); + let masked1 = seq::qual_mask(&fastq.seq1, &fastq.qual1, qual_offset); + let masked2 = seq::qual_mask(&fastq.seq2, &fastq.qual2, qual_offset); for l in vec![masked1, masked2] { let length_l = l.len(); if length_l < k { @@ -639,7 +790,7 @@ pub fn kmers_from_fq_minimizer_qual( fastq.seq1 = l.to_owned(); } else if line_count % 4 == 0 { fastq.qual1 = l.to_owned(); - let masked1 = seq::qual_mask(fastq.seq1.to_owned(), fastq.qual1, qual_offset); + let masked1 = seq::qual_mask(&fastq.seq1, &fastq.qual1, qual_offset); for l in vec![masked1] { let length_l = l.len(); if length_l < k { @@ -686,14 +837,14 @@ pub fn clean_map( } pub fn revcomp(dna: &str) -> String { - let mut rc_dna: String = String::with_capacity(dna.len()); + let mut rc_dna = String::with_capacity(dna.len()); for c in dna.chars().rev() { - rc_dna.push(switch_base(c)) + rc_dna.push(switch_base(&c)) } rc_dna } -fn switch_base(c: char) -> char { +fn switch_base(c: &char) -> char { match c { 'a' => 't', 'c' => 'g', @@ -714,53 +865,79 @@ fn switch_base(c: char) -> char { //auto cutoff inference from Zam Iqbal's Cortex pub fn auto_cutoff(map: fnv::FnvHashMap) -> usize { let mut histo_map = fnv::FnvHashMap::default(); - for (_key, value) in map { - *histo_map.entry(value).or_insert(0) += 1; - } - let mut count_vec: Vec<_> = histo_map.iter().collect(); - count_vec.sort_by(|&(a, _), &(b, _)| a.cmp(&b)); - let mut coverages: Vec = Vec::with_capacity(count_vec.len()); - for v in count_vec { - coverages.push(*v.1); - } - //first pseudo-derivative - let mut d1 = Vec::new(); - for i in 1..coverages.len() - 1 { - d1.push(coverages[i] as f64 / coverages[i + 1] as f64); - } - //second pseudo-derivative - let mut d2 = Vec::new(); - for i in 0..d1.len() - 1 { - d2.push(d1[i] / d1[i + 1]); - } - let mut first_pos_d1 = 0; - let mut first_pos_d2 = 0; - let threshold: f64 = 1.0; - for (i, p) in d1.iter().enumerate() { - if p < &threshold { - first_pos_d1 = i + 1; - break; + let mut max_cov = 0; + for (_key, value) in &map { + if value > &max_cov{ + max_cov = *value; } + *histo_map.entry(value).or_insert(0) += 1; } - for (i, p) in d2.iter().enumerate() { - if p < &threshold { - first_pos_d2 = i + 1; - break; + //eprintln!("length histo {}", histo_map.len()); + //not in original code of Zam: do not filter when mean coverage is close to 1 + let mut sum = 0; + for (i, p) in &histo_map { + sum += *i * p; } - } - //estimate coverage (mean), exclude singleton k-mers - let mut bigsum = 0; - for (i, p) in coverages[1..].iter().enumerate() { - bigsum += i * p; - } - let num_kmers: usize = coverages[1..].iter().sum(); - let mean: f64 = bigsum as f64 / num_kmers as f64; - if (first_pos_d1 > 0) && ((first_pos_d1 as f64) < (mean * 0.75)) { - first_pos_d1 - } else if first_pos_d2 > 0 { - first_pos_d2 + let num_kmers_total = map.len(); + let total_mean: f64 = sum as f64 / num_kmers_total as f64; + if total_mean < 1.5 { + 0 } else { - cmp::max(1, (mean / 2.0).ceil() as usize) + let mut count_vec = Vec::with_capacity(max_cov); + for c in 1..max_cov{ + if !histo_map.contains_key(&c){ + count_vec.push((c,0)); + }else{ + count_vec.push((c, *histo_map.get(&c).unwrap())); + } + } + /* + let mut count_vec: Vec<_> = histo_map.iter().collect(); + count_vec.sort_by(|&(a, _), &(b, _)| a.cmp(&b)); + */ + let mut coverages: Vec = Vec::with_capacity(count_vec.len()); + for v in count_vec { + coverages.push(v.1); + } + //first pseudo-derivative + let mut d1 = Vec::new(); + for i in 1..coverages.len() - 1 { + d1.push(coverages[i] as f64 / coverages[i + 1] as f64); + } + //second pseudo-derivative + let mut d2 = Vec::new(); + for i in 0..d1.len() - 1 { + d2.push(d1[i] / d1[i + 1]); + } + let mut first_pos_d1 = 0; + let mut first_pos_d2 = 0; + let threshold: f64 = 1.0; + for (i, p) in d1.iter().enumerate() { + if p < &threshold { + first_pos_d1 = i + 1; + break; + } + } + for (i, p) in d2.iter().enumerate() { + if p < &threshold { + first_pos_d2 = i + 1; + break; + } + } + //estimate coverage (mean), exclude singleton k-mers + let mut bigsum = 0; + for (i, p) in coverages[1..].iter().enumerate() { + bigsum += i * p; + } + let num_kmers: usize = coverages[1..].iter().sum(); + let mean: f64 = bigsum as f64 / num_kmers as f64; + if (first_pos_d1 > 0) && ((first_pos_d1 as f64) < (mean * 0.75)) { + first_pos_d1 + } else if first_pos_d2 > 0 { + first_pos_d2 + } else { + cmp::max(1, (mean / 2.0).ceil() as usize) + } } } diff --git a/src/main.rs b/src/main.rs index 5ec92b3..9f05e54 100644 --- a/src/main.rs +++ b/src/main.rs @@ -14,7 +14,7 @@ static GLOBAL: System = System; fn main() -> std::io::Result<()> { let matches = App::new("colorid") - .version("0.1.4.0") + .version("0.1.4.1") .author("Henk C. den Bakker ") .about("BIGSI based taxonomic ID of sequence data") .setting(AppSettings::ArgRequiredElseHelp) diff --git a/src/perfect_search.rs b/src/perfect_search.rs index c2ba9c2..d1f6836 100644 --- a/src/perfect_search.rs +++ b/src/perfect_search.rs @@ -17,7 +17,7 @@ pub fn batch_search( //only fasta formatted file! eprintln!("Counting k-mers, this may take a while!"); let vec_query = kmer::read_fasta(file.to_owned()); - let kmers_query = kmer::kmerize_vector(vec_query, k_size, 1); + let kmers_query = kmer::kmerize_vector(&vec_query, k_size, 1); eprintln!("{} kmers in query", kmers_query.len()); if kmers_query.len() == 0 { eprintln!("Warning! no kmers in query; maybe your kmer length is larger than your query length?"); diff --git a/src/read_id_mt_pe.rs b/src/read_id_mt_pe.rs index 8f74270..c1d1a1f 100644 --- a/src/read_id_mt_pe.rs +++ b/src/read_id_mt_pe.rs @@ -38,7 +38,7 @@ pub fn false_prob_map( } #[inline] -pub fn bitwise_and(vector_of_bitvectors: Vec<&BitVec>) -> BitVec { +pub fn bitwise_and(vector_of_bitvectors: &Vec<&BitVec>) -> BitVec { let mut first = vector_of_bitvectors[0].to_owned(); if vector_of_bitvectors.len() == 1 { first @@ -63,17 +63,16 @@ pub fn vec_strings_to_string(vector_in: &Vec) -> String { comma_separated } - pub fn search_index_classic( bigsi_map: &fnv::FnvHashMap, - map: &fnv::FnvHashMap, + map: &fnv::FnvHashSet, bloom_size: usize, num_hash: usize, no_hits_num: usize, ) -> fnv::FnvHashMap { let mut report = fnv::FnvHashMap::default(); let empty_bitvec = BitVec::from_elem(no_hits_num, false); - for k in map.keys() { + for k in map { let mut kmer_slices = Vec::new(); for i in 0..num_hash { let bit_index = @@ -89,7 +88,7 @@ pub fn search_index_classic( *report.entry(no_hits_num).or_insert(0) += 1; break; } else { - let first = bitwise_and(kmer_slices); + let first = bitwise_and(&kmer_slices); let mut color = 0; for item in first { if item { @@ -102,10 +101,9 @@ pub fn search_index_classic( report } - pub fn search_index( bigsi_map: &fnv::FnvHashMap, - map: &fnv::FnvHashMap, + map: &fnv::FnvHashSet, bloom_size: usize, num_hash: usize, no_hits_num: usize, @@ -114,7 +112,7 @@ pub fn search_index( let mut report = fnv::FnvHashSet::default(); let mut final_report = fnv::FnvHashMap::default(); let mut counter = 0; - for k in map.keys() { + for k in map { if counter < start_sample { let mut kmer_slices = Vec::new(); for i in 0..num_hash { @@ -129,7 +127,7 @@ pub fn search_index( *final_report.entry(no_hits_num).or_insert(0) += 1; break; } else { - let first = bitwise_and(kmer_slices); + let first = bitwise_and(&kmer_slices); let mut color: usize = 0; for item in &first { if item { @@ -153,7 +151,7 @@ pub fn search_index( *final_report.entry(no_hits_num).or_insert(0) += 1; break; } else { - let first = bitwise_and(kmer_slices); + let first = bitwise_and(&kmer_slices); for item in &report { if first[*item] { *final_report.entry(*item).or_insert(0) += 1; @@ -252,7 +250,6 @@ pub fn kmer_poll_plus<'a>( } } - pub struct SeqRead { pub id: String, //id including > pub seq: Vec, // sequence @@ -267,6 +264,20 @@ impl SeqRead { } } +pub struct SeqReadstr<'a> { + pub id: &'a str, //id including > + pub seq: Vec<&'a str>, // sequence +} + +impl<'a> SeqReadstr<'a> { + pub fn new() -> SeqReadstr<'a> { + SeqReadstr { + id: "", + seq: Vec::new(), + } + } +} + #[allow(unused_assignments)] pub fn parallel_vec( vec: &Vec<(String, Vec)>, @@ -302,14 +313,105 @@ pub fn parallel_vec( ) } else { let map = if m == 0 { - kmer::kmerize_vector_skip_n(&r.1, k, d) + kmer::kmerize_vector_skip_n_set(&r.1, k, d) } else { - kmer::minimerize_vector_skip_n(&r.1, k, m, d) + kmer::minimerize_vector_skip_n_set(&r.1, k, m, d) }; - let report = if start_sample == 0{ + let report = if start_sample == 0 { search_index_classic(&child_bigsi, &map, bloom_size, num_hash, no_hits_num) - }else{ - search_index(&child_bigsi, &map, bloom_size, num_hash, no_hits_num, start_sample) + } else { + search_index( + &child_bigsi, + &map, + bloom_size, + num_hash, + no_hits_num, + start_sample, + ) + }; + if report.is_empty() { + ( + r.0.to_owned(), + "no_hits".to_string(), + 0 as usize, + map.len(), + "accept".to_string(), + 0 as usize, + ) + } else { + let classification = kmer_poll_plus( + &report, + map.len(), + &colors_accession, + &child_fp, + no_hits_num, + fp_correct, + ); + ( + r.0.to_owned(), + classification.0, + classification.1.to_owned(), + classification.2.to_owned(), + classification.3.to_owned(), + classification.4, + ) + } + } + }) + .collect(); + out_vec +} + +#[allow(unused_assignments)] +pub fn parallel_vec_str( + vec: &Vec<(&str, Vec<&str>)>, + bigsi: &fnv::FnvHashMap, + colors_accession: &fnv::FnvHashMap, + ref_kmers_in: &fnv::FnvHashMap, + bloom_size: usize, + num_hash: usize, + no_hits_num: usize, + k: usize, + m: usize, + d: usize, + fp_correct: f64, + start_sample: usize, +) -> std::vec::Vec<(std::string::String, String, usize, usize, String, usize)> { + let false_positive_p = false_prob_map(colors_accession, ref_kmers_in, bloom_size, num_hash); + let my_bigsi: Arc<&fnv::FnvHashMap> = Arc::new(bigsi); + let false_positive_p_arc: Arc> = Arc::new(false_positive_p); + let mut out_vec: Vec<_> = vec![]; + out_vec = vec + .par_iter() + .map(|r| { + let child_bigsi = my_bigsi.clone(); + let child_fp = false_positive_p_arc.clone(); + if r.1[0].len() < k { + ( + r.0.to_owned(), + "too_short".to_string(), + 0 as usize, + 0 as usize, + "accept".to_string(), + 0 as usize, + ) + } else { + let map = if m == 0 { + kmer::kmerize_vector_skip_n_set_str(&r.1, k, d) + } else { + kmer::minimerize_vector_skip_n_set_str(&r.1, k, m, d) + }; + let report = if start_sample == 0 { + search_index_classic(&child_bigsi, &map, bloom_size, num_hash, no_hits_num) + } else { + search_index( + &child_bigsi, + &map, + bloom_size, + num_hash, + no_hits_num, + start_sample, + ) }; if report.is_empty() { ( @@ -360,10 +462,11 @@ pub fn stream_fasta( b: usize, prefix: &str, start_sample: usize, -) -{ +) { let f = File::open(filenames[0]).expect("file not found"); - let iter1 = io::BufReader::new(f).lines(); + //let iter1 = io::BufReader::new(f).lines(); + let mut iter1 = io::BufReader::new(f); + let mut l = String::with_capacity(250); let mut vec = Vec::with_capacity(b); let no_hits_num: usize = colors_accession.len(); let mut file = @@ -377,16 +480,17 @@ pub fn stream_fasta( let mut count = 0; let mut read_count = 0; let mut fasta = SeqRead::new(); - for line in iter1 { - let l = line.unwrap().to_string(); + //for line in iter1 { + while iter1.read_line(&mut l).unwrap() > 0 { + //let l = line.unwrap(); if count == 0 { - fasta.id = l; + fasta.id = l[..l.len() - 1].to_string(); } else { if l.contains('>') { if sub_string.len() > 0 { fasta.seq = vec![sub_string.to_string()]; vec.push((fasta.id, fasta.seq)); - fasta.id = l; + fasta.id = l[..l.len() - 1].to_string(); sub_string.clear(); } } else { @@ -423,6 +527,7 @@ pub fn stream_fasta( } vec.clear(); } + l.clear(); } fasta.seq = vec![sub_string.to_string()]; vec.push((fasta.id, fasta.seq)); @@ -466,6 +571,130 @@ pub fn stream_fasta( } } } +/* +#[allow(unused_assignments)] +pub fn stream_fasta_str( + filenames: Vec<&str>, + bigsi_map: &fnv::FnvHashMap, //has to be an Arc ? + colors_accession: &fnv::FnvHashMap, + ref_kmers_in: &fnv::FnvHashMap, + bloom_size: usize, + num_hash: usize, + k: usize, + m: usize, + t: usize, //threads + d: usize, //downsample factor + fp_correct: f64, + b: usize, + prefix: &str, + start_sample: usize, +) +{ + let f = File::open(filenames[0]).expect("file not found"); + let iter1 = io::BufReader::new(f).lines(); + let mut vec = Vec::with_capacity(b); + let no_hits_num: usize = colors_accession.len(); + let mut file = + File::create(format!("{}_reads.txt", prefix)).expect("could not create outfile!"); + let mut sub_string = String::new(); + ThreadPoolBuilder::new() + .num_threads(t) + .build_global() + .expect("Can't initialize ThreadPoolBuilder"); + let search_time = SystemTime::now(); + let mut count = 0; + let mut read_count = 0; + let mut fasta = SeqReadstr::new(); + for line in iter1 { + let l = line.unwrap(); + if count == 0 { + fasta.id = &l; + } else { + if l.contains('>') { + if sub_string.len() > 0 { + fasta.seq = vec![&sub_string]; + vec.push((fasta.id, fasta.seq)); + fasta.id = &l; + sub_string.clear(); + } + } else { + sub_string.push_str(&l); + } + } + count += 1; + if vec.len() % b == 0 { + let c = parallel_vec_str( + &vec, + bigsi_map, + colors_accession, + ref_kmers_in, + bloom_size, + num_hash, + no_hits_num, + k, + m, + d, + fp_correct, + start_sample, + ); + read_count += c.len(); + eprint!(" {} reads classified\r", read_count); + for id in c { + file.write_all( + format!( + "{}\t{}\t{}\t{}\t{}\t{}\n", + id.0, id.1, id.2, id.3, id.4, id.5 + ) + .as_bytes(), + ) + .expect("could not write results!"); + } + vec.clear(); + } + } + fasta.seq = vec![&sub_string]; + vec.push((fasta.id, fasta.seq)); + let c = parallel_vec_str( + &vec, + bigsi_map, + colors_accession, + ref_kmers_in, + bloom_size, + num_hash, + no_hits_num, + k, + m, + d, + fp_correct, + start_sample, + ); + read_count += c.len(); + for id in c { + file.write_all( + format!( + "{}\t{}\t{}\t{}\t{}\t{}\n", + id.0, id.1, id.2, id.3, id.4, id.5 + ) + .as_bytes(), + ) + .expect("could not write results!"); + } + eprint!(" {} reads classified\r", read_count); + match search_time.elapsed() { + Ok(elapsed) => { + eprintln!( + "Classified {} reads in {} seconds", + read_count, + elapsed.as_secs() + ); + } + Err(e) => { + // an error occurred! + eprintln!("Error: {:?}", e); + } + } +} +*/ pub fn false_prob(m: f64, k: f64, n: f64) -> f64 { let e = std::f64::consts::E; @@ -489,8 +718,7 @@ pub fn per_read_stream_pe( prefix: &str, qual_offset: u8, start_sample: usize, -) -{ +) { let search_time = SystemTime::now(); let mut vec = Vec::with_capacity(b); let mut line_count = 1; @@ -529,14 +757,8 @@ pub fn per_read_stream_pe( vec.push(( fastq.id.to_owned(), vec![ - seq::qual_mask(fastq.seq1.to_owned(), l.to_owned(), qual_offset) - .to_owned(), - seq::qual_mask( - fastq.seq2.to_owned(), - l2.unwrap().to_owned(), - qual_offset, - ) - .to_owned(), + seq::qual_mask(&fastq.seq1, &l, qual_offset).to_owned(), + seq::qual_mask(&fastq.seq2, &l2.unwrap(), qual_offset).to_owned(), ], )); } @@ -662,11 +884,7 @@ pub fn per_read_stream_se( } else if line_count % 4 == 0 { vec.push(( fastq.id.to_owned(), - vec![seq::qual_mask( - fastq.seq1.to_owned(), - l.to_owned(), - qual_offset, - )], + vec![seq::qual_mask(&fastq.seq1, &l, qual_offset)], )); } line_count += 1; diff --git a/src/seq.rs b/src/seq.rs index d8caf0b..a2cf025 100644 --- a/src/seq.rs +++ b/src/seq.rs @@ -33,9 +33,9 @@ impl Fastq { } //from L. Katz fasten! make a method for Fastq -pub fn qual_mask(seq: String, qual: String, max_quality_offset: u8) -> String { +pub fn qual_mask(seq: &String, qual: &String, max_quality_offset: u8) -> String { if max_quality_offset == 0 { - seq + seq.to_string() } else { let max_quality: u8 = max_quality_offset + 33; let mut seq_chars = seq.chars();