From 1b23141357043c4b847713d17249c7bdc3d4d61c Mon Sep 17 00:00:00 2001 From: John Lees Date: Thu, 6 Jul 2023 11:58:35 +0100 Subject: [PATCH 1/6] Documentation improvements Closes #43 --- src/io_utils.rs | 12 ++++++++++++ src/lib.rs | 26 +++++++++++++++++++++++++- src/merge_ska_dict.rs | 9 ++++++++- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/src/io_utils.rs b/src/io_utils.rs index c06cd1e..46b284d 100644 --- a/src/io_utils.rs +++ b/src/io_utils.rs @@ -132,3 +132,15 @@ pub fn get_input_list( None => read_input_fastas(seq_files.as_ref().unwrap()), } } + +/// Checks if any input files are fastq +pub fn any_fastq(files: &[InputFastx]) -> bool { + let mut fastq = false; + for file in files { + if file.2.is_some() { + fastq = true; + break; + } + } + fastq +} diff --git a/src/lib.rs b/src/lib.rs index c0fb53d..e6bef0b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,7 +28,7 @@ //! Various optimisations are used to make this as fast as possible. For a more thorough comparison with version 1.0 of SKA, see the //! [github description](https://github.com/bacpop/ska.rust/blob/master/README.md). //! -//! //! *NB As split k-mers are even lengths, it is possible that they are their +//! *NB As split k-mers are even lengths, it is possible that they are their //! own reverse complement. The original version of ska would randomly pick a strand, //! possibly introducing a SNP across samples. This version uses an ambiguous middle //! base (W for A/T; S for C/G) to represent this case.* @@ -50,6 +50,18 @@ //! to stream into another compatible program on STDIN. You can also add an output //! file prefix directly with `-o` (for `ska build` this is required). //! +//! ## Important notes +//! +//! - In this version of ska the k-mer length is the _total_ split k-mer size, +//! whereas in ska1 the k-mer length is half the split length. So the default of +//! k=15 in ska1 corresponds to choosing `-k 31` in this version. +//! - If you are using FASTQ input files (reads), these must be provided as two +//! deinterleaved files using the `-f` option to `ska build`. Providing them as +//! a single file will treat them as FASTA, and not correct sequencing errors. +//! - If you are running `ska map`, it is more memory efficient to run these one at +//! a time, rather than merging a single `.skf` file. A command for doing this +//! in parallel is listed below. +//! //! ## Common options //! //! Version can be viewed by running `ska -V`. @@ -168,6 +180,18 @@ //! ska map ref.fa seq1.fa seq2.fa -f vcf --threads 2 | bgzip -c - > seqs.vcf.gz //! ``` //! +//! ### Efficiency +//! +//! As `ska map` is independent for each input file, the most efficient way to +//! run this on a number of samples is with gnu parallel, for example: +//! ``` +//! cat ${FILE_LIST} | parallel -j 8 "ID=\$(basename {} | sed 's/_a.*fasta/_temp_skf/g'); +//! ska build -o \$ID -k 31 {} ; +//! MAPOUT=\$(basename {} | sed 's/_a.*fasta/_aln/g'); +//! ska map $REFERENCE_LOC \${ID}.skf -o \${MAPOUT}.aln" +//! cat *.aln > ska_map.aln +//! ``` +//! //! ## ska distance //! //! Use to calculate distances between all samples within an `.skf` file. The output diff --git a/src/merge_ska_dict.rs b/src/merge_ska_dict.rs index 19788fe..e5b412b 100644 --- a/src/merge_ska_dict.rs +++ b/src/merge_ska_dict.rs @@ -14,6 +14,7 @@ use hashbrown::HashMap; use indicatif::ProgressIterator; use super::QualOpts; +use crate::io_utils::any_fastq; use crate::ska_dict::bit_encoding::UInt; use crate::ska_dict::SkaDict; @@ -330,7 +331,13 @@ where { // Build indexes log::info!("Building skf dicts from sequence input"); - log::info!("If FASTQ input: {qual}"); + + if any_fastq(input_files) { + log::info!("FASTQ files filtered with: {qual}"); + } else { + log::info!("All input files FASTA (no error filtering)"); + } + if threads > 1 { rayon::ThreadPoolBuilder::new() .num_threads(threads) From 6475a2377a46b69f4bded57923af5188fae78884 Mon Sep 17 00:00:00 2001 From: John Lees Date: Thu, 6 Jul 2023 13:05:53 +0100 Subject: [PATCH 2/6] Fix doc compile --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index e6bef0b..11f69b0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -184,7 +184,7 @@ //! //! As `ska map` is independent for each input file, the most efficient way to //! run this on a number of samples is with gnu parallel, for example: -//! ``` +//! ```bash //! cat ${FILE_LIST} | parallel -j 8 "ID=\$(basename {} | sed 's/_a.*fasta/_temp_skf/g'); //! ska build -o \$ID -k 31 {} ; //! MAPOUT=\$(basename {} | sed 's/_a.*fasta/_aln/g'); From 643cfa07dae09b28f3b7aa636f589e0b96ed1988 Mon Sep 17 00:00:00 2001 From: John Lees Date: Thu, 6 Jul 2023 15:32:31 +0100 Subject: [PATCH 3/6] Add code to mask all ambig sites as N --- src/cli.rs | 9 ++++++++- src/generic_modes.rs | 10 +++++++++- src/lib.rs | 22 ++++++++++++++++++++++ src/merge_ska_array.rs | 9 ++++++++- src/ska_ref.rs | 10 ++++++++-- src/ska_ref/aln_writer.rs | 15 +++++++++++++-- 6 files changed, 68 insertions(+), 7 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index aef89cd..65839aa 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -71,13 +71,15 @@ pub enum FileType { Aln, } -/// Possible variant filters for align +/// Possible variant filters #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] pub enum FilterType { /// Output all variants NoFilter, /// Filter constant bases NoConst, + /// Replace all ambiguous bases with N + NoAmbig, /// Filter constant bases, and any variants where there are ambiguous bases NoAmbigOrConst, } @@ -88,6 +90,7 @@ impl fmt::Display for FilterType { match *self { Self::NoFilter => write!(f, "No filtering"), Self::NoConst => write!(f, "No constant sites"), + Self::NoAmbig => write!(f, "Ambiguous sites as Ns"), Self::NoAmbigOrConst => write!(f, "No constant sites or ambiguous bases"), } } @@ -191,6 +194,10 @@ pub enum Commands { #[arg(short, long, value_enum, default_value_t = FileType::Aln)] format: FileType, + /// Filter for constant middle base sites + #[arg(long, value_enum, default_value_t = FilterType::NoFilter)] + filter: FilterType, + /// Mask any repeats in the alignment with 'N' #[arg(long, default_value_t = DEFAULT_REPEATMASK)] repeat_mask: bool, diff --git a/src/generic_modes.rs b/src/generic_modes.rs index 2853a16..a6f321c 100644 --- a/src/generic_modes.rs +++ b/src/generic_modes.rs @@ -189,7 +189,15 @@ pub fn weed UInt<'a>>( ska_array.kmer_len(), ska_array.rc() ); - let ska_weed = RefSka::new(ska_array.kmer_len(), weed_fasta, ska_array.rc(), false); + let repeat_mask = false; + let filter_ambig = false; + let ska_weed = RefSka::new( + ska_array.kmer_len(), + weed_fasta, + ska_array.rc(), + repeat_mask, + filter_ambig, + ); if !reverse { log::info!("Removing weed k-mers"); diff --git a/src/lib.rs b/src/lib.rs index 11f69b0..d952b7f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -529,10 +529,30 @@ pub fn main() { input, output, format, + filter, repeat_mask, threads, } => { check_threads(*threads); + + // Parse filter options, give verbose output + let filter_ambig = match *filter { + FilterType::NoAmbig => { + log::info!("Replacing ambiguous bases with N"); + true + } + FilterType::NoAmbigOrConst => { + log::info!("Replacing ambiguous bases with N"); + log::warn!("Cannot remove constant bases when mapping"); + true + } + FilterType::NoConst => { + log::warn!("Cannot remove constant bases when mapping"); + false + } + FilterType::NoFilter => false, + }; + log::info!("Loading skf as dictionary"); if let Ok(mut ska_array) = load_array::(input, *threads) { log::info!( @@ -545,6 +565,7 @@ pub fn main() { reference, ska_array.rc(), *repeat_mask, + filter_ambig, ); map(&mut ska_array, &mut ska_ref, output, format, *threads); } else if let Ok(mut ska_array) = load_array::(input, *threads) { @@ -558,6 +579,7 @@ pub fn main() { reference, ska_array.rc(), *repeat_mask, + filter_ambig, ); map(&mut ska_array, &mut ska_ref, output, format, *threads); } else { diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index 78f2115..952fc0d 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -253,7 +253,7 @@ where let ((count, row), kmer) = count_it; if *count >= min_count { let keep_var = match *filter { - FilterType::NoFilter => true, + FilterType::NoFilter | FilterType::NoAmbig => true, FilterType::NoConst => { let first_var = row[0]; let mut keep = false; @@ -299,6 +299,13 @@ where if update_kmers { self.split_kmers = filtered_kmers; } + + // Replace any ambiguous variants with Ns, if requested + if *filter == FilterType::NoAmbig { + self.variants + .mapv_inplace(|v| if is_ambiguous(v) { b'N' } else { v }); + } + log::info!("Filtering removed {removed} split k-mers"); removed } diff --git a/src/ska_ref.rs b/src/ska_ref.rs index 9fae006..83b70a5 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -98,6 +98,8 @@ where k: usize, /// Concatenated list of split k-mers split_kmer_pos: Vec>, + /// Replace ambiguous bases with N + filter_ambig: bool, /// Input sequence @@ -158,7 +160,7 @@ where /// - File doesn't exist or can't be opened. /// - File cannot be parsed as FASTA (FASTQ is not supported). /// - If there are no valid split k-mers. - pub fn new(k: usize, filename: &str, rc: bool, repeat_mask: bool) -> Self { + pub fn new(k: usize, filename: &str, rc: bool, repeat_mask: bool, filter_ambig: bool) -> Self { if !(5..=63).contains(&k) || k % 2 == 0 { panic!("Invalid k-mer length"); } @@ -267,6 +269,7 @@ where Self { k, seq, + filter_ambig, chrom_names, split_kmer_pos, repeat_coors, @@ -344,7 +347,10 @@ where } let mut seq_writers = - vec![AlnWriter::new(&self.seq, self.k, &self.repeat_coors); self.mapped_names.len()]; + vec![ + AlnWriter::new(&self.seq, self.k, &self.repeat_coors, self.filter_ambig); + self.mapped_names.len() + ]; rayon::ThreadPoolBuilder::new() .num_threads(threads) .build_global() diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs index fb94efb..05b59df 100644 --- a/src/ska_ref/aln_writer.rs +++ b/src/ska_ref/aln_writer.rs @@ -39,6 +39,8 @@ pub struct AlnWriter<'a> { finalised: bool, /// Repeats to mask, which happens at finalisation repeat_regions: &'a Vec, + /// Whether to treat all ambiguous bases as Ns + mask_ambig: bool, /// Buffer for amibguous bases, which are written at finalisation. _ambig_out: Vec<(u8, usize)>, } @@ -46,7 +48,12 @@ pub struct AlnWriter<'a> { impl<'a> AlnWriter<'a> { /// Create a new [`AlnWriter`] taking the reference sequence mapped against /// and the k-mer size used for the mapping - pub fn new(ref_seq: &'a Vec>, k: usize, repeat_regions: &'a Vec) -> Self { + pub fn new( + ref_seq: &'a Vec>, + k: usize, + repeat_regions: &'a Vec, + mask_ambig: bool, + ) -> Self { let total_size = ref_seq.iter().map(|x| x.len()).sum(); let half_split_len = (k - 1) / 2; Self { @@ -60,6 +67,7 @@ impl<'a> AlnWriter<'a> { half_split_len, finalised: false, repeat_regions, + mask_ambig, _ambig_out: Vec::new(), } } @@ -121,7 +129,10 @@ impl<'a> AlnWriter<'a> { // Ambiguous bases may clash with the flanks, which are copied from // reference. Deal with these in `finalise`. if is_ambiguous(base) { - self._ambig_out.push((base, mapped_pos + self.chrom_offset)); + self._ambig_out.push(( + if self.mask_ambig { b'N' } else { base }, + mapped_pos + self.chrom_offset, + )); } self.last_mapped = mapped_pos; } else { From 828b4b13490add4bd74c02888fdaa2f740313bc3 Mon Sep 17 00:00:00 2001 From: John Lees Date: Thu, 6 Jul 2023 15:41:43 +0100 Subject: [PATCH 4/6] Fix doc tests --- src/merge_ska_array.rs | 3 ++- src/ska_ref.rs | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index 952fc0d..74a9bed 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -64,7 +64,8 @@ use crate::cli::FilterType; /// /// // Delete k-mers /// let mask_repeats = false; -/// let ska_weed = RefSka::new(ska_array.kmer_len(), &"tests/test_files_in/weed.fa", ska_array.rc(), mask_repeats); +/// let mask_ambiguous = false; +/// let ska_weed = RefSka::new(ska_array.kmer_len(), &"tests/test_files_in/weed.fa", ska_array.rc(), mask_repeats, mask_ambiguous); /// let reverse = false; /// ska_array.weed(&ska_weed, reverse); /// ``` diff --git a/src/ska_ref.rs b/src/ska_ref.rs index 83b70a5..4c054dd 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -21,7 +21,8 @@ //! //! // Index a reference sequence //! let mask_repeats = false; -//! let mut ref_kmers = RefSka::new(ska_dict.kmer_len(), &"tests/test_files_in/test_ref.fa", ska_dict.rc(), mask_repeats); +//! let mask_ambiguous = false; +//! let mut ref_kmers = RefSka::new(ska_dict.kmer_len(), &"tests/test_files_in/test_ref.fa", ska_dict.rc(), mask_repeats, mask_ambiguous); //! //! // Run mapping, output an alignment to stdout //! ref_kmers.map(&ska_dict); From f49c1e6573a1fd31219f68aa91f8ac840c0ff734 Mon Sep 17 00:00:00 2001 From: John Lees Date: Thu, 6 Jul 2023 15:47:27 +0100 Subject: [PATCH 5/6] Add test for ambiguous base filtering in ska map --- src/ska_ref/aln_writer.rs | 16 ++++++++-------- .../map_aln_k9_filter.stdout | 4 ++++ 2 files changed, 12 insertions(+), 8 deletions(-) create mode 100644 tests/test_results_correct/map_aln_k9_filter.stdout diff --git a/src/ska_ref/aln_writer.rs b/src/ska_ref/aln_writer.rs index 05b59df..7227fb4 100644 --- a/src/ska_ref/aln_writer.rs +++ b/src/ska_ref/aln_writer.rs @@ -125,15 +125,15 @@ impl<'a> AlnWriter<'a> { if mapped_chrom > self.curr_chrom { self.fill_contig(); } + // Ambiguous bases may clash with the flanks, which are copied from + // reference. Deal with these in `finalise`. + if is_ambiguous(base) { + self._ambig_out.push(( + if self.mask_ambig { b'N' } else { base }, + mapped_pos + self.chrom_offset, + )); + } if mapped_pos < self.next_pos { - // Ambiguous bases may clash with the flanks, which are copied from - // reference. Deal with these in `finalise`. - if is_ambiguous(base) { - self._ambig_out.push(( - if self.mask_ambig { b'N' } else { base }, - mapped_pos + self.chrom_offset, - )); - } self.last_mapped = mapped_pos; } else { // Write bases between last match and this one diff --git a/tests/test_results_correct/map_aln_k9_filter.stdout b/tests/test_results_correct/map_aln_k9_filter.stdout new file mode 100644 index 0000000..549e2be --- /dev/null +++ b/tests/test_results_correct/map_aln_k9_filter.stdout @@ -0,0 +1,4 @@ +>test_1 +------------GATTTGAAGGCGTTTATGATATTTAGCTGTACGAGAGTNTTTTAAGAGTNTTTTGATGGTTT------------ +>test_2 +------------GATTTGAAGGCGTTTATGATATATAGCTGTACGAGAGTCTTTTAAAAGTGTTTTGATGGTTT------------ From 3a3f48d963e5ca72d5e2dea31fe48d7a69e8ef4a Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 7 Jul 2023 14:42:04 +0100 Subject: [PATCH 6/6] Add a separate mask ambiguous option --- src/cli.rs | 22 +++++++--- src/generic_modes.rs | 22 ++++++---- src/lib.rs | 35 +++++----------- src/merge_ska_array.rs | 41 ++++++++++++++---- src/ska_ref.rs | 11 +++-- tests/align.rs | 44 +++++++++++++++++--- tests/map.rs | 10 +++++ tests/skf_ops.rs | 23 ++++++++++ tests/test_results_correct/weed_nk_k9.stdout | 9 ++++ 9 files changed, 160 insertions(+), 57 deletions(-) create mode 100644 tests/test_results_correct/weed_nk_k9.stdout diff --git a/src/cli.rs b/src/cli.rs index 65839aa..b598634 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -11,6 +11,8 @@ pub const DEFAULT_KMER: usize = 17; pub const DEFAULT_STRAND: bool = false; /// Default repeat masking behaviour pub const DEFAULT_REPEATMASK: bool = false; +/// Default ambiguous masking behaviour +pub const DEFAULT_AMBIGMASK: bool = false; /// Default minimum k-mer count for FASTQ files pub const DEFAULT_MINCOUNT: u16 = 10; /// Default minimum base quality (PHRED score) for FASTQ files @@ -78,9 +80,9 @@ pub enum FilterType { NoFilter, /// Filter constant bases NoConst, - /// Replace all ambiguous bases with N + /// Filter any site with an ambiguous base NoAmbig, - /// Filter constant bases, and any variants where there are ambiguous bases + /// Filter constant bases, and any ambiguous bases NoAmbigOrConst, } @@ -90,7 +92,7 @@ impl fmt::Display for FilterType { match *self { Self::NoFilter => write!(f, "No filtering"), Self::NoConst => write!(f, "No constant sites"), - Self::NoAmbig => write!(f, "Ambiguous sites as Ns"), + Self::NoAmbig => write!(f, "No ambiguous sites"), Self::NoAmbigOrConst => write!(f, "No constant sites or ambiguous bases"), } } @@ -174,6 +176,10 @@ pub enum Commands { #[arg(long, value_enum, default_value_t = FilterType::NoConst)] filter: FilterType, + /// Mask any ambiguous bases in the alignment with 'N' + #[arg(long, default_value_t = DEFAULT_AMBIGMASK)] + ambig_mask: bool, + /// Number of CPU threads #[arg(long, value_parser = valid_cpus, default_value_t = 1)] threads: usize, @@ -194,9 +200,9 @@ pub enum Commands { #[arg(short, long, value_enum, default_value_t = FileType::Aln)] format: FileType, - /// Filter for constant middle base sites - #[arg(long, value_enum, default_value_t = FilterType::NoFilter)] - filter: FilterType, + /// Mask any ambiguous bases in the alignment with 'N' + #[arg(long, default_value_t = DEFAULT_AMBIGMASK)] + ambig_mask: bool, /// Mask any repeats in the alignment with 'N' #[arg(long, default_value_t = DEFAULT_REPEATMASK)] @@ -273,6 +279,10 @@ pub enum Commands { /// Filter for constant middle base sites #[arg(long, value_enum, default_value_t = FilterType::NoFilter)] filter: FilterType, + + /// Mask any ambiguous bases in the alignment with 'N' + #[arg(long, default_value_t = DEFAULT_AMBIGMASK)] + ambig_mask: bool, }, /// Get the number of k-mers in a split k-mer file, and other information Nk { diff --git a/src/generic_modes.rs b/src/generic_modes.rs index a6f321c..b02deac 100644 --- a/src/generic_modes.rs +++ b/src/generic_modes.rs @@ -19,13 +19,14 @@ pub fn align UInt<'a>>( ska_array: &mut MergeSkaArray, output: &Option, filter: &FilterType, + mask_ambig: bool, min_freq: f64, ) { // In debug mode (cannot be set from CLI, give details) log::debug!("{ska_array}"); // Apply filters - apply_filters(ska_array, min_freq, filter); + apply_filters(ska_array, min_freq, filter, mask_ambig); // Write out to file/stdout log::info!("Writing alignment"); @@ -102,11 +103,12 @@ pub fn apply_filters UInt<'a>>( ska_array: &mut MergeSkaArray, min_freq: f64, filter: &FilterType, + ambig_mask: bool, ) -> i32 { let update_kmers = false; let filter_threshold = f64::ceil(ska_array.nsamples() as f64 * min_freq) as usize; - log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter}"); - ska_array.filter(filter_threshold, filter, update_kmers) + log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask}"); + ska_array.filter(filter_threshold, filter, ambig_mask, update_kmers) } /// Calculate distances between samples @@ -122,12 +124,13 @@ pub fn distance UInt<'a>>( // In debug mode (cannot be set from CLI, give details) log::debug!("{ska_array}"); - let constant = apply_filters(ska_array, min_freq, &FilterType::NoConst); + let mask_ambig = false; + let constant = apply_filters(ska_array, min_freq, &FilterType::NoConst, mask_ambig); if filt_ambig || (min_freq * ska_array.nsamples() as f64 >= 1.0) { if filt_ambig { - apply_filters(ska_array, min_freq, &FilterType::NoAmbigOrConst); + apply_filters(ska_array, min_freq, &FilterType::NoAmbigOrConst, mask_ambig); } else { - apply_filters(ska_array, min_freq, &FilterType::NoFilter); + apply_filters(ska_array, min_freq, &FilterType::NoFilter, mask_ambig); } } @@ -181,6 +184,7 @@ pub fn weed UInt<'a>>( reverse: bool, min_freq: f64, filter: &FilterType, + ambig_mask: bool, out_file: &str, ) { if let Some(weed_fasta) = weed_file { @@ -208,10 +212,10 @@ pub fn weed UInt<'a>>( } let filter_threshold = f64::floor(ska_array.nsamples() as f64 * min_freq) as usize; - if filter_threshold > 0 || *filter != FilterType::NoFilter { - log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter}"); + if filter_threshold > 0 || *filter != FilterType::NoFilter || ambig_mask { + log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask}"); let update_kmers = true; - ska_array.filter(filter_threshold, filter, update_kmers); + ska_array.filter(filter_threshold, filter, ambig_mask, update_kmers); } log::info!("Saving modified skf file"); diff --git a/src/lib.rs b/src/lib.rs index d952b7f..4b7bbd2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -366,7 +366,8 @@ //! let min_count = 2; //! let update_kmers = false; //! let filter = FilterType::NoConst; -//! ska_array.filter(min_count, &filter, update_kmers); +//! let ambig_mask = false; +//! ska_array.filter(min_count, &filter, update_kmers, ambig_mask); //! //! // Write out to stdout //! let mut out_stream = set_ostream(&None); @@ -509,17 +510,18 @@ pub fn main() { output, min_freq, filter, + ambig_mask, threads, } => { check_threads(*threads); if let Ok(mut ska_array) = load_array::(input, *threads) { // In debug mode (cannot be set from CLI, give details) log::debug!("{ska_array}"); - align(&mut ska_array, output, filter, *min_freq); + align(&mut ska_array, output, filter, *ambig_mask, *min_freq); } else if let Ok(mut ska_array) = load_array::(input, *threads) { // In debug mode (cannot be set from CLI, give details) log::debug!("{ska_array}"); - align(&mut ska_array, output, filter, *min_freq); + align(&mut ska_array, output, filter, *ambig_mask, *min_freq); } else { panic!("Could not read input file(s): {input:?}"); } @@ -529,30 +531,12 @@ pub fn main() { input, output, format, - filter, + ambig_mask, repeat_mask, threads, } => { check_threads(*threads); - // Parse filter options, give verbose output - let filter_ambig = match *filter { - FilterType::NoAmbig => { - log::info!("Replacing ambiguous bases with N"); - true - } - FilterType::NoAmbigOrConst => { - log::info!("Replacing ambiguous bases with N"); - log::warn!("Cannot remove constant bases when mapping"); - true - } - FilterType::NoConst => { - log::warn!("Cannot remove constant bases when mapping"); - false - } - FilterType::NoFilter => false, - }; - log::info!("Loading skf as dictionary"); if let Ok(mut ska_array) = load_array::(input, *threads) { log::info!( @@ -564,8 +548,8 @@ pub fn main() { ska_array.kmer_len(), reference, ska_array.rc(), + *ambig_mask, *repeat_mask, - filter_ambig, ); map(&mut ska_array, &mut ska_ref, output, format, *threads); } else if let Ok(mut ska_array) = load_array::(input, *threads) { @@ -578,8 +562,8 @@ pub fn main() { ska_array.kmer_len(), reference, ska_array.rc(), + *ambig_mask, *repeat_mask, - filter_ambig, ); map(&mut ska_array, &mut ska_ref, output, format, *threads); } else { @@ -653,6 +637,7 @@ pub fn main() { weed_file, reverse, min_freq, + ambig_mask, filter, } => { log::info!("Loading skf file"); @@ -663,6 +648,7 @@ pub fn main() { *reverse, *min_freq, filter, + *ambig_mask, skf_file, ); } else if let Ok(mut ska_array) = MergeSkaArray::::load(skf_file) { @@ -672,6 +658,7 @@ pub fn main() { *reverse, *min_freq, filter, + *ambig_mask, skf_file, ); } else { diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index 74a9bed..a23dfe5 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -55,8 +55,9 @@ use crate::cli::FilterType; /// // Remove constant sites and save /// let min_count = 1; // no filtering by minor allele frequency /// let filter = FilterType::NoAmbigOrConst; // remove sites with no minor allele +/// let mask_ambiguous = false; // leave ambiguous sites as IUPAC codes /// let update_counts = true; // keep counts updated, as saving -/// ska_array.filter(min_count, &filter, update_counts); +/// ska_array.filter(min_count, &filter, mask_ambiguous, update_counts); /// ska_array.save(&"no_const_sites.skf"); /// /// // Delete a sample @@ -64,7 +65,6 @@ use crate::cli::FilterType; /// /// // Delete k-mers /// let mask_repeats = false; -/// let mask_ambiguous = false; /// let ska_weed = RefSka::new(ska_array.kmer_len(), &"tests/test_files_in/weed.fa", ska_array.rc(), mask_repeats, mask_ambiguous); /// let reverse = false; /// ska_array.weed(&ska_weed, reverse); @@ -235,11 +235,18 @@ where /// /// - `min_count` -- minimum number of samples split k-mer found in. /// - `const_sites` -- include sites where all middle bases are the same. + /// - `mask_ambig` -- replace any non-ACGTUN- with N /// - `update_kmers` -- update counts and split k-mers after removing variants. /// /// The default for `update_kmers` should be `true`, but it can be `false` /// if [`write_fasta()`] will be called immediately afterwards. - pub fn filter(&mut self, min_count: usize, filter: &FilterType, update_kmers: bool) -> i32 { + pub fn filter( + &mut self, + min_count: usize, + filter: &FilterType, + mask_ambig: bool, + update_kmers: bool, + ) -> i32 { let total = self.names.len(); let mut filtered_variants = Array2::zeros((0, total)); let mut filtered_counts = Vec::new(); @@ -254,7 +261,7 @@ where let ((count, row), kmer) = count_it; if *count >= min_count { let keep_var = match *filter { - FilterType::NoFilter | FilterType::NoAmbig => true, + FilterType::NoFilter => true, FilterType::NoConst => { let first_var = row[0]; let mut keep = false; @@ -266,6 +273,16 @@ where } keep } + FilterType::NoAmbig => { + let mut keep = true; + for var in row { + if is_ambiguous(*var) { + keep = false; + break; + } + } + keep + } FilterType::NoAmbigOrConst => { let mut keep = false; let mut first_var = None; @@ -300,14 +317,22 @@ where if update_kmers { self.split_kmers = filtered_kmers; } + log::info!("Filtering removed {removed} split k-mers"); // Replace any ambiguous variants with Ns, if requested - if *filter == FilterType::NoAmbig { - self.variants - .mapv_inplace(|v| if is_ambiguous(v) { b'N' } else { v }); + if mask_ambig { + let mut masked = 0; + self.variants.mapv_inplace(|v| { + if is_ambiguous(v) { + masked += 1; + b'N' + } else { + v + } + }); + log::info!("Masked {masked} ambiguous bases (non-A/C/G/T/U/N/-) with 'N'"); } - log::info!("Filtering removed {removed} split k-mers"); removed } diff --git a/src/ska_ref.rs b/src/ska_ref.rs index 4c054dd..c248c01 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -100,7 +100,7 @@ where /// Concatenated list of split k-mers split_kmer_pos: Vec>, /// Replace ambiguous bases with N - filter_ambig: bool, + ambig_mask: bool, /// Input sequence @@ -161,7 +161,7 @@ where /// - File doesn't exist or can't be opened. /// - File cannot be parsed as FASTA (FASTQ is not supported). /// - If there are no valid split k-mers. - pub fn new(k: usize, filename: &str, rc: bool, repeat_mask: bool, filter_ambig: bool) -> Self { + pub fn new(k: usize, filename: &str, rc: bool, ambig_mask: bool, repeat_mask: bool) -> Self { if !(5..=63).contains(&k) || k % 2 == 0 { panic!("Invalid k-mer length"); } @@ -270,7 +270,7 @@ where Self { k, seq, - filter_ambig, + ambig_mask, chrom_names, split_kmer_pos, repeat_coors, @@ -346,10 +346,13 @@ where if !self.is_mapped() { panic!("No split k-mers mapped to reference"); } + if self.ambig_mask { + log::info!("Masking any ambiguous bases (non-A/C/G/T/U/N/-) with 'N'"); + } let mut seq_writers = vec![ - AlnWriter::new(&self.seq, self.k, &self.repeat_coors, self.filter_ambig); + AlnWriter::new(&self.seq, self.k, &self.repeat_coors, self.ambig_mask); self.mapped_names.len() ]; rayon::ThreadPoolBuilder::new() diff --git a/tests/align.rs b/tests/align.rs index d920e8b..6228c3b 100644 --- a/tests/align.rs +++ b/tests/align.rs @@ -162,7 +162,7 @@ fn filters() { let unfilt_align_out = Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) .arg("align") - .arg(sandbox.file_string("merge_k9.fa", TestDir::Input)) + .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) .arg("--filter") .arg("no-filter") .arg("-v") @@ -170,9 +170,26 @@ fn filters() { .unwrap() .stdout; + let full_length = 38; // could also get 38 from ska nk let lengths = aln_length(&unfilt_align_out); for length in lengths { - assert_eq!(39, length); // could also get 39 from ska nk + assert_eq!(full_length, length); + } + + let noambig_align_out = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("align") + .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) + .arg("--filter") + .arg("no-ambig") + .arg("-v") + .output() + .unwrap() + .stdout; + + let lengths = aln_length(&noambig_align_out); + for length in lengths { + assert_eq!(full_length - 1, length); } let const_filt_align_out = Command::new(cargo_bin("ska")) @@ -186,10 +203,10 @@ fn filters() { .unwrap() .stdout; - let mut correct_aln = HashSet::from([vec!['T', 'A'], vec!['C', 'T'], vec!['S', 'G']]); + let correct_aln = HashSet::from([vec!['T', 'A'], vec!['C', 'T'], vec!['S', 'G']]); assert_eq!(var_hash(&const_filt_align_out), correct_aln); - let no_ambig_align_out = Command::new(cargo_bin("ska")) + let no_ambig_or_const_align_out = Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) .arg("align") .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) @@ -200,8 +217,23 @@ fn filters() { .unwrap() .stdout; - correct_aln.remove(&vec!['S', 'G']); - assert_eq!(var_hash(&no_ambig_align_out), correct_aln); + let correct_aln = HashSet::from([vec!['T', 'A'], vec!['C', 'T']]); + assert_eq!(var_hash(&no_ambig_or_const_align_out), correct_aln); + + let ambig_filter_align_out = Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("align") + .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) + .arg("--filter") + .arg("no-const") + .arg("--ambig-mask") + .arg("-v") + .output() + .unwrap() + .stdout; + + let correct_aln = HashSet::from([vec!['T', 'A'], vec!['C', 'T'], vec!['N', 'G']]); + assert_eq!(var_hash(&ambig_filter_align_out), correct_aln); } #[test] diff --git a/tests/map.rs b/tests/map.rs index ed4ea12..339d937 100644 --- a/tests/map.rs +++ b/tests/map.rs @@ -42,6 +42,16 @@ fn map_aln() { .assert() .stdout_eq_path(sandbox.file_string("map_aln_k9.stdout", TestDir::Correct)); + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("map") + .arg(sandbox.file_string("test_ref.fa", TestDir::Input)) + .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) + .arg("--ambig-mask") + .arg("-v") + .assert() + .stdout_eq_path(sandbox.file_string("map_aln_k9_filter.stdout", TestDir::Correct)); + Command::new(cargo_bin("ska")) .current_dir(sandbox.get_wd()) .arg("map") diff --git a/tests/skf_ops.rs b/tests/skf_ops.rs index d2f652f..674d2e1 100644 --- a/tests/skf_ops.rs +++ b/tests/skf_ops.rs @@ -202,6 +202,29 @@ fn weed() { .assert() .stdout_matches_path(sandbox.file_string("weed_nk.stdout", TestDir::Correct)); + // Masking ambig sites + Command::new("cp") + .current_dir(sandbox.get_wd()) + .arg(sandbox.file_string("merge_k9.skf", TestDir::Input)) + .arg("merge_k9.skf") + .assert() + .success(); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("weed") + .arg("merge_k9.skf") + .arg("--ambig-mask") + .assert() + .success(); + + Command::new(cargo_bin("ska")) + .current_dir(sandbox.get_wd()) + .arg("nk") + .arg("merge_k9.skf") + .assert() + .stdout_matches_path(sandbox.file_string("weed_nk_k9.stdout", TestDir::Correct)); + // Keep rather than weed Command::new("cp") .current_dir(sandbox.get_wd()) diff --git a/tests/test_results_correct/weed_nk_k9.stdout b/tests/test_results_correct/weed_nk_k9.stdout new file mode 100644 index 0000000..26093c8 --- /dev/null +++ b/tests/test_results_correct/weed_nk_k9.stdout @@ -0,0 +1,9 @@ +ska_version=[..] +k=9 +k_bits=64 +rc=true +k-mers=68 +samples=2 +sample_names=["test_1", "test_2"] +sample_kmers=[53, 53] +