Merge pull request #44 from bacpop/0.3.1-candidate

Enchanced filtering options
bacpop · Jul 7, 2023 · 5d6c1dc · 5d6c1dc
2 parents 93b7ecf + 3a3f48d
commit 5d6c1dc
Show file tree

Hide file tree

Showing 13 changed files with 248 additions and 35 deletions.
diff --git a/src/cli.rs b/src/cli.rs
@@ -11,6 +11,8 @@ pub const DEFAULT_KMER: usize = 17;
 pub const DEFAULT_STRAND: bool = false;
 /// Default repeat masking behaviour
 pub const DEFAULT_REPEATMASK: bool = false;
+/// Default ambiguous masking behaviour
+pub const DEFAULT_AMBIGMASK: bool = false;
 /// Default minimum k-mer count for FASTQ files
 pub const DEFAULT_MINCOUNT: u16 = 10;
 /// Default minimum base quality (PHRED score) for FASTQ files
@@ -71,14 +73,16 @@ pub enum FileType {
     Aln,
 }
 
-/// Possible variant filters for align
+/// Possible variant filters
 #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
 pub enum FilterType {
     /// Output all variants
     NoFilter,
     /// Filter constant bases
     NoConst,
-    /// Filter constant bases, and any variants where there are ambiguous bases
+    /// Filter any site with an ambiguous base
+    NoAmbig,
+    /// Filter constant bases, and any ambiguous bases
     NoAmbigOrConst,
 }
 
@@ -88,6 +92,7 @@ impl fmt::Display for FilterType {
         match *self {
             Self::NoFilter => write!(f, "No filtering"),
             Self::NoConst => write!(f, "No constant sites"),
+            Self::NoAmbig => write!(f, "No ambiguous sites"),
             Self::NoAmbigOrConst => write!(f, "No constant sites or ambiguous bases"),
         }
     }
@@ -171,6 +176,10 @@ pub enum Commands {
         #[arg(long, value_enum, default_value_t = FilterType::NoConst)]
         filter: FilterType,
 
+        /// Mask any ambiguous bases in the alignment with 'N'
+        #[arg(long, default_value_t = DEFAULT_AMBIGMASK)]
+        ambig_mask: bool,
+
         /// Number of CPU threads
         #[arg(long, value_parser = valid_cpus, default_value_t = 1)]
         threads: usize,
@@ -191,6 +200,10 @@ pub enum Commands {
         #[arg(short, long, value_enum, default_value_t = FileType::Aln)]
         format: FileType,
 
+        /// Mask any ambiguous bases in the alignment with 'N'
+        #[arg(long, default_value_t = DEFAULT_AMBIGMASK)]
+        ambig_mask: bool,
+
         /// Mask any repeats in the alignment with 'N'
         #[arg(long, default_value_t = DEFAULT_REPEATMASK)]
         repeat_mask: bool,
@@ -266,6 +279,10 @@ pub enum Commands {
         /// Filter for constant middle base sites
         #[arg(long, value_enum, default_value_t = FilterType::NoFilter)]
         filter: FilterType,
+
+        /// Mask any ambiguous bases in the alignment with 'N'
+        #[arg(long, default_value_t = DEFAULT_AMBIGMASK)]
+        ambig_mask: bool,
     },
     /// Get the number of k-mers in a split k-mer file, and other information
     Nk {

diff --git a/src/generic_modes.rs b/src/generic_modes.rs
@@ -19,13 +19,14 @@ pub fn align<IntT: for<'a> UInt<'a>>(
     ska_array: &mut MergeSkaArray<IntT>,
     output: &Option<String>,
     filter: &FilterType,
+    mask_ambig: bool,
     min_freq: f64,
 ) {
     // In debug mode (cannot be set from CLI, give details)
     log::debug!("{ska_array}");
 
     // Apply filters
-    apply_filters(ska_array, min_freq, filter);
+    apply_filters(ska_array, min_freq, filter, mask_ambig);
 
     // Write out to file/stdout
     log::info!("Writing alignment");
@@ -102,11 +103,12 @@ pub fn apply_filters<IntT: for<'a> UInt<'a>>(
     ska_array: &mut MergeSkaArray<IntT>,
     min_freq: f64,
     filter: &FilterType,
+    ambig_mask: bool,
 ) -> i32 {
     let update_kmers = false;
     let filter_threshold = f64::ceil(ska_array.nsamples() as f64 * min_freq) as usize;
-    log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter}");
-    ska_array.filter(filter_threshold, filter, update_kmers)
+    log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask}");
+    ska_array.filter(filter_threshold, filter, ambig_mask, update_kmers)
 }
 
 /// Calculate distances between samples
@@ -122,12 +124,13 @@ pub fn distance<IntT: for<'a> UInt<'a>>(
     // In debug mode (cannot be set from CLI, give details)
     log::debug!("{ska_array}");
 
-    let constant = apply_filters(ska_array, min_freq, &FilterType::NoConst);
+    let mask_ambig = false;
+    let constant = apply_filters(ska_array, min_freq, &FilterType::NoConst, mask_ambig);
     if filt_ambig || (min_freq * ska_array.nsamples() as f64 >= 1.0) {
         if filt_ambig {
-            apply_filters(ska_array, min_freq, &FilterType::NoAmbigOrConst);
+            apply_filters(ska_array, min_freq, &FilterType::NoAmbigOrConst, mask_ambig);
         } else {
-            apply_filters(ska_array, min_freq, &FilterType::NoFilter);
+            apply_filters(ska_array, min_freq, &FilterType::NoFilter, mask_ambig);
         }
     }
 
@@ -181,6 +184,7 @@ pub fn weed<IntT: for<'a> UInt<'a>>(
     reverse: bool,
     min_freq: f64,
     filter: &FilterType,
+    ambig_mask: bool,
     out_file: &str,
 ) {
     if let Some(weed_fasta) = weed_file {
@@ -189,7 +193,15 @@ pub fn weed<IntT: for<'a> UInt<'a>>(
             ska_array.kmer_len(),
             ska_array.rc()
         );
-        let ska_weed = RefSka::new(ska_array.kmer_len(), weed_fasta, ska_array.rc(), false);
+        let repeat_mask = false;
+        let filter_ambig = false;
+        let ska_weed = RefSka::new(
+            ska_array.kmer_len(),
+            weed_fasta,
+            ska_array.rc(),
+            repeat_mask,
+            filter_ambig,
+        );
 
         if !reverse {
             log::info!("Removing weed k-mers");
@@ -200,10 +212,10 @@ pub fn weed<IntT: for<'a> UInt<'a>>(
     }
 
     let filter_threshold = f64::floor(ska_array.nsamples() as f64 * min_freq) as usize;
-    if filter_threshold > 0 || *filter != FilterType::NoFilter {
-        log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter}");
+    if filter_threshold > 0 || *filter != FilterType::NoFilter || ambig_mask {
+        log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask}");
         let update_kmers = true;
-        ska_array.filter(filter_threshold, filter, update_kmers);
+        ska_array.filter(filter_threshold, filter, ambig_mask, update_kmers);
     }
 
     log::info!("Saving modified skf file");

diff --git a/src/io_utils.rs b/src/io_utils.rs
@@ -132,3 +132,15 @@ pub fn get_input_list(
         None => read_input_fastas(seq_files.as_ref().unwrap()),
     }
 }
+
+/// Checks if any input files are fastq
+pub fn any_fastq(files: &[InputFastx]) -> bool {
+    let mut fastq = false;
+    for file in files {
+        if file.2.is_some() {
+            fastq = true;
+            break;
+        }
+    }
+    fastq
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -28,7 +28,7 @@
 //! Various optimisations are used to make this as fast as possible. For a more thorough comparison with version 1.0 of SKA, see the
 //! [github description](https://github.com/bacpop/ska.rust/blob/master/README.md).
 //!
-//! //! *NB As split k-mers are even lengths, it is possible that they are their
+//! *NB As split k-mers are even lengths, it is possible that they are their
 //! own reverse complement. The original version of ska would randomly pick a strand,
 //! possibly introducing a SNP across samples. This version uses an ambiguous middle
 //! base (W for A/T; S for C/G) to represent this case.*
@@ -50,6 +50,18 @@
 //! to stream into another compatible program on STDIN. You can also add an output
 //! file prefix directly with `-o` (for `ska build` this is required).
 //!
+//! ## Important notes
+//!
+//! - In this version of ska the k-mer length is the _total_ split k-mer size,
+//! whereas in ska1 the k-mer length is half the split length. So the default of
+//! k=15 in ska1 corresponds to choosing `-k 31` in this version.
+//! - If you are using FASTQ input files (reads), these must be provided as two
+//! deinterleaved files using the `-f` option to `ska build`. Providing them as
+//! a single file will treat them as FASTA, and not correct sequencing errors.
+//! - If you are running `ska map`, it is more memory efficient to run these one at
+//! a time, rather than merging a single `.skf` file. A command for doing this
+//! in parallel is listed below.
+//!
 //! ## Common options
 //!
 //! Version can be viewed by running `ska -V`.
@@ -168,6 +180,18 @@
 //! ska map ref.fa seq1.fa seq2.fa -f vcf --threads 2 | bgzip -c - > seqs.vcf.gz
 //! ```
 //!
+//! ### Efficiency
+//!
+//! As `ska map` is independent for each input file, the most efficient way to
+//! run this on a number of samples is with gnu parallel, for example:
+//! ```bash
+//! cat ${FILE_LIST} | parallel -j 8 "ID=\$(basename {} | sed 's/_a.*fasta/_temp_skf/g');
+//! ska build -o \$ID -k 31 {} ;
+//! MAPOUT=\$(basename {} | sed 's/_a.*fasta/_aln/g');
+//!  ska map $REFERENCE_LOC \${ID}.skf -o \${MAPOUT}.aln"
+//! cat *.aln > ska_map.aln
+//! ```
+//!
 //! ## ska distance
 //!
 //! Use to calculate distances between all samples within an `.skf` file. The output
@@ -342,7 +366,8 @@
 //! let min_count = 2;
 //! let update_kmers = false;
 //! let filter = FilterType::NoConst;
-//! ska_array.filter(min_count, &filter, update_kmers);
+//! let ambig_mask = false;
+//! ska_array.filter(min_count, &filter, update_kmers, ambig_mask);
 //!
 //! // Write out to stdout
 //! let mut out_stream = set_ostream(&None);
@@ -485,17 +510,18 @@ pub fn main() {
             output,
             min_freq,
             filter,
+            ambig_mask,
             threads,
         } => {
             check_threads(*threads);
             if let Ok(mut ska_array) = load_array::<u64>(input, *threads) {
                 // In debug mode (cannot be set from CLI, give details)
                 log::debug!("{ska_array}");
-                align(&mut ska_array, output, filter, *min_freq);
+                align(&mut ska_array, output, filter, *ambig_mask, *min_freq);
             } else if let Ok(mut ska_array) = load_array::<u128>(input, *threads) {
                 // In debug mode (cannot be set from CLI, give details)
                 log::debug!("{ska_array}");
-                align(&mut ska_array, output, filter, *min_freq);
+                align(&mut ska_array, output, filter, *ambig_mask, *min_freq);
             } else {
                 panic!("Could not read input file(s): {input:?}");
             }
@@ -505,10 +531,12 @@ pub fn main() {
             input,
             output,
             format,
+            ambig_mask,
             repeat_mask,
             threads,
         } => {
             check_threads(*threads);
+
             log::info!("Loading skf as dictionary");
             if let Ok(mut ska_array) = load_array::<u64>(input, *threads) {
                 log::info!(
@@ -520,6 +548,7 @@ pub fn main() {
                     ska_array.kmer_len(),
                     reference,
                     ska_array.rc(),
+                    *ambig_mask,
                     *repeat_mask,
                 );
                 map(&mut ska_array, &mut ska_ref, output, format, *threads);
@@ -533,6 +562,7 @@ pub fn main() {
                     ska_array.kmer_len(),
                     reference,
                     ska_array.rc(),
+                    *ambig_mask,
                     *repeat_mask,
                 );
                 map(&mut ska_array, &mut ska_ref, output, format, *threads);
@@ -607,6 +637,7 @@ pub fn main() {
             weed_file,
             reverse,
             min_freq,
+            ambig_mask,
             filter,
         } => {
             log::info!("Loading skf file");
@@ -617,6 +648,7 @@ pub fn main() {
                     *reverse,
                     *min_freq,
                     filter,
+                    *ambig_mask,
                     skf_file,
                 );
             } else if let Ok(mut ska_array) = MergeSkaArray::<u128>::load(skf_file) {
@@ -626,6 +658,7 @@ pub fn main() {
                     *reverse,
                     *min_freq,
                     filter,
+                    *ambig_mask,
                     skf_file,
                 );
             } else {

diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs
@@ -55,16 +55,17 @@ use crate::cli::FilterType;
 /// // Remove constant sites and save
 /// let min_count = 1;                          // no filtering by minor allele frequency
 /// let filter = FilterType::NoAmbigOrConst;    // remove sites with no minor allele
+/// let mask_ambiguous = false;                 // leave ambiguous sites as IUPAC codes
 /// let update_counts = true;                   // keep counts updated, as saving
-/// ska_array.filter(min_count, &filter, update_counts);
+/// ska_array.filter(min_count, &filter, mask_ambiguous, update_counts);
 /// ska_array.save(&"no_const_sites.skf");
 ///
 /// // Delete a sample
 /// ska_array.delete_samples(&[&"test_1"]);
 ///
 /// // Delete k-mers
 /// let mask_repeats = false;
-/// let ska_weed = RefSka::new(ska_array.kmer_len(), &"tests/test_files_in/weed.fa", ska_array.rc(), mask_repeats);
+/// let ska_weed = RefSka::new(ska_array.kmer_len(), &"tests/test_files_in/weed.fa", ska_array.rc(), mask_repeats, mask_ambiguous);
 /// let reverse = false;
 /// ska_array.weed(&ska_weed, reverse);
 /// ```
@@ -234,11 +235,18 @@ where
     ///
     /// - `min_count` -- minimum number of samples split k-mer found in.
     /// - `const_sites` -- include sites where all middle bases are the same.
+    /// - `mask_ambig` -- replace any non-ACGTUN- with N
     /// - `update_kmers` -- update counts and split k-mers after removing variants.
     ///
     /// The default for `update_kmers` should be `true`, but it can be `false`
     /// if [`write_fasta()`] will be called immediately afterwards.
-    pub fn filter(&mut self, min_count: usize, filter: &FilterType, update_kmers: bool) -> i32 {
+    pub fn filter(
+        &mut self,
+        min_count: usize,
+        filter: &FilterType,
+        mask_ambig: bool,
+        update_kmers: bool,
+    ) -> i32 {
         let total = self.names.len();
         let mut filtered_variants = Array2::zeros((0, total));
         let mut filtered_counts = Vec::new();
@@ -265,6 +273,16 @@ where
                         }
                         keep
                     }
+                    FilterType::NoAmbig => {
+                        let mut keep = true;
+                        for var in row {
+                            if is_ambiguous(*var) {
+                                keep = false;
+                                break;
+                            }
+                        }
+                        keep
+                    }
                     FilterType::NoAmbigOrConst => {
                         let mut keep = false;
                         let mut first_var = None;
@@ -300,6 +318,21 @@ where
             self.split_kmers = filtered_kmers;
         }
         log::info!("Filtering removed {removed} split k-mers");
+
+        // Replace any ambiguous variants with Ns, if requested
+        if mask_ambig {
+            let mut masked = 0;
+            self.variants.mapv_inplace(|v| {
+                if is_ambiguous(v) {
+                    masked += 1;
+                    b'N'
+                } else {
+                    v
+                }
+            });
+            log::info!("Masked {masked} ambiguous bases (non-A/C/G/T/U/N/-) with 'N'");
+        }
+
         removed
     }