Merge pull request #76 from bacpop/cov_fix

Fix for ska cov with low counts
bacpop · Jul 23, 2024 · 28bca4a · 28bca4a
2 parents 3a881c6 + d3f61b1
commit 28bca4a
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 33 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "ska"
-version = "0.3.8"
+version = "0.3.9"
 authors = [
     "John Lees <[email protected]>",
     "Simon Harris <[email protected]>",

diff --git a/scripts/plot_cov.py b/scripts/plot_cov.py
@@ -8,9 +8,12 @@
 import matplotlib.pyplot as plt
 
 import argparse
+from math import exp
 
+# Space at the top
+y_padding = 1.1
 
-def norm_series(series: list()):
+def norm_series(series: list):
     norm = max(series)
     series = [item / norm for item in series]
     return series
@@ -54,7 +57,7 @@ def main():
 
     ax1.set_xlabel("K-mer count")
     ax1.set_ylabel("Frequency")
-    ax1.set_ylim(0, k_norm[1])
+    ax1.set_ylim(0, max(k_norm[1:]) * y_padding)
     ax1.plot(
         idx_xseries, k_norm, color="black", linewidth=2, label="K-mer count frequency"
     )
@@ -79,7 +82,7 @@ def main():
     ax2.set_yscale("log")
     ax2.set_xlabel("K-mer count")
     ax2.set_ylabel("log(Frequency)")
-    ax2.set_ylim(min(k_norm), k_norm[1])
+    ax2.set_ylim(min(k_norm), max(k_norm[1:]) * exp(y_padding))
     ax2.plot(
         idx_xseries, k_norm, color="black", linewidth=2, label="K-mer count frequency"
     )

diff --git a/src/coverage.rs b/src/coverage.rs
@@ -148,6 +148,7 @@ where
     ///
     /// # Panics
     /// - If the fit has already been run
+    #[allow(clippy::map_clone)]
     pub fn fit_histogram(&mut self) -> Result<usize, Error> {
         if self.fitted {
             panic!("Model already fitted");
@@ -163,15 +164,15 @@ where
         }
 
         // Truncate count vec and covert to float
-        let mut counts_f64: Vec<f64> = Vec::new();
-        for hist_bin in &self.counts {
-            if *hist_bin < MIN_FREQ {
-                break;
-            } else {
-                counts_f64.push(*hist_bin as f64);
-            }
-        }
-        let count_len = counts_f64.len();
+        self.counts = self
+            .counts
+            .iter()
+            .rev()
+            .skip_while(|x| **x < MIN_FREQ)
+            .map(|x| *x)
+            .collect();
+        self.counts.reverse();
+        let counts_f64: Vec<f64> = self.counts.iter().map(|x| *x as f64).collect();
 
         // Fit with maximum likelihood. Using BFGS optimiser and simple line search
         // seems to work fine
@@ -182,13 +183,13 @@ where
         // and it gave very poor results for the c optimisation
         let init_hessian: Vec<Vec<f64>> = vec![vec![1.0, 0.0], vec![0.0, 1.0]];
         let linesearch = BacktrackingLineSearch::new(ArmijoCondition::new(0.0001f64)?);
-        let solver = BFGS::new(linesearch);
+        let solver = BFGS::new(linesearch).with_tolerance_cost(1e-6)?;
         // Usually around 10 iterations should be enough
         let mut exec = Executor::new(mixture_fit, solver).configure(|state| {
             state
                 .param(init_param)
                 .inv_hessian(init_hessian)
-                .max_iters(100)
+                .max_iters(20)
         });
         if self.verbose {
             exec = exec.add_observer(SlogLogger::term(), ObserverMode::Always);
@@ -205,7 +206,7 @@ where
                 self.c = best[1];
 
                 // calculate the coverage cutoff
-                self.cutoff = find_cutoff(best, count_len);
+                self.cutoff = find_cutoff(best, self.counts.len());
                 self.fitted = true;
                 Ok(self.cutoff)
             } else {
@@ -232,13 +233,10 @@ where
         log::info!("Calculating and printing count series");
         println!("Count\tK_mers\tMixture_density\tComponent");
         for (idx, count) in self.counts.iter().enumerate() {
-            if *count < MIN_FREQ {
-                break;
-            }
             println!(
                 "{}\t{}\t{:e}\t{}",
                 idx + 1,
-                *count,
+                *count as usize,
                 f64::exp(lse(
                     a(self.w0, idx as f64 + 1.0),
                     b(self.w0, self.c, idx as f64 + 1.0)

diff --git a/src/lib.rs b/src/lib.rs
@@ -56,14 +56,14 @@
 //! ## Important notes
 //!
 //! - In this version of ska the k-mer length is the _total_ split k-mer size,
-//! whereas in ska1 the k-mer length is half the split length. So the default of
-//! k=15 in ska1 corresponds to choosing `-k 31` in this version.
+//!     whereas in ska1 the k-mer length is half the split length. So the default of
+//!     k=15 in ska1 corresponds to choosing `-k 31` in this version.
 //! - If you are using FASTQ input files (reads), these must be provided as two
-//! deinterleaved files using the `-f` option to `ska build`. Providing them as
-//! a single file will treat them as FASTA, and not correct sequencing errors.
+//!     deinterleaved files using the `-f` option to `ska build`. Providing them as
+//!     a single file will treat them as FASTA, and not correct sequencing errors.
 //! - If you are running `ska map`, it is more memory efficient to run these one at
-//! a time, rather than merging a single `.skf` file. A command for doing this
-//! in parallel is listed below.
+//!     a time, rather than merging a single `.skf` file. A command for doing this
+//!     in parallel is listed below.
 //!
 //! ## Common options
 //!
@@ -107,14 +107,15 @@
 //! ```
 //! Options for filtering/error correction are:
 //! - `--min-count`. Specify a minimum number of appearances a k-mer must have
-//! to be included. This is an effective way of filtering sequencing errors if set
-//! to at least three, but higher may be appropriate for higher coverage data.
-//! A two-step blocked bloom and hash table filter is used for memory efficiency.
+//!     to be included. This is an effective way of filtering sequencing errors if set
+//!     to at least three, but higher may be appropriate for higher coverage data.
+//!     A two-step blocked bloom and hash table filter is used for memory efficiency.
 //! - `--proportion-reads`. Specify a proportion of reads to use to build the .skf file.
-//! The value of this parameter must be between 0 and 1.
+//!     The value of this parameter must be between 0 and 1. If you have high coverage samples
+//!     this can be used to reduce the build time.
 //! - `--qual-filter`. `none` do not filter based on quality scores.
-//! `middle` (default) filter k-mers where the middle base is below the minimum quality.
-//! `strict` filter k-mers where any base is below the minimum quality.
+//!     `middle` (default) filter k-mers where the middle base is below the minimum quality.
+//!     `strict` filter k-mers where any base is below the minimum quality.
 //! - `--min-qual`. Specify a minimum PHRED score to use in the filter.
 //!
 //! FASTQ files must be paired end. If you'd like to request more flexibility in

diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs
@@ -379,7 +379,7 @@ where
     /// # Arguments
     ///
     /// - `constant` – the number of prefiltered constant bases, used to adjust
-    /// the denominator of mismatch proportion
+    ///     the denominator of mismatch proportion
     pub fn distance(&self, constant: f64) -> Vec<Vec<(f64, f64)>> {
         let mut distances: Vec<Vec<(f64, f64)>> = Vec::new();
         self.variants