Skip to content

Commit

Permalink
Merge pull request #76 from bacpop/cov_fix
Browse files Browse the repository at this point in the history
Fix for ska cov with low counts
  • Loading branch information
johnlees authored Jul 23, 2024
2 parents 3a881c6 + d3f61b1 commit 28bca4a
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 33 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "ska"
version = "0.3.8"
version = "0.3.9"
authors = [
"John Lees <[email protected]>",
"Simon Harris <[email protected]>",
Expand Down
9 changes: 6 additions & 3 deletions scripts/plot_cov.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@
import matplotlib.pyplot as plt

import argparse
from math import exp

# Space at the top
y_padding = 1.1

def norm_series(series: list()):
def norm_series(series: list):
norm = max(series)
series = [item / norm for item in series]
return series
Expand Down Expand Up @@ -54,7 +57,7 @@ def main():

ax1.set_xlabel("K-mer count")
ax1.set_ylabel("Frequency")
ax1.set_ylim(0, k_norm[1])
ax1.set_ylim(0, max(k_norm[1:]) * y_padding)
ax1.plot(
idx_xseries, k_norm, color="black", linewidth=2, label="K-mer count frequency"
)
Expand All @@ -79,7 +82,7 @@ def main():
ax2.set_yscale("log")
ax2.set_xlabel("K-mer count")
ax2.set_ylabel("log(Frequency)")
ax2.set_ylim(min(k_norm), k_norm[1])
ax2.set_ylim(min(k_norm), max(k_norm[1:]) * exp(y_padding))
ax2.plot(
idx_xseries, k_norm, color="black", linewidth=2, label="K-mer count frequency"
)
Expand Down
30 changes: 14 additions & 16 deletions src/coverage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ where
///
/// # Panics
/// - If the fit has already been run
#[allow(clippy::map_clone)]
pub fn fit_histogram(&mut self) -> Result<usize, Error> {
if self.fitted {
panic!("Model already fitted");
Expand All @@ -163,15 +164,15 @@ where
}

// Truncate count vec and covert to float
let mut counts_f64: Vec<f64> = Vec::new();
for hist_bin in &self.counts {
if *hist_bin < MIN_FREQ {
break;
} else {
counts_f64.push(*hist_bin as f64);
}
}
let count_len = counts_f64.len();
self.counts = self
.counts
.iter()
.rev()
.skip_while(|x| **x < MIN_FREQ)
.map(|x| *x)
.collect();
self.counts.reverse();
let counts_f64: Vec<f64> = self.counts.iter().map(|x| *x as f64).collect();

// Fit with maximum likelihood. Using BFGS optimiser and simple line search
// seems to work fine
Expand All @@ -182,13 +183,13 @@ where
// and it gave very poor results for the c optimisation
let init_hessian: Vec<Vec<f64>> = vec![vec![1.0, 0.0], vec![0.0, 1.0]];
let linesearch = BacktrackingLineSearch::new(ArmijoCondition::new(0.0001f64)?);
let solver = BFGS::new(linesearch);
let solver = BFGS::new(linesearch).with_tolerance_cost(1e-6)?;
// Usually around 10 iterations should be enough
let mut exec = Executor::new(mixture_fit, solver).configure(|state| {
state
.param(init_param)
.inv_hessian(init_hessian)
.max_iters(100)
.max_iters(20)
});
if self.verbose {
exec = exec.add_observer(SlogLogger::term(), ObserverMode::Always);
Expand All @@ -205,7 +206,7 @@ where
self.c = best[1];

// calculate the coverage cutoff
self.cutoff = find_cutoff(best, count_len);
self.cutoff = find_cutoff(best, self.counts.len());
self.fitted = true;
Ok(self.cutoff)
} else {
Expand All @@ -232,13 +233,10 @@ where
log::info!("Calculating and printing count series");
println!("Count\tK_mers\tMixture_density\tComponent");
for (idx, count) in self.counts.iter().enumerate() {
if *count < MIN_FREQ {
break;
}
println!(
"{}\t{}\t{:e}\t{}",
idx + 1,
*count,
*count as usize,
f64::exp(lse(
a(self.w0, idx as f64 + 1.0),
b(self.w0, self.c, idx as f64 + 1.0)
Expand Down
25 changes: 13 additions & 12 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,14 @@
//! ## Important notes
//!
//! - In this version of ska the k-mer length is the _total_ split k-mer size,
//! whereas in ska1 the k-mer length is half the split length. So the default of
//! k=15 in ska1 corresponds to choosing `-k 31` in this version.
//! whereas in ska1 the k-mer length is half the split length. So the default of
//! k=15 in ska1 corresponds to choosing `-k 31` in this version.
//! - If you are using FASTQ input files (reads), these must be provided as two
//! deinterleaved files using the `-f` option to `ska build`. Providing them as
//! a single file will treat them as FASTA, and not correct sequencing errors.
//! deinterleaved files using the `-f` option to `ska build`. Providing them as
//! a single file will treat them as FASTA, and not correct sequencing errors.
//! - If you are running `ska map`, it is more memory efficient to run these one at
//! a time, rather than merging a single `.skf` file. A command for doing this
//! in parallel is listed below.
//! a time, rather than merging a single `.skf` file. A command for doing this
//! in parallel is listed below.
//!
//! ## Common options
//!
Expand Down Expand Up @@ -107,14 +107,15 @@
//! ```
//! Options for filtering/error correction are:
//! - `--min-count`. Specify a minimum number of appearances a k-mer must have
//! to be included. This is an effective way of filtering sequencing errors if set
//! to at least three, but higher may be appropriate for higher coverage data.
//! A two-step blocked bloom and hash table filter is used for memory efficiency.
//! to be included. This is an effective way of filtering sequencing errors if set
//! to at least three, but higher may be appropriate for higher coverage data.
//! A two-step blocked bloom and hash table filter is used for memory efficiency.
//! - `--proportion-reads`. Specify a proportion of reads to use to build the .skf file.
//! The value of this parameter must be between 0 and 1.
//! The value of this parameter must be between 0 and 1. If you have high coverage samples
//! this can be used to reduce the build time.
//! - `--qual-filter`. `none` do not filter based on quality scores.
//! `middle` (default) filter k-mers where the middle base is below the minimum quality.
//! `strict` filter k-mers where any base is below the minimum quality.
//! `middle` (default) filter k-mers where the middle base is below the minimum quality.
//! `strict` filter k-mers where any base is below the minimum quality.
//! - `--min-qual`. Specify a minimum PHRED score to use in the filter.
//!
//! FASTQ files must be paired end. If you'd like to request more flexibility in
Expand Down
2 changes: 1 addition & 1 deletion src/merge_ska_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ where
/// # Arguments
///
/// - `constant` – the number of prefiltered constant bases, used to adjust
/// the denominator of mismatch proportion
/// the denominator of mismatch proportion
pub fn distance(&self, constant: f64) -> Vec<Vec<(f64, f64)>> {
let mut distances: Vec<Vec<(f64, f64)>> = Vec::new();
self.variants
Expand Down

0 comments on commit 28bca4a

Please sign in to comment.