Skip to content

Commit

Permalink
Change min freq in distance default to 0.9
Browse files Browse the repository at this point in the history
  • Loading branch information
johnlees committed Sep 25, 2024
1 parent f26e7d8 commit 77532e2
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 17 deletions.
6 changes: 4 additions & 2 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ pub const DEFAULT_KMER: usize = 17;
pub const DEFAULT_PROPORTION_READS: Option<f64> = None;
/// Default single strand (which is equivalent to !rc)
pub const DEFAULT_STRAND: bool = false;
/// Default minimum frequency filter threshold
pub const DEFAULT_MINFREQ: f64 = 0.9;
/// Default behaviour when min-freq counting ambig sites
pub const DEFAULT_AMBIGMISSING: bool = false;
/// Default repeat masking behaviour
Expand Down Expand Up @@ -191,7 +193,7 @@ pub enum Commands {
output: Option<String>,

/// Minimum fraction of samples a k-mer has to appear in
#[arg(short, long, value_parser = zero_to_one, default_value_t = 0.9)]
#[arg(short, long, value_parser = zero_to_one, default_value_t = DEFAULT_MINFREQ)]
min_freq: f64,

/// With min_freq, only count non-ambiguous sites
Expand Down Expand Up @@ -312,7 +314,7 @@ pub enum Commands {
reverse: bool,

/// Minimum fraction of samples a k-mer has to appear in
#[arg(short, long, value_parser = zero_to_one, default_value_t = 0.0)]
#[arg(short, long, value_parser = zero_to_one, default_value_t = DEFAULT_MINFREQ)]
min_freq: f64,

/// With min_freq, only count non-ambiguous sites
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,8 @@
//! ska distance -o distances.txt seqs.skf
//! ```
//!
//! Consider ambiguous bases by adding `--allow-ambiguous` flag, and `--min-freq` to
//! ignore k-mers only found in some samples. Note that ambiguous bases may overestimate
//! Consider ambiguous bases by adding `--allow-ambiguous` flag, and change `--min-freq` to
//! ignore more/less k-mers only found in some samples (default = 0.9). Note that ambiguous bases may overestimate
//! distances due to repeat k-mers. For finer control over filtering, first run `ska weed`
//! on the input .skf.
//!
Expand Down
6 changes: 4 additions & 2 deletions tests/distance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,22 +97,24 @@ fn multisample_dists() {
.assert()
.success();

// Test with filters off
Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
.arg("distance")
.arg("multidist.skf")
.arg("-v")
.args(&["--min-freq", "0"])
.arg("--allow-ambiguous")
.args(&["--threads", "2"])
.assert()
.stdout_eq_path(sandbox.file_string("multidist.stdout", TestDir::Correct));

// Test with default filters
Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
.arg("distance")
.arg("multidist.skf")
.arg("-v")
.arg("--allow-ambiguous")
.args(&["--min-freq", "0.5"])
.assert()
.stdout_eq_path(sandbox.file_string("multidist.filter.stdout", TestDir::Correct));
}
14 changes: 7 additions & 7 deletions tests/test_results_correct/multidist.filter.stdout
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
Sample1 Sample2 Distance Mismatches
N_test_1 N_test_2 2.00 0.22222
N_test_1 N_test_2 2.00 0.51724
N_test_1 ambig_test_1 0.00 1.00000
N_test_1 ambig_test_2 0.00 1.00000
N_test_1 test_1 0.50 0.08333
N_test_1 test_2 1.00 0.05556
N_test_1 test_1 1.00 0.25455
N_test_1 test_2 1.00 0.42623
N_test_2 ambig_test_1 0.00 1.00000
N_test_2 ambig_test_2 0.00 1.00000
N_test_2 test_1 1.50 0.19444
N_test_2 test_2 1.00 0.16667
ambig_test_1 ambig_test_2 0.00 0.00000
N_test_2 test_1 2.00 0.56716
N_test_2 test_2 1.00 0.28571
ambig_test_1 ambig_test_2 1.00 0.44444
ambig_test_1 test_1 0.00 1.00000
ambig_test_1 test_2 0.00 1.00000
ambig_test_2 test_1 0.00 1.00000
ambig_test_2 test_2 0.00 1.00000
test_1 test_2 1.50 0.02778
test_1 test_2 3.00 0.44118
8 changes: 4 additions & 4 deletions tests/test_results_correct/multidist.stdout
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ Sample1 Sample2 Distance Mismatches
N_test_1 N_test_2 2.00 0.51724
N_test_1 ambig_test_1 0.00 1.00000
N_test_1 ambig_test_2 0.00 1.00000
N_test_1 test_1 1.00 0.25455
N_test_1 test_1 0.50 0.25455
N_test_1 test_2 1.00 0.42623
N_test_2 ambig_test_1 0.00 1.00000
N_test_2 ambig_test_2 0.00 1.00000
N_test_2 test_1 2.00 0.56716
N_test_2 test_1 1.50 0.56716
N_test_2 test_2 1.00 0.28571
ambig_test_1 ambig_test_2 1.00 0.44444
ambig_test_1 ambig_test_2 0.50 0.44444
ambig_test_1 test_1 0.00 1.00000
ambig_test_1 test_2 0.00 1.00000
ambig_test_2 test_1 0.00 1.00000
ambig_test_2 test_2 0.00 1.00000
test_1 test_2 3.00 0.44118
test_1 test_2 2.50 0.44118

0 comments on commit 77532e2

Please sign in to comment.