From fec8babe4b4afce95639ab60aa8f33f99d16add6 Mon Sep 17 00:00:00 2001 From: John Lees Date: Tue, 20 Jun 2023 10:06:51 +0100 Subject: [PATCH 1/5] Slight improvement to double counting Doesn't change results --- Cargo.toml | 2 +- src/ska_ref.rs | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 21f5442..f93d465 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ska" -version = "0.3.0" +version = "0.3.1" authors = [ "John Lees ", "Simon Harris ", diff --git a/src/ska_ref.rs b/src/ska_ref.rs index b057cbd..9fae006 100644 --- a/src/ska_ref.rs +++ b/src/ska_ref.rs @@ -257,7 +257,11 @@ where last_end = end; } } - log::info!("Masking {} unique split k-mer repeats spanning {} bases", repeats.len(), repeat_coors.len()); + log::info!( + "Masking {} unique split k-mer repeats spanning {} bases", + repeats.len(), + repeat_coors.len() + ); } Self { @@ -278,8 +282,7 @@ where if let Vacant(rep_kmer) = repeats.entry(kmer) { match singles.entry(kmer) { Vacant(single_entry) => single_entry.insert(), - Occupied(single_entry) => { - single_entry.remove(); + Occupied(_) => { rep_kmer.insert(); } } @@ -383,7 +386,9 @@ where threads: usize, ) -> Result<(), needletail::errors::ParseError> { if self.chrom_names.len() > 1 { - log::warn!("Reference contained multiple contigs, in the output they will be concatenated"); + log::warn!( + "Reference contained multiple contigs, in the output they will be concatenated" + ); } let alignments = self.pseudoalignment(threads); From 8abef79a46ba6837162a7538744c74f89d405e44 Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 28 Jun 2023 10:38:29 +0100 Subject: [PATCH 2/5] Add k-mer counts per sample --- src/merge_ska_array.rs | 13 +++++++++++-- tests/test_results_correct/dup_ss_nk.stdout | 7 ++++--- tests/test_results_correct/k33.stdout | 7 ++++--- tests/test_results_correct/merge_nk.stdout | 7 ++++--- tests/test_results_correct/weed_nk.stdout | 7 ++++--- tests/test_results_correct/weed_nk_k41.stdout | 7 ++++--- 6 files changed, 31 insertions(+), 17 deletions(-) diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index c31feff..78f2115 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -416,6 +416,14 @@ where }) } + /// Count of number of k-mers found in each sample + pub fn n_sample_kmers(&self) -> Vec { + self.variants + .map(|v| if *v != b'-' { 1 } else { 0 }) + .sum_axis(Axis(0)) + .to_vec() + } + /// K-mer length used when builiding pub fn kmer_len(&self) -> usize { self.k @@ -487,7 +495,7 @@ where fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, - "ska_version={}\nk={}\nk_bits={}\nrc={}\n{} k-mers\n{} samples\n", + "ska_version={}\nk={}\nk_bits={}\nrc={}\nk-mers={}\nsamples={}\n", self.ska_version, self.kmer_len(), self.k_bits, @@ -495,7 +503,8 @@ where self.ksize(), self.nsamples() )?; - writeln!(f, "{:?}", self.names) + writeln!(f, "sample_names={:?}", self.names)?; + writeln!(f, "sample_kmers={:?}", self.n_sample_kmers()) } } diff --git a/tests/test_results_correct/dup_ss_nk.stdout b/tests/test_results_correct/dup_ss_nk.stdout index c62dd06..22863e4 100644 --- a/tests/test_results_correct/dup_ss_nk.stdout +++ b/tests/test_results_correct/dup_ss_nk.stdout @@ -2,9 +2,10 @@ ska_version=[..] k=9 k_bits=64 rc=false -1 k-mers -2 samples -["dup_test_1", "dup_test_2"] +k-mers=1 +samples=2 +sample_names=["dup_test_1", "dup_test_2"] +sample_kmers=[1, 1] GATT AAGG K,D diff --git a/tests/test_results_correct/k33.stdout b/tests/test_results_correct/k33.stdout index a76cf25..1dbbfdd 100644 --- a/tests/test_results_correct/k33.stdout +++ b/tests/test_results_correct/k33.stdout @@ -2,7 +2,8 @@ ska_version=[..] k=33 k_bits=128 rc=true -58 k-mers -2 samples -["test_1", "test_2"] +k-mers=58 +samples=2 +sample_names=["test_1", "test_2"] +sample_kmers=[30, 30] diff --git a/tests/test_results_correct/merge_nk.stdout b/tests/test_results_correct/merge_nk.stdout index 030c6a1..ad8854a 100644 --- a/tests/test_results_correct/merge_nk.stdout +++ b/tests/test_results_correct/merge_nk.stdout @@ -2,7 +2,8 @@ ska_version=[..] k=17 k_bits=64 rc=true -78 k-mers -2 samples -["test_1", "test_2"] +k-mers=78 +samples=2 +sample_names=["test_1", "test_2"] +sample_kmers=[46, 46] diff --git a/tests/test_results_correct/weed_nk.stdout b/tests/test_results_correct/weed_nk.stdout index 5a60d34..fa397cd 100644 --- a/tests/test_results_correct/weed_nk.stdout +++ b/tests/test_results_correct/weed_nk.stdout @@ -2,9 +2,10 @@ ska_version=[..] k=17 k_bits=64 rc=true -1 k-mers -2 samples -["test_1", "test_2"] +k-mers=1 +samples=2 +sample_names=["test_1", "test_2"] +sample_kmers=[1, 1] AAAACACT TTAAAAGA C,T diff --git a/tests/test_results_correct/weed_nk_k41.stdout b/tests/test_results_correct/weed_nk_k41.stdout index 703c8f1..9376b45 100644 --- a/tests/test_results_correct/weed_nk_k41.stdout +++ b/tests/test_results_correct/weed_nk_k41.stdout @@ -2,9 +2,10 @@ ska_version=[..] k=41 k_bits=128 rc=true -1 k-mers -2 samples -["test_1", "test_2"] +k-mers=1 +samples=2 +sample_names=["test_1", "test_2"] +sample_kmers=[1, 1] AAAAGACTCTCGTACAGCTA ATATCATAAACGCCTTCAAA A,T From 2f36b31561be3db932af4d435395cc962ab1c76c Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 28 Jun 2023 10:45:41 +0100 Subject: [PATCH 3/5] Correct homepage URL --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f93d465..d211c9c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ authors = [ edition = "2021" description = "Split k-mer analysis" repository = "https://github.com/bacpop/ska.rust/" -homepage = "https://bacpop.com/software/" +homepage = "https://bacpop.org/software/" license = "Apache-2.0" readme = "README.md" include = [ From c456429619222540d0eaedb443e7878a306791a3 Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 28 Jun 2023 10:50:25 +0100 Subject: [PATCH 4/5] Update docs for ska nk --- src/lib.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 179418f..f3dc12b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -228,20 +228,31 @@ //! You can do this by removing any constant sites or ambiguous-only s0tes (which are typically unused), and by hard-filtering //! by frequency (i.e. before writing output): //! ```bash -//! ska weed --filter NoAmbigOrConst --min-freq 0.9 all_samples.skf +//! ska weed --filter no-ambig-or-const --min-freq 0.9 all_samples.skf //! ``` //! //! ## ska nk //! Return information on the *n*umber of *k*-mers in an `.skf` file. This will //! print on STDOUT: +//! - The version of ska used to build the file. //! - The k-mer size. +//! - Number of bits used for the split k-mer (64 or 128). //! - Whether reverse complements were used. -//! - The number of split k-mers. -//! - The number of samples. +//! - The total number of split k-mers. +//! - The total number of samples. //! - The sample names. +//! - The number of split k-mers found in each sample. //! //! ```bash //! ska nk all_samples.skf +//! ska_version=0.3.1 +//! k=21 +//! k_bits=64 +//! rc=true +//! k-mers=3228084 +//! samples=28 +//! sample_names=["19183_4#73", "12673_8#29", "12673_8#31", "12754_5#61", "12754_5#89", "12754_5#47", "12754_5#32", "12754_5#78", "12754_4#85", "12754_5#74", "19183_4#57", "12754_5#36", "19183_4#49", "19183_4#79", "19183_4#60", "12754_5#24", "12754_5#22", "12754_5#71", "12673_8#26", "12754_5#95", "12754_5#86", "12673_8#24", "19183_4#61", "12673_8#41", "12754_4#66", "12754_5#80", "12754_5#84", "19183_4#78"] +//! sample_kmers=[2872587, 2997448, 2949719, 2997496, 2997178, 2912749, 2996491, 2997221, 2949102, 2997454, 2914109, 2912237, 2872518, 2869957, 2872470, 2997992, 2997647, 2958512, 2998099, 2997290, 2950253, 3027707, 2997881, 2907920, 2911447, 2997644, 2944830, 2915080] //! ``` //! //! If you add `--full-info`, the split k-mer dictionary will be decoded and printed From d54ac017cd7d432f6f22cf20a5d813f35ece1491 Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 28 Jun 2023 10:55:24 +0100 Subject: [PATCH 5/5] Add notes on palindromes --- README.md | 5 +++-- src/lib.rs | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0ffeab3..c0f4a69 100644 --- a/README.md +++ b/README.md @@ -51,12 +51,13 @@ Optimisations include: - Integer DNA encoding, optimised parsing from FASTA/FASTQ. - Faster dictionaries. - Full parallelisation of build phase. -- Smaller, standardised input/output files. +- Smaller, standardised input/output files. Faster to save/load. - Reduced memory footprint with read filtering. And other improvements: -- IUPAC uncertainty codes for multiple copy k-mers. +- IUPAC uncertainty codes for multiple copy split k-mers. +- Uncertainty with self-reverse-complement split k-mers (palindromes). - Fully dynamic files (merge, delete samples). - Native VCF output for map. - Support for known strand sequence (e.g. RNA viruses). diff --git a/src/lib.rs b/src/lib.rs index f3dc12b..c0fb53d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,6 +28,11 @@ //! Various optimisations are used to make this as fast as possible. For a more thorough comparison with version 1.0 of SKA, see the //! [github description](https://github.com/bacpop/ska.rust/blob/master/README.md). //! +//! //! *NB As split k-mers are even lengths, it is possible that they are their +//! own reverse complement. The original version of ska would randomly pick a strand, +//! possibly introducing a SNP across samples. This version uses an ambiguous middle +//! base (W for A/T; S for C/G) to represent this case.* +//! //! Command line usage follows. For API documentation and usage, see the [end of this section](#api-usage). //! //! # Usage