Merge pull request #40 from bacpop/0.3.1-candidate

Add k-mer counts per sample to ska nk; efficiency of dup counting
bacpop · Jun 28, 2023 · 93b7ecf · 93b7ecf
2 parents 62b752c + d54ac01
commit 93b7ecf
Show file tree

Hide file tree

Showing 10 changed files with 64 additions and 28 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "ska"
-version = "0.3.0"
+version = "0.3.1"
 authors = [
     "John Lees <[email protected]>",
     "Simon Harris <[email protected]>",
@@ -12,7 +12,7 @@ authors = [
 edition = "2021"
 description = "Split k-mer analysis"
 repository = "https://github.com/bacpop/ska.rust/"
-homepage = "https://bacpop.com/software/"
+homepage = "https://bacpop.org/software/"
 license = "Apache-2.0"
 readme = "README.md"
 include = [

diff --git a/README.md b/README.md
@@ -51,12 +51,13 @@ Optimisations include:
 - Integer DNA encoding, optimised parsing from FASTA/FASTQ.
 - Faster dictionaries.
 - Full parallelisation of build phase.
-- Smaller, standardised input/output files.
+- Smaller, standardised input/output files. Faster to save/load.
 - Reduced memory footprint with read filtering.
 
 And other improvements:
 
-- IUPAC uncertainty codes for multiple copy k-mers.
+- IUPAC uncertainty codes for multiple copy split k-mers.
+- Uncertainty with self-reverse-complement split k-mers (palindromes).
 - Fully dynamic files (merge, delete samples).
 - Native VCF output for map.
 - Support for known strand sequence (e.g. RNA viruses).

diff --git a/src/lib.rs b/src/lib.rs
@@ -28,6 +28,11 @@
 //! Various optimisations are used to make this as fast as possible. For a more thorough comparison with version 1.0 of SKA, see the
 //! [github description](https://github.com/bacpop/ska.rust/blob/master/README.md).
 //!
+//! //! *NB As split k-mers are even lengths, it is possible that they are their
+//! own reverse complement. The original version of ska would randomly pick a strand,
+//! possibly introducing a SNP across samples. This version uses an ambiguous middle
+//! base (W for A/T; S for C/G) to represent this case.*
+//!
 //! Command line usage follows. For API documentation and usage, see the [end of this section](#api-usage).
 //!
 //! # Usage
@@ -228,20 +233,31 @@
 //! You can do this by removing any constant sites or ambiguous-only s0tes (which are typically unused), and by hard-filtering
 //! by frequency (i.e. before writing output):
 //! ```bash
-//! ska weed --filter NoAmbigOrConst --min-freq 0.9 all_samples.skf
+//! ska weed --filter no-ambig-or-const --min-freq 0.9 all_samples.skf
 //! ```
 //!
 //! ## ska nk
 //! Return information on the *n*umber of *k*-mers in an `.skf` file. This will
 //! print on STDOUT:
+//! - The version of ska used to build the file.
 //! - The k-mer size.
+//! - Number of bits used for the split k-mer (64 or 128).
 //! - Whether reverse complements were used.
-//! - The number of split k-mers.
-//! - The number of samples.
+//! - The total number of split k-mers.
+//! - The total number of samples.
 //! - The sample names.
+//! - The number of split k-mers found in each sample.
 //!
 //! ```bash
 //! ska nk all_samples.skf
+//! ska_version=0.3.1
+//! k=21
+//! k_bits=64
+//! rc=true
+//! k-mers=3228084
+//! samples=28
+//! sample_names=["19183_4#73", "12673_8#29", "12673_8#31", "12754_5#61", "12754_5#89", "12754_5#47", "12754_5#32", "12754_5#78", "12754_4#85", "12754_5#74", "19183_4#57", "12754_5#36", "19183_4#49", "19183_4#79", "19183_4#60", "12754_5#24", "12754_5#22", "12754_5#71", "12673_8#26", "12754_5#95", "12754_5#86", "12673_8#24", "19183_4#61", "12673_8#41", "12754_4#66", "12754_5#80", "12754_5#84", "19183_4#78"]
+//! sample_kmers=[2872587, 2997448, 2949719, 2997496, 2997178, 2912749, 2996491, 2997221, 2949102, 2997454, 2914109, 2912237, 2872518, 2869957, 2872470, 2997992, 2997647, 2958512, 2998099, 2997290, 2950253, 3027707, 2997881, 2907920, 2911447, 2997644, 2944830, 2915080]
 //! ```
 //!
 //! If you add `--full-info`, the split k-mer dictionary will be decoded and printed

diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs
@@ -416,6 +416,14 @@ where
             })
     }
 
+    /// Count of number of k-mers found in each sample
+    pub fn n_sample_kmers(&self) -> Vec<i32> {
+        self.variants
+            .map(|v| if *v != b'-' { 1 } else { 0 })
+            .sum_axis(Axis(0))
+            .to_vec()
+    }
+
     /// K-mer length used when builiding
     pub fn kmer_len(&self) -> usize {
         self.k
@@ -487,15 +495,16 @@ where
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(
             f,
-            "ska_version={}\nk={}\nk_bits={}\nrc={}\n{} k-mers\n{} samples\n",
+            "ska_version={}\nk={}\nk_bits={}\nrc={}\nk-mers={}\nsamples={}\n",
             self.ska_version,
             self.kmer_len(),
             self.k_bits,
             self.rc(),
             self.ksize(),
             self.nsamples()
         )?;
-        writeln!(f, "{:?}", self.names)
+        writeln!(f, "sample_names={:?}", self.names)?;
+        writeln!(f, "sample_kmers={:?}", self.n_sample_kmers())
     }
 }
 

diff --git a/src/ska_ref.rs b/src/ska_ref.rs
@@ -257,7 +257,11 @@ where
                     last_end = end;
                 }
             }
-            log::info!("Masking {} unique split k-mer repeats spanning {} bases", repeats.len(), repeat_coors.len());
+            log::info!(
+                "Masking {} unique split k-mer repeats spanning {} bases",
+                repeats.len(),
+                repeat_coors.len()
+            );
         }
 
         Self {
@@ -278,8 +282,7 @@ where
         if let Vacant(rep_kmer) = repeats.entry(kmer) {
             match singles.entry(kmer) {
                 Vacant(single_entry) => single_entry.insert(),
-                Occupied(single_entry) => {
-                    single_entry.remove();
+                Occupied(_) => {
                     rep_kmer.insert();
                 }
             }
@@ -383,7 +386,9 @@ where
         threads: usize,
     ) -> Result<(), needletail::errors::ParseError> {
         if self.chrom_names.len() > 1 {
-            log::warn!("Reference contained multiple contigs, in the output they will be concatenated");
+            log::warn!(
+                "Reference contained multiple contigs, in the output they will be concatenated"
+            );
         }
 
         let alignments = self.pseudoalignment(threads);

diff --git a/tests/test_results_correct/dup_ss_nk.stdout b/tests/test_results_correct/dup_ss_nk.stdout
@@ -2,9 +2,10 @@ ska_version=[..]
 k=9
 k_bits=64
 rc=false
-1 k-mers
-2 samples
-["dup_test_1", "dup_test_2"]
+k-mers=1
+samples=2
+sample_names=["dup_test_1", "dup_test_2"]
+sample_kmers=[1, 1]
 
 GATT	AAGG	K,D
 
diff --git a/tests/test_results_correct/k33.stdout b/tests/test_results_correct/k33.stdout
@@ -2,7 +2,8 @@ ska_version=[..]
 k=33
 k_bits=128
 rc=true
-58 k-mers
-2 samples
-["test_1", "test_2"]
+k-mers=58
+samples=2
+sample_names=["test_1", "test_2"]
+sample_kmers=[30, 30]
 
diff --git a/tests/test_results_correct/merge_nk.stdout b/tests/test_results_correct/merge_nk.stdout
@@ -2,7 +2,8 @@ ska_version=[..]
 k=17
 k_bits=64
 rc=true
-78 k-mers
-2 samples
-["test_1", "test_2"]
+k-mers=78
+samples=2
+sample_names=["test_1", "test_2"]
+sample_kmers=[46, 46]
 
diff --git a/tests/test_results_correct/weed_nk.stdout b/tests/test_results_correct/weed_nk.stdout
@@ -2,9 +2,10 @@ ska_version=[..]
 k=17
 k_bits=64
 rc=true
-1 k-mers
-2 samples
-["test_1", "test_2"]
+k-mers=1
+samples=2
+sample_names=["test_1", "test_2"]
+sample_kmers=[1, 1]
 
 AAAACACT	TTAAAAGA	C,T
 
diff --git a/tests/test_results_correct/weed_nk_k41.stdout b/tests/test_results_correct/weed_nk_k41.stdout
@@ -2,9 +2,10 @@ ska_version=[..]
 k=41
 k_bits=128
 rc=true
-1 k-mers
-2 samples
-["test_1", "test_2"]
+k-mers=1
+samples=2
+sample_names=["test_1", "test_2"]
+sample_kmers=[1, 1]
 
 AAAAGACTCTCGTACAGCTA	ATATCATAAACGCCTTCAAA	A,T