From fec8babe4b4afce95639ab60aa8f33f99d16add6 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Tue, 20 Jun 2023 10:06:51 +0100
Subject: [PATCH 1/5] Slight improvement to double counting

Doesn't change results
---
 Cargo.toml     |  2 +-
 src/ska_ref.rs | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 21f5442..f93d465 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "ska"
-version = "0.3.0"
+version = "0.3.1"
 authors = [
     "John Lees <jlees@ebi.ac.uk>",
     "Simon Harris <simon.harris@gatesfoundation.org>",
diff --git a/src/ska_ref.rs b/src/ska_ref.rs
index b057cbd..9fae006 100644
--- a/src/ska_ref.rs
+++ b/src/ska_ref.rs
@@ -257,7 +257,11 @@ where
                     last_end = end;
                 }
             }
-            log::info!("Masking {} unique split k-mer repeats spanning {} bases", repeats.len(), repeat_coors.len());
+            log::info!(
+                "Masking {} unique split k-mer repeats spanning {} bases",
+                repeats.len(),
+                repeat_coors.len()
+            );
         }
 
         Self {
@@ -278,8 +282,7 @@ where
         if let Vacant(rep_kmer) = repeats.entry(kmer) {
             match singles.entry(kmer) {
                 Vacant(single_entry) => single_entry.insert(),
-                Occupied(single_entry) => {
-                    single_entry.remove();
+                Occupied(_) => {
                     rep_kmer.insert();
                 }
             }
@@ -383,7 +386,9 @@ where
         threads: usize,
     ) -> Result<(), needletail::errors::ParseError> {
         if self.chrom_names.len() > 1 {
-            log::warn!("Reference contained multiple contigs, in the output they will be concatenated");
+            log::warn!(
+                "Reference contained multiple contigs, in the output they will be concatenated"
+            );
         }
 
         let alignments = self.pseudoalignment(threads);

From 8abef79a46ba6837162a7538744c74f89d405e44 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Wed, 28 Jun 2023 10:38:29 +0100
Subject: [PATCH 2/5] Add k-mer counts per sample

---
 src/merge_ska_array.rs                        | 13 +++++++++++--
 tests/test_results_correct/dup_ss_nk.stdout   |  7 ++++---
 tests/test_results_correct/k33.stdout         |  7 ++++---
 tests/test_results_correct/merge_nk.stdout    |  7 ++++---
 tests/test_results_correct/weed_nk.stdout     |  7 ++++---
 tests/test_results_correct/weed_nk_k41.stdout |  7 ++++---
 6 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs
index c31feff..78f2115 100644
--- a/src/merge_ska_array.rs
+++ b/src/merge_ska_array.rs
@@ -416,6 +416,14 @@ where
             })
     }
 
+    /// Count of number of k-mers found in each sample
+    pub fn n_sample_kmers(&self) -> Vec<i32> {
+        self.variants
+            .map(|v| if *v != b'-' { 1 } else { 0 })
+            .sum_axis(Axis(0))
+            .to_vec()
+    }
+
     /// K-mer length used when builiding
     pub fn kmer_len(&self) -> usize {
         self.k
@@ -487,7 +495,7 @@ where
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(
             f,
-            "ska_version={}\nk={}\nk_bits={}\nrc={}\n{} k-mers\n{} samples\n",
+            "ska_version={}\nk={}\nk_bits={}\nrc={}\nk-mers={}\nsamples={}\n",
             self.ska_version,
             self.kmer_len(),
             self.k_bits,
@@ -495,7 +503,8 @@ where
             self.ksize(),
             self.nsamples()
         )?;
-        writeln!(f, "{:?}", self.names)
+        writeln!(f, "sample_names={:?}", self.names)?;
+        writeln!(f, "sample_kmers={:?}", self.n_sample_kmers())
     }
 }
 
diff --git a/tests/test_results_correct/dup_ss_nk.stdout b/tests/test_results_correct/dup_ss_nk.stdout
index c62dd06..22863e4 100644
--- a/tests/test_results_correct/dup_ss_nk.stdout
+++ b/tests/test_results_correct/dup_ss_nk.stdout
@@ -2,9 +2,10 @@ ska_version=[..]
 k=9
 k_bits=64
 rc=false
-1 k-mers
-2 samples
-["dup_test_1", "dup_test_2"]
+k-mers=1
+samples=2
+sample_names=["dup_test_1", "dup_test_2"]
+sample_kmers=[1, 1]
 
 GATT	AAGG	K,D
 
diff --git a/tests/test_results_correct/k33.stdout b/tests/test_results_correct/k33.stdout
index a76cf25..1dbbfdd 100644
--- a/tests/test_results_correct/k33.stdout
+++ b/tests/test_results_correct/k33.stdout
@@ -2,7 +2,8 @@ ska_version=[..]
 k=33
 k_bits=128
 rc=true
-58 k-mers
-2 samples
-["test_1", "test_2"]
+k-mers=58
+samples=2
+sample_names=["test_1", "test_2"]
+sample_kmers=[30, 30]
 
diff --git a/tests/test_results_correct/merge_nk.stdout b/tests/test_results_correct/merge_nk.stdout
index 030c6a1..ad8854a 100644
--- a/tests/test_results_correct/merge_nk.stdout
+++ b/tests/test_results_correct/merge_nk.stdout
@@ -2,7 +2,8 @@ ska_version=[..]
 k=17
 k_bits=64
 rc=true
-78 k-mers
-2 samples
-["test_1", "test_2"]
+k-mers=78
+samples=2
+sample_names=["test_1", "test_2"]
+sample_kmers=[46, 46]
 
diff --git a/tests/test_results_correct/weed_nk.stdout b/tests/test_results_correct/weed_nk.stdout
index 5a60d34..fa397cd 100644
--- a/tests/test_results_correct/weed_nk.stdout
+++ b/tests/test_results_correct/weed_nk.stdout
@@ -2,9 +2,10 @@ ska_version=[..]
 k=17
 k_bits=64
 rc=true
-1 k-mers
-2 samples
-["test_1", "test_2"]
+k-mers=1
+samples=2
+sample_names=["test_1", "test_2"]
+sample_kmers=[1, 1]
 
 AAAACACT	TTAAAAGA	C,T
 
diff --git a/tests/test_results_correct/weed_nk_k41.stdout b/tests/test_results_correct/weed_nk_k41.stdout
index 703c8f1..9376b45 100644
--- a/tests/test_results_correct/weed_nk_k41.stdout
+++ b/tests/test_results_correct/weed_nk_k41.stdout
@@ -2,9 +2,10 @@ ska_version=[..]
 k=41
 k_bits=128
 rc=true
-1 k-mers
-2 samples
-["test_1", "test_2"]
+k-mers=1
+samples=2
+sample_names=["test_1", "test_2"]
+sample_kmers=[1, 1]
 
 AAAAGACTCTCGTACAGCTA	ATATCATAAACGCCTTCAAA	A,T
 

From 2f36b31561be3db932af4d435395cc962ab1c76c Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Wed, 28 Jun 2023 10:45:41 +0100
Subject: [PATCH 3/5] Correct homepage URL

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index f93d465..d211c9c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,7 +12,7 @@ authors = [
 edition = "2021"
 description = "Split k-mer analysis"
 repository = "https://github.com/bacpop/ska.rust/"
-homepage = "https://bacpop.com/software/"
+homepage = "https://bacpop.org/software/"
 license = "Apache-2.0"
 readme = "README.md"
 include = [

From c456429619222540d0eaedb443e7878a306791a3 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Wed, 28 Jun 2023 10:50:25 +0100
Subject: [PATCH 4/5] Update docs for ska nk

---
 src/lib.rs | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 179418f..f3dc12b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -228,20 +228,31 @@
 //! You can do this by removing any constant sites or ambiguous-only s0tes (which are typically unused), and by hard-filtering
 //! by frequency (i.e. before writing output):
 //! ```bash
-//! ska weed --filter NoAmbigOrConst --min-freq 0.9 all_samples.skf
+//! ska weed --filter no-ambig-or-const --min-freq 0.9 all_samples.skf
 //! ```
 //!
 //! ## ska nk
 //! Return information on the *n*umber of *k*-mers in an `.skf` file. This will
 //! print on STDOUT:
+//! - The version of ska used to build the file.
 //! - The k-mer size.
+//! - Number of bits used for the split k-mer (64 or 128).
 //! - Whether reverse complements were used.
-//! - The number of split k-mers.
-//! - The number of samples.
+//! - The total number of split k-mers.
+//! - The total number of samples.
 //! - The sample names.
+//! - The number of split k-mers found in each sample.
 //!
 //! ```bash
 //! ska nk all_samples.skf
+//! ska_version=0.3.1
+//! k=21
+//! k_bits=64
+//! rc=true
+//! k-mers=3228084
+//! samples=28
+//! sample_names=["19183_4#73", "12673_8#29", "12673_8#31", "12754_5#61", "12754_5#89", "12754_5#47", "12754_5#32", "12754_5#78", "12754_4#85", "12754_5#74", "19183_4#57", "12754_5#36", "19183_4#49", "19183_4#79", "19183_4#60", "12754_5#24", "12754_5#22", "12754_5#71", "12673_8#26", "12754_5#95", "12754_5#86", "12673_8#24", "19183_4#61", "12673_8#41", "12754_4#66", "12754_5#80", "12754_5#84", "19183_4#78"]
+//! sample_kmers=[2872587, 2997448, 2949719, 2997496, 2997178, 2912749, 2996491, 2997221, 2949102, 2997454, 2914109, 2912237, 2872518, 2869957, 2872470, 2997992, 2997647, 2958512, 2998099, 2997290, 2950253, 3027707, 2997881, 2907920, 2911447, 2997644, 2944830, 2915080]
 //! ```
 //!
 //! If you add `--full-info`, the split k-mer dictionary will be decoded and printed

From d54ac017cd7d432f6f22cf20a5d813f35ece1491 Mon Sep 17 00:00:00 2001
From: John Lees <lees.john6@gmail.com>
Date: Wed, 28 Jun 2023 10:55:24 +0100
Subject: [PATCH 5/5] Add notes on palindromes

---
 README.md  | 5 +++--
 src/lib.rs | 5 +++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0ffeab3..c0f4a69 100644
--- a/README.md
+++ b/README.md
@@ -51,12 +51,13 @@ Optimisations include:
 - Integer DNA encoding, optimised parsing from FASTA/FASTQ.
 - Faster dictionaries.
 - Full parallelisation of build phase.
-- Smaller, standardised input/output files.
+- Smaller, standardised input/output files. Faster to save/load.
 - Reduced memory footprint with read filtering.
 
 And other improvements:
 
-- IUPAC uncertainty codes for multiple copy k-mers.
+- IUPAC uncertainty codes for multiple copy split k-mers.
+- Uncertainty with self-reverse-complement split k-mers (palindromes).
 - Fully dynamic files (merge, delete samples).
 - Native VCF output for map.
 - Support for known strand sequence (e.g. RNA viruses).
diff --git a/src/lib.rs b/src/lib.rs
index f3dc12b..c0fb53d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -28,6 +28,11 @@
 //! Various optimisations are used to make this as fast as possible. For a more thorough comparison with version 1.0 of SKA, see the
 //! [github description](https://github.com/bacpop/ska.rust/blob/master/README.md).
 //!
+//! //! *NB As split k-mers are even lengths, it is possible that they are their
+//! own reverse complement. The original version of ska would randomly pick a strand,
+//! possibly introducing a SNP across samples. This version uses an ambiguous middle
+//! base (W for A/T; S for C/G) to represent this case.*
+//!
 //! Command line usage follows. For API documentation and usage, see the [end of this section](#api-usage).
 //!
 //! # Usage