Skip to content

Commit

Permalink
Merge pull request #40 from bacpop/0.3.1-candidate
Browse files Browse the repository at this point in the history
Add k-mer counts per sample to ska nk; efficiency of dup counting
  • Loading branch information
johnlees authored Jun 28, 2023
2 parents 62b752c + d54ac01 commit 93b7ecf
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 28 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "ska"
version = "0.3.0"
version = "0.3.1"
authors = [
"John Lees <[email protected]>",
"Simon Harris <[email protected]>",
Expand All @@ -12,7 +12,7 @@ authors = [
edition = "2021"
description = "Split k-mer analysis"
repository = "https://github.com/bacpop/ska.rust/"
homepage = "https://bacpop.com/software/"
homepage = "https://bacpop.org/software/"
license = "Apache-2.0"
readme = "README.md"
include = [
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,13 @@ Optimisations include:
- Integer DNA encoding, optimised parsing from FASTA/FASTQ.
- Faster dictionaries.
- Full parallelisation of build phase.
- Smaller, standardised input/output files.
- Smaller, standardised input/output files. Faster to save/load.
- Reduced memory footprint with read filtering.

And other improvements:

- IUPAC uncertainty codes for multiple copy k-mers.
- IUPAC uncertainty codes for multiple copy split k-mers.
- Uncertainty with self-reverse-complement split k-mers (palindromes).
- Fully dynamic files (merge, delete samples).
- Native VCF output for map.
- Support for known strand sequence (e.g. RNA viruses).
Expand Down
22 changes: 19 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@
//! Various optimisations are used to make this as fast as possible. For a more thorough comparison with version 1.0 of SKA, see the
//! [github description](https://github.com/bacpop/ska.rust/blob/master/README.md).
//!
//! //! *NB As split k-mers are even lengths, it is possible that they are their
//! own reverse complement. The original version of ska would randomly pick a strand,
//! possibly introducing a SNP across samples. This version uses an ambiguous middle
//! base (W for A/T; S for C/G) to represent this case.*
//!
//! Command line usage follows. For API documentation and usage, see the [end of this section](#api-usage).
//!
//! # Usage
Expand Down Expand Up @@ -228,20 +233,31 @@
//! You can do this by removing any constant sites or ambiguous-only s0tes (which are typically unused), and by hard-filtering
//! by frequency (i.e. before writing output):
//! ```bash
//! ska weed --filter NoAmbigOrConst --min-freq 0.9 all_samples.skf
//! ska weed --filter no-ambig-or-const --min-freq 0.9 all_samples.skf
//! ```
//!
//! ## ska nk
//! Return information on the *n*umber of *k*-mers in an `.skf` file. This will
//! print on STDOUT:
//! - The version of ska used to build the file.
//! - The k-mer size.
//! - Number of bits used for the split k-mer (64 or 128).
//! - Whether reverse complements were used.
//! - The number of split k-mers.
//! - The number of samples.
//! - The total number of split k-mers.
//! - The total number of samples.
//! - The sample names.
//! - The number of split k-mers found in each sample.
//!
//! ```bash
//! ska nk all_samples.skf
//! ska_version=0.3.1
//! k=21
//! k_bits=64
//! rc=true
//! k-mers=3228084
//! samples=28
//! sample_names=["19183_4#73", "12673_8#29", "12673_8#31", "12754_5#61", "12754_5#89", "12754_5#47", "12754_5#32", "12754_5#78", "12754_4#85", "12754_5#74", "19183_4#57", "12754_5#36", "19183_4#49", "19183_4#79", "19183_4#60", "12754_5#24", "12754_5#22", "12754_5#71", "12673_8#26", "12754_5#95", "12754_5#86", "12673_8#24", "19183_4#61", "12673_8#41", "12754_4#66", "12754_5#80", "12754_5#84", "19183_4#78"]
//! sample_kmers=[2872587, 2997448, 2949719, 2997496, 2997178, 2912749, 2996491, 2997221, 2949102, 2997454, 2914109, 2912237, 2872518, 2869957, 2872470, 2997992, 2997647, 2958512, 2998099, 2997290, 2950253, 3027707, 2997881, 2907920, 2911447, 2997644, 2944830, 2915080]
//! ```
//!
//! If you add `--full-info`, the split k-mer dictionary will be decoded and printed
Expand Down
13 changes: 11 additions & 2 deletions src/merge_ska_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,14 @@ where
})
}

/// Count of number of k-mers found in each sample
pub fn n_sample_kmers(&self) -> Vec<i32> {
self.variants
.map(|v| if *v != b'-' { 1 } else { 0 })
.sum_axis(Axis(0))
.to_vec()
}

/// K-mer length used when builiding
pub fn kmer_len(&self) -> usize {
self.k
Expand Down Expand Up @@ -487,15 +495,16 @@ where
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"ska_version={}\nk={}\nk_bits={}\nrc={}\n{} k-mers\n{} samples\n",
"ska_version={}\nk={}\nk_bits={}\nrc={}\nk-mers={}\nsamples={}\n",
self.ska_version,
self.kmer_len(),
self.k_bits,
self.rc(),
self.ksize(),
self.nsamples()
)?;
writeln!(f, "{:?}", self.names)
writeln!(f, "sample_names={:?}", self.names)?;
writeln!(f, "sample_kmers={:?}", self.n_sample_kmers())
}
}

Expand Down
13 changes: 9 additions & 4 deletions src/ska_ref.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,11 @@ where
last_end = end;
}
}
log::info!("Masking {} unique split k-mer repeats spanning {} bases", repeats.len(), repeat_coors.len());
log::info!(
"Masking {} unique split k-mer repeats spanning {} bases",
repeats.len(),
repeat_coors.len()
);
}

Self {
Expand All @@ -278,8 +282,7 @@ where
if let Vacant(rep_kmer) = repeats.entry(kmer) {
match singles.entry(kmer) {
Vacant(single_entry) => single_entry.insert(),
Occupied(single_entry) => {
single_entry.remove();
Occupied(_) => {
rep_kmer.insert();
}
}
Expand Down Expand Up @@ -383,7 +386,9 @@ where
threads: usize,
) -> Result<(), needletail::errors::ParseError> {
if self.chrom_names.len() > 1 {
log::warn!("Reference contained multiple contigs, in the output they will be concatenated");
log::warn!(
"Reference contained multiple contigs, in the output they will be concatenated"
);
}

let alignments = self.pseudoalignment(threads);
Expand Down
7 changes: 4 additions & 3 deletions tests/test_results_correct/dup_ss_nk.stdout
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ ska_version=[..]
k=9
k_bits=64
rc=false
1 k-mers
2 samples
["dup_test_1", "dup_test_2"]
k-mers=1
samples=2
sample_names=["dup_test_1", "dup_test_2"]
sample_kmers=[1, 1]

GATT AAGG K,D

7 changes: 4 additions & 3 deletions tests/test_results_correct/k33.stdout
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ ska_version=[..]
k=33
k_bits=128
rc=true
58 k-mers
2 samples
["test_1", "test_2"]
k-mers=58
samples=2
sample_names=["test_1", "test_2"]
sample_kmers=[30, 30]

7 changes: 4 additions & 3 deletions tests/test_results_correct/merge_nk.stdout
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ ska_version=[..]
k=17
k_bits=64
rc=true
78 k-mers
2 samples
["test_1", "test_2"]
k-mers=78
samples=2
sample_names=["test_1", "test_2"]
sample_kmers=[46, 46]

7 changes: 4 additions & 3 deletions tests/test_results_correct/weed_nk.stdout
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ ska_version=[..]
k=17
k_bits=64
rc=true
1 k-mers
2 samples
["test_1", "test_2"]
k-mers=1
samples=2
sample_names=["test_1", "test_2"]
sample_kmers=[1, 1]

AAAACACT TTAAAAGA C,T

7 changes: 4 additions & 3 deletions tests/test_results_correct/weed_nk_k41.stdout
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ ska_version=[..]
k=41
k_bits=128
rc=true
1 k-mers
2 samples
["test_1", "test_2"]
k-mers=1
samples=2
sample_names=["test_1", "test_2"]
sample_kmers=[1, 1]

AAAAGACTCTCGTACAGCTA ATATCATAAACGCCTTCAAA A,T

0 comments on commit 93b7ecf

Please sign in to comment.