MRG: bad k-mer detection, forcing, and error handling (#7)

* rename lint * handle errors a bit * properly handle (and test) errors * upd docs * bump vers * name ksize parameter in constructor
oxli-bio · Sep 2, 2024 · c1346ff · c1346ff
1 parent 3fe4dec
commit c1346ff
Show file tree

Hide file tree

Showing 7 changed files with 114 additions and 10 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -10,7 +10,7 @@ on:
   push:
     branches: [latest]
 jobs:
-  tests_on_linux:
+  rust_on_linux:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v4

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "oxli"
-version = "0.2.0"
+version = "0.2.1"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

diff --git a/doc/api.md b/doc/api.md
@@ -3,15 +3,17 @@
 Import necessary modules:
 
 ```python
->>> import screed
->>> import oxli
+>>> import screed # FASTA/FASTQ parsing
+>>> import oxli   # this package!
 
 ```
 
+## The basics
+
 Create a KmerCountTable with a k-mer size of 31:
 
 ```python
->>> counts = oxli.KmerCountTable(31)
+>>> counts = oxli.KmerCountTable(ksize=31)
 
 ```
 
@@ -24,10 +26,53 @@ Open a FASTA file and consume k-mers from all the sequences within:
 
 ```
 
-Get the count of `CGGAGGAAGCAAGAACAAAATATTTTTTCAT` in the data::
+Here, `consume` reports the total number of k-mers consumed.
+
+Get the count of `CGGAGGAAGCAAGAACAAAATATTTTTTCAT` in the data:
 
 ```python
 >>> counts.get('CGGAGGAAGCAAGAACAAAATATTTTTTCAT')
 1
 
 ```
+
+Reverse complement are of course handled:
+```python
+>>> counts.get('ATGAAAAAATATTTTGTTCTTGCTTCCTCCG')
+1
+
+```
+
+
+## Handling bad k-mers in DNA sequence
+
+You can fail on bad k-mers:
+
+```python
+>>> counts.consume('XXXCGGAGGAAGCAAGAACAAAATATTTTTTCATGGG', allow_bad_kmers=False)
+Traceback (most recent call last):
+...
+ValueError: bad k-mer encountered at position 0
+
+```
+
+or allow them (which is default):
+
+```python
+>>> counts.consume('XXXCGGAGGAAGCAAGAACAAAATATTTTTTCATGGG', allow_bad_kmers=True)
+4
+
+```
+
+If you allow bad k-mers, then all of the valid k-mers will be counted, and all of the bad k-mers will be skipped:
+```
+>>> counts.get("CGGAGGAAGCAAGAACAAAATATTTTTTCAT")
+2
+
+>>> counts.get("AGGAAGCAAGAACAAAATATTTTTTCATGGG")
+1
+
+```
+
+@CTB note: right now, all k-mers up to the bad k-mers are
+counted... oops. Fix this!
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,10 @@
+name: oxli-dev
+channels:
+    - conda-forge
+    - bioconda
+    - defaults
+dependencies:
+    - rust
+    - maturin>=1,<2
+    - pytest
+    - compilers
diff --git a/src/lib.rs b/src/lib.rs
@@ -19,6 +19,7 @@ struct KmerCountTable {
 #[pymethods]
 impl KmerCountTable {
     #[new]
+    #[pyo3(signature = (ksize))]
     pub fn new(ksize: u8) -> Self {
         Self {
             counts: HashMap::new(),
@@ -85,26 +86,32 @@ impl KmerCountTable {
     }
 
     // Consume this DNA strnig. Return number of k-mers consumed.
-    pub fn consume(&mut self, seq: String) -> PyResult<u64> {
+    #[pyo3(signature = (seq, allow_bad_kmers=true))]
+    pub fn consume(&mut self, seq: String, allow_bad_kmers: bool) -> PyResult<u64> {
         let hashes = SeqToHashes::new(
             seq.as_bytes(),
             self.ksize.into(),
-            false,
+            allow_bad_kmers,
             false,
             HashFunctions::Murmur64Dna,
             42,
         );
 
         let mut n = 0;
         for hash_value in hashes {
+            // eprintln!("hash_value: {:?}", hash_value);
             match hash_value {
                 Ok(0) => continue,
                 Ok(x) => {
                     self.count_hash(x);
                     ()
                 }
-                Err(_err) => (), // @CTB
+                Err(_) => {
+                    let msg = format!("bad k-mer encountered at position {}", n);
+                    return Err(PyValueError::new_err(msg));
+                }
             }
+
             n += 1;
         }
 

diff --git a/src/python/tests/test_basic.py b/src/python/tests/test_basic.py
@@ -2,6 +2,7 @@
 import oxli
 
 def test_simple():
+    # yo dawg it works
     cg = oxli.KmerCountTable(4)
     kmer = "ATCG"
 
@@ -11,6 +12,7 @@ def test_simple():
 
 
 def test_wrong_ksize():
+    # but only with the right ksize
     cg = oxli.KmerCountTable(3)
     kmer = "ATCG"
 
@@ -22,6 +24,7 @@ def test_wrong_ksize():
 
 
 def test_consume():
+    # test basic consume
     cg = oxli.KmerCountTable(4)
     kmer = "ATCG"
 
@@ -30,10 +33,49 @@ def test_consume():
 
 
 def test_consume_2():
+    # test reverse complement
     cg = oxli.KmerCountTable(4)
     seq = "ATCGG"
 
     assert cg.consume(seq) == 2
     assert cg.get("ATCG") == 1
     assert cg.get("TCGG") == 1
     assert cg.get("CCGA") == 1 # reverse complement!
+
+
+def test_consume_bad_DNA():
+    # test an invalid base in last position
+    cg = oxli.KmerCountTable(4)
+    seq = "ATCGGX"
+    with pytest.raises(ValueError,
+                       match="bad k-mer encountered at position 2"):
+        cg.consume(seq, allow_bad_kmers=False)
+
+
+def test_consume_bad_DNA_2():
+    # test an invalid base in first position
+    cg = oxli.KmerCountTable(4)
+    seq = "XATCGG"
+    with pytest.raises(ValueError,
+                       match="bad k-mer encountered at position 0"):
+        cg.consume(seq, allow_bad_kmers=False)
+
+
+def test_consume_bad_DNA_ignore():
+    # we can ignore bad DNA
+    cg = oxli.KmerCountTable(4)
+    seq = "XATCGG"
+    print(cg.consume(seq, allow_bad_kmers=True))
+    assert cg.get("ATCG") == 1
+    assert cg.get("TCGG") == 1
+    assert cg.get("CCGA") == 1 # rc
+
+
+def test_consume_bad_DNA_ignore_is_default():
+    # ignoring bad DNA is default
+    cg = oxli.KmerCountTable(4)
+    seq = "XATCGG"
+    print(cg.consume(seq))
+    assert cg.get("ATCG") == 1
+    assert cg.get("TCGG") == 1
+    assert cg.get("CCGA") == 1 # rc