Skip to content

Commit

Permalink
MRG: bad k-mer detection, forcing, and error handling (#7)
Browse files Browse the repository at this point in the history
* rename lint

* handle errors a bit

* properly handle (and test) errors

* upd docs

* bump vers

* name ksize parameter in constructor
  • Loading branch information
ctb authored Sep 2, 2024
1 parent 3fe4dec commit c1346ff
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:
push:
branches: [latest]
jobs:
tests_on_linux:
rust_on_linux:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "oxli"
version = "0.2.0"
version = "0.2.1"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
53 changes: 49 additions & 4 deletions doc/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@
Import necessary modules:

```python
>>> import screed
>>> import oxli
>>> import screed # FASTA/FASTQ parsing
>>> import oxli # this package!

```

## The basics

Create a KmerCountTable with a k-mer size of 31:

```python
>>> counts = oxli.KmerCountTable(31)
>>> counts = oxli.KmerCountTable(ksize=31)

```

Expand All @@ -24,10 +26,53 @@ Open a FASTA file and consume k-mers from all the sequences within:

```

Get the count of `CGGAGGAAGCAAGAACAAAATATTTTTTCAT` in the data::
Here, `consume` reports the total number of k-mers consumed.

Get the count of `CGGAGGAAGCAAGAACAAAATATTTTTTCAT` in the data:

```python
>>> counts.get('CGGAGGAAGCAAGAACAAAATATTTTTTCAT')
1

```

Reverse complement are of course handled:
```python
>>> counts.get('ATGAAAAAATATTTTGTTCTTGCTTCCTCCG')
1

```


## Handling bad k-mers in DNA sequence

You can fail on bad k-mers:

```python
>>> counts.consume('XXXCGGAGGAAGCAAGAACAAAATATTTTTTCATGGG', allow_bad_kmers=False)
Traceback (most recent call last):
...
ValueError: bad k-mer encountered at position 0

```

or allow them (which is default):

```python
>>> counts.consume('XXXCGGAGGAAGCAAGAACAAAATATTTTTTCATGGG', allow_bad_kmers=True)
4

```

If you allow bad k-mers, then all of the valid k-mers will be counted, and all of the bad k-mers will be skipped:
```
>>> counts.get("CGGAGGAAGCAAGAACAAAATATTTTTTCAT")
2
>>> counts.get("AGGAAGCAAGAACAAAATATTTTTTCATGGG")
1
```

@CTB note: right now, all k-mers up to the bad k-mers are
counted... oops. Fix this!
10 changes: 10 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: oxli-dev
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- rust
- maturin>=1,<2
- pytest
- compilers
13 changes: 10 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ struct KmerCountTable {
#[pymethods]
impl KmerCountTable {
#[new]
#[pyo3(signature = (ksize))]
pub fn new(ksize: u8) -> Self {
Self {
counts: HashMap::new(),
Expand Down Expand Up @@ -85,26 +86,32 @@ impl KmerCountTable {
}

// Consume this DNA strnig. Return number of k-mers consumed.
pub fn consume(&mut self, seq: String) -> PyResult<u64> {
#[pyo3(signature = (seq, allow_bad_kmers=true))]
pub fn consume(&mut self, seq: String, allow_bad_kmers: bool) -> PyResult<u64> {
let hashes = SeqToHashes::new(
seq.as_bytes(),
self.ksize.into(),
false,
allow_bad_kmers,
false,
HashFunctions::Murmur64Dna,
42,
);

let mut n = 0;
for hash_value in hashes {
// eprintln!("hash_value: {:?}", hash_value);
match hash_value {
Ok(0) => continue,
Ok(x) => {
self.count_hash(x);
()
}
Err(_err) => (), // @CTB
Err(_) => {
let msg = format!("bad k-mer encountered at position {}", n);
return Err(PyValueError::new_err(msg));
}
}

n += 1;
}

Expand Down
42 changes: 42 additions & 0 deletions src/python/tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import oxli

def test_simple():
# yo dawg it works
cg = oxli.KmerCountTable(4)
kmer = "ATCG"

Expand All @@ -11,6 +12,7 @@ def test_simple():


def test_wrong_ksize():
# but only with the right ksize
cg = oxli.KmerCountTable(3)
kmer = "ATCG"

Expand All @@ -22,6 +24,7 @@ def test_wrong_ksize():


def test_consume():
# test basic consume
cg = oxli.KmerCountTable(4)
kmer = "ATCG"

Expand All @@ -30,10 +33,49 @@ def test_consume():


def test_consume_2():
# test reverse complement
cg = oxli.KmerCountTable(4)
seq = "ATCGG"

assert cg.consume(seq) == 2
assert cg.get("ATCG") == 1
assert cg.get("TCGG") == 1
assert cg.get("CCGA") == 1 # reverse complement!


def test_consume_bad_DNA():
# test an invalid base in last position
cg = oxli.KmerCountTable(4)
seq = "ATCGGX"
with pytest.raises(ValueError,
match="bad k-mer encountered at position 2"):
cg.consume(seq, allow_bad_kmers=False)


def test_consume_bad_DNA_2():
# test an invalid base in first position
cg = oxli.KmerCountTable(4)
seq = "XATCGG"
with pytest.raises(ValueError,
match="bad k-mer encountered at position 0"):
cg.consume(seq, allow_bad_kmers=False)


def test_consume_bad_DNA_ignore():
# we can ignore bad DNA
cg = oxli.KmerCountTable(4)
seq = "XATCGG"
print(cg.consume(seq, allow_bad_kmers=True))
assert cg.get("ATCG") == 1
assert cg.get("TCGG") == 1
assert cg.get("CCGA") == 1 # rc


def test_consume_bad_DNA_ignore_is_default():
# ignoring bad DNA is default
cg = oxli.KmerCountTable(4)
seq = "XATCGG"
print(cg.consume(seq))
assert cg.get("ATCG") == 1
assert cg.get("TCGG") == 1
assert cg.get("CCGA") == 1 # rc

0 comments on commit c1346ff

Please sign in to comment.