Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev notes #26

Merged
merged 2 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ impl KmerCountTable {
}
}

// TODO: Optionally store hash:kmer pair when counting a new kmer
// Modify KmerCountTable to optionally store map of hash:kmer
// Modify SeqToHashes to return canonical kmer & hash

// TODO: Add function to get canonical kmer using hash key

fn hash_kmer(&self, kmer: String) -> Result<u64> {
if kmer.len() as u8 != self.ksize {
Err(anyhow!("wrong ksize"))
Expand Down Expand Up @@ -92,13 +98,48 @@ impl KmerCountTable {
hash_keys.iter().map(|&key| self.get_hash(key)).collect()
}

// TODO: Add method "drop"
// remove kmer from table

// TODO: Add method "drop_hash"
// remove hash from table

// TODO: Add "mincut". Remove counts below a minimum cutoff.

// TODO: Add "maxcut". Remove counts above an maximum cutoff.

// TODO: Serialize the KmerCountTable instance to a JSON string.

// TODO: Compress JSON string with gzip and save to file

// TODO: Static method to load KmerCountTable from serialized JSON. Yield new object.

// TODO: Add method "dump"
// Output tab delimited kmer:count pairs
// Default sort by count
// Option sort kmers lexicographically

// TODO: Add method "dump_hash"
// Output tab delimited hash:count pairs
// Default sort by count
// Option sort on keys

// TODO: Add method "histo"
// Output frequency counts

// Getter for the 'hashes' attribute, returning all hash keys in the table
#[getter]
pub fn hashes(&self) -> Vec<u64> {
// Collect and return all keys from the counts HashMap
self.counts.keys().cloned().collect()
}

// TODO: Getter for the version attribute
// Store oxli version when instance is created

// TODO: Getter for the consumed seq len attribute
// Update tracker when DNA is processed with count() or consume()

// Consume this DNA string. Return number of k-mers consumed.
#[pyo3(signature = (seq, allow_bad_kmers=true))]
pub fn consume(&mut self, seq: String, allow_bad_kmers: bool) -> PyResult<u64> {
Expand Down Expand Up @@ -179,6 +220,16 @@ impl KmerCountTable {
fn __xor__(&self, other: &KmerCountTable) -> HashSet<u64> {
self.symmetric_difference(other)
}

// Python dunder method for __iter__

// Python dunder method for __next__

// Python dunder method for __len__

// Python dunder method for __getitem__

// Python dunder method for __setitem__
}

// Python module definition
Expand Down
31 changes: 31 additions & 0 deletions src/python/tests/test_attr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import oxli
import pytest
from test_basic import create_sample_kmer_table

# Test attributes

def test_hashes_attribute():
table = create_sample_kmer_table(3, ["AAA", "TTT", "AAC"])
hashes = table.hashes
hash_aaa = table.hash_kmer("AAA") # 10679328328772601858
hash_ttt = table.hash_kmer("TTT") # 10679328328772601858
hash_aac = table.hash_kmer("AAC") # 6579496673972597301

expected_hashes = set(
[hash_aaa, hash_ttt, hash_aac]
) # {10679328328772601858, 6579496673972597301}
assert (
set(hashes) == expected_hashes
), ".hashes attribute should match the expected set of hash keys"


def test_version_attr():
'''Check version attribute matches current version.'''
pass

def test_total_consumed_seq_len_attr():
'''Should log total seq len consumed.'''
# Individual kmers
# Long seqs with multiple kmers
# Exclude invalid kmers?
pass
85 changes: 1 addition & 84 deletions src/python/tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,22 +107,6 @@ def test_consume_bad_DNA_ignore_is_default():
assert cg.get("CCGA") == 1 # rc


# Test attributes
def test_hashes_attribute():
table = create_sample_kmer_table(3, ["AAA", "TTT", "AAC"])
hashes = table.hashes
hash_aaa = table.hash_kmer("AAA") # 10679328328772601858
hash_ttt = table.hash_kmer("TTT") # 10679328328772601858
hash_aac = table.hash_kmer("AAC") # 6579496673972597301

expected_hashes = set(
[hash_aaa, hash_ttt, hash_aac]
) # {10679328328772601858, 6579496673972597301}
assert (
set(hashes) == expected_hashes
), ".hashes attribute should match the expected set of hash keys"


# Getting counts
def test_count_vs_counthash():
# test a bug reported by adam taranto: count and get should work together!
Expand Down Expand Up @@ -180,75 +164,8 @@ def test_get_hash_array():
), "Hash array counts should match the counts of 'AAA' and 'AAC' and return zero for 'GGG'."
assert rev_counts == [0, 1, 2], "Count should be in same order as input list"


def test_get_array():
"""
Get vector of counts corresponding to vector of kmers.
"""
# TODO: Add function to get list of counts given list of kmers.
pass


# Set operations
def test_union():
table1 = create_sample_kmer_table(3, ["AAA", "AAC"])
table2 = create_sample_kmer_table(3, ["AAC", "AAG"])

union_set = table1.union(table2)
expected_union = set(table1.hashes).union(table2.hashes)

assert union_set == expected_union, "Union of hash sets should match"


def test_intersection():
table1 = create_sample_kmer_table(3, ["AAA", "AAC"])
table2 = create_sample_kmer_table(3, ["AAC", "AAG"])

intersection_set = table1.intersection(table2)
expected_intersection = set(table1.hashes).intersection(table2.hashes)

assert (
intersection_set == expected_intersection
), "Intersection of hash sets should match"


def test_difference():
table1 = create_sample_kmer_table(3, ["AAA", "AAC"])
table2 = create_sample_kmer_table(3, ["AAC", "AAG"])

difference_set = table1.difference(table2)
expected_difference = set(table1.hashes).difference(table2.hashes)

assert difference_set == expected_difference, "Difference of hash sets should match"


def test_symmetric_difference():
table1 = create_sample_kmer_table(3, ["AAA", "AAC"])
table2 = create_sample_kmer_table(3, ["AAC", "AAG"])

symmetric_difference_set = table1.symmetric_difference(table2)
expected_symmetric_difference = set(table1.hashes).symmetric_difference(
table2.hashes
)

assert (
symmetric_difference_set == expected_symmetric_difference
), "Symmetric difference of hash sets should match"


def test_dunder_methods():
table1 = create_sample_kmer_table(3, ["AAA", "AAC"])
table2 = create_sample_kmer_table(3, ["AAC", "AAG"])

assert table1.__or__(table2) == table1.union(
table2
), "__or__ method should match union()"
assert table1.__and__(table2) == table1.intersection(
table2
), "__and__ method should match intersection()"
assert table1.__sub__(table2) == table1.difference(
table2
), "__sub__ method should match difference()"
assert table1.__xor__(table2) == table1.symmetric_difference(
table2
), "__xor__ method should match symmetric_difference()"
pass
25 changes: 25 additions & 0 deletions src/python/tests/test_dunders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import oxli
import pytest
from test_basic import create_sample_kmer_table


def test_len_dunder_method():
'''__len__ should return number of keys in KmerCountTable.'''
pass

def test_iter_dunder_method():
'''KmerCountTable should be iterable, yield hash:count pairs'''
pass

def test_next_dunder_method():
'''Select next key in generator'''
pass

def test_getitem_dunder_method():
'''Query an object to using the indexing syntax (obj[key])'''
# Same behaviour as .get()
pass

def test_setitem_dunder_method():
'''Set values using the indexing syntax (obj[key] = value)'''
pass
8 changes: 8 additions & 0 deletions src/python/tests/test_kmer_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import oxli
import pytest
from test_basic import create_sample_kmer_table


def test_kmermap():
'''Test option to add kmermap'''
pass
24 changes: 24 additions & 0 deletions src/python/tests/test_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import oxli
import pytest
from test_basic import create_sample_kmer_table


def test_serialise():
'''Serialise object to JSON '''
pass

def test_deserialise():
'''Load object from file.'''
pass

def test_dump():
'''Write tab delimited kmer:count pairs'''
pass

def test_dump_hash():
'''Write tab delimited hash_count pairs '''
pass

def test_histo():
'''Write frequency counts.'''
pass
19 changes: 19 additions & 0 deletions src/python/tests/test_remove.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import oxli
import pytest
from test_basic import create_sample_kmer_table

def test_drop():
'''Remove kmer by name.'''
pass

def test_drop_hash():
'''Remove record by hash.'''
pass

def test_mincut():
'''Remove all records with counts < threshold. '''
pass

def test_maxcut():
'''Remove all records with counts > threshold. '''
pass
68 changes: 68 additions & 0 deletions src/python/tests/test_setops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import oxli
import pytest

from test_basic import create_sample_kmer_table

# Set operations
def test_union():
table1 = create_sample_kmer_table(3, ["AAA", "AAC"])
table2 = create_sample_kmer_table(3, ["AAC", "AAG"])

union_set = table1.union(table2)
expected_union = set(table1.hashes).union(table2.hashes)

assert union_set == expected_union, "Union of hash sets should match"


def test_intersection():
table1 = create_sample_kmer_table(3, ["AAA", "AAC"])
table2 = create_sample_kmer_table(3, ["AAC", "AAG"])

intersection_set = table1.intersection(table2)
expected_intersection = set(table1.hashes).intersection(table2.hashes)

assert (
intersection_set == expected_intersection
), "Intersection of hash sets should match"


def test_difference():
table1 = create_sample_kmer_table(3, ["AAA", "AAC"])
table2 = create_sample_kmer_table(3, ["AAC", "AAG"])

difference_set = table1.difference(table2)
expected_difference = set(table1.hashes).difference(table2.hashes)

assert difference_set == expected_difference, "Difference of hash sets should match"


def test_symmetric_difference():
table1 = create_sample_kmer_table(3, ["AAA", "AAC"])
table2 = create_sample_kmer_table(3, ["AAC", "AAG"])

symmetric_difference_set = table1.symmetric_difference(table2)
expected_symmetric_difference = set(table1.hashes).symmetric_difference(
table2.hashes
)

assert (
symmetric_difference_set == expected_symmetric_difference
), "Symmetric difference of hash sets should match"


def test_dunder_methods():
table1 = create_sample_kmer_table(3, ["AAA", "AAC"])
table2 = create_sample_kmer_table(3, ["AAC", "AAG"])

assert table1.__or__(table2) == table1.union(
table2
), "__or__ method should match union()"
assert table1.__and__(table2) == table1.intersection(
table2
), "__and__ method should match intersection()"
assert table1.__sub__(table2) == table1.difference(
table2
), "__sub__ method should match difference()"
assert table1.__xor__(table2) == table1.symmetric_difference(
table2
), "__xor__ method should match symmetric_difference()"
Loading