Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removing kmer records #28

Merged
merged 4 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 68 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,77 @@ impl KmerCountTable {
hash_keys.iter().map(|&key| self.get_hash(key)).collect()
}

// TODO: Add method "drop"
// remove kmer from table
/// Drop a k-mer from the count table by its string representation
pub fn drop(&mut self, kmer: String) -> PyResult<()> {
// Compute the hash of the k-mer using the same method used for counting
let hashval = self.hash_kmer(kmer)?;
// Attempt to remove the k-mer's hash from the counts HashMap
if self.counts.remove(&hashval).is_some() {
// If the k-mer was successfully removed, return Ok
debug!("K-mer with hashval {} removed from table", hashval);
Ok(())
} else {
// If the k-mer was not found, return Ok without an error
debug!("K-mer with hashval {} not found in table", hashval);
Ok(())
}
}

/// Drop a k-mer from the count table by its hash value
pub fn drop_hash(&mut self, hashval: u64) -> PyResult<()> {
// Attempt to remove the hash value from the counts HashMap
if self.counts.remove(&hashval).is_some() {
// If the hash value was successfully removed, log and return Ok
debug!("Hash value {} removed from table", hashval);
Ok(())
} else {
// If the hash value was not found, log and return Ok without error
debug!("Hash value {} not found in table", hashval);
Ok(())
}
}

/// Remove all k-mers with counts less than a given threshold
pub fn mincut(&mut self, min_count: u64) -> PyResult<u64> {
// Create a vector to store the keys (hashes) to be removed
let mut to_remove = Vec::new();

// Iterate over the HashMap and identify keys with counts less than the threshold
for (&hash, &count) in self.counts.iter() {
if count < min_count {
to_remove.push(hash);
}
}

// Remove the identified keys from the counts HashMap
for &hash in &to_remove {
self.counts.remove(&hash);
}

// Return the number of k-mers removed
Ok(to_remove.len() as u64)
}

// TODO: Add method "drop_hash"
// remove hash from table
/// Remove all k-mers with counts greater than a given threshold
pub fn maxcut(&mut self, max_count: u64) -> PyResult<u64> {
// Create a vector to store the keys (hashes) to be removed
let mut to_remove = Vec::new();

// TODO: Add "mincut". Remove counts below a minimum cutoff.
// Iterate over the HashMap and identify keys with counts greater than the threshold
for (&hash, &count) in self.counts.iter() {
if count > max_count {
to_remove.push(hash);
}
}

// TODO: Add "maxcut". Remove counts above an maximum cutoff.
// Remove the identified keys from the counts HashMap
for &hash in &to_remove {
self.counts.remove(&hash);
}

// Return the number of k-mers removed
Ok(to_remove.len() as u64)
}

// TODO: Serialize the KmerCountTable instance to a JSON string.

Expand Down
117 changes: 103 additions & 14 deletions src/python/tests/test_remove.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,108 @@
import oxli
import pytest
from test_basic import create_sample_kmer_table
import oxli


@pytest.fixture
def setup_kmer_table():
"""Fixture to set up a KmerCountTable with ksize=4 and some initial k-mers"""
kct = oxli.KmerCountTable(ksize=4)
kct.count("AAAA") # Hash of canonical form will be used (AAAA)
kct.count("CCCC") # CCCC
kct.count("ATAT") # ATAT
kct.count("GGGG") # Should map to CCCC
kct.count("TTTT") # Should map to AAAA
kct.count("CCCC") # Increment count for CCCC/GGGG
# AAAA/TTTT = 2
# ATAT = 1
# CCCC/GGGG = 3
return kct


def test_drop(setup_kmer_table):
"""
Test the drop method to remove a k-mer by its string representation.
Edge case: Dropping a k-mer that doesn't exist.
"""
kct = setup_kmer_table

# Drop "GGGG" which exists, and check it's removed
kct.drop("GGGG")
assert kct.get("GGGG") == 0, "Expected 'GGGG' to be removed."

# Drop "AAAA", should remove both "AAAA" and "TTTT" (same canonical form)
kct.drop("AAAA")
assert kct.get("AAAA") == 0, "Expected 'AAAA' (and 'TTTT') to be removed."

# Edge case: Drop a k-mer that doesn't exist, e.g., "GGGA"
kct.drop("GGGA") # "GGGA" not present in the table
assert kct.get("GGGA") == 0 # "GGGA" not present in the table


def test_drop_hash(setup_kmer_table):
"""
Test the drop_hash method to remove a k-mer by its hash.
Edge case: Dropping a hash that doesn't exist.
"""
kct = setup_kmer_table

# Drop by the hash for "CCCC", and check it's removed
hashval = kct.hash_kmer("CCCC")
kct.drop_hash(hashval)
assert kct.get_hash(hashval) == 0, "Expected 'CCCC' and 'GGGG' to be removed."
assert kct.get("CCCC") == 0, "Expected 'CCCC' to be removed."
assert kct.get("GGGG") == 0, "Expected 'GGGG' to be removed."

# Edge case: Drop a hash that doesn't exist
non_existent_hash = 999999999
kct.drop_hash(non_existent_hash) # Should not raise an error
assert (
kct.get_hash(non_existent_hash) == 0
), "Expected non-existent hash removal to succeed."


def test_mincut(setup_kmer_table):
"""
Test the mincut method to remove all k-mers with counts less than a given threshold.
Edge cases: Threshold is higher than all counts, no k-mers to remove.
"""
kct = setup_kmer_table

# Set a threshold that only removes k-mers with counts < 2
removed = kct.mincut(3)
assert removed == 2, "Expected 2 k-mers to be removed ('ATAT' and 'AAAA/TTTT')."
assert kct.get("GGGG") == 3, "Expected 'GGGG/CCCC' to remain."

# Edge case: Threshold is higher than all k-mer counts (remove everything)
removed = kct.mincut(10)
assert removed == 1, "Expected all remaining k-mers to be removed ('GGGG/CCCC')."
assert len(kct.hashes) == 0, "Expected no k-mers left after removing all."


def test_drop():
'''Remove kmer by name.'''
pass
def test_maxcut(setup_kmer_table):
"""
Test the maxcut method to remove all k-mers with counts greater than a given threshold.
Edge case: Threshold is lower than all counts, no k-mers to remove.
"""
kct = setup_kmer_table

def test_drop_hash():
'''Remove record by hash.'''
pass
# Set a threshold that only removes k-mers with counts > 1 (GGGG)
removed = kct.maxcut(2)
assert removed == 1, "Expected 'CCCC/GGGG' to be removed."
assert kct.get("GGGG") == 0, "Expected 'CCCC/GGGG' to be removed."
assert (
kct.get("AAAA") == 2
), "Should not remove kmers with exact maxcut value, only greater."

def test_mincut():
'''Remove all records with counts < threshold. '''
pass
# Edge case: Threshold is higher than all k-mer counts (remove none)
removed = kct.maxcut(10)
assert removed == 0, "Expected no k-mers to be removed since all counts are < 10."
assert (
len(kct.hashes) == 2
), "Expected 2 records with counts < 10 to remain in the table."

def test_maxcut():
'''Remove all records with counts > threshold. '''
pass
# Edge case: Threshold is lower than all k-mer counts (remove all)
removed = kct.maxcut(0)
assert removed == 2, "Expected no k-mers to be removed since all counts are > 0."
assert (
len(kct.hashes) == 0
), "Expected 0 records with counts < 1 to remain in the table."
Loading