Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dump function to write counts to file #30

Merged
merged 7 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 61 additions & 8 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ use pyo3::prelude::*;
use sourmash::encodings::HashFunctions;
use sourmash::signature::SeqToHashes;

use pyo3::PyResult;
use std::fs::File;
use std::io::{BufWriter, Write};

// Set version variable
const VERSION: &str = env!("CARGO_PKG_VERSION");

Expand Down Expand Up @@ -185,15 +189,64 @@ impl KmerCountTable {

// TODO: Static method to load KmerCountTable from serialized JSON. Yield new object.

// TODO: Add method "dump"
// Output tab delimited kmer:count pairs
// Default sort by count
// Option sort kmers lexicographically
/// Dump (hash,count) pairs, optional sorted by count or hash key.
///
/// # Arguments
/// * `file` - Optional file path to write the output. If not provided, returns a list of tuples.
/// * `sortkeys` - Optional flag to sort by hash keys (default: False).
/// * `sortcounts` - Sort on counts, secondary sort on keys. (default: False).
#[pyo3(signature = (file=None, sortcounts=false, sortkeys=false))]
pub fn dump(
&self,
file: Option<String>,
sortcounts: bool,
sortkeys: bool,
) -> PyResult<Vec<(u64, u64)>> {
// Raise an error if both sortcounts and sortkeys are true
if sortcounts && sortkeys {
return Err(PyValueError::new_err(
"Cannot sort by both counts and keys at the same time.",
));
}

// Collect hashes and counts
let mut hash_count_pairs: Vec<(&u64, &u64)> = self.counts.iter().collect();

// Handle sorting based on the flags
if sortkeys {
// Sort by hash keys if `sortkeys` is set to true
hash_count_pairs.sort_by_key(|&(hash, _)| *hash);
} else if sortcounts {
// Sort by count, secondary sort by hash if `sortcounts` is true
hash_count_pairs.sort_by(|&(hash1, count1), &(hash2, count2)| {
count1.cmp(count2).then_with(|| hash1.cmp(hash2))
});
}
// If both sortcounts and sortkeys are false, no sorting is done.

// If a file is provided, write to the file
if let Some(filepath) = file {
let f = File::create(filepath)?;
let mut writer = BufWriter::new(f);

// Write each hash:count pair to the file
for (hash, count) in hash_count_pairs {
writeln!(writer, "{}\t{}", hash, count)?;
}

writer.flush()?; // Flush the buffer
Ok(vec![]) // Return empty vector to Python
} else {
// Convert the vector of references to owned values
let result: Vec<(u64, u64)> = hash_count_pairs
.into_iter()
.map(|(&hash, &count)| (hash, count))
.collect();

// TODO: Add method "dump_hash"
// Output tab delimited hash:count pairs
// Default sort by count
// Option sort on keys
// Return the vector of (hash, count) tuples
Ok(result)
}
}

/// Calculates the frequency histogram for k-mer counts
/// Returns a vector of tuples (frequency, count), where 'frequency' is
Expand Down
165 changes: 165 additions & 0 deletions src/python/tests/test_dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import pytest
import tempfile
from os import remove
from oxli import KmerCountTable


@pytest.fixture
def kmer_count_table():
"""Fixture to set up a KmerCountTable instance with sample data."""
kct = KmerCountTable(ksize=4)
kct.count("AAAA") # 17832910516274425539
kct.count("TTTT") # 17832910516274425539
kct.count("AATT") # 382727017318141683
kct.count("GGGG") # 73459868045630124
kct.count("GGGG") # 73459868045630124
return kct


@pytest.fixture
def empty_kmer_count_table():
"""Fixture to set up an empty KmerCountTable instance."""
return KmerCountTable(ksize=4)


def test_dump_conflicting_sort_options(kmer_count_table):
"""Test that passing both sortcounts=True and sortkeys=True raises a ValueError."""
with pytest.raises(
ValueError, match="Cannot sort by both counts and keys at the same time."
):
kmer_count_table.dump(file=None, sortcounts=True, sortkeys=True)


def test_dump_no_sorting(kmer_count_table):
"""Test the dump function with no sorting (both sortcounts and sortkeys are False)."""
result = kmer_count_table.dump(file=None, sortcounts=False, sortkeys=False)

# Expected output same order as for iterator
expected = list(kmer_count_table)
# [(17832910516274425539, 2), (382727017318141683, 1), (73459868045630124, 2)]

assert result == expected, f"Expected {expected}, but got {result}"


def test_dump_sortcounts_with_ties(kmer_count_table):
"""Test the dump function with sortcounts=True, ensuring it handles ties in counts."""
result = kmer_count_table.dump(file=None, sortcounts=True, sortkeys=False)

# Expected output sorted by count, with secondary sorting by hash for ties
expected = [
(382727017318141683, 1), # 'AATT'
(73459868045630124, 2), # 'GGGG' (lower hash than 'AAAA')
(17832910516274425539, 2), # 'AAAA'/'TTTT'
]

assert result == expected, f"Expected {expected}, but got {result}"


def test_dump_single_kmer():
"""Test the dump function with only a single k-mer counted."""
kct = KmerCountTable(ksize=4)
kct.count("AAAA") # Hash for 'AAAA'/'TTTT'

result = kct.dump(file=None, sortcounts=True, sortkeys=False)

expected = [
(17832910516274425539, 1) # 'AAAA'/'TTTT'
]

assert result == expected, f"Expected {expected}, but got {result}"


def test_dump_write_to_file(kmer_count_table):
"""Test the dump function when writing to a file.

This test checks if the function correctly writes the hash:count pairs to a file.
"""
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file_path = temp_file.name

kmer_count_table.dump(file=temp_file_path, sortcounts=True, sortkeys=False)

with open(temp_file_path, "r") as f:
lines = f.readlines()

# Expected output sorted by count then hash (default behavior)
expected_lines = [
f"{382727017318141683}\t1\n", # 'AATT'
f"{73459868045630124}\t2\n", # 'GGGG'
f"{17832910516274425539}\t2\n", # 'AAAA'/'TTTT'
]

assert lines == expected_lines, f"Expected {expected_lines}, but got {lines}"

# Cleanup
remove(temp_file_path)


def test_dump_write_to_file_sortkeys(kmer_count_table):
"""Test the dump function with sortkeys=True when writing to a file."""
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file_path = temp_file.name

kmer_count_table.dump(file=temp_file_path, sortkeys=True)

with open(temp_file_path, "r") as f:
lines = f.readlines()

# Expected output sorted by hash keys
expected_lines = [
f"{73459868045630124}\t2\n", # 'GGGG'
f"{382727017318141683}\t1\n", # 'AATT'
f"{17832910516274425539}\t2\n", # 'AAAA'/'TTTT'
]

assert lines == expected_lines, f"Expected {expected_lines}, but got {lines}"

# Cleanup
remove(temp_file_path)


def test_dump_sortkeys(kmer_count_table):
"""Test the dump function with sortkeys=True.

This test verifies if the function sorts by hash keys when `sortkeys` is set to True.
"""
result = kmer_count_table.dump(file=None, sortkeys=True)

# Expected output sorted by hash key
expected = [
(73459868045630124, 2), # 'GGGG'
(382727017318141683, 1), # 'AATT'
(17832910516274425539, 2), # 'AAAA'/'TTTT'
]

assert result == expected, f"Expected {expected}, but got {result}"


def test_dump_invalid_file_path(kmer_count_table):
"""Test that passing an invalid file path raises an error."""
with pytest.raises(OSError):
kmer_count_table.dump(file="", sortkeys=True)


def test_dump_hash_empty_table(empty_kmer_count_table):
"""Test the dump function on an empty KmerCountTable.

This test checks that the function handles an empty table correctly.
"""
# Test that calling dump without file returns an empty list
result = empty_kmer_count_table.dump(file=None, sortkeys=False)
assert result == [], "Expected an empty list from an empty KmerCountTable"

# Test that calling dump with a file writes nothing to the file
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file_path = temp_file.name

empty_kmer_count_table.dump(file=temp_file_path, sortkeys=False)

with open(temp_file_path, "r") as f:
lines = f.readlines()

assert lines == [], "Expected an empty file for an empty KmerCountTable"

# Cleanup
remove(temp_file_path)
Loading