diff --git a/Cargo.lock b/Cargo.lock index 394a7bb..26511ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -263,6 +263,31 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + [[package]] name = "csv" version = "1.3.0" @@ -728,8 +753,9 @@ dependencies = [ "log", "niffler", "pyo3", - "serde", + "rayon", "serde_json", + "serde", "sourmash", ] @@ -950,6 +976,26 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "regex" version = "1.10.6" diff --git a/Cargo.toml b/Cargo.toml index aff50d4..326db1a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ env_logger = "0.11.5" log = "0.4.22" niffler = "2.6.0" pyo3 = { version="0.22.3", features = ["extension-module", "anyhow"] } +rayon = "1.10.0" serde = { version = "1.0.210", features = ["derive"] } serde_json = "1.0.128" sourmash = "0.15.1" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 0e472df..095c9bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,5 +35,6 @@ features = ["pyo3/extension-module"] [project.optional-dependencies] test = [ "pytest>=7.0", - "toml>=0.10" + "toml>=0.10", + "scipy" ] diff --git a/src/lib.rs b/src/lib.rs index 66e1b9d..0f30ba3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,7 @@ use niffler::get_writer; use pyo3::exceptions::{PyIOError, PyValueError}; use pyo3::prelude::*; use pyo3::PyResult; +use rayon::prelude::*; use serde::{Deserialize, Serialize}; use sourmash::encodings::HashFunctions; use sourmash::signature::SeqToHashes; @@ -498,6 +499,68 @@ impl KmerCountTable { self.counts.insert(hashval, count); Ok(()) } + + /// Calculates the Jaccard Similarity Coefficient between two KmerCountTable objects. + /// # Returns + /// The Jaccard Similarity Coefficient between the two tables as a float value between 0 and 1. + pub fn jaccard(&self, other: &KmerCountTable) -> f64 { + // Get the intersection of the two k-mer sets. + let intersection_size = self.intersection(other).len(); + + // Get the union of the two k-mer sets. + let union_size = self.union(other).len(); + + // Handle the case where the union is empty (both sets are empty). + if union_size == 0 { + return 1.0; // By convention, two empty sets are considered identical. + } + + // Calculate and return the Jaccard similarity as a ratio of intersection to union. + intersection_size as f64 / union_size as f64 + } + + /// Cosine similarity between two `KmerCountTable` objects. + /// # Returns + /// The cosine similarity between the two tables as a float value between 0 and 1. + pub fn cosine(&self, other: &KmerCountTable) -> f64 { + // Early return if either table is empty. + if self.counts.is_empty() || other.counts.is_empty() { + return 0.0; + } + + // Calculate the dot product in parallel. + let dot_product: u64 = self + .counts + .par_iter() + .filter_map(|(&hash, &count1)| { + // Only include in the dot product if both tables have the k-mer. + other.counts.get(&hash).map(|&count2| count1 * count2) + }) + .sum(); + + // Calculate magnitudes in parallel for both tables. + let magnitude_self: f64 = self + .counts + .par_iter() + .map(|(_, v)| (*v as f64).powi(2)) // Access the value, square it + .sum::() + .sqrt(); + + let magnitude_other: f64 = other + .counts + .par_iter() + .map(|(_, v)| (*v as f64).powi(2)) // Access the value, square it + .sum::() + .sqrt(); + + // If either magnitude is zero (no k-mers), return 0 to avoid division by zero. + if magnitude_self == 0.0 || magnitude_other == 0.0 { + return 0.0; + } + + // Calculate and return cosine similarity. + dot_product as f64 / (magnitude_self * magnitude_other) + } } #[pyclass] diff --git a/src/python/tests/test_metrics.py b/src/python/tests/test_metrics.py new file mode 100644 index 0000000..c8323db --- /dev/null +++ b/src/python/tests/test_metrics.py @@ -0,0 +1,271 @@ +from math import isclose +import numpy as np +import pytest +from scipy.spatial.distance import cosine + +from oxli import KmerCountTable + +# Cosine similarity tests + + +def test_cosine_similarity_identical_tables(): + """ + Test cosine similarity for two identical KmerCountTable objects. + + The cosine similarity should be 1.0 because the vectors representing + the k-mer counts are exactly the same, meaning the angle between them + is 0 degrees (cos(0) = 1). + """ + kct1 = KmerCountTable(ksize=4) + kct2 = KmerCountTable(ksize=4) + + # Manually set k-mer counts + kct1["AAAA"] = 5 + kct1["AATT"] = 3 + kct1["GGGG"] = 1 + kct1["CCAA"] = 4 + kct1["ATTG"] = 6 + + # Copy the exact same counts to kct2 + kct2["AAAA"] = 5 + kct2["AATT"] = 3 + kct2["GGGG"] = 1 + kct2["CCAA"] = 4 + kct2["ATTG"] = 6 + + # Cosine similarity between identical tables should be 1.0 + # Allow value within 0.001% + assert isclose(kct1.cosine(kct2), 1.0, rel_tol=1e-5) + assert isclose(kct2.cosine(kct1), 1.0, rel_tol=1e-5) + + # Using scipy to calculate the expected value + vector1 = [5, 3, 1, 4, 6] + vector2 = [5, 3, 1, 4, 6] + expected_cosine_sim = 1 - cosine(vector1, vector2) + + # Allow value within 0.001% + assert isclose(kct1.cosine(kct2), expected_cosine_sim, rel_tol=1e-5) + assert isclose(kct2.cosine(kct1), expected_cosine_sim, rel_tol=1e-5) + + +def test_cosine_similarity_different_tables(): + """ + Test cosine similarity for two different KmerCountTable objects. + + The cosine similarity will be less than 1.0 since the vectors + are not identical. We will calculate the expected cosine + similarity using scipy. + """ + kct1 = KmerCountTable(ksize=4) + kct2 = KmerCountTable(ksize=4) + + # Manually set k-mer counts for kct1 + kct1["AAAA"] = 4 + kct1["AATT"] = 3 + kct1["GGGG"] = 1 + kct1["CCAA"] = 4 + kct1["ATTG"] = 6 + + # Manually set different counts for kct2 + kct2["AAAA"] = 5 + kct2["AATT"] = 3 + kct2["GGGG"] = 1 + kct2["CCAA"] = 4 + kct2["ATTG"] = 0 + + # Using scipy to calculate the expected value + vector1 = [4, 3, 1, 4, 6] + vector2 = [5, 3, 1, 4, 0] + expected_cosine_sim = 1 - cosine(vector1, vector2) + + # Allow value within 0.001% + assert isclose(kct1.cosine(kct2), expected_cosine_sim, rel_tol=1e-5) + assert isclose(kct2.cosine(kct1), expected_cosine_sim, rel_tol=1e-5) + + +def test_cosine_similarity_empty_table(): + """ + Test cosine similarity for two KmerCountTable objects where one is empty. + + The cosine similarity should be 0.0 because the dot product with an + empty table will result in zero, making the numerator of the cosine + similarity formula zero. + """ + kct1 = KmerCountTable(ksize=4) + kct2 = KmerCountTable(ksize=4) + + # Set counts for kct1 + kct1["AAAA"] = 5 + kct1["TTTG"] = 10 + + # Leave kct2 empty + + # Cosine similarity should be 0 since one table is empty + assert kct1.cosine(kct2) == 0.0 + + # Set kct2 with 1 non-overlapping kmer + kct2["ATTG"] = 1 + + # Cosine similarity should be 0 since no shared kmers + assert kct1.cosine(kct2) == 0.0 + + # Using scipy for comparison + vector1 = [5, 10, 0] + vector2 = [0, 0, 1] # Representing the empty table with no overlap + expected_cosine_sim = 1 - cosine(vector1, vector2) + + assert isclose(kct1.cosine(kct2), expected_cosine_sim, rel_tol=1e-5) + assert isclose(kct2.cosine(kct1), expected_cosine_sim, rel_tol=1e-5) + + +def test_cosine_similarity_both_empty(): + """ + Test cosine similarity for two empty KmerCountTable objects. + """ + # Both tables are empty + kct1 = KmerCountTable(ksize=4) + kct2 = KmerCountTable(ksize=4) + + # Cosine similarity should be 0.0 for two empty tables + assert kct1.cosine(kct2) == 0.0 + assert kct2.cosine(kct1) == 0.0 + + +def test_cosine_similarity_partial_overlap(): + """ + Test cosine similarity for two KmerCountTable objects with partial overlap in k-mers. + + The cosine similarity should be less than 1.0 but greater than 0.0 because + the tables have overlapping k-mers, but their counts differ. + """ + kct1 = KmerCountTable(ksize=4) + kct2 = KmerCountTable(ksize=4) + + # Manually set k-mer counts for kct1 + # kct1["AAAA"] = 0 # Not in kct2 + kct1["AATT"] = 3 + kct1["GGGG"] = 1 + kct1["CCAA"] = 4 + kct1["ATTG"] = 0 + kct1["AGAT"] = 0 # Set but not in either + + # Manually set k-mer counts for kct2 + kct2["AAAA"] = 5 + kct2["AATT"] = 4 # Diff value to kct1 + kct2["GGGG"] = 1 + kct2["CCAA"] = 4 + kct2["ATTG"] = 1 # Not in kct1 + + # Using scipy for comparison + vector1 = [0, 3, 1, 4, 0, 0] + vector2 = [5, 4, 1, 4, 1, 0] + expected_cosine_sim = 1 - cosine(vector1, vector2) + + # Cosine similarity is expected to be > 0 but < 1 + assert isclose(kct1.cosine(kct2), expected_cosine_sim, rel_tol=1e-5) + assert isclose(kct2.cosine(kct1), expected_cosine_sim, rel_tol=1e-5) + + +# Jaccard coefficient similarity tests + + +def test_jaccard_similarity_identical_tables(): + """ + Test Jaccard similarity for two identical KmerCountTable objects. + + The Jaccard similarity should be 1.0 because both tables contain exactly the same k-mers. + """ + kct1 = KmerCountTable(ksize=4) + kct2 = KmerCountTable(ksize=4) + + # Manually set identical k-mer counts for both tables + kct1["AAAA"] = 5 + kct1["TTTC"] = 2 + kct1["AATT"] = 3 + kct1["GGGG"] = 1 + + kct2["AAAA"] = 5 + kct2["TTTC"] = 2 + kct2["AATT"] = 3 + kct2["GGGG"] = 1 + + # Jaccard similarity should be 1.0 for identical sets + assert kct1.jaccard(kct2) == 1.0 + assert kct2.jaccard(kct1) == 1.0 + + +def test_jaccard_similarity_different_tables(): + """ + Test Jaccard similarity for two KmerCountTable objects with different k-mers. + + The Jaccard similarity will be less than 1.0 because the sets of k-mers differ. + """ + kct1 = KmerCountTable(ksize=4) + kct2 = KmerCountTable(ksize=4) + + # Set different k-mer counts for both tables + kct1["AAAA"] = 5 + kct1["TTTC"] = 2 + + kct2["AATT"] = 3 + kct2["GGGG"] = 4 + + # Expected result: 0 overlap between the sets + assert kct1.jaccard(kct2) == 0.0 + assert kct2.jaccard(kct1) == 0.0 + + +def test_jaccard_similarity_partial_overlap(): + """ + Test Jaccard similarity for two KmerCountTable objects with partial overlap in k-mers. + + The Jaccard similarity should be greater than 0.0 but less than 1.0 because there are overlapping k-mers. + """ + kct1 = KmerCountTable(ksize=4) + kct2 = KmerCountTable(ksize=4) + + # Set k-mer counts for kct1 + kct1["AAAA"] = 5 + kct1["AATT"] = 1 + kct1["TTTC"] = 2 + + # Set k-mer counts for kct2 + kct2["AAAA"] = 2 + kct2["AATT"] = 1 + kct2["GGGG"] = 4 + + # Calculate expected Jaccard similarity: intersection {AAAA, AATT}, union {AAAA, TTTT, AATT, GGGG} + assert kct1.jaccard(kct2) == 2 / 4 + assert kct2.jaccard(kct1) == 2 / 4 + + +def test_jaccard_similarity_empty_table(): + """ + Test Jaccard similarity for two KmerCountTable objects where one is empty. + + The Jaccard similarity should be 0.0 because one set is empty, and the union is non-empty. + """ + kct1 = KmerCountTable(ksize=4) + kct2 = KmerCountTable(ksize=4) + + # Set counts for kct1 + kct1["AAAA"] = 5 + kct1["TTTC"] = 5 + + # kct2 is empty + assert kct1.jaccard(kct2) == 0.0 + assert kct2.jaccard(kct1) == 0.0 + + +def test_jaccard_similarity_both_empty(): + """ + Test Jaccard similarity for two empty KmerCountTable objects. + + The Jaccard similarity should be 1.0 because both sets are empty, and thus identical. + """ + kct1 = KmerCountTable(ksize=4) + kct2 = KmerCountTable(ksize=4) + + # Both tables are empty + assert kct1.jaccard(kct2) == 1.0 + assert kct2.jaccard(kct1) == 1.0