Skip to content

Commit

Permalink
WIP: Add distance metrics (#45)
Browse files Browse the repository at this point in the history
* Add rayon

* add scipy for testing cosine

* Init cosine() method to calc cosine similarity. Use rayon.

* Add cosine() tests

* jaccard tests

* Jaccard Similarity Coefficient

* Style fixes by Ruff

* MRG: update tests for distance metrics to test symmetry, with a bit of paranoia thrown in (#48)

* test symmetric

* Style fixes by Ruff

---------

Co-authored-by: ctb <[email protected]>

---------

Co-authored-by: Adamtaranto <[email protected]>
Co-authored-by: C. Titus Brown <[email protected]>
Co-authored-by: ctb <[email protected]>
  • Loading branch information
4 people authored Sep 23, 2024
1 parent 7b49c89 commit c0ff7fe
Show file tree
Hide file tree
Showing 5 changed files with 384 additions and 2 deletions.
48 changes: 47 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ env_logger = "0.11.5"
log = "0.4.22"
niffler = "2.6.0"
pyo3 = { version="0.22.3", features = ["extension-module", "anyhow"] }
rayon = "1.10.0"
serde = { version = "1.0.210", features = ["derive"] }
serde_json = "1.0.128"
sourmash = "0.15.1"
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,6 @@ features = ["pyo3/extension-module"]
[project.optional-dependencies]
test = [
"pytest>=7.0",
"toml>=0.10"
"toml>=0.10",
"scipy"
]
63 changes: 63 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use niffler::get_writer;
use pyo3::exceptions::{PyIOError, PyValueError};
use pyo3::prelude::*;
use pyo3::PyResult;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use sourmash::encodings::HashFunctions;
use sourmash::signature::SeqToHashes;
Expand Down Expand Up @@ -498,6 +499,68 @@ impl KmerCountTable {
self.counts.insert(hashval, count);
Ok(())
}

/// Calculates the Jaccard Similarity Coefficient between two KmerCountTable objects.
/// # Returns
/// The Jaccard Similarity Coefficient between the two tables as a float value between 0 and 1.
pub fn jaccard(&self, other: &KmerCountTable) -> f64 {
// Get the intersection of the two k-mer sets.
let intersection_size = self.intersection(other).len();

// Get the union of the two k-mer sets.
let union_size = self.union(other).len();

// Handle the case where the union is empty (both sets are empty).
if union_size == 0 {
return 1.0; // By convention, two empty sets are considered identical.
}

// Calculate and return the Jaccard similarity as a ratio of intersection to union.
intersection_size as f64 / union_size as f64
}

/// Cosine similarity between two `KmerCountTable` objects.
/// # Returns
/// The cosine similarity between the two tables as a float value between 0 and 1.
pub fn cosine(&self, other: &KmerCountTable) -> f64 {
// Early return if either table is empty.
if self.counts.is_empty() || other.counts.is_empty() {
return 0.0;
}

// Calculate the dot product in parallel.
let dot_product: u64 = self
.counts
.par_iter()
.filter_map(|(&hash, &count1)| {
// Only include in the dot product if both tables have the k-mer.
other.counts.get(&hash).map(|&count2| count1 * count2)
})
.sum();

// Calculate magnitudes in parallel for both tables.
let magnitude_self: f64 = self
.counts
.par_iter()
.map(|(_, v)| (*v as f64).powi(2)) // Access the value, square it
.sum::<f64>()
.sqrt();

let magnitude_other: f64 = other
.counts
.par_iter()
.map(|(_, v)| (*v as f64).powi(2)) // Access the value, square it
.sum::<f64>()
.sqrt();

// If either magnitude is zero (no k-mers), return 0 to avoid division by zero.
if magnitude_self == 0.0 || magnitude_other == 0.0 {
return 0.0;
}

// Calculate and return cosine similarity.
dot_product as f64 / (magnitude_self * magnitude_other)
}
}

#[pyclass]
Expand Down
Loading

0 comments on commit c0ff7fe

Please sign in to comment.