Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Rust] Update sparse matrix #405

Merged
merged 2 commits into from
Nov 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 29 additions & 21 deletions rust/src/similarities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,30 @@ use std::time::Instant;
use fxhash::FxHashMap;
use pyo3::PyResult;

use crate::sparse::CsrMatrix;

const MAX_BLOCK_SIZE: i64 = 200_000_000;

pub(crate) fn compute_sum_squares(interactions: &CsrMatrix, num: usize) -> Vec<f32> {
let mut sum_squares = vec![0.0; num];
for (i, ss) in sum_squares.iter_mut().enumerate() {
if let Some(row) = interactions.get_row(i) {
*ss = row.map(|(_, &d)| d * d).sum()
}
}
sum_squares
}

/// Divide `n_x` into several blocks to avoid huge memory consumption.
pub(crate) fn invert_cosine(
indices: &[i32],
indptr: &[usize],
data: &[f32],
interactions: &CsrMatrix,
sum_squares: &[f32],
cum_values: &mut FxHashMap<i32, (i32, i32, f32, usize)>,
n_x: usize,
n_y: usize,
min_common: usize,
) -> PyResult<Vec<(i32, i32, f32)>> {
let (indices, indptr, data) = interactions.values();
let start = Instant::now();
let mut cosine_sims: Vec<(i32, i32, f32)> = Vec::new();
let step = (MAX_BLOCK_SIZE as f64 / n_x as f64).ceil() as usize;
Expand All @@ -37,13 +48,6 @@ pub(crate) fn invert_cosine(
let value = data[i] * data[j];
prods[index] += value;
counts[index] += 1;
// cum_values
// .entry(key)
// .and_modify(|(.., v, c)| {
// *v += value;
// *c += 1;
// })
// .or_insert((x1, x2, value, 1));
}
}
}
Expand All @@ -55,38 +59,42 @@ pub(crate) fn invert_cosine(
let prod = prods[index];
let sq1 = sum_squares[x1];
let sq2 = sum_squares[x2];
let cosine = if prod == 0.0 || sq1 == 0.0 || sq2 == 0.0 {
0.0
} else {
let norm = sq1.sqrt() * sq2.sqrt();
prod / norm
};
let key = i32::try_from(x1 * n_x + x2)?;
let x1 = i32::try_from(x1)?;
let x2 = i32::try_from(x2)?;
let count = counts[index];
if count >= min_common {
let cosine = if prod == 0.0 || sq1 == 0.0 || sq2 == 0.0 {
0.0
} else {
let norm = sq1.sqrt() * sq2.sqrt();
prod / norm
};
cosine_sims.push((x1, x2, cosine));
}
if count > 0 {
cum_values.insert(key, (x1, x2, cosine, count));
cum_values.insert(key, (x1, x2, prod, count));
}
}
}
}
let duration = start.elapsed();
println!("cosine sim: {} elapsed: {:.4?}", cosine_sims.len(), duration);
println!(
"cosine sim: {} elapsed: {:.4?}",
cosine_sims.len(),
duration
);
Ok(cosine_sims)
}

pub(crate) fn sort_by_sims(
n_x: usize,
cosine_sims: Vec<(i32, i32, f32)>,
cosine_sims: &[(i32, i32, f32)],
sim_mapping: &mut FxHashMap<i32, (Vec<i32>, Vec<f32>)>,
) -> PyResult<()> {
let start = Instant::now();
let mut agg_sims: Vec<Vec<(i32, f32)>> = vec![Vec::new(); n_x];
for (x1, x2, sim) in cosine_sims {
for &(x1, x2, sim) in cosine_sims {
agg_sims[usize::try_from(x1)?].push((x2, sim));
agg_sims[usize::try_from(x2)?].push((x1, sim));
}
Expand All @@ -103,7 +111,7 @@ pub(crate) fn sort_by_sims(
sim_mapping.insert(i32::try_from(i)?, (neighbors, sims));
}
let duration = start.elapsed();
println!("sort elapsed: {:.4?}", duration);
println!("sort elapsed: {duration:.4?}");
Ok(())
}

Expand Down
179 changes: 178 additions & 1 deletion rust/src/sparse.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,182 @@
pub struct SparseMatrix<T = i32, U = f32> {
use std::hash::Hash;

use fxhash::FxHashMap;
use pyo3::types::PyList;
use pyo3::PyResult;

pub(crate) fn construct_csr_matrix(
sparse_indices: &PyList,
sparse_indptr: &PyList,
sparse_data: &PyList,
) -> PyResult<CsrMatrix> {
let matrix = CsrMatrix {
indices: sparse_indices.extract::<Vec<i32>>()?,
indptr: sparse_indptr.extract::<Vec<usize>>()?,
data: sparse_data.extract::<Vec<f32>>()?,
};
Ok(matrix)
}

/// Analogy of `scipy.sparse.csr_matrix`
/// https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
pub struct CsrMatrix<T = i32, U = f32> {
pub indices: Vec<T>,
pub indptr: Vec<usize>,
pub data: Vec<U>,
}

impl<T: Copy + Eq + Hash + Ord, U: Copy> CsrMatrix<T, U> {
pub fn values(&self) -> (&[T], &[usize], &[U]) {
(&self.indices, &self.indptr, &self.data)
}

#[inline]
pub fn n_rows(&self) -> usize {
self.indptr.len() - 1
}

pub fn get_row(&self, i: usize) -> Option<impl Iterator<Item = (&T, &U)>> {
if i >= self.n_rows() {
return None;
}
let start = self.indptr[i];
let end = self.indptr[i + 1];
if start == end {
None
} else {
Some(self.index_iter(start, end))
}
}

fn index_iter(&self, start: usize, end: usize) -> impl Iterator<Item = (&T, &U)> {
let indices = self.indices[start..end].iter();
let data = self.data[start..end].iter();
indices.zip(data)
}

fn to_dok(&self, n_rows: Option<usize>) -> DokMatrix<T, U> {
let mut data = Vec::new();
let n_rows = n_rows.unwrap_or_else(|| self.n_rows());
for i in 0..n_rows {
if let Some(row) = self.get_row(i) {
data.push(FxHashMap::from_iter(row.map(|(idx, dat)| (*idx, *dat))))
} else {
data.push(FxHashMap::default());
}
}
DokMatrix { data }
}

pub fn add(
this: &CsrMatrix<T, U>,
other: &CsrMatrix<T, U>,
n_rows: Option<usize>,
) -> CsrMatrix<T, U> {
let mut dok_matrix = this.to_dok(n_rows);
dok_matrix.add(other).to_csr()
}
}

/// Analogy of `scipy.sparse.dok_matrix`
/// https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.dok_matrix.html
pub struct DokMatrix<T = i32, U = f32> {
data: Vec<FxHashMap<T, U>>,
}

impl<T, U> DokMatrix<T, U>
where
T: Copy + Eq + Hash + Ord,
U: Copy,
{
fn add(&mut self, other: &CsrMatrix<T, U>) -> &Self {
for i in 0..other.n_rows() {
if let Some(row) = other.get_row(i) {
for (idx, dat) in row {
let mapping = &mut self.data[i];
mapping.insert(*idx, *dat);
}
}
}
self
}

fn to_csr(&self) -> CsrMatrix<T, U> {
let mut indices: Vec<T> = Vec::new();
let mut indptr: Vec<usize> = vec![0];
let mut data: Vec<U> = Vec::new();
for d in self.data.iter() {
if d.is_empty() {
continue;
}
let mut mapping: Vec<(&T, &U)> = d.iter().collect();
mapping.sort_unstable_by_key(|(i, _)| *i);
let (idx, dat): (Vec<T>, Vec<U>) = mapping.into_iter().unzip();
indices.extend(idx);
data.extend(dat);
indptr.push(indices.len());
}
CsrMatrix {
indices,
indptr,
data,
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_add_sparse_matrix() {
// [[1, 0, 0], [0, 0, 1]]
let mut matrix = CsrMatrix {
indices: vec![0, 2],
indptr: vec![0, 1, 2],
data: vec![1, 1],
};
// [[0, 0, 0], [1, 0, 2], [3, 3, 0]]
let matrix_large = CsrMatrix {
indices: vec![0, 2, 0, 1],
indptr: vec![0, 0, 2, 4],
data: vec![1, 2, 3, 3],
};
// [[2, 0, 4]]
let matrix_small = CsrMatrix {
indices: vec![0, 2],
indptr: vec![0, 2],
data: vec![2, 4],
};

// [[1, 0, 0], [1, 0, 2], [3, 3, 0]]
matrix = CsrMatrix::add(&matrix, &matrix_large, Some(3));
assert_eq!(matrix.indices, vec![0, 0, 2, 0, 1]);
assert_eq!(matrix.indptr, vec![0, 1, 3, 5]);
assert_eq!(matrix.data, vec![1, 1, 2, 3, 3]);

// [[2, 0, 4], [1, 0, 2], [3, 3, 0]]
matrix = CsrMatrix::add(&matrix, &matrix_small, Some(3));
assert_eq!(matrix.indices, vec![0, 2, 0, 2, 0, 1]);
assert_eq!(matrix.indptr, vec![0, 2, 4, 6]);
assert_eq!(matrix.data, vec![2, 4, 1, 2, 3, 3]);
}

#[test]
#[should_panic(expected = "index out of bounds: the len is 2 but the index is 2")]
fn test_add_insufficient_size() {
let new_size = 2;
// [[1, 0, 0], [0, 0, 1]]
let matrix = CsrMatrix {
indices: vec![0, 2],
indptr: vec![0, 1, 2],
data: vec![1, 1],
};
// [[0, 0, 0], [1, 0, 2], [3, 3, 0]]
let matrix_large = CsrMatrix {
indices: vec![0, 2, 0, 1],
indptr: vec![0, 0, 2, 4],
data: vec![1, 2, 3, 3],
};
CsrMatrix::add(&matrix, &matrix_large, Some(new_size));
}
}
Loading
Loading