From dfd7da4965d5040ba277a3c7b7b7d62cdc71f7d2 Mon Sep 17 00:00:00 2001 From: Shiva Manne Date: Thu, 22 Jun 2017 15:01:58 +0530 Subject: [PATCH] Add sparse input support in interfaces_getitem when num_best is not None. Fix #1294 (#1321) * added any2sparse_clipped() function * changed full2sparse_clipped to any2sparse_clipped in __getitem__ * added missing whitespace * return topn from any2sparse_clipped() * efficient any2sparse_clipped implementation * added unit test for any2sparse_clipped * function call corrected * removed any2sparse_clipped and added scipy2scipy_clipped * added new code path for maintain_sparsity * added unit tests for new function and issue * fixed flake8 errors * fixed matrix_indptr * added requested changes * replaced hasattr with getattr * call abs() once for entire matrix in scipy2scipy_clipped * removed matrix.sort_indices and removed indptr while calling argsort --- gensim/interfaces.py | 5 +++++ gensim/matutils.py | 37 ++++++++++++++++++++++++++++++++ gensim/test/test_similarities.py | 31 ++++++++++++++++++++++++++ 3 files changed, 73 insertions(+) diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 530fab398b..58e6f45b13 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -222,6 +222,11 @@ def __getitem__(self, query): if self.num_best is None: return result + # if maintain_sparity is True, result is scipy sparse. Sort, clip the + # topn and return as a scipy sparse matrix. + if getattr(self, 'maintain_sparsity', False): + return matutils.scipy2scipy_clipped(result, self.num_best) + # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): diff --git a/gensim/matutils.py b/gensim/matutils.py index db485d8ca4..af7b093548 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -166,6 +166,43 @@ def any2sparse(vec, eps=1e-9): return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps] +def scipy2scipy_clipped(matrix, topn, eps=1e-9): + """ + Return a scipy.sparse vector/matrix consisting of 'topn' elements of the greatest magnitude (absolute value). + """ + if not scipy.sparse.issparse(matrix): + raise ValueError("'%s' is not a scipy sparse vector." % matrix) + if topn <= 0: + return scipy.sparse.csr_matrix([]) + # Return clipped sparse vector if input is a sparse vector. + if matrix.shape[0] == 1: + # use np.argpartition/argsort and only form tuples that are actually returned. + biggest = argsort(abs(matrix.data), topn, reverse=True) + indices, data = matrix.indices.take(biggest), matrix.data.take(biggest) + return scipy.sparse.csr_matrix((data, indices, [0, len(indices)])) + # Return clipped sparse matrix if input is a matrix, processing row by row. + else: + matrix_indices = [] + matrix_data = [] + matrix_indptr = [0] + # calling abs() on entire matrix once is faster than calling abs() iteratively for each row + matrix_abs = abs(matrix) + for i in range(matrix.shape[0]): + v = matrix.getrow(i) + v_abs = matrix_abs.getrow(i) + # Sort and clip each row vector first. + biggest = argsort(v_abs.data, topn, reverse=True) + indices, data = v.indices.take(biggest), v.data.take(biggest) + # Store the topn indices and values of each row vector. + matrix_data.append(data) + matrix_indices.append(indices) + matrix_indptr.append(matrix_indptr[-1] + min(len(indices), topn)) + matrix_indices = np.concatenate(matrix_indices).ravel() + matrix_data = np.concatenate(matrix_data).ravel() + # Instantiate and return a sparse csr_matrix which preserves the order of indices/data. + return scipy.sparse.csr.csr_matrix((matrix_data, matrix_indices, matrix_indptr), shape=(matrix.shape[0], np.max(matrix_indices) + 1)) + + def scipy2sparse(vec, eps=1e-9): """Convert a scipy.sparse vector into gensim document format (=list of 2-tuples).""" vec = vec.tocsr() diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 091766816a..a7a59c77d3 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -109,6 +109,22 @@ def test_full2sparse_clipped(self): expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)] self.assertTrue(matutils.full2sparse_clipped(vec, topn=3), expected) + def test_scipy2scipy_clipped(self): + # Test for scipy vector/row + vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15] + expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)] + vec_scipy = scipy.sparse.csr_matrix(vec) + vec_scipy_clipped = matutils.scipy2scipy_clipped(vec_scipy, topn=3) + self.assertTrue(scipy.sparse.issparse(vec_scipy_clipped)) + self.assertTrue(matutils.scipy2sparse(vec_scipy_clipped), expected) + + # Test for scipy matrix + vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15] + expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)] + matrix_scipy = scipy.sparse.csr_matrix([vec] * 3) + matrix_scipy_clipped = matutils.scipy2scipy_clipped(matrix_scipy, topn=3) + self.assertTrue(scipy.sparse.issparse(matrix_scipy_clipped)) + self.assertTrue([matutils.scipy2sparse(x) for x in matrix_scipy_clipped], [expected] * 3) def testChunking(self): @@ -406,6 +422,21 @@ def testMaintainSparsity(self): self.assertTrue(scipy.sparse.issparse(sparse_sims)) numpy.testing.assert_array_equal(dense_sims, sparse_sims.todense()) + def testMaintainSparsityWithNumBest(self): + """Tests that sparsity is correctly maintained when maintain_sparsity=True and num_best is not None""" + num_features = len(dictionary) + + index = self.cls(corpus, num_features=num_features, maintain_sparsity=False, num_best=3) + dense_topn_sims = index[corpus] + + index = self.cls(corpus, num_features=num_features, maintain_sparsity=True, num_best=3) + scipy_topn_sims = index[corpus] + + self.assertFalse(scipy.sparse.issparse(dense_topn_sims)) + self.assertTrue(scipy.sparse.issparse(scipy_topn_sims)) + self.assertEqual(dense_topn_sims, [matutils.scipy2sparse(v) for v in scipy_topn_sims]) + + class TestSimilarity(unittest.TestCase, _TestSimilarityABC): def setUp(self):