Skip to content

Commit

Permalink
Make termsim matrix positive definite even with negative similarities (
Browse files Browse the repository at this point in the history
…#2397)

* Make termsim matrix positive definite even with negative similarities

* Test that termsim matrix with negative similarities is positive definite

* Use `or` instead of `{...}` for documenting a union of types
  • Loading branch information
Witiko authored and mpenkov committed May 4, 2019
1 parent ce0af20 commit 18bcd11
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 11 deletions.
6 changes: 3 additions & 3 deletions gensim/similarities/termsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ class SparseTermSimilarityMatrix(SaveLoad):
strict column diagonal dominance. Positive definiteness is a necessary precondition if you
later wish to derive a change-of-basis matrix from the term similarity matrix using Cholesky
factorization.
nonzero_limit : {int, None}, optional
nonzero_limit : int or None, optional
The maximum number of non-zero elements outside the diagonal in a single column of the
sparse term similarity matrix. If None, then no limit will be imposed.
dtype : numpy.dtype, optional
Expand Down Expand Up @@ -242,11 +242,11 @@ def __init__(self, source, dictionary=None, tfidf=None, symmetric=True, positive
key=lambda x: (lambda term_index, _: (tfidf.idfs[term_index], -term_index))(*x), reverse=True)

for row_number, (t2_index, similarity) in zip(range(num_rows), rows):
if positive_definite and column_sum[t1_index] + similarity >= 1.0:
if positive_definite and column_sum[t1_index] + abs(similarity) >= 1.0:
break
if symmetric:
if column_nonzero[t2_index] <= nonzero_limit \
and (not positive_definite or column_sum[t2_index] + similarity < 1.0) \
and (not positive_definite or column_sum[t2_index] + abs(similarity) < 1.0) \
and not (t1_index, t2_index) in matrix:
matrix[t1_index, t2_index] = similarity
column_nonzero[t1_index] += 1
Expand Down
17 changes: 9 additions & 8 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,21 +800,22 @@ def test_symmetric(self):

def test_positive_definite(self):
"""Test the positive_definite parameter of the matrix constructor."""
negative_index = UniformTermSimilarityIndex(self.dictionary, term_similarity=-0.5)
matrix = SparseTermSimilarityMatrix(
self.index, self.dictionary, nonzero_limit=2).matrix.todense()
negative_index, self.dictionary, nonzero_limit=2).matrix.todense()
expected_matrix = numpy.array([
[1.0, 0.5, 0.5, 0.0, 0.0],
[0.5, 1.0, 0.0, 0.5, 0.0],
[0.5, 0.0, 1.0, 0.0, 0.0],
[0.0, 0.5, 0.0, 1.0, 0.0],
[1.0, -.5, -.5, 0.0, 0.0],
[-.5, 1.0, 0.0, -.5, 0.0],
[-.5, 0.0, 1.0, 0.0, 0.0],
[0.0, -.5, 0.0, 1.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 1.0]])
self.assertTrue(numpy.all(expected_matrix == matrix))

matrix = SparseTermSimilarityMatrix(
self.index, self.dictionary, nonzero_limit=2, positive_definite=True).matrix.todense()
negative_index, self.dictionary, nonzero_limit=2, positive_definite=True).matrix.todense()
expected_matrix = numpy.array([
[1.0, 0.5, 0.0, 0.0, 0.0],
[0.5, 1.0, 0.0, 0.0, 0.0],
[1.0, -.5, 0.0, 0.0, 0.0],
[-.5, 1.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 1.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 1.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 1.0]])
Expand Down

0 comments on commit 18bcd11

Please sign in to comment.