Skip to content

Commit

Permalink
removed l0 norm. added more tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
devashishd12 committed Apr 4, 2016
1 parent a6df0d2 commit 21d295a
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 33 deletions.
16 changes: 4 additions & 12 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,14 +319,14 @@ def veclen(vec):
assert length > 0.0, "sparse documents must not contain any explicit zero entries"
return length

def retvec(vec, length):
def ret_normalized_vec(vec, length):
if length != 1.0:
return [(termid, val / length) for termid, val in vec]
else:
return list(vec)

def isnorm(norm):
if norm not in ('l0', 'l1', 'l2'):
if norm not in ('l1', 'l2'):
raise ValueError("'%s' is not a supported norm" % norm)
else:
return norm
Expand All @@ -346,8 +346,6 @@ def unitvec(vec, norm='l2'):
norm = isnorm(norm)
if scipy.sparse.issparse(vec): # convert scipy.sparse to standard numpy array
vec = vec.tocsr()
if norm == 'l0':
veclen = len(vec.data)
if norm == 'l1':
veclen = numpy.sum(numpy.abs(vec.data))
if norm == 'l2':
Expand All @@ -359,8 +357,6 @@ def unitvec(vec, norm='l2'):

if isinstance(vec, numpy.ndarray):
vec = numpy.asarray(vec, dtype=float)
if norm == 'l0':
veclen = numpy.count_nonzero(vec)
if norm == 'l1':
veclen = numpy.sum(numpy.abs(vec))
if norm == 'l2':
Expand All @@ -376,18 +372,14 @@ def unitvec(vec, norm='l2'):
return vec

if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format?
if norm == 'l0':
length = float(sum(val != 0 for _, val in vec))
assert length > 0.0, "Document contains all zero entries"
return retvec(vec, length)
if norm == 'l1':
length = float(sum(abs(val) for _, val in vec))
assert length > 0.0, "Document contains all zero entries"
return retvec(vec, length)
return ret_normalized_vec(vec, length)
if norm == 'l2':
length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec))
assert length > 0.0, "sparse documents must not contain any explicit zero entries"
return retvec(vec, length)
return ret_normalized_vec(vec, length)
else:
raise ValueError("unknown input type")

Expand Down
8 changes: 2 additions & 6 deletions gensim/models/normmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
class Normmodel(interfaces.TransformationABC):
"""
Objects of this class realize the explicit normalization of
vectors. Supported norms are 'l0', 'l1' and 'l2' with 'l2' being
vectors. Supported norms are l1' and 'l2' with 'l2' being
default.
The main methods are:
Expand All @@ -30,12 +30,8 @@ class Normmodel(interfaces.TransformationABC):
"""
def __init__(self, corpus=None, norm='l2'):
"""
Compute the 'l0', 'l1' or 'l2' normalization by normalizing separately
Compute the 'l1' or 'l2' normalization by normalizing separately
for each doc in a corpus.
Formula for 'l0' norm for term 'i' in document 'j' in a corpus of 'D' documents is::
norml0_{i, j} = (i / (no. of non-zero terms in j))
Formula for 'l1' norm for term 'i' in document 'j' in a corpus of 'D' documents is::
norml1_{i, j} = (i / sum(absolute(values in j)))
Expand Down
41 changes: 26 additions & 15 deletions gensim/test/test_normmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,31 +54,42 @@ def setUp(self):
# doc is [(1, 1.0), (5, 2.0), (8, 1.0)]
self.doc = list(self.corpus)[3]

def testl0norm(self):
# create l0 norm model
model = normmodel.Normmodel(self.corpus, norm='l0')

# normalize document
normalized = model[self.doc]
expected = [(1, 0.3333333333333333), (5, 0.6666666666666666), (8, 0.3333333333333333)]
self.assertTrue(numpy.allclose(normalized, expected))

def testl1norm(self):
# create l1 norm model
def testTransform(self):
# Test for l1 norm model
model = normmodel.Normmodel(self.corpus, norm='l1')

normalized = model[self.doc]
expected = [(1, 0.25), (5, 0.5), (8, 0.25)]
self.assertTrue(numpy.allclose(normalized, expected))

def testl2norm(self):
# create l2 norm model
# Test for l2 norm model
model = normmodel.Normmodel(self.corpus, norm='l2')

normalized = model[self.doc]
expected = [(1, 0.4082482904638631), (5, 0.8164965809277261), (8, 0.4082482904638631)]
self.assertTrue(numpy.allclose(normalized, expected))

def testInit(self):
# Test if error messages raised on unsupported norm
with self.assertRaisesRegexp(ValueError, 'not a supported norm'):
model = normmodel.Normmodel(self.corpus, norm='l0')

def testPersistence(self):
fname = testfile()
model = normmodel.Normmodel(self.corpus, norm='l1')
model.save(fname)
model2 = normmodel.Normmodel.load(fname)
self.assertTrue(model.norms == model2.norms)
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

def testPersistenceCompressed(self):
fname = testfile() + '.gz'
model = normmodel.Normmodel(self.corpus, norm='l1')
model.save(fname)
model2 = normmodel.Normmodel.load(fname, mmap=None)
self.assertTrue(model.norms == model2.norms)
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
Expand Down

0 comments on commit 21d295a

Please sign in to comment.