Skip to content

Commit

Permalink
fix phraser memory
Browse files Browse the repository at this point in the history
  • Loading branch information
jenishah committed Oct 20, 2018
1 parent 7e4965e commit 242c80e
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 2 deletions.
4 changes: 2 additions & 2 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,7 @@ def __init__(self, phrases_model):
for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True):
if bigram in self.phrasegrams:
logger.info('Phraser repeat %s', bigram)
self.phrasegrams[bigram] = (phrases_model.vocab[self.delimiter.join(bigram)], score)
self.phrasegrams[bigram] = (None, score)
count += 1
if not count % 50000:
logger.info('Phraser added %i phrasegrams', count)
Expand Down Expand Up @@ -848,7 +848,7 @@ def score_item(self, worda, wordb, components, scorer):
"""
try:
return self.phrasegrams[tuple(components)][1]
return self.phrasegrams[tuple(components)][-1]
except KeyError:
return -1

Expand Down
Binary file added gensim/test/test_data/phraser_model_3dot6
Binary file not shown.
10 changes: 10 additions & 0 deletions gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,16 @@ def testEncoding(self):
self.assertTrue(isinstance(transformed, six.text_type))


class TestPhraserModelCompatibilty(unittest.TestCase):

def testCompatibilty(self):
bigram_loaded = Phraser.load(datapath("phraser_model_3dot6"))
test_sentences = [u'trees', u'graph', u'minors']
prev_ver = bigram_loaded[test_sentences]
expected_res = ['trees_graph', 'minors']
self.assertEqual(prev_ver, expected_res)


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()

0 comments on commit 242c80e

Please sign in to comment.