Skip to content

Commit

Permalink
Merge pull request #946 from markroxor/singleapi
Browse files Browse the repository at this point in the history
Create unified base class for all topic models, #938
  • Loading branch information
tmylk authored Oct 18, 2016
2 parents bc0d694 + 6de2121 commit bf7c0ed
Show file tree
Hide file tree
Showing 9 changed files with 83 additions and 88 deletions.
7 changes: 4 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ Changes
* Change export_phrases in Phrases model. Fix issue #794 (@AadityaJ,
[#879](https://github.com/RaRe-Technologies/gensim/pull/879))
- bigram construction can now support multiple bigrams within one sentence
* Fixed issue #838, RuntimeWarning: overflow encountered in exp (@markroxor, [#895](https://github.com/RaRe-Technologies/gensim/pull/895))
* Fixed issue [#838](https://github.com/RaRe-Technologies/gensim/issues/838), RuntimeWarning: overflow encountered in exp ([@markroxor](https://github.com/markroxor), [#895](https://github.com/RaRe-Technologies/gensim/pull/895))
* Changed some log messages to warnings as suggested in issue #828. (@rhnvrm, [#884](https://github.com/RaRe-Technologies/gensim/pull/884))
* Fixed issue #851, In summarizer.py, RunTimeError is raised if single sentence input is provided to avoid ZeroDivionError. (@metalaman, #887)
* Fixed issue [#791](https://github.com/RaRe-Technologies/gensim/issues/791), correct logic for iterating over SimilarityABC interface. ([@MridulS](https://github.com/MridulS), [#839](https://github.com/RaRe-Technologies/gensim/pull/839)

* Fixed issue [#791](https://github.com/RaRe-Technologies/gensim/issues/791), correct logic for iterating over SimilarityABC interface. ([@MridulS](https://github.com/MridulS), [#839](https://github.com/RaRe-Technologies/gensim/pull/839))
* Fixed issue [#938](https://github.com/RaRe-Technologies/gensim/issues/938),Creating a unified base class for all topic models. ([@markroxor](https://github.com/markroxor), [#946](https://github.com/RaRe-Technologies/gensim/pull/946))
- _breaking change - may break HdpTopicFormatter.show_\__topics_

0.13.2, 2016-08-19

Expand Down
16 changes: 16 additions & 0 deletions gensim/models/basemodel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
class BaseTopicModel():
def print_topic(self, topicno, topn=10):
"""
Return a single topic as a formatted string. See `show_topic()` for parameters.
>>> lsimodel.print_topic(10, topn=5)
'-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"'
"""
return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in self.show_topic(topicno, topn)])

def print_topics(self, num_topics=20, num_words=10):
"""Alias for `show_topics()` that prints the `num_words` most
probable words for `topics` number of topics to log.
Set `topics=-1` to print all topics."""
return self.show_topics(num_topics=num_topics, num_words=num_words, log=True)
14 changes: 4 additions & 10 deletions gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import scipy.special as sp

from gensim import interfaces, utils, matutils
from gensim.models import basemodel
from six.moves import xrange

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -106,7 +107,7 @@ def set_zero(self):
self.m_var_beta_ss.fill(0.0)


class HdpModel(interfaces.TransformationABC):
class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
"""
The constructor estimates Hierachical Dirichlet Process model parameters based
on a training corpus:
Expand Down Expand Up @@ -434,12 +435,6 @@ def update_expectations(self):
self.m_timestamp[:] = self.m_updatect
self.m_status_up_to_date = True

def print_topics(self, num_topics=20, num_words=20):
"""Alias for `show_topics()` that prints the `num_words` most
probable words for `topics` number of topics to log.
Set `topics=-1` to print all topics."""
return self.show_topics(num_topics=num_topics, num_words=num_words, log=True)

def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True):
"""
Print the `num_words` most probable words for `topics` number of topics.
Expand Down Expand Up @@ -593,10 +588,9 @@ def show_topic_terms(self, topic_data, num_words):
def format_topic(self, topic_id, topic_terms):
if self.STYLE_GENSIM == self.style:
fmt = ' + '.join(['%.3f*%s' % (weight, word) for (word, weight) in topic_terms])
fmt = 'topic %i: %s' % (topic_id, fmt)
else:
fmt = '\n'.join([' %20s %.8f' % (word, weight) for (word, weight) in topic_terms])
fmt = 'topic %i:\n%s' % (topic_id, fmt)

fmt = (topic_id,fmt)
return fmt
#endclass HdpTopicFormatter
# endclass HdpTopicFormatter
11 changes: 3 additions & 8 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
import numbers

from gensim import interfaces, utils, matutils
from gensim.models import basemodel

from itertools import chain
from scipy.special import gammaln, psi # gamma function utils
from scipy.special import polygamma
Expand Down Expand Up @@ -193,7 +195,7 @@ def get_Elogbeta(self):
# endclass LdaState


class LdaModel(interfaces.TransformationABC):
class LdaModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
"""
The constructor estimates Latent Dirichlet Allocation model parameters based
on a training corpus:
Expand Down Expand Up @@ -767,9 +769,6 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0):
score += numpy.sum(gammaln(sum_eta) - gammaln(numpy.sum(_lambda, 1)))
return score

def print_topics(self, num_topics=10, num_words=10):
return self.show_topics(num_topics, num_words, log=True)

def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
"""
For `num_topics` number of topics, return `num_words` most significant words
Expand Down Expand Up @@ -833,10 +832,6 @@ def get_topic_terms(self, topicid, topn=10):
bestn = matutils.argsort(topic, topn, reverse=True)
return [(id, topic[id]) for id in bestn]

def print_topic(self, topicid, topn=10):
"""Return the result of `show_topic`, but formatted as a single string."""
return ' + '.join(['%.3f*%s' % (v, k) for k, v in self.show_topic(topicid, topn)])

def top_topics(self, corpus, num_words=20):
"""
Calculate the Umass topic coherence for each topic. Algorithm from
Expand Down
18 changes: 3 additions & 15 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
from scipy.sparse import sparsetools

from gensim import interfaces, matutils, utils
from gensim.models import basemodel

from six import iterkeys
from six.moves import xrange

Expand Down Expand Up @@ -221,7 +223,7 @@ def merge(self, other, decay=1.0):
#endclass Projection


class LsiModel(interfaces.TransformationABC):
class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
"""
Objects of this class allow building and maintaining a model for Latent
Semantic Indexing (also known as Latent Semantic Analysis).
Expand Down Expand Up @@ -490,16 +492,6 @@ def show_topic(self, topicno, topn=10):
most = matutils.argsort(numpy.abs(c), topn, reverse=True)
return [(self.id2word[val], 1.0 * c[val] / norm) for val in most]

def print_topic(self, topicno, topn=10):
"""
Return a single topic as a formatted string. See `show_topic()` for parameters.
>>> lsimodel.print_topic(10, topn=5)
'-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"'
"""
return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in self.show_topic(topicno, topn)])

def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True):
"""
Return `num_topics` most significant topics (return all by default).
Expand All @@ -525,10 +517,6 @@ def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True):
logger.info("topic #%i(%.3f): %s", i, self.projection.s[i], topic)
return shown

def print_topics(self, num_topics=5, num_words=10):
"""Alias for `show_topics()` which prints the top 5 topics to log."""
return self.show_topics(num_topics=num_topics, num_words=num_words, log=True)

def print_debug(self, num_topics=5, num_words=10):
"""
Print (to log) the most salient words of the first `num_topics` topics.
Expand Down
42 changes: 42 additions & 0 deletions gensim/test/test_basemodel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <[email protected]>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Automated tests for checking transformation algorithms (the models package).
"""

import six

class TestBaseTopicModel():
def testPrintTopic(self):
topics = self.model.show_topics(formatted=True)
for topic_no, topic in topics:
self.assertTrue(isinstance(topic_no, int))
self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode))

def testPrintTopics(self):
topics = self.model.print_topics()

for topic_no, topic in topics:
self.assertTrue(isinstance(topic_no, int))
self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode))

def testShowTopic(self):
topic = self.model.show_topic(1)

for k, v in topic:
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, float))

def testShowTopics(self):
topics = self.model.show_topics(formatted=False)

for topic_no, topic in topics:
self.assertTrue(isinstance(topic_no, int))
self.assertTrue(isinstance(topic, list))
for k, v in topic:
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, float))
18 changes: 5 additions & 13 deletions gensim/test/test_hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from gensim.corpora import mmcorpus, Dictionary
from gensim.models import hdpmodel
from gensim import matutils
from gensim.test import test_basemodel


module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
Expand All @@ -47,24 +48,15 @@ def testfile():
return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')



class TestHdpModel(unittest.TestCase):
class TestHdpModel(unittest.TestCase, test_basemodel.TestBaseTopicModel):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
self.class_ = hdpmodel.HdpModel
self.model = self.class_(corpus, id2word=dictionary)

def testShowTopics(self):
topics = self.model.show_topics(formatted=False, num_topics=20, num_words=20)

for topic_no, topic in topics:
self.assertTrue(isinstance(topic_no, int))
self.assertTrue(isinstance(topic, list))
for k, v in topic:
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, float))


def testShowTopic(self):
# TODO create show_topic in HdpModel and then test
return

if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
Expand Down
25 changes: 4 additions & 21 deletions gensim/test/test_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from gensim.corpora import mmcorpus, Dictionary
from gensim.models import ldamodel, ldamulticore
from gensim import matutils
from gensim.test import test_basemodel


module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
Expand Down Expand Up @@ -53,7 +54,8 @@ def testRandomState():
for testcase in testcases:
assert(isinstance(ldamodel.get_random_state(testcase), numpy.random.RandomState))

class TestLdaModel(unittest.TestCase):

class TestLdaModel(unittest.TestCase, test_basemodel.TestBaseTopicModel):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
self.class_ = ldamodel.LdaModel
Expand Down Expand Up @@ -217,7 +219,6 @@ def testEta(self):
kwargs['eta'] = "gensim is cool"
self.assertRaises(ValueError, self.class_, **kwargs)


def testTopTopics(self):
top_topics = self.model.top_topics(self.corpus)

Expand All @@ -236,24 +237,6 @@ def testGetTopicTerms(self):
self.assertTrue(isinstance(k, numbers.Integral))
self.assertTrue(isinstance(v, float))

def testShowTopic(self):
topic = self.model.show_topic(1)

for k, v in topic:
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, float))

def testShowTopics(self):
topics = self.model.show_topics(formatted=False)

for topic_no, topic in topics:
self.assertTrue(isinstance(topic_no, int))
self.assertTrue(isinstance(topic, list))
for k, v in topic:
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, float))


def testGetDocumentTopics(self):

model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100, random_state=numpy.random.seed(0))
Expand All @@ -278,7 +261,7 @@ def testGetDocumentTopics(self):

for w, phi_values in word_phis:
self.assertTrue(isinstance(w, int))
self.assertTrue(isinstance(phi_values, list))
self.assertTrue(isinstance(phi_values, list))

# word_topics looks like this: ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]).
# we check one case in word_topics, i.e of the first word in the doc, and it's likely topics.
Expand Down
20 changes: 2 additions & 18 deletions gensim/test/test_lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from gensim.corpora import mmcorpus, Dictionary
from gensim.models import lsimodel
from gensim import matutils
from gensim.test import test_basemodel


module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
Expand Down Expand Up @@ -50,7 +51,7 @@ def testfile():
return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')


class TestLsiModel(unittest.TestCase):
class TestLsiModel(unittest.TestCase, test_basemodel.TestBaseTopicModel):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
self.model = lsimodel.LsiModel(self.corpus, num_topics=2)
Expand All @@ -72,23 +73,6 @@ def testTransform(self):
# expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign

def testShowTopic(self):
topic = self.model.show_topic(1)

for k, v in topic:
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, float))

def testShowTopics(self):
topics = self.model.show_topics(formatted=False)

for topic_no, topic in topics:
self.assertTrue(isinstance(topic_no, int))
self.assertTrue(isinstance(topic, list))
for k, v in topic:
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, float))

def testCorpusTransform(self):
"""Test lsi[corpus] transformation."""
model = self.model
Expand Down

0 comments on commit bf7c0ed

Please sign in to comment.