Merge pull request #946 from markroxor/singleapi

Create unified base class for all topic models, #938
piskvorky · Oct 18, 2016 · bf7c0ed · bf7c0ed
2 parents bc0d694 + 6de2121
commit bf7c0ed
Show file tree

Hide file tree

Showing 9 changed files with 83 additions and 88 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,11 +10,12 @@ Changes
 * Change export_phrases in Phrases model. Fix issue #794 (@AadityaJ,
 [#879](https://github.com/RaRe-Technologies/gensim/pull/879))
     - bigram construction can now support multiple bigrams within one sentence
-* Fixed issue #838, RuntimeWarning: overflow encountered in exp  (@markroxor, [#895](https://github.com/RaRe-Technologies/gensim/pull/895))
+* Fixed issue [#838](https://github.com/RaRe-Technologies/gensim/issues/838), RuntimeWarning: overflow encountered in exp  ([@markroxor](https://github.com/markroxor), [#895](https://github.com/RaRe-Technologies/gensim/pull/895))
 *  Changed some log messages to warnings as suggested in issue #828. (@rhnvrm, [#884](https://github.com/RaRe-Technologies/gensim/pull/884))
 *  Fixed issue #851, In summarizer.py, RunTimeError is raised if single sentence input is provided to avoid ZeroDivionError. (@metalaman, #887)
-* Fixed issue [#791](https://github.com/RaRe-Technologies/gensim/issues/791), correct logic for iterating over SimilarityABC interface. ([@MridulS](https://github.com/MridulS), [#839](https://github.com/RaRe-Technologies/gensim/pull/839)
-
+* Fixed issue [#791](https://github.com/RaRe-Technologies/gensim/issues/791), correct logic for iterating over SimilarityABC interface. ([@MridulS](https://github.com/MridulS), [#839](https://github.com/RaRe-Technologies/gensim/pull/839))
+* Fixed issue [#938](https://github.com/RaRe-Technologies/gensim/issues/938),Creating a unified base class for all topic models. ([@markroxor](https://github.com/markroxor), [#946](https://github.com/RaRe-Technologies/gensim/pull/946))
+    -  _breaking change - may break HdpTopicFormatter.show_\__topics_
 
 0.13.2, 2016-08-19
 

diff --git a/gensim/models/basemodel.py b/gensim/models/basemodel.py
@@ -0,0 +1,16 @@
+class BaseTopicModel():
+    def print_topic(self, topicno, topn=10):
+        """
+        Return a single topic as a formatted string. See `show_topic()` for parameters.
+
+        >>> lsimodel.print_topic(10, topn=5)
+        '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"'
+
+        """
+        return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in self.show_topic(topicno, topn)])
+
+    def print_topics(self, num_topics=20, num_words=10):
+        """Alias for `show_topics()` that prints the `num_words` most
+        probable words for `topics` number of topics to log.
+        Set `topics=-1` to print all topics."""
+        return self.show_topics(num_topics=num_topics, num_words=num_words, log=True)
diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py
@@ -38,6 +38,7 @@
 import scipy.special as sp
 
 from gensim import interfaces, utils, matutils
+from gensim.models import basemodel
 from six.moves import xrange
 
 logger = logging.getLogger(__name__)
@@ -106,7 +107,7 @@ def set_zero(self):
         self.m_var_beta_ss.fill(0.0)
 
 
-class HdpModel(interfaces.TransformationABC):
+class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
     """
     The constructor estimates Hierachical Dirichlet Process model parameters based
     on a training corpus:
@@ -434,12 +435,6 @@ def update_expectations(self):
         self.m_timestamp[:] = self.m_updatect
         self.m_status_up_to_date = True
 
-    def print_topics(self, num_topics=20, num_words=20):
-        """Alias for `show_topics()` that prints the `num_words` most
-        probable words for `topics` number of topics to log.
-        Set `topics=-1` to print all topics."""
-        return self.show_topics(num_topics=num_topics, num_words=num_words, log=True)
-
     def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True):
         """
         Print the `num_words` most probable words for `topics` number of topics.
@@ -593,10 +588,9 @@ def show_topic_terms(self, topic_data, num_words):
     def format_topic(self, topic_id, topic_terms):
         if self.STYLE_GENSIM == self.style:
             fmt = ' + '.join(['%.3f*%s' % (weight, word) for (word, weight) in topic_terms])
-            fmt = 'topic %i: %s' % (topic_id, fmt)
         else:
             fmt = '\n'.join(['    %20s    %.8f' % (word, weight) for (word, weight) in topic_terms])
-            fmt = 'topic %i:\n%s' % (topic_id, fmt)
 
+        fmt = (topic_id,fmt)
         return fmt
-#endclass HdpTopicFormatter
+# endclass HdpTopicFormatter
diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -36,6 +36,8 @@
 import numbers
 
 from gensim import interfaces, utils, matutils
+from gensim.models import basemodel
+
 from itertools import chain
 from scipy.special import gammaln, psi  # gamma function utils
 from scipy.special import polygamma
@@ -193,7 +195,7 @@ def get_Elogbeta(self):
 # endclass LdaState
 
 
-class LdaModel(interfaces.TransformationABC):
+class LdaModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
     """
     The constructor estimates Latent Dirichlet Allocation model parameters based
     on a training corpus:
@@ -767,9 +769,6 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0):
         score += numpy.sum(gammaln(sum_eta) - gammaln(numpy.sum(_lambda, 1)))
         return score
 
-    def print_topics(self, num_topics=10, num_words=10):
-        return self.show_topics(num_topics, num_words, log=True)
-
     def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
         """
         For `num_topics` number of topics, return `num_words` most significant words
@@ -833,10 +832,6 @@ def get_topic_terms(self, topicid, topn=10):
         bestn = matutils.argsort(topic, topn, reverse=True)
         return [(id, topic[id]) for id in bestn]
 
-    def print_topic(self, topicid, topn=10):
-        """Return the result of `show_topic`, but formatted as a single string."""
-        return ' + '.join(['%.3f*%s' % (v, k) for k, v in self.show_topic(topicid, topn)])
-
     def top_topics(self, corpus, num_words=20):
         """
         Calculate the Umass topic coherence for each topic. Algorithm from

diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py
@@ -59,6 +59,8 @@
 from scipy.sparse import sparsetools
 
 from gensim import interfaces, matutils, utils
+from gensim.models import basemodel
+
 from six import iterkeys
 from six.moves import xrange
 
@@ -221,7 +223,7 @@ def merge(self, other, decay=1.0):
 #endclass Projection
 
 
-class LsiModel(interfaces.TransformationABC):
+class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
     """
     Objects of this class allow building and maintaining a model for Latent
     Semantic Indexing (also known as Latent Semantic Analysis).
@@ -490,16 +492,6 @@ def show_topic(self, topicno, topn=10):
         most = matutils.argsort(numpy.abs(c), topn, reverse=True)
         return [(self.id2word[val], 1.0 * c[val] / norm) for val in most]
 
-    def print_topic(self, topicno, topn=10):
-        """
-        Return a single topic as a formatted string. See `show_topic()` for parameters.
-
-        >>> lsimodel.print_topic(10, topn=5)
-        '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"'
-
-        """
-        return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in self.show_topic(topicno, topn)])
-
     def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True):
         """
         Return `num_topics` most significant topics (return all by default).
@@ -525,10 +517,6 @@ def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True):
                     logger.info("topic #%i(%.3f): %s", i, self.projection.s[i], topic)
         return shown
 
-    def print_topics(self, num_topics=5, num_words=10):
-        """Alias for `show_topics()` which prints the top 5 topics to log."""
-        return self.show_topics(num_topics=num_topics, num_words=num_words, log=True)
-
     def print_debug(self, num_topics=5, num_words=10):
         """
         Print (to log) the most salient words of the first `num_topics` topics.

diff --git a/gensim/test/test_basemodel.py b/gensim/test/test_basemodel.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2010 Radim Rehurek <[email protected]>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Automated tests for checking transformation algorithms (the models package).
+"""
+
+import six
+
+class TestBaseTopicModel():
+    def testPrintTopic(self):
+        topics = self.model.show_topics(formatted=True)
+        for topic_no, topic in topics:
+            self.assertTrue(isinstance(topic_no, int))
+            self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode))
+
+    def testPrintTopics(self):
+        topics = self.model.print_topics()
+
+        for topic_no, topic in topics:
+            self.assertTrue(isinstance(topic_no, int))
+            self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode))
+
+    def testShowTopic(self):
+        topic = self.model.show_topic(1)
+
+        for k, v in topic:
+            self.assertTrue(isinstance(k, six.string_types))
+            self.assertTrue(isinstance(v, float))
+
+    def testShowTopics(self):
+        topics = self.model.show_topics(formatted=False)
+
+        for topic_no, topic in topics:
+            self.assertTrue(isinstance(topic_no, int))
+            self.assertTrue(isinstance(topic, list))
+            for k, v in topic:
+                self.assertTrue(isinstance(k, six.string_types))
+                self.assertTrue(isinstance(v, float))
diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py
@@ -22,6 +22,7 @@
 from gensim.corpora import mmcorpus, Dictionary
 from gensim.models import hdpmodel
 from gensim import matutils
+from gensim.test import test_basemodel
 
 
 module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
@@ -47,24 +48,15 @@ def testfile():
     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
 
 
-
-class TestHdpModel(unittest.TestCase):
+class TestHdpModel(unittest.TestCase, test_basemodel.TestBaseTopicModel):
     def setUp(self):
         self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
         self.class_ = hdpmodel.HdpModel
         self.model = self.class_(corpus, id2word=dictionary)
 
-    def testShowTopics(self):
-        topics = self.model.show_topics(formatted=False, num_topics=20, num_words=20)
-
-        for topic_no, topic in topics:
-            self.assertTrue(isinstance(topic_no, int))
-            self.assertTrue(isinstance(topic, list))
-            for k, v in topic:
-                self.assertTrue(isinstance(k, six.string_types))
-                self.assertTrue(isinstance(v, float))
-
-
+    def testShowTopic(self):
+        # TODO create show_topic in HdpModel and then test
+        return
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py
@@ -23,6 +23,7 @@
 from gensim.corpora import mmcorpus, Dictionary
 from gensim.models import ldamodel, ldamulticore
 from gensim import matutils
+from gensim.test import test_basemodel
 
 
 module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
@@ -53,7 +54,8 @@ def testRandomState():
     for testcase in testcases:
         assert(isinstance(ldamodel.get_random_state(testcase), numpy.random.RandomState))
 
-class TestLdaModel(unittest.TestCase):
+
+class TestLdaModel(unittest.TestCase, test_basemodel.TestBaseTopicModel):
     def setUp(self):
         self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
         self.class_ = ldamodel.LdaModel
@@ -217,7 +219,6 @@ def testEta(self):
         kwargs['eta'] = "gensim is cool"
         self.assertRaises(ValueError, self.class_, **kwargs)
 
-
     def testTopTopics(self):
         top_topics = self.model.top_topics(self.corpus)
 
@@ -236,24 +237,6 @@ def testGetTopicTerms(self):
             self.assertTrue(isinstance(k, numbers.Integral))
             self.assertTrue(isinstance(v, float))
 
-    def testShowTopic(self):
-        topic = self.model.show_topic(1)
-
-        for k, v in topic:
-            self.assertTrue(isinstance(k, six.string_types))
-            self.assertTrue(isinstance(v, float))
-
-    def testShowTopics(self):
-        topics = self.model.show_topics(formatted=False)
-
-        for topic_no, topic in topics:
-            self.assertTrue(isinstance(topic_no, int))
-            self.assertTrue(isinstance(topic, list))
-            for k, v in topic:
-                self.assertTrue(isinstance(k, six.string_types))
-                self.assertTrue(isinstance(v, float))
-
-
     def testGetDocumentTopics(self):
 
         model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100, random_state=numpy.random.seed(0))
@@ -278,7 +261,7 @@ def testGetDocumentTopics(self):
 
         for w, phi_values in word_phis:
             self.assertTrue(isinstance(w, int))
-            self.assertTrue(isinstance(phi_values, list))            
+            self.assertTrue(isinstance(phi_values, list))
 
         # word_topics looks like this: ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]).
         # we check one case in word_topics, i.e of the first word in the doc, and it's likely topics.

diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py
@@ -22,6 +22,7 @@
 from gensim.corpora import mmcorpus, Dictionary
 from gensim.models import lsimodel
 from gensim import matutils
+from gensim.test import test_basemodel
 
 
 module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
@@ -50,7 +51,7 @@ def testfile():
     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
 
 
-class TestLsiModel(unittest.TestCase):
+class TestLsiModel(unittest.TestCase, test_basemodel.TestBaseTopicModel):
     def setUp(self):
         self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
         self.model = lsimodel.LsiModel(self.corpus, num_topics=2)
@@ -72,23 +73,6 @@ def testTransform(self):
         # expected = numpy.array([-0.1973928, 0.05591352])  # non-scaled LSI version
         self.assertTrue(numpy.allclose(abs(vec), abs(expected)))  # transformed entries must be equal up to sign
 
-    def testShowTopic(self):
-        topic = self.model.show_topic(1)
-
-        for k, v in topic:
-            self.assertTrue(isinstance(k, six.string_types))
-            self.assertTrue(isinstance(v, float))
-
-    def testShowTopics(self):
-        topics = self.model.show_topics(formatted=False)
-
-        for topic_no, topic in topics:
-            self.assertTrue(isinstance(topic_no, int))
-            self.assertTrue(isinstance(topic, list))
-            for k, v in topic:
-                self.assertTrue(isinstance(k, six.string_types))
-                self.assertTrue(isinstance(v, float))
-
     def testCorpusTransform(self):
         """Test lsi[corpus] transformation."""
         model = self.model