From 05edaf2e777a8fb2a793aa89415a65c87e597af5 Mon Sep 17 00:00:00 2001 From: markroxor Date: Thu, 13 Oct 2016 21:12:37 +0530 Subject: [PATCH 01/12] rebased --- gensim/models/basemodel.py | 10 ++++++++++ gensim/models/ldamodel.py | 8 +++----- gensim/models/lsimodel.py | 14 +++----------- 3 files changed, 16 insertions(+), 16 deletions(-) create mode 100644 gensim/models/basemodel.py diff --git a/gensim/models/basemodel.py b/gensim/models/basemodel.py new file mode 100644 index 0000000000..90d449bdf6 --- /dev/null +++ b/gensim/models/basemodel.py @@ -0,0 +1,10 @@ +class BaseTopicModel(): + def print_topic(self, topicno, topn=10): + """ + Return a single topic as a formatted string. See `show_topic()` for parameters. + + >>> lsimodel.print_topic(10, topn=5) + '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"' + + """ + return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in self.show_topic(topicno, topn)]) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index b985b2f628..bb09fc438b 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -36,6 +36,8 @@ import numbers from gensim import interfaces, utils, matutils +from gensim.models import basemodel + from itertools import chain from scipy.special import gammaln, psi # gamma function utils from scipy.special import polygamma @@ -193,7 +195,7 @@ def get_Elogbeta(self): # endclass LdaState -class LdaModel(interfaces.TransformationABC): +class LdaModel(interfaces.TransformationABC,basemodel.BaseTopicModel): """ The constructor estimates Latent Dirichlet Allocation model parameters based on a training corpus: @@ -833,10 +835,6 @@ def get_topic_terms(self, topicid, topn=10): bestn = matutils.argsort(topic, topn, reverse=True) return [(id, topic[id]) for id in bestn] - def print_topic(self, topicid, topn=10): - """Return the result of `show_topic`, but formatted as a single string.""" - return ' + '.join(['%.3f*%s' % (v, k) for k, v in self.show_topic(topicid, topn)]) - def top_topics(self, corpus, num_words=20): """ Calculate the Umass topic coherence for each topic. Algorithm from diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 12a3c17d18..7041530af8 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -59,6 +59,8 @@ from scipy.sparse import sparsetools from gensim import interfaces, matutils, utils +from gensim.models import basemodel + from six import iterkeys from six.moves import xrange @@ -221,7 +223,7 @@ def merge(self, other, decay=1.0): #endclass Projection -class LsiModel(interfaces.TransformationABC): +class LsiModel(interfaces.TransformationABC,basemodel.BaseTopicModel): """ Objects of this class allow building and maintaining a model for Latent Semantic Indexing (also known as Latent Semantic Analysis). @@ -490,16 +492,6 @@ def show_topic(self, topicno, topn=10): most = matutils.argsort(numpy.abs(c), topn, reverse=True) return [(self.id2word[val], 1.0 * c[val] / norm) for val in most] - def print_topic(self, topicno, topn=10): - """ - Return a single topic as a formatted string. See `show_topic()` for parameters. - - >>> lsimodel.print_topic(10, topn=5) - '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"' - - """ - return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in self.show_topic(topicno, topn)]) - def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True): """ Return `num_topics` most significant topics (return all by default). From 48396c14752de05a7b83fa887ea274f92416732b Mon Sep 17 00:00:00 2001 From: markroxor Date: Thu, 13 Oct 2016 21:41:26 +0530 Subject: [PATCH 02/12] tests added for lsi and lda --- gensim/test/test_ldamodel.py | 12 +++++++++++- gensim/test/test_lsimodel.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index c081da328a..ca74b4db0c 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -244,6 +244,8 @@ def testShowTopic(self): self.assertTrue(isinstance(v, float)) def testShowTopics(self): + + #testing show_topic topics = self.model.show_topics(formatted=False) for topic_no, topic in topics: @@ -253,6 +255,14 @@ def testShowTopics(self): self.assertTrue(isinstance(k, six.string_types)) self.assertTrue(isinstance(v, float)) + #testing print_topic + topics = self.model.show_topics(formatted=True) + print type(topics) + for topic_no, topic in topics: + print type(topic_no),type(topic) + self.assertTrue(isinstance(topic_no, int)) + self.assertTrue(isinstance(topic, str)) + def testGetDocumentTopics(self): @@ -278,7 +288,7 @@ def testGetDocumentTopics(self): for w, phi_values in word_phis: self.assertTrue(isinstance(w, int)) - self.assertTrue(isinstance(phi_values, list)) + self.assertTrue(isinstance(phi_values, list)) # word_topics looks like this: ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]). # we check one case in word_topics, i.e of the first word in the doc, and it's likely topics. diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py index 26df7c011e..f092979a7a 100644 --- a/gensim/test/test_lsimodel.py +++ b/gensim/test/test_lsimodel.py @@ -80,6 +80,8 @@ def testShowTopic(self): self.assertTrue(isinstance(v, float)) def testShowTopics(self): + + #testing show_topic topics = self.model.show_topics(formatted=False) for topic_no, topic in topics: @@ -89,6 +91,14 @@ def testShowTopics(self): self.assertTrue(isinstance(k, six.string_types)) self.assertTrue(isinstance(v, float)) + #testing print_topic + topics = self.model.show_topics(formatted=True) + print type(topics) + for topic_no, topic in topics: + print type(topic_no),type(topic) + self.assertTrue(isinstance(topic_no, int)) + self.assertTrue(isinstance(topic, str)) + def testCorpusTransform(self): """Test lsi[corpus] transformation.""" model = self.model From e673305d85218781f733f10685652d0e012fc508 Mon Sep 17 00:00:00 2001 From: markroxor Date: Thu, 13 Oct 2016 21:51:13 +0530 Subject: [PATCH 03/12] tests added for lsi and lda --- gensim/test/test_ldamodel.py | 4 +--- gensim/test/test_lsimodel.py | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index ca74b4db0c..da3b995297 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -244,7 +244,7 @@ def testShowTopic(self): self.assertTrue(isinstance(v, float)) def testShowTopics(self): - + #testing show_topic topics = self.model.show_topics(formatted=False) @@ -257,9 +257,7 @@ def testShowTopics(self): #testing print_topic topics = self.model.show_topics(formatted=True) - print type(topics) for topic_no, topic in topics: - print type(topic_no),type(topic) self.assertTrue(isinstance(topic_no, int)) self.assertTrue(isinstance(topic, str)) diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py index f092979a7a..848339fd12 100644 --- a/gensim/test/test_lsimodel.py +++ b/gensim/test/test_lsimodel.py @@ -93,9 +93,7 @@ def testShowTopics(self): #testing print_topic topics = self.model.show_topics(formatted=True) - print type(topics) for topic_no, topic in topics: - print type(topic_no),type(topic) self.assertTrue(isinstance(topic_no, int)) self.assertTrue(isinstance(topic, str)) From de807205a25e5fc55da1a14c769c3425154ce7ef Mon Sep 17 00:00:00 2001 From: markroxor Date: Thu, 13 Oct 2016 22:27:17 +0530 Subject: [PATCH 04/12] tests added for lsi and lda --- gensim/test/test_ldamodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index da3b995297..9986653a97 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -259,7 +259,7 @@ def testShowTopics(self): topics = self.model.show_topics(formatted=True) for topic_no, topic in topics: self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(topic, str)) + self.assertTrue(isinstance(topic, unicode)) def testGetDocumentTopics(self): From 272d7fcecfb0e841004010e63ec976ebba6302a0 Mon Sep 17 00:00:00 2001 From: markroxor Date: Thu, 13 Oct 2016 23:08:54 +0530 Subject: [PATCH 05/12] tests added for lsi and lda --- gensim/test/test_ldamodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index 9986653a97..5c08d4a640 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -259,7 +259,7 @@ def testShowTopics(self): topics = self.model.show_topics(formatted=True) for topic_no, topic in topics: self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(topic, unicode)) + self.assertTrue(isinstance(topic, unicode) or isinstance(topic, str)) def testGetDocumentTopics(self): From f4f92d3cc98e7920d481048a1a6166f9cb6e393d Mon Sep 17 00:00:00 2001 From: markroxor Date: Thu, 13 Oct 2016 23:26:13 +0530 Subject: [PATCH 06/12] tests added for lsi and lda --- gensim/test/test_ldamodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index 5c08d4a640..3f4703b2e5 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -259,7 +259,7 @@ def testShowTopics(self): topics = self.model.show_topics(formatted=True) for topic_no, topic in topics: self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(topic, unicode) or isinstance(topic, str)) + self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) def testGetDocumentTopics(self): From f9d28a073793585588aa204401074dfbc61b4c41 Mon Sep 17 00:00:00 2001 From: markroxor Date: Fri, 14 Oct 2016 00:08:08 +0530 Subject: [PATCH 07/12] registered to CHANGELOG --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c950c618fd..62a23fa56c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,10 +10,11 @@ Changes * Change export_phrases in Phrases model. Fix issue #794 (@AadityaJ, [#879](https://github.com/RaRe-Technologies/gensim/pull/879)) - bigram construction can now support multiple bigrams within one sentence -* Fixed issue #838, RuntimeWarning: overflow encountered in exp (@markroxor, [#895](https://github.com/RaRe-Technologies/gensim/pull/895)) +* Fixed issue [#838](https://github.com/RaRe-Technologies/gensim/issues/838), RuntimeWarning: overflow encountered in exp ([@markroxor](https://github.com/markroxor), [#895](https://github.com/RaRe-Technologies/gensim/pull/895)) * Changed some log messages to warnings as suggested in issue #828. (@rhnvrm, [#884](https://github.com/RaRe-Technologies/gensim/pull/884)) * Fixed issue #851, In summarizer.py, RunTimeError is raised if single sentence input is provided to avoid ZeroDivionError. (@metalaman, #887) -* Fixed issue [#791](https://github.com/RaRe-Technologies/gensim/issues/791), correct logic for iterating over SimilarityABC interface. ([@MridulS](https://github.com/MridulS), [#839](https://github.com/RaRe-Technologies/gensim/pull/839) +* Fixed issue [#791](https://github.com/RaRe-Technologies/gensim/issues/791), correct logic for iterating over SimilarityABC interface. ([@MridulS](https://github.com/MridulS), [#839](https://github.com/RaRe-Technologies/gensim/pull/839)) +* Fixed issue [#938](https://github.com/RaRe-Technologies/gensim/issues/938),Creating a unified base class for all topic models. ([@markroxor](https://github.com/markroxor), [#946](https://github.com/RaRe-Technologies/gensim/pull/946)) 0.13.2, 2016-08-19 From 0b474083ac2b42c3e93e9b5e591b33c9607ab684 Mon Sep 17 00:00:00 2001 From: markroxor Date: Sat, 15 Oct 2016 11:47:27 +0530 Subject: [PATCH 08/12] added print_topics() to baseModel --- gensim/models/basemodel.py | 6 ++++++ gensim/models/hdpmodel.py | 12 ++---------- gensim/models/ldamodel.py | 5 +---- gensim/models/lsimodel.py | 6 +----- 4 files changed, 10 insertions(+), 19 deletions(-) diff --git a/gensim/models/basemodel.py b/gensim/models/basemodel.py index 90d449bdf6..fa1e6bfe7a 100644 --- a/gensim/models/basemodel.py +++ b/gensim/models/basemodel.py @@ -8,3 +8,9 @@ def print_topic(self, topicno, topn=10): """ return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in self.show_topic(topicno, topn)]) + + def print_topics(self, num_topics=20, num_words=10): + """Alias for `show_topics()` that prints the `num_words` most + probable words for `topics` number of topics to log. + Set `topics=-1` to print all topics.""" + return self.show_topics(num_topics=num_topics, num_words=num_words, log=True) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index c34aa21619..043c10355a 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -38,6 +38,7 @@ import scipy.special as sp from gensim import interfaces, utils, matutils +from gensim.models import basemodel from six.moves import xrange logger = logging.getLogger(__name__) @@ -125,7 +126,7 @@ def set_zero(self): self.m_var_beta_ss.fill(0.0) -class HdpModel(interfaces.TransformationABC): +class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): """ The constructor estimates Hierachical Dirichlet Process model parameters based on a training corpus: @@ -453,12 +454,6 @@ def update_expectations(self): self.m_timestamp[:] = self.m_updatect self.m_status_up_to_date = True - def print_topics(self, num_topics=20, num_words=20): - """Alias for `show_topics()` that prints the `num_words` most - probable words for `topics` number of topics to log. - Set `topics=-1` to print all topics.""" - return self.show_topics(num_topics=num_topics, num_words=num_words, log=True) - def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): """ Print the `num_words` most probable words for `topics` number of topics. @@ -575,9 +570,6 @@ def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None self.style = style - def print_topics(self, num_topics=10, num_words=10): - return self.show_topics(num_topics, num_words, True) - def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): shown = [] if num_topics < 0: diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index bb09fc438b..7def8966b3 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -195,7 +195,7 @@ def get_Elogbeta(self): # endclass LdaState -class LdaModel(interfaces.TransformationABC,basemodel.BaseTopicModel): +class LdaModel(interfaces.TransformationABC, basemodel.BaseTopicModel): """ The constructor estimates Latent Dirichlet Allocation model parameters based on a training corpus: @@ -769,9 +769,6 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0): score += numpy.sum(gammaln(sum_eta) - gammaln(numpy.sum(_lambda, 1))) return score - def print_topics(self, num_topics=10, num_words=10): - return self.show_topics(num_topics, num_words, log=True) - def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): """ For `num_topics` number of topics, return `num_words` most significant words diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 7041530af8..3f44028667 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -223,7 +223,7 @@ def merge(self, other, decay=1.0): #endclass Projection -class LsiModel(interfaces.TransformationABC,basemodel.BaseTopicModel): +class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel): """ Objects of this class allow building and maintaining a model for Latent Semantic Indexing (also known as Latent Semantic Analysis). @@ -517,10 +517,6 @@ def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True): logger.info("topic #%i(%.3f): %s", i, self.projection.s[i], topic) return shown - def print_topics(self, num_topics=5, num_words=10): - """Alias for `show_topics()` which prints the top 5 topics to log.""" - return self.show_topics(num_topics=num_topics, num_words=num_words, log=True) - def print_debug(self, num_topics=5, num_words=10): """ Print (to log) the most salient words of the first `num_topics` topics. From ac4e93b29551a778ef2df6e1dba86d9dea54e46f Mon Sep 17 00:00:00 2001 From: markroxor Date: Sat, 15 Oct 2016 12:24:39 +0530 Subject: [PATCH 09/12] added tests for lsi lda, test base created unified TestBaseModel and aded test for printTopic --- gensim/test/test_basemodel.py | 43 +++++++++++++++++++++++++++++++++++ gensim/test/test_hdpmodel.py | 2 -- gensim/test/test_ldamodel.py | 31 +++---------------------- gensim/test/test_lsimodel.py | 28 ++--------------------- 4 files changed, 48 insertions(+), 56 deletions(-) create mode 100644 gensim/test/test_basemodel.py diff --git a/gensim/test/test_basemodel.py b/gensim/test/test_basemodel.py new file mode 100644 index 0000000000..1c2cb939c3 --- /dev/null +++ b/gensim/test/test_basemodel.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2010 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for checking transformation algorithms (the models package). +""" + +import six + + +class TestBaseTopicModel(): + def testPrintTopic(self): + topics = self.model.show_topics(formatted=True) + for topic_no, topic in topics: + self.assertTrue(isinstance(topic_no, int)) + self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) + + def testPrintTopics(self): + topics = self.model.print_topics() + + for topic_no, topic in topics: + self.assertTrue(isinstance(topic_no, int)) + self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) + + def testShowTopic(self): + topic = self.model.show_topic(1) + + for k, v in topic: + self.assertTrue(isinstance(k, six.string_types)) + self.assertTrue(isinstance(v, float)) + + def testShowTopics(self): + topics = self.model.show_topics(formatted=False) + + for topic_no, topic in topics: + self.assertTrue(isinstance(topic_no, int)) + self.assertTrue(isinstance(topic, list)) + for k, v in topic: + self.assertTrue(isinstance(k, six.string_types)) + self.assertTrue(isinstance(v, float)) diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py index 0d140d3316..4cef57ed9f 100644 --- a/gensim/test/test_hdpmodel.py +++ b/gensim/test/test_hdpmodel.py @@ -47,7 +47,6 @@ def testfile(): return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') - class TestHdpModel(unittest.TestCase): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) @@ -65,7 +64,6 @@ def testShowTopics(self): self.assertTrue(isinstance(v, float)) - if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index 3f4703b2e5..cc2cececc9 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -23,6 +23,7 @@ from gensim.corpora import mmcorpus, Dictionary from gensim.models import ldamodel, ldamulticore from gensim import matutils +from gensim.test import test_basemodel module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder @@ -53,7 +54,8 @@ def testRandomState(): for testcase in testcases: assert(isinstance(ldamodel.get_random_state(testcase), numpy.random.RandomState)) -class TestLdaModel(unittest.TestCase): + +class TestLdaModel(unittest.TestCase, test_basemodel.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.class_ = ldamodel.LdaModel @@ -217,7 +219,6 @@ def testEta(self): kwargs['eta'] = "gensim is cool" self.assertRaises(ValueError, self.class_, **kwargs) - def testTopTopics(self): top_topics = self.model.top_topics(self.corpus) @@ -236,32 +237,6 @@ def testGetTopicTerms(self): self.assertTrue(isinstance(k, numbers.Integral)) self.assertTrue(isinstance(v, float)) - def testShowTopic(self): - topic = self.model.show_topic(1) - - for k, v in topic: - self.assertTrue(isinstance(k, six.string_types)) - self.assertTrue(isinstance(v, float)) - - def testShowTopics(self): - - #testing show_topic - topics = self.model.show_topics(formatted=False) - - for topic_no, topic in topics: - self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(topic, list)) - for k, v in topic: - self.assertTrue(isinstance(k, six.string_types)) - self.assertTrue(isinstance(v, float)) - - #testing print_topic - topics = self.model.show_topics(formatted=True) - for topic_no, topic in topics: - self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) - - def testGetDocumentTopics(self): model = self.class_(self.corpus, id2word=dictionary, num_topics=2, passes= 100, random_state=numpy.random.seed(0)) diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py index 848339fd12..ab86c18d4f 100644 --- a/gensim/test/test_lsimodel.py +++ b/gensim/test/test_lsimodel.py @@ -22,6 +22,7 @@ from gensim.corpora import mmcorpus, Dictionary from gensim.models import lsimodel from gensim import matutils +from gensim.test import test_basemodel module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder @@ -50,7 +51,7 @@ def testfile(): return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') -class TestLsiModel(unittest.TestCase): +class TestLsiModel(unittest.TestCase, test_basemodel.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.model = lsimodel.LsiModel(self.corpus, num_topics=2) @@ -72,31 +73,6 @@ def testTransform(self): # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign - def testShowTopic(self): - topic = self.model.show_topic(1) - - for k, v in topic: - self.assertTrue(isinstance(k, six.string_types)) - self.assertTrue(isinstance(v, float)) - - def testShowTopics(self): - - #testing show_topic - topics = self.model.show_topics(formatted=False) - - for topic_no, topic in topics: - self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(topic, list)) - for k, v in topic: - self.assertTrue(isinstance(k, six.string_types)) - self.assertTrue(isinstance(v, float)) - - #testing print_topic - topics = self.model.show_topics(formatted=True) - for topic_no, topic in topics: - self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(topic, str)) - def testCorpusTransform(self): """Test lsi[corpus] transformation.""" model = self.model From 098be5fb22e5e6dbdb30354a3814938fa8061c69 Mon Sep 17 00:00:00 2001 From: markroxor Date: Mon, 17 Oct 2016 16:36:47 +0530 Subject: [PATCH 10/12] unified test_hdpmodel --- gensim/models/hdpmodel.py | 5 ++++- gensim/test/test_basemodel.py | 1 - gensim/test/test_hdpmodel.py | 16 +++++----------- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 043c10355a..5e8dd9cc5b 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -570,6 +570,9 @@ def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None self.style = style + def print_topics(self, num_topics=10, num_words=10): + return self.show_topics(num_topics, num_words, True) + def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): shown = [] if num_topics < 0: @@ -610,4 +613,4 @@ def format_topic(self, topic_id, topic_terms): fmt = 'topic %i:\n%s' % (topic_id, fmt) return fmt -#endclass HdpTopicFormatter +# endclass HdpTopicFormatter diff --git a/gensim/test/test_basemodel.py b/gensim/test/test_basemodel.py index 1c2cb939c3..21ea08d7b4 100644 --- a/gensim/test/test_basemodel.py +++ b/gensim/test/test_basemodel.py @@ -10,7 +10,6 @@ import six - class TestBaseTopicModel(): def testPrintTopic(self): topics = self.model.show_topics(formatted=True) diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py index 4cef57ed9f..8c0495cb9a 100644 --- a/gensim/test/test_hdpmodel.py +++ b/gensim/test/test_hdpmodel.py @@ -22,6 +22,7 @@ from gensim.corpora import mmcorpus, Dictionary from gensim.models import hdpmodel from gensim import matutils +from gensim.test import test_basemodel module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder @@ -47,22 +48,15 @@ def testfile(): return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') -class TestHdpModel(unittest.TestCase): +class TestHdpModel(unittest.TestCase, test_basemodel.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.class_ = hdpmodel.HdpModel self.model = self.class_(corpus, id2word=dictionary) - def testShowTopics(self): - topics = self.model.show_topics(formatted=False, num_topics=20, num_words=20) - - for topic_no, topic in topics: - self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(topic, list)) - for k, v in topic: - self.assertTrue(isinstance(k, six.string_types)) - self.assertTrue(isinstance(v, float)) - + def testShowTopic(self): + # TODO create show_topic in HdpModel and then test + return if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From a215aedca438a84704c2c862b14c4cdf457fc2dd Mon Sep 17 00:00:00 2001 From: markroxor Date: Tue, 18 Oct 2016 14:02:14 +0530 Subject: [PATCH 11/12] changed hdpformatter.show_topics behaviour --- CHANGELOG.md | 2 +- gensim/models/hdpmodel.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62a23fa56c..c0fb903c9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ Changes * Fixed issue #851, In summarizer.py, RunTimeError is raised if single sentence input is provided to avoid ZeroDivionError. (@metalaman, #887) * Fixed issue [#791](https://github.com/RaRe-Technologies/gensim/issues/791), correct logic for iterating over SimilarityABC interface. ([@MridulS](https://github.com/MridulS), [#839](https://github.com/RaRe-Technologies/gensim/pull/839)) * Fixed issue [#938](https://github.com/RaRe-Technologies/gensim/issues/938),Creating a unified base class for all topic models. ([@markroxor](https://github.com/markroxor), [#946](https://github.com/RaRe-Technologies/gensim/pull/946)) - + - breaking change - may break HdpTopicFormatter.show_topics_ 0.13.2, 2016-08-19 diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 5e8dd9cc5b..113d6bcaa8 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -572,7 +572,7 @@ def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None def print_topics(self, num_topics=10, num_words=10): return self.show_topics(num_topics, num_words, True) - + def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): shown = [] if num_topics < 0: @@ -607,10 +607,9 @@ def show_topic_terms(self, topic_data, num_words): def format_topic(self, topic_id, topic_terms): if self.STYLE_GENSIM == self.style: fmt = ' + '.join(['%.3f*%s' % (weight, word) for (word, weight) in topic_terms]) - fmt = 'topic %i: %s' % (topic_id, fmt) else: fmt = '\n'.join([' %20s %.8f' % (word, weight) for (word, weight) in topic_terms]) - fmt = 'topic %i:\n%s' % (topic_id, fmt) + fmt = (topic_id,fmt) return fmt # endclass HdpTopicFormatter From 6de21211853fbcec5fa7e36c0a9ccd0d74547953 Mon Sep 17 00:00:00 2001 From: markroxor Date: Tue, 18 Oct 2016 14:02:39 +0530 Subject: [PATCH 12/12] changed hdpformatter.show_topics behaviour --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0fb903c9b..7d4f8ec5e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ Changes * Fixed issue #851, In summarizer.py, RunTimeError is raised if single sentence input is provided to avoid ZeroDivionError. (@metalaman, #887) * Fixed issue [#791](https://github.com/RaRe-Technologies/gensim/issues/791), correct logic for iterating over SimilarityABC interface. ([@MridulS](https://github.com/MridulS), [#839](https://github.com/RaRe-Technologies/gensim/pull/839)) * Fixed issue [#938](https://github.com/RaRe-Technologies/gensim/issues/938),Creating a unified base class for all topic models. ([@markroxor](https://github.com/markroxor), [#946](https://github.com/RaRe-Technologies/gensim/pull/946)) - - breaking change - may break HdpTopicFormatter.show_topics_ + - _breaking change - may break HdpTopicFormatter.show_\__topics_ 0.13.2, 2016-08-19