From 834e130e36a4457ebe286ce6a9120e965eed527e Mon Sep 17 00:00:00 2001 From: Prakhar Pratyush Date: Wed, 24 May 2017 03:45:38 +0530 Subject: [PATCH] gensim models show_topic/print_topic parameter num_words changed to topn to match other topic models. Backwards compatible (#1200) * Update CHANGELOG.txt * Update CHANGELOG.txt * Release version typo fix * Typo in version * show_topic parameter num_words changed to topn show_topic parameter num_words changed to topn in order to make it consistent with LdaModel show_topic parameter num_words changed to topn both old and new param with deprecation warning ldamallet now supports both num_words and topn parameters for show_topic with deprecation warning for the num_words. hdpmodel show_topic supports old and new param show_topic in hdpmodel now supports both num_words and topn parameters to make it consistent across all models, with deprecation warning for num_words dtmmodel topn/num_words with deprecation warning Inconsistency between api and code removed for topn/num_words by adding support for both params with proper deprecation warning hdpmodel show_topic supports old and new param show_topic in hdpmodel now supports both num_words and topn parameters to make it consistent across all models, with deprecation warning for num_words - checks should pass this time hdpmodel show_topic supports old and new para dtmmodel topn/num_words with deprecation warning ldamallet show_topic param fixed ldamallet now supports both num_words and topn parameters for show_topic with deprecation warning for the num_words. dtmmodel topn/num_words with deprecation warning dtmmodel is now compatible with both topn/num_words parameters for show_topic and others with proper deprecation warnings. hdpmodel num_words changed to topn with deprecation warning To make the code consistent with the api- parameters num_words changed to topn (for print_topic/show_topic method), with deprecation warning for num_words hdpmodel num_words changed to topn with deprecation warning To make the code consistent with the api- parameters num_words changed to topn (for print_topic/show_topic method), with deprecation warning for num_words hdpmodel num_words changed to topn with deprecation warning To make the code consistent with the api- parameters num_words changed to topn (for print_topic/show_topic method), with deprecation warning for num_words dtmmodel num_words changed to topn with deprecation warning To make the code consistent with the api- parameters num_words changed to topn (for print_topic/show_topic method), with deprecation warning for num_words ldamallet num_words changed to topn with deprecation warning To make the code consistent with the api- parameters num_words changed to topn (for print_topic/show_topic method), with deprecation warning for num_words hdpmodel num_words changed to topn with deprecation warning To make the code consistent with the api- parameters num_words changed to topn (for print_topic/show_topic method), with deprecation warning for num_words ldamallet num_words changed to topn with deprecation warning To make the code consistent with the api- parameters num_words changed to topn (for print_topic/show_topic method), with deprecation warning for num_words * hdpmodel topn/num_words conflict resolved * dtmmodel topn/show_topic conflict resolved * ldamallet topn/num_words conflict resolved * whitespace error resolved * whitespace error resolved * split multi-line comments in hdpmodel * splitting multi-line comments in dtmmodel * splitting multi-line comments for ldamallet --- gensim/models/hdpmodel.py | 29 +++++++++++++++++++++-------- gensim/models/wrappers/dtmmodel.py | 18 ++++++++++++++---- gensim/models/wrappers/ldamallet.py | 9 +++++++-- 3 files changed, 42 insertions(+), 14 deletions(-) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 2c74d15a15..ee6035a449 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -47,7 +47,6 @@ meanchangethresh = 0.00001 rhot_bound = 0.0 - def expect_log_sticks(sticks): """ For stick-breaking hdp, return the E[log(sticks)] @@ -436,7 +435,7 @@ def update_expectations(self): self.m_timestamp[:] = self.m_updatect self.m_status_up_to_date = True - def show_topic(self, topic_id, num_words=20, log=False, formatted=False): + def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=None): """ Print the `num_words` most probable words for topic `topic_id`. @@ -444,12 +443,17 @@ def show_topic(self, topic_id, num_words=20, log=False, formatted=False): `False` as lists of (weight, word) pairs. """ + if num_words is not None: # deprecated num_words is used + logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.") + logger.warning("Please use topn instead.") + topn = num_words + if not self.m_status_up_to_date: self.update_expectations() betas = self.m_lambda + self.m_eta hdp_formatter = HdpTopicFormatter(self.id2word, betas) - return hdp_formatter.show_topic(topic_id, num_words, log, formatted) - + return hdp_formatter.show_topic(topic_id, topn, log, formatted) + def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): """ Print the `num_words` most probable words for `num_topics` number of topics. @@ -608,10 +612,19 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): return shown - def print_topic(self, topic_id, num_words): - return self.show_topic(topic_id, num_words, formatted=True) + def print_topic(self, topic_id, topn= None, num_words=None): + if num_words is not None: # deprecated num_words is used + logger.warning("The parameter num_words for print_topic() would be deprecated in the updated version.") + logger.warning("Please use topn instead.") + topn = num_words + + return self.show_topic(topic_id, topn, formatted=True) - def show_topic(self, topic_id, num_words, log=False, formatted=False): + def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words= None,): + if num_words is not None: # deprecated num_words is used + logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.") + logger.warning("Please use topn instead.") + topn = num_words lambdak = list(self.data[topic_id, :]) lambdak = lambdak / sum(lambdak) @@ -619,7 +632,7 @@ def show_topic(self, topic_id, num_words, log=False, formatted=False): temp = zip(lambdak, xrange(len(lambdak))) temp = sorted(temp, key=lambda x: x[0], reverse=True) - topic_terms = self.show_topic_terms(temp, num_words) + topic_terms = self.show_topic_terms(temp, topn) if formatted: topic = self.format_topic(topic_id, topic_terms) diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index a953ce858a..f9cb19362a 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -283,12 +283,17 @@ def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted # topic)) return shown - def show_topic(self, topicid, time, num_words=50): + def show_topic(self, topicid, time, topn=50, num_words=None): """ Return `num_words` most probable words for the given `topicid`, as a list of `(word_probability, word)` 2-tuples. """ + if num_words is not None: # deprecated num_words is used + logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.") + logger.warning("Please use topn instead.") + topn = num_words + topics = self.lambda_[:, :, time] topic = topics[topicid] # liklihood to probability @@ -296,13 +301,18 @@ def show_topic(self, topicid, time, num_words=50): # normalize to probability dist topic = topic / topic.sum() # sort according to prob - bestn = matutils.argsort(topic, num_words, reverse=True) + bestn = matutils.argsort(topic, topn, reverse=True) beststr = [(topic[id], self.id2word[id]) for id in bestn] return beststr - def print_topic(self, topicid, time, num_words=10): + def print_topic(self, topicid, time, topn=10, num_words=None): """Return the given topic, formatted as a string.""" - return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, num_words)]) + if num_words is not None: # deprecated num_words is used + logger.warning("The parameter num_words for print_topic(() would be deprecated in the updated version.") + logger.warning("Please use topn instead.") + topn = num_words + + return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, topn)]) def dtm_vis(self, corpus, time): """ diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index fb9ae1e31d..b52de2e3f0 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -240,14 +240,19 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic) return shown - def show_topic(self, topicid, num_words=10): + def show_topic(self, topicid, topn=10, num_words=None): + if num_words is not None: # deprecated num_words is used + logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.") + logger.warning("Please use topn instead.") + topn = num_words + if self.word_topics is None: logger.warning( "Run train or load_word_topics before showing topics." ) topic = self.word_topics[topicid] topic = topic / topic.sum() # normalize to probability dist - bestn = matutils.argsort(topic, num_words, reverse=True) + bestn = matutils.argsort(topic, topn, reverse=True) beststr = [(self.id2word[id], topic[id]) for id in bestn] return beststr