LDA hyperparameter fix: eta dimensionality and optimization (#1024)

* Copied ldamodel into ldamodel2. Going to make changes to ldamodel2 and compare to the original. * Fixed initialization of eta, and optimization of eta. * Changed the name of both LDA versions, so that it is possible to see what changes I have made in the PR. * Added a notebook with tests. * Added check of eta shape. Added check that eta is not 'asymmetric', raising ValueError if it is. Updated test notebook. * Updated lda unit tests. Expected dimensions of eta are now '(num_terms,)'. Removed tests of asymmetric eta, and where eta has shape '(num_topics, num_terms)'. * Just removed a print statement. * Not logging eta, as it can be quite huge. Updates w.r.t. requested changes on PR. * Re-introduced K x V asymmetric priors. Updated eta docstring. * Reverted positivity of prior check to the way it was before. * Added a assertionRaise for asymmetric option of eta. * Removed an incorrect unit test. Added unit tests for K x V eta. * Fixed indentation. * Added an assertRaises where eta is too long. * Removed temporary test notebook and old version of lda. * Removed import of old version of lda. * Removed unnecessary comments. * Updated CHANGELOG.md
piskvorky · Nov 29, 2016 · 54871ba · 54871ba
1 parent 0b2f6b8
commit 54871ba
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 51 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 Changes
 =======
+
+* Fix automatic learning of eta (prior over words) in LDA (@olavurmortensen, [#1024](https://github.com/RaRe-Technologies/gensim/pull/1024#)).
+    * eta should have dimensionality V (size of vocab) not K (number of topics). eta with shape K x V is still allowed, as the user may want to impose specific prior information to each topic.
+    * eta is no longer allowed the "asymmetric" option. Asymmetric priors over words in general are fine (learned or user defined).
+    * As a result, the eta update (`update_eta`) was simplified some. It also no longer logs eta when updated, because it is too large for that.
+    * Unit tests were updated accordingly. The unit tests expect a different shape than before; some unit tests were redundant after the change; `eta='asymmetric'` now should raise an error.
+
 0.13.5, 2016-11-12
 * Add delete_temporary_training_data() function to word2vec and doc2vec models. (@deepmipt-VladZhukov, [#987](https://github.com/RaRe-Technologies/gensim/pull/987))
 

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -239,12 +239,13 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         prior directly from your data.
 
         `eta` can be a scalar for a symmetric prior over topic/word
-        distributions, or a matrix of shape num_topics x num_words, which can
-        be used to impose asymmetric priors over the word distribution on a
-        per-topic basis. This may be useful if you want to seed certain topics
-        with particular words by boosting the priors for those words.  It also
-        supports the special value 'auto', which learns an asymmetric prior
-        directly from your data.
+        distributions, or a vector of shape num_words, which can be used to 
+        impose (user defined) asymmetric priors over the word distribution. 
+        It also supports the special value 'auto', which learns an asymmetric
+        prior over words directly from your data. `eta` can also be a matrix
+        of shape num_topics x num_words, which can be used to impose 
+        asymmetric priors over the word distribution on a per-topic basis
+        (can not be learned from data).
 
         Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_
         on how to set up a cluster of machines for gensim).
@@ -270,6 +271,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5)  # train asymmetric alpha from data
 
         """
+
         # store user-supplied parameters
         self.id2word = id2word
         if corpus is None and self.id2word is None:
@@ -305,13 +307,17 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
 
         assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
 
+        if isinstance(eta, six.string_types):
+            if eta == 'asymmetric':
+                raise ValueError("The 'asymmetric' option cannot be used for eta")
+
         self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')
 
         self.random_state = get_random_state(random_state)
 
-        assert (self.eta.shape == (self.num_topics, 1) or self.eta.shape == (self.num_topics, self.num_terms)), (
-            "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
-            (str(self.eta.shape), self.num_topics, self.num_topics, self.num_terms))
+        assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), (
+                "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
+                (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms))
 
         # VB constants
         self.iterations = iterations
@@ -354,39 +360,39 @@ def init_dir_prior(self, prior, name):
         if prior is None:
             prior = 'symmetric'
 
+        if name == 'alpha':
+            prior_shape = self.num_topics
+        elif name == 'eta':
+            prior_shape = self.num_terms
+        else:
+            raise ValueError("'name' must be 'alpha' or 'eta'")
+
         is_auto = False
 
         if isinstance(prior, six.string_types):
             if prior == 'symmetric':
-                logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
-                init_prior = np.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)])
+                logger.info("using symmetric %s at %s", name, 1.0 / prior_shape)
+                init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
             elif prior == 'asymmetric':
-                init_prior = np.asarray([1.0 / (i + np.sqrt(self.num_topics)) for i in xrange(self.num_topics)])
+                init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)])
                 init_prior /= init_prior.sum()
                 logger.info("using asymmetric %s %s", name, list(init_prior))
             elif prior == 'auto':
                 is_auto = True
-                init_prior = np.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)])
-                logger.info("using autotuned %s, starting with %s", name, list(init_prior))
+                init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
+                if name == 'alpha':
+                    logger.info("using autotuned %s, starting with %s", name, list(init_prior))
             else:
                 raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior))
         elif isinstance(prior, list):
             init_prior = np.asarray(prior)
         elif isinstance(prior, np.ndarray):
             init_prior = prior
         elif isinstance(prior, np.number) or isinstance(prior, numbers.Real):
-            init_prior = np.asarray([prior] * self.num_topics)
+            init_prior = np.asarray([prior] * prior_shape)
         else:
             raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name)
 
-        if name == 'eta':
-            # please note the difference in shapes between alpha and eta:
-            # alpha is a row: [0.1, 0.1]
-            # eta is a column: [[0.1],
-            #                   [0.1]]
-            if init_prior.shape == (self.num_topics,) or init_prior.shape == (1, self.num_topics):
-                init_prior = init_prior.reshape((self.num_topics, 1))  # this statement throws ValueError if eta did not match self.num_topics
-
         return init_prior, is_auto
 
     def __str__(self):
@@ -522,13 +528,10 @@ def update_eta(self, lambdat, rho):
         Update parameters for the Dirichlet prior on the per-topic
         word weights `eta` given the last `lambdat`.
         """
-        if self.eta.shape[1] != 1:
-            raise ValueError("Can't use update_eta with eta matrices, only column vectors.")
-        N = float(lambdat.shape[1])
-        logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat.transpose()) / N).reshape((self.num_topics, 1))
+        N = float(lambdat.shape[0])
+        logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,))
 
         self.eta = update_dir_prior(self.eta, N, logphat, rho)
-        logger.info("optimized eta %s", list(self.eta.reshape((self.num_topics))))
 
         return self.eta
 
@@ -767,9 +770,10 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0):
         if np.ndim(self.eta) == 0:
             sum_eta = self.eta * self.num_terms
         else:
-            sum_eta = np.sum(self.eta, 1)
+            sum_eta = np.sum(self.eta)
 
         score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1)))
+
         return score
 
     def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):

diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py
@@ -159,49 +159,40 @@ def testEta(self):
             num_topics=2,
             eta=None
         )
-        expected_shape = (2, 1)
+        num_terms = len(dictionary)
+        expected_shape = (num_terms,)
 
         # should not raise anything
         model = self.class_(**kwargs)
         self.assertEqual(model.eta.shape, expected_shape)
-        self.assertTrue(all(model.eta == np.array([[0.5], [0.5]])))
+        self.assertTrue(all(model.eta == np.array([0.5] * num_terms)))
 
         kwargs['eta'] = 'symmetric'
         model = self.class_(**kwargs)
         self.assertEqual(model.eta.shape, expected_shape)
-        self.assertTrue(all(model.eta == np.array([[0.5], [0.5]])))
-
-        kwargs['eta'] = 'asymmetric'
-        model = self.class_(**kwargs)
-        self.assertEqual(model.eta.shape, expected_shape)
-        self.assertTrue(np.allclose(model.eta, [[0.630602], [0.369398]]))
+        self.assertTrue(all(model.eta == np.array([0.5] * num_terms)))
 
         kwargs['eta'] = 0.3
         model = self.class_(**kwargs)
         self.assertEqual(model.eta.shape, expected_shape)
-        self.assertTrue(all(model.eta == np.array([[0.3], [0.3]])))
+        self.assertTrue(all(model.eta == np.array([0.3] * num_terms)))
 
         kwargs['eta'] = 3
         model = self.class_(**kwargs)
         self.assertEqual(model.eta.shape, expected_shape)
-        self.assertTrue(all(model.eta == np.array([[3], [3]])))
-
-        kwargs['eta'] = [[0.3], [0.3]]
-        model = self.class_(**kwargs)
-        self.assertEqual(model.eta.shape, expected_shape)
-        self.assertTrue(all(model.eta == np.array([[0.3], [0.3]])))
+        self.assertTrue(all(model.eta == np.array([3] * num_terms)))
 
-        kwargs['eta'] = [0.3, 0.3]
+        kwargs['eta'] = [0.3] * num_terms
         model = self.class_(**kwargs)
         self.assertEqual(model.eta.shape, expected_shape)
-        self.assertTrue(all(model.eta == np.array([[0.3], [0.3]])))
+        self.assertTrue(all(model.eta == np.array([0.3] * num_terms)))
 
-        kwargs['eta'] = np.array([0.3, 0.3])
+        kwargs['eta'] = np.array([0.3] * num_terms)
         model = self.class_(**kwargs)
         self.assertEqual(model.eta.shape, expected_shape)
-        self.assertTrue(all(model.eta == np.array([[0.3], [0.3]])))
+        self.assertTrue(all(model.eta == np.array([0.3] * num_terms)))
 
-        # should be ok with num_topics x num_terms
+	# should be ok with num_topics x num_terms
         testeta = np.array([[0.5] * len(dictionary)] * 2)
         kwargs['eta'] = testeta
         self.class_(**kwargs)
@@ -210,15 +201,18 @@ def testEta(self):
         kwargs['eta'] = testeta.reshape(tuple(reversed(testeta.shape)))
         self.assertRaises(AssertionError, self.class_, **kwargs)
 
-        kwargs['eta'] = [0.3, 0.3, 0.3]
+        kwargs['eta'] = [0.3]
         self.assertRaises(AssertionError, self.class_, **kwargs)
 
-        kwargs['eta'] = [0.3]
+        kwargs['eta'] = [0.3] * (num_terms + 1)
         self.assertRaises(AssertionError, self.class_, **kwargs)
 
         kwargs['eta'] = "gensim is cool"
         self.assertRaises(ValueError, self.class_, **kwargs)
 
+        kwargs['eta'] = "asymmetric"
+        self.assertRaises(ValueError, self.class_, **kwargs)
+
     def testTopTopics(self):
         top_topics = self.model.top_topics(self.corpus)