Skip to content

Commit

Permalink
LDA hyperparameter fix: eta dimensionality and optimization (#1024)
Browse files Browse the repository at this point in the history
* Copied ldamodel into ldamodel2. Going to make changes to ldamodel2 and compare to the original.

* Fixed initialization of eta, and optimization of eta.

* Changed the name of both LDA versions, so that it is possible to see what changes I have made in the PR.

* Added a notebook with tests.

* Added check of eta shape. Added check that eta is not 'asymmetric', raising ValueError if it is. Updated test notebook.

* Updated lda unit tests. Expected dimensions of eta are now '(num_terms,)'. Removed tests of asymmetric eta, and where eta has shape '(num_topics, num_terms)'.

* Just removed a print statement.

* Not logging eta, as it can be quite huge. Updates w.r.t. requested changes on PR.

* Re-introduced K x V asymmetric priors. Updated eta docstring.

* Reverted positivity of prior check to the way it was before.

* Added a assertionRaise for asymmetric option of eta.

* Removed an incorrect unit test. Added unit tests for K x V eta.

* Fixed indentation.

* Added an assertRaises where eta is too long.

* Removed temporary test notebook and old version of lda.

* Removed import of old version of lda.

* Removed unnecessary comments.

* Updated CHANGELOG.md
  • Loading branch information
olavurmortensen authored and tmylk committed Nov 29, 2016
1 parent 0b2f6b8 commit 54871ba
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 51 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
Changes
=======

* Fix automatic learning of eta (prior over words) in LDA (@olavurmortensen, [#1024](https://github.com/RaRe-Technologies/gensim/pull/1024#)).
* eta should have dimensionality V (size of vocab) not K (number of topics). eta with shape K x V is still allowed, as the user may want to impose specific prior information to each topic.
* eta is no longer allowed the "asymmetric" option. Asymmetric priors over words in general are fine (learned or user defined).
* As a result, the eta update (`update_eta`) was simplified some. It also no longer logs eta when updated, because it is too large for that.
* Unit tests were updated accordingly. The unit tests expect a different shape than before; some unit tests were redundant after the change; `eta='asymmetric'` now should raise an error.

0.13.5, 2016-11-12
* Add delete_temporary_training_data() function to word2vec and doc2vec models. (@deepmipt-VladZhukov, [#987](https://github.com/RaRe-Technologies/gensim/pull/987))

Expand Down
62 changes: 33 additions & 29 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,12 +239,13 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
prior directly from your data.
`eta` can be a scalar for a symmetric prior over topic/word
distributions, or a matrix of shape num_topics x num_words, which can
be used to impose asymmetric priors over the word distribution on a
per-topic basis. This may be useful if you want to seed certain topics
with particular words by boosting the priors for those words. It also
supports the special value 'auto', which learns an asymmetric prior
directly from your data.
distributions, or a vector of shape num_words, which can be used to
impose (user defined) asymmetric priors over the word distribution.
It also supports the special value 'auto', which learns an asymmetric
prior over words directly from your data. `eta` can also be a matrix
of shape num_topics x num_words, which can be used to impose
asymmetric priors over the word distribution on a per-topic basis
(can not be learned from data).
Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_
on how to set up a cluster of machines for gensim).
Expand All @@ -270,6 +271,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
>>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data
"""

# store user-supplied parameters
self.id2word = id2word
if corpus is None and self.id2word is None:
Expand Down Expand Up @@ -305,13 +307,17 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,

assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)

if isinstance(eta, six.string_types):
if eta == 'asymmetric':
raise ValueError("The 'asymmetric' option cannot be used for eta")

self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

self.random_state = get_random_state(random_state)

assert (self.eta.shape == (self.num_topics, 1) or self.eta.shape == (self.num_topics, self.num_terms)), (
"Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
(str(self.eta.shape), self.num_topics, self.num_topics, self.num_terms))
assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), (
"Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
(str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms))

# VB constants
self.iterations = iterations
Expand Down Expand Up @@ -354,39 +360,39 @@ def init_dir_prior(self, prior, name):
if prior is None:
prior = 'symmetric'

if name == 'alpha':
prior_shape = self.num_topics
elif name == 'eta':
prior_shape = self.num_terms
else:
raise ValueError("'name' must be 'alpha' or 'eta'")

is_auto = False

if isinstance(prior, six.string_types):
if prior == 'symmetric':
logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)])
logger.info("using symmetric %s at %s", name, 1.0 / prior_shape)
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
elif prior == 'asymmetric':
init_prior = np.asarray([1.0 / (i + np.sqrt(self.num_topics)) for i in xrange(self.num_topics)])
init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)])
init_prior /= init_prior.sum()
logger.info("using asymmetric %s %s", name, list(init_prior))
elif prior == 'auto':
is_auto = True
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)])
logger.info("using autotuned %s, starting with %s", name, list(init_prior))
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
if name == 'alpha':
logger.info("using autotuned %s, starting with %s", name, list(init_prior))
else:
raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior))
elif isinstance(prior, list):
init_prior = np.asarray(prior)
elif isinstance(prior, np.ndarray):
init_prior = prior
elif isinstance(prior, np.number) or isinstance(prior, numbers.Real):
init_prior = np.asarray([prior] * self.num_topics)
init_prior = np.asarray([prior] * prior_shape)
else:
raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name)

if name == 'eta':
# please note the difference in shapes between alpha and eta:
# alpha is a row: [0.1, 0.1]
# eta is a column: [[0.1],
# [0.1]]
if init_prior.shape == (self.num_topics,) or init_prior.shape == (1, self.num_topics):
init_prior = init_prior.reshape((self.num_topics, 1)) # this statement throws ValueError if eta did not match self.num_topics

return init_prior, is_auto

def __str__(self):
Expand Down Expand Up @@ -522,13 +528,10 @@ def update_eta(self, lambdat, rho):
Update parameters for the Dirichlet prior on the per-topic
word weights `eta` given the last `lambdat`.
"""
if self.eta.shape[1] != 1:
raise ValueError("Can't use update_eta with eta matrices, only column vectors.")
N = float(lambdat.shape[1])
logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat.transpose()) / N).reshape((self.num_topics, 1))
N = float(lambdat.shape[0])
logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,))

self.eta = update_dir_prior(self.eta, N, logphat, rho)
logger.info("optimized eta %s", list(self.eta.reshape((self.num_topics))))

return self.eta

Expand Down Expand Up @@ -767,9 +770,10 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0):
if np.ndim(self.eta) == 0:
sum_eta = self.eta * self.num_terms
else:
sum_eta = np.sum(self.eta, 1)
sum_eta = np.sum(self.eta)

score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1)))

return score

def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
Expand Down
38 changes: 16 additions & 22 deletions gensim/test/test_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,49 +159,40 @@ def testEta(self):
num_topics=2,
eta=None
)
expected_shape = (2, 1)
num_terms = len(dictionary)
expected_shape = (num_terms,)

# should not raise anything
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == np.array([[0.5], [0.5]])))
self.assertTrue(all(model.eta == np.array([0.5] * num_terms)))

kwargs['eta'] = 'symmetric'
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == np.array([[0.5], [0.5]])))

kwargs['eta'] = 'asymmetric'
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(np.allclose(model.eta, [[0.630602], [0.369398]]))
self.assertTrue(all(model.eta == np.array([0.5] * num_terms)))

kwargs['eta'] = 0.3
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == np.array([[0.3], [0.3]])))
self.assertTrue(all(model.eta == np.array([0.3] * num_terms)))

kwargs['eta'] = 3
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == np.array([[3], [3]])))

kwargs['eta'] = [[0.3], [0.3]]
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == np.array([[0.3], [0.3]])))
self.assertTrue(all(model.eta == np.array([3] * num_terms)))

kwargs['eta'] = [0.3, 0.3]
kwargs['eta'] = [0.3] * num_terms
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == np.array([[0.3], [0.3]])))
self.assertTrue(all(model.eta == np.array([0.3] * num_terms)))

kwargs['eta'] = np.array([0.3, 0.3])
kwargs['eta'] = np.array([0.3] * num_terms)
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == np.array([[0.3], [0.3]])))
self.assertTrue(all(model.eta == np.array([0.3] * num_terms)))

# should be ok with num_topics x num_terms
# should be ok with num_topics x num_terms
testeta = np.array([[0.5] * len(dictionary)] * 2)
kwargs['eta'] = testeta
self.class_(**kwargs)
Expand All @@ -210,15 +201,18 @@ def testEta(self):
kwargs['eta'] = testeta.reshape(tuple(reversed(testeta.shape)))
self.assertRaises(AssertionError, self.class_, **kwargs)

kwargs['eta'] = [0.3, 0.3, 0.3]
kwargs['eta'] = [0.3]
self.assertRaises(AssertionError, self.class_, **kwargs)

kwargs['eta'] = [0.3]
kwargs['eta'] = [0.3] * (num_terms + 1)
self.assertRaises(AssertionError, self.class_, **kwargs)

kwargs['eta'] = "gensim is cool"
self.assertRaises(ValueError, self.class_, **kwargs)

kwargs['eta'] = "asymmetric"
self.assertRaises(ValueError, self.class_, **kwargs)

def testTopTopics(self):
top_topics = self.model.top_topics(self.corpus)

Expand Down

0 comments on commit 54871ba

Please sign in to comment.