diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 6a407e860e..8f8c9c511a 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -70,6 +70,7 @@ from gensim import interfaces, matutils, utils from gensim.models import basemodel +from gensim.utils import is_empty logger = logging.getLogger(__name__) @@ -489,7 +490,8 @@ def add_documents(self, corpus, chunksize=None, decay=None): chunksize = self.chunksize if decay is None: decay = self.decay - + if is_empty(corpus): + logger.warning('LsiModel.add_documents() called but no documents provided, is this intended?') if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo @@ -590,7 +592,8 @@ def __getitem__(self, bow, scaled=False, chunksize=512): Latent representation of corpus in BoW format if `bow` is corpus. """ - assert self.projection.u is not None, "decomposition not initialized yet" + if self.projection.u is None: + raise ValueError('No training data provided - LSI model not initialized yet') # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) diff --git a/gensim/utils.py b/gensim/utils.py index d4fc6a71dc..78d64b88e6 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -30,6 +30,7 @@ from copy import deepcopy from datetime import datetime import platform +import types import numpy as np import scipy.sparse @@ -2084,3 +2085,19 @@ def effective_n_jobs(n_jobs): elif n_jobs < 0: n_jobs = max(multiprocessing.cpu_count() + 1 + n_jobs, 1) return n_jobs + + +def is_empty(corpus): + """Is the corpus (an iterable or a scipy.sparse array) empty?""" + if scipy.sparse.issparse(corpus): + return corpus.shape[1] == 0 # by convention, scipy.sparse documents are columns + if isinstance(corpus, types.GeneratorType): + return False # don't try to guess emptiness of generators, may lose elements irretrievably + try: + # list, numpy array etc + first_doc = next(iter(corpus)) # noqa: F841 (ignore unused variable) + return False # first document exists => not empty + except StopIteration: + return True + except Exception: + return False