diff --git a/gensim/utils.py b/gensim/utils.py index 1a499a6d61..3f35824fed 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -4,9 +4,7 @@ # Copyright (C) 2010 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains various general utility functions. -""" +"""This module contains various general utility functions.""" from __future__ import with_statement @@ -43,50 +41,44 @@ from six import iterkeys, iteritems, u, string_types, unichr from six.moves import xrange +from smart_open import smart_open + if sys.version_info[0] >= 3: unicode = str logger = logging.getLogger(__name__) -try: - from smart_open import smart_open -except ImportError: - logger.info("smart_open library not found; falling back to local-filesystem-only") - def make_closing(base, **attrs): - """ - Add support for `with Base(attrs) as fout:` to the base class if it's missing. - The base class' `close()` method will be called on context exit, to always close the file properly. +PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE) +RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE) - This is needed for gzip.GzipFile, bz2.BZ2File etc in older Pythons (<=2.6), which otherwise - raise "AttributeError: GzipFile instance has no attribute '__exit__'". - """ - if not hasattr(base, '__enter__'): - attrs['__enter__'] = lambda self: self - if not hasattr(base, '__exit__'): - attrs['__exit__'] = lambda self, type, value, traceback: self.close() - return type('Closing' + base.__name__, (base, object), attrs) - - def smart_open(fname, mode='rb'): - _, ext = os.path.splitext(fname) - if ext == '.bz2': - from bz2 import BZ2File - return make_closing(BZ2File)(fname, mode) - if ext == '.gz': - from gzip import GzipFile - return make_closing(GzipFile)(fname, mode) - return open(fname, mode) +def get_random_state(seed): + """Generate :class:`numpy.random.RandomState` based on input seed. + Parameters + ---------- + seed : {None, int, array_like} + Seed for random state. -PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE) -RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE) + Returns + ------- + :class:`numpy.random.RandomState` + Random state. + Raises + ------ + AttributeError + If seed is not {None, int, array_like}. + + Notes + ----- + Method originally from [1]_ and written by @joshloyal. + + References + ---------- + .. [1] https://github.com/maciejkula/glove-python -def get_random_state(seed): - """ - Turn seed into a np.random.RandomState instance. - Method originally from maciejkula/glove-python, and written by @joshloyal. """ if seed is None or seed is np.random: return np.random.mtrand._rand @@ -98,10 +90,16 @@ def get_random_state(seed): def synchronous(tlockname): - """ - A decorator to place an instance-based lock around a method. + """A decorator to place an instance-based lock around a method. + + Notes + ----- + Adapted from [2]_ + + References + ---------- + .. [2] http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/ - Adapted from http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/ """ def _synched(func): @wraps(func) @@ -118,27 +116,18 @@ def _synchronizer(self, *args, **kwargs): return _synched -class NoCM(object): - def acquire(self): - pass - - def release(self): - pass - - def __enter__(self): - pass - - def __exit__(self, type, value, traceback): - pass - - -nocm = NoCM() +def file_or_filename(input): + """Open file with `smart_open`. + Parameters + ---------- + input : str or file-like + Filename or file-like object. -def file_or_filename(input): - """ - Return a file-like object ready to be read from the beginning. `input` is either - a filename (gz/bz2 also supported) or a file-like object supporting seek. + Returns + ------- + input : file-like object + Opened file OR seek out to 0 byte if `input` is already file-like object. """ if isinstance(input, string_types): @@ -151,11 +140,21 @@ def file_or_filename(input): def deaccent(text): - """ - Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring. + """Remove accentuation from the given string. - Return input string with accents removed, as unicode. + Parameters + ---------- + text : str + Input string. + Returns + ------- + str + Unicode string without accentuation. + + Examples + -------- + >>> from gensim.utils import deaccent >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek") u'Sef chomutovskych komunistu dostal postou bily prasek' @@ -169,9 +168,19 @@ def deaccent(text): def copytree_hardlink(source, dest): - """ - Recursively copy a directory ala shutils.copytree, but hardlink files - instead of copying. Available on UNIX systems only. + """Recursively copy a directory ala shutils.copytree, but hardlink files instead of copying. + + Parameters + ---------- + source : str + Path to source directory + dest : str + Path to destination directory + + Warnings + -------- + Available on UNIX systems only. + """ copy2 = shutil.copy2 try: @@ -182,17 +191,35 @@ def copytree_hardlink(source, dest): def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, lower=False): - """ - Iteratively yield tokens as unicode strings, removing accent marks - and optionally lowercasing the unidoce string by assigning True - to one of the parameters, lowercase, to_lower, or lower. - - Input text may be either unicode or utf8-encoded byte string. - - The tokens on output are maximal contiguous sequences of alphabetic - characters (no digits!). - - >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True)) + """Iteratively yield tokens as unicode strings, removing accent marks and optionally lowercasing string + if any from `lowercase`, `to_lower`, `lower` set to True. + + Parameters + ---------- + text : str + Input string. + lowercase : bool, optional + If True - lowercase input string. + deacc : bool, optional + If True - remove accentuation from string by :func:`~gensim.utils.deaccent`. + encoding : str, optional + Encoding of input string, used as parameter for :func:`~gensim.utils.to_unicode`. + errors : str, optional + Error handling behaviour, used as parameter for :func:`~gensim.utils.to_unicode`. + to_lower : bool, optional + Same as `lowercase`. + lower : bool, optional + Same as `lowercase`. + + Yields + ------ + str + Contiguous sequences of alphabetic characters (no digits!), using :func:`~gensim.utils.simple_tokenize` + + Examples + -------- + >>> from gensim.utils import tokenize + >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc=True)) [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu'] """ @@ -206,16 +233,42 @@ def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict def simple_tokenize(text): + """Tokenize input test using :const:`gensim.utils.PAT_ALPHABETIC`. + + Parameters + ---------- + text : str + Input text. + + Yields + ------ + str + Tokens from `text`. + + """ for match in PAT_ALPHABETIC.finditer(text): yield match.group() def simple_preprocess(doc, deacc=False, min_len=2, max_len=15): - """ - Convert a document into a list of tokens. - - This lowercases, tokenizes, de-accents (optional). -- the output are final - tokens = unicode strings, that won't be processed any further. + """Convert a document into a list of tokens (also with lowercase and optional de-accents), + used :func:`~gensim.utils.tokenize`. + + Parameters + ---------- + doc : str + Input document. + deacc : bool, optional + If True - remove accentuation from string by :func:`~gensim.utils.deaccent`. + min_len : int, optional + Minimal length of token in result (inclusive). + max_len : int, optional + Maximal length of token in result (inclusive). + + Returns + ------- + list of str + Tokens extracted from `doc`. """ tokens = [ @@ -226,7 +279,24 @@ def simple_preprocess(doc, deacc=False, min_len=2, max_len=15): def any2utf8(text, errors='strict', encoding='utf8'): - """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8.""" + """Convert `text` to bytestring in utf8. + + Parameters + ---------- + text : str + Input text. + errors : str, optional + Error handling behaviour, used as parameter for `unicode` function (python2 only). + encoding : str, optional + Encoding of `text` for `unicode` function (python2 only). + + Returns + ------- + str + Bytestring in utf8. + + """ + if isinstance(text, unicode): return text.encode('utf8') # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8 @@ -237,7 +307,23 @@ def any2utf8(text, errors='strict', encoding='utf8'): def any2unicode(text, encoding='utf8', errors='strict'): - """Convert a string (bytestring in `encoding` or unicode), to unicode.""" + """Convert `text` to unicode. + + Parameters + ---------- + text : str + Input text. + errors : str, optional + Error handling behaviour, used as parameter for `unicode` function (python2 only). + encoding : str, optional + Encoding of `text` for `unicode` function (python2 only). + + Returns + ------- + str + Unicode version of `text`. + + """ if isinstance(text, unicode): return text return unicode(text, encoding, errors=errors) @@ -247,31 +333,59 @@ def any2unicode(text, encoding='utf8', errors='strict'): def call_on_class_only(*args, **kwargs): - """Raise exception when load methods are called on instance""" + """Helper for raise `AttributeError` if method should be called from instance. + + Parameters + ---------- + *args + Variable length argument list. + **kwargs + Arbitrary keyword arguments. + + Raises + ------ + AttributeError + If `load` method are called on instance. + + """ raise AttributeError('This method should be called on a class object.') class SaveLoad(object): - """ - Objects which inherit from this class have save/load functions, which un/pickle - them to disk. + """Class which inherit from this class have save/load functions, which un/pickle them to disk. - This uses pickle for de/serializing, so objects must not contain - unpicklable attributes, such as lambda functions etc. + Warnings + -------- + This uses pickle for de/serializing, so objects must not contain unpicklable attributes, + such as lambda functions etc. """ @classmethod def load(cls, fname, mmap=None): - """ - Load a previously saved object from file (also see `save`). - - If the object was saved with large arrays stored separately, you can load - these arrays via mmap (shared memory) using `mmap='r'`. Default: don't use - mmap, load large arrays as normal objects. - - If the file being loaded is compressed (either '.gz' or '.bz2'), then - `mmap=None` must be set. Load will raise an `IOError` if this condition - is encountered. + """Load a previously saved object (using :meth:`~gensim.utils.SaveLoad.save`) from file. + + Parameters + ---------- + fname : str + Path to file that contains needed object. + mmap : str, optional + Memory-map option. If the object was saved with large arrays stored separately, you can load these arrays + via mmap (shared memory) using `mmap='r'. + If the file being loaded is compressed (either '.gz' or '.bz2'), then `mmap=None` **must be** set. + + See Also + -------- + :meth:`~gensim.utils.SaveLoad.save` + + Returns + ------- + object + Object loaded from `fname`. + + Raises + ------ + IOError + When methods are called on instance (should be called from class). """ logger.info("loading %s object from %s", cls.__name__, fname) @@ -284,9 +398,20 @@ def load(cls, fname, mmap=None): return obj def _load_specials(self, fname, mmap, compress, subname): - """ - Loads any attributes that were stored specially, and gives the same - opportunity to recursively included SaveLoad instances. + """Loads any attributes that were stored specially, and gives the same opportunity + to recursively included :class:`~gensim.utils.SaveLoad` instances. + + Parameters + ---------- + fname : str + Path to file that contains needed object. + mmap : str + Memory-map option. + compress : bool + Set to True if file is compressed. + subname : str + ... + """ def mmap_error(obj, filename): @@ -337,14 +462,40 @@ def mmap_error(obj, filename): @staticmethod def _adapt_by_suffix(fname): - """Give appropriate compress setting and filename formula""" + """Give appropriate compress setting and filename formula. + + Parameters + ---------- + fname : str + Input filename. + + Returns + ------- + (bool, function) + First argument will be True if `fname` compressed. + + """ compress, suffix = (True, 'npz') if fname.endswith('.gz') or fname.endswith('.bz2') else (False, 'npy') return compress, lambda *args: '.'.join(args + (suffix,)) def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): - """ - Save the object to file (also see `load`). - + """Save the object to file. + + Parameters + ---------- + fname : str + Path to file. + separately : list, optional + Iterable of attributes than need to store distinctly. + sep_limit : int, optional + Limit for separation. + ignore : frozenset, optional + Attributes that shouldn't be store. + pickle_protocol : int, optional + Protocol number for pickle. + + Notes + ----- If `separately` is None, automatically detect large numpy/scipy.sparse arrays in the object being stored, and store them into separate files. This avoids pickle memory errors and @@ -354,12 +505,9 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=fro a list of attribute names to be stored in separate files. The automatic check is not performed in this case. - `ignore` is a set of attribute names to *not* serialize (file - handles, caches etc). On subsequent load() these attributes will - be set to None. - - `pickle_protocol` defaults to 2 so the pickled object can be imported - in both Python 2 and 3. + See Also + -------- + :meth:`~gensim.utils.SaveLoad.load` """ logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately) @@ -378,13 +526,31 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=fro logger.info("saved %s", fname) def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): - """ - Save aside any attributes that need to be handled separately, including - by recursion any attributes that are themselves SaveLoad instances. - - Returns a list of (obj, {attrib: value, ...}) settings that the caller - should use to restore each object's attributes that were set aside - during the default pickle(). + """Save aside any attributes that need to be handled separately, including + by recursion any attributes that are themselves :class:`~gensim.utils.SaveLoad` instances. + + Parameters + ---------- + fname : str + Output filename. + separately : list or None + Iterable of attributes than need to store distinctly + sep_limit : int + Limit for separation. + ignore : iterable of str + Attributes that shouldn't be store. + pickle_protocol : int + Protocol number for pickle. + compress : bool + If True - compress output with :func:`numpy.savez_compressed`. + subname : function + Produced by :meth:`~gensim.utils.SaveLoad._adapt_by_suffix` + + Returns + ------- + list of (obj, {attrib: value, ...}) + Settings that the caller should use to restore each object's attributes that were set aside + during the default :func:`~gensim.utils.pickle`. """ asides = {} @@ -463,29 +629,29 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, return restores + [(self, asides)] def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): - """ - Save the object to file (also see `load`). - - `fname_or_handle` is either a string specifying the file name to - save to, or an open file-like object which can be written to. If - the object is a file handle, no special array handling will be - performed; all attributes will be saved to the same file. - - If `separately` is None, automatically detect large - numpy/scipy.sparse arrays in the object being stored, and store - them into separate files. This avoids pickle memory errors and - allows mmap'ing large arrays back on load efficiently. - - You can also set `separately` manually, in which case it must be - a list of attribute names to be stored in separate files. The - automatic check is not performed in this case. - - `ignore` is a set of attribute names to *not* serialize (file - handles, caches etc). On subsequent load() these attributes will - be set to None. - - `pickle_protocol` defaults to 2 so the pickled object can be imported - in both Python 2 and 3. + """Save the object to file. + + Parameters + ---------- + fname_or_handle : str or file-like + Path to output file or already opened file-like object. If the object is a file handle, + no special array handling will be performed, all attributes will be saved to the same file. + separately : list of str or None, optional + If None - automatically detect large numpy/scipy.sparse arrays in the object being stored, and store + them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays + back on load efficiently. + If list of str - this attributes will be stored in separate files, the automatic check + is not performed in this case. + sep_limit : int + Limit for automatic separation. + ignore : frozenset of str + Attributes that shouldn't be serialize/store. + pickle_protocol : int + Protocol number for pickle. + + See Also + -------- + :meth:`~gensim.utils.SaveLoad.load` """ try: @@ -496,15 +662,38 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore= def identity(p): - """Identity fnc, for flows that don't accept lambda (pickling etc).""" + """Identity fnc, for flows that don't accept lambda (pickling etc). + + Parameters + ---------- + p : object + Input parameter. + + Returns + ------- + object + Same as `p`. + + """ return p def get_max_id(corpus): - """ - Return the highest feature id that appears in the corpus. + """Get the highest feature id that appears in the corpus. - For empty corpora (no features at all), return -1. + Parameters + ---------- + corpus : iterable of iterable of (int, int) + Collection of texts in BoW format. + + Returns + ------ + int + Highest feature id. + + Notes + ----- + For empty `corpus` return -1. """ maxid = -1 @@ -514,16 +703,22 @@ def get_max_id(corpus): class FakeDict(object): - """ - Objects of this class act as dictionaries that map integer->str(integer), for - a specified range of integers <0, num_terms). + """Objects of this class act as dictionaries that map integer->str(integer), for a specified + range of integers <0, num_terms). - This is meant to avoid allocating real dictionaries when `num_terms` is huge, which - is a waste of memory. + This is meant to avoid allocating real dictionaries when `num_terms` is huge, which is a waste of memory. """ def __init__(self, num_terms): + """ + + Parameters + ---------- + num_terms : int + Number of terms. + + """ self.num_terms = num_terms def __str__(self): @@ -532,20 +727,34 @@ def __str__(self): def __getitem__(self, val): if 0 <= val < self.num_terms: return str(val) - raise ValueError("internal id out of bounds (%s, expected <0..%s))" % - (val, self.num_terms)) + raise ValueError("internal id out of bounds (%s, expected <0..%s))" % (val, self.num_terms)) def iteritems(self): + """Iterate over all keys and values. + + + Yields + ------ + (int, str) + Pair of (id, token). + + """ for i in xrange(self.num_terms): yield i, str(i) def keys(self): - """ - Override the dict.keys() function, which is used to determine the maximum - internal id of a corpus = the vocabulary dimensionality. + """Override the `dict.keys()`, which is used to determine the maximum internal id of a corpus, + i.e. the vocabulary dimensionality. - HACK: To avoid materializing the whole `range(0, self.num_terms)`, this returns - the highest id = `[self.num_terms - 1]` only. + Returns + ------- + list of int + Highest id, packed in list. + + Warnings + -------- + To avoid materializing the whole `range(0, self.num_terms)`, + this returns the highest id = `[self.num_terms - 1]` only. """ return [self.num_terms - 1] @@ -560,13 +769,24 @@ def get(self, val, default=None): def dict_from_corpus(corpus): - """ - Scan corpus for all word ids that appear in it, then construct and return a mapping - which maps each `wordId -> str(wordId)`. + """Scan corpus for all word ids that appear in it, then construct a mapping + which maps each `word_id` -> `str(word_id)`. + + Parameters + ---------- + corpus : iterable of iterable of (int, int) + Collection of texts in BoW format. - This function is used whenever *words* need to be displayed (as opposed to just - their ids) but no wordId->word mapping was provided. The resulting mapping - only covers words actually used in the corpus, up to the highest wordId found. + Returns + ------ + id2word : :class:`~gensim.utils.FakeDict` + "Fake" mapping which maps each `word_id` -> `str(word_id)`. + + Warnings + -------- + This function is used whenever *words* need to be displayed (as opposed to just their ids) + but no `word_id` -> `word` mapping was provided. The resulting mapping only covers words actually + used in the corpus, up to the highest `word_id` found. """ num_terms = 1 + get_max_id(corpus) @@ -575,16 +795,22 @@ def dict_from_corpus(corpus): def is_corpus(obj): - """ - Check whether `obj` is a corpus. Return (is_corpus, new) 2-tuple, where - `new is obj` if `obj` was an iterable, or `new` yields the same sequence as - `obj` if it was an iterator. + """Check whether `obj` is a corpus. - `obj` is a corpus if it supports iteration over documents, where a document - is in turn anything that acts as a sequence of 2-tuples (int, float). + Parameters + ---------- + obj : object + Something `iterable of iterable` that contains (int, int). - Note: An "empty" corpus (empty input sequence) is ambiguous, so in this case the - result is forcefully defined as `is_corpus=False`. + Return + ------ + (bool, object) + Pair of (is_corpus, `obj`), is_corpus True if `obj` is corpus. + + Warnings + -------- + An "empty" corpus (empty input sequence) is ambiguous, so in this case + the result is forcefully defined as (False, `obj`). """ try: @@ -613,12 +839,17 @@ def is_corpus(obj): def get_my_ip(): - """ - Try to obtain our external ip (from the pyro nameserver's point of view) + """Try to obtain our external ip (from the Pyro4 nameserver's point of view) - This tries to sidestep the issue of bogus `/etc/hosts` entries and other - local misconfigurations, which often mess up hostname resolution. + Returns + ------- + str + IP address. + Warnings + -------- + This tries to sidestep the issue of bogus `/etc/hosts` entries and other local misconfiguration, + which often mess up hostname resolution. If all else fails, fall back to simple `socket.gethostbyname()` lookup. """ @@ -644,21 +875,29 @@ def get_my_ip(): class RepeatCorpus(SaveLoad): - """ - Used in the tutorial on distributed computing and likely not useful anywhere else. + """Wrap a `corpus` as another corpus of length `reps`. This is achieved by repeating documents from `corpus` + over and over again, until the requested length `len(result) == reps` is reached. + Repetition is done on-the-fly=efficiently, via `itertools`. + + Examples + -------- + >>> from gensim.utils import RepeatCorpus + >>> + >>> corpus = [[(1, 2)], []] # 2 documents + >>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents + [[(1, 2)], [], [(1, 2)], [], [(1, 2)]] """ def __init__(self, corpus, reps): """ - Wrap a `corpus` as another corpus of length `reps`. This is achieved by - repeating documents from `corpus` over and over again, until the requested - length `len(result)==reps` is reached. Repetition is done - on-the-fly=efficiently, via `itertools`. - >>> corpus = [[(1, 0.5)], []] # 2 documents - >>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents - [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)]] + Parameters + ---------- + corpus : iterable of iterable of (int, int) + Input corpus. + reps : int + Number of repeats for documents from corpus. """ self.corpus = corpus @@ -669,14 +908,28 @@ def __iter__(self): class RepeatCorpusNTimes(SaveLoad): + """Wrap a `corpus` and repeat it `n` times. + + Examples + -------- + >>> from gensim.utils import RepeatCorpusNTimes + >>> + >>> corpus = [[(1, 0.5)], []] + >>> list(RepeatCorpusNTimes(corpus, 3)) # repeat 3 times + [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []] + + """ def __init__(self, corpus, n): """ - Repeat a `corpus` `n` times. - >>> corpus = [[(1, 0.5)], []] - >>> list(RepeatCorpusNTimes(corpus, 3)) # repeat 3 times - [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []] + Parameters + ---------- + corpus : iterable of iterable of (int, int) + Input corpus. + n : int + Number of repeats for corpus. + """ self.corpus = corpus self.n = n @@ -688,13 +941,22 @@ def __iter__(self): class ClippedCorpus(SaveLoad): + """Wrap a `corpus` and return `max_doc` element from it""" + def __init__(self, corpus, max_docs=None): """ - Return a corpus that is the "head" of input iterable `corpus`. - Any documents after `max_docs` are ignored. This effectively limits the - length of the returned corpus to <= `max_docs`. Set `max_docs=None` for - "no limit", effectively wrapping the entire input corpus. + Parameters + ---------- + corpus : iterable of iterable of (int, int) + Input corpus. + max_docs : int + Maximal number of documents in result corpus. + + Warnings + -------- + Any documents after `max_docs` are ignored. This effectively limits the length of the returned corpus + to <= `max_docs`. Set `max_docs=None` for "no limit", effectively wrapping the entire input corpus. """ self.corpus = corpus @@ -708,19 +970,26 @@ def __len__(self): class SlicedCorpus(SaveLoad): + """Wrap `corpus` and return the slice of it""" + def __init__(self, corpus, slice_): """ - Return a corpus that is the slice of input iterable `corpus`. - Negative slicing can only be used if the corpus is indexable. - Otherwise, the corpus will be iterated over. + Parameters + ---------- + corpus : iterable of iterable of (int, int) + Input corpus. + slice_ : slice or iterable + Slice for `corpus` + Notes + ----- + Negative slicing can only be used if the corpus is indexable, otherwise, the corpus will be iterated over. Slice can also be a np.ndarray to support fancy indexing. - NOTE: calculating the size of a SlicedCorpus is expensive - when using a slice as the corpus has to be iterated over once. - Using a list or np.ndarray does not have this drawback, but - consumes more memory. + Calculating the size of a SlicedCorpus is expensive when using a slice as the corpus has + to be iterated over once. Using a list or np.ndarray does not have this drawback, but consumes more memory. + """ self.corpus = corpus self.slice_ = slice_ @@ -747,6 +1016,19 @@ def __len__(self): def safe_unichr(intval): + """ + + Parameters + ---------- + intval : int + Integer code of character + + Returns + ------- + string + Unicode string of character + + """ try: return unichr(intval) except ValueError: @@ -757,12 +1039,18 @@ def safe_unichr(intval): def decode_htmlentities(text): - """ - Decode HTML entities in text, coded as hex, decimal or named. - - Adapted - from http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py - + """Decode HTML entities in text, coded as hex, decimal or named. + This function from [3]_. + + Parameters + ---------- + text : str + Input html text. + + Examples + -------- + >>> from gensim.utils import decode_htmlentities + >>> >>> u = u'E tu vivrai nel terrore - L'aldilà (1981)' >>> print(decode_htmlentities(u).encode('UTF-8')) E tu vivrai nel terrore - L'aldilà (1981) @@ -771,6 +1059,10 @@ def decode_htmlentities(text): >>> print(decode_htmlentities("foo < bar")) foo < bar + References + ---------- + .. [3] http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py + """ def substitute_entity(match): try: @@ -798,10 +1090,25 @@ def substitute_entity(match): def chunkize_serial(iterable, chunksize, as_numpy=False): - """ - Return elements from the iterable in `chunksize`-ed lists. The last returned - element may be smaller (if length of collection is not divisible by `chunksize`). - + """Give elements from the iterable in `chunksize`-ed lists. + The last returned element may be smaller (if length of collection is not divisible by `chunksize`). + + Parameters + ---------- + iterable : iterable of object + Any iterable. + chunksize : int + Size of chunk from result. + as_numpy : bool, optional + If True - yield `np.ndarray`, otherwise - list + + Yields + ------ + list of object OR np.ndarray + Groups based on `iterable` + + Examples + -------- >>> print(list(grouper(range(10), 3))) [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] @@ -860,29 +1167,58 @@ def run(self): warnings.warn("detected Windows; aliasing chunkize to chunkize_serial") def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): + """Split `corpus` into smaller chunks, used :func:`~gensim.utils.chunkize_serial`. + + Parameters + ---------- + corpus : iterable of object + Any iterable object. + chunksize : int + Size of chunk from result. + maxsize : int, optional + THIS PARAMETER IGNORED. + as_numpy : bool, optional + If True - yield `np.ndarray`, otherwise - list + + Yields + ------ + list of object OR np.ndarray + Groups based on `iterable` + + """ for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy): yield chunk else: def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): - """ - Split a stream of values into smaller chunks. + """Split `corpus` into smaller chunks, used :func:`~gensim.utils.chunkize_serial`. + + Parameters + ---------- + corpus : iterable of object + Any iterable object. + chunksize : int + Size of chunk from result. + maxsize : int, optional + THIS PARAMETER IGNORED. + as_numpy : bool, optional + If True - yield `np.ndarray`, otherwise - list + + Notes + ----- Each chunk is of length `chunksize`, except the last one which may be smaller. - A once-only input stream (`corpus` from a generator) is ok, chunking is done - efficiently via itertools. + A once-only input stream (`corpus` from a generator) is ok, chunking is done efficiently via itertools. - If `maxsize > 1`, don't wait idly in between successive chunk `yields`, but - rather keep filling a short queue (of size at most `maxsize`) with forthcoming - chunks in advance. This is realized by starting a separate process, and is - meant to reduce I/O delays, which can be significant when `corpus` comes - from a slow medium (like harddisk). + If `maxsize > 1`, don't wait idly in between successive chunk `yields`, but rather keep filling a short queue + (of size at most `maxsize`) with forthcoming chunks in advance. This is realized by starting a separate process, + and is meant to reduce I/O delays, which can be significant when `corpus` comes from a slow medium (like HDD). - If `maxsize==0`, don't fool around with parallelism and simply yield the chunksize - via `chunkize_serial()` (no I/O optimizations). + If `maxsize == 0`, don't fool around with parallelism and simply yield the chunksize + via :func:`~gensim.utils.chunkize_serial` (no I/O optimizations). - >>> for chunk in chunkize(range(10), 4): print(chunk) - [0, 1, 2, 3] - [4, 5, 6, 7] - [8, 9] + Yields + ------ + list of object OR np.ndarray + Groups based on `iterable` """ assert chunksize > 0 @@ -903,6 +1239,21 @@ def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): def smart_extension(fname, ext): + """Generate filename with `ext`. + + Parameters + ---------- + fname : str + Path to file. + ext : str + File extension. + + Returns + ------- + str + New path to file with `ext`. + + """ fname, oext = os.path.splitext(fname) if oext.endswith('.bz2'): fname = fname + oext[:-4] + ext + '.bz2' @@ -917,8 +1268,14 @@ def smart_extension(fname, ext): def pickle(obj, fname, protocol=2): """Pickle object `obj` to file `fname`. - `protocol` defaults to 2 so pickled objects are compatible across - Python 2.x and 3.x. + Parameters + ---------- + obj : object + Any python object. + fname : str + Path to pickle file. + protocol : int, optional + Pickle protocol number, default is 2 to support compatible across python 2.x and 3.x. """ with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows @@ -926,7 +1283,19 @@ def pickle(obj, fname, protocol=2): def unpickle(fname): - """Load pickled object from `fname`""" + """Load object from `fname`. + + Parameters + ---------- + fname : str + Path to pickle file. + + Returns + ------- + object + Python object loaded from `fname`. + + """ with smart_open(fname, 'rb') as f: # Because of loading from S3 load can't be used (missing readline in smart_open) if sys.version_info > (3, 0): @@ -936,25 +1305,109 @@ def unpickle(fname): def revdict(d): - """ - Reverse a dictionary mapping. + """Reverse a dictionary mapping, i.e. `{1: 2, 3: 4}` -> `{2: 1, 4: 3}`. + + Parameters + ---------- + d : dict + Input dictionary. + + Returns + ------- + dict + Reversed dictionary mapping. + + Notes + ----- + When two keys map to the same value, only one of them will be kept in the result (which one is kept is arbitrary). - When two keys map to the same value, only one of them will be kept in the - result (which one is kept is arbitrary). + Examples + -------- + >>> from gensim.utils import revdict + >>> d = {1: 2, 3: 4} + >>> revdict(d) + {2: 1, 4: 3} """ return {v: k for (k, v) in iteritems(dict(d))} +def deprecated(reason): + """Decorator which can be used to mark functions as deprecated. + + Parameters + ---------- + reason : str + Reason of deprecation. + + Returns + ------- + function + Decorated function + + Notes + ----- + It will result in a warning being emitted when the function is used, base code from [4]_. + + References + ---------- + .. [4] https://stackoverflow.com/a/40301488/8001386 + + """ + if isinstance(reason, string_types): + def decorator(func): + fmt = "Call to deprecated `{name}` ({reason})." + + @wraps(func) + def new_func1(*args, **kwargs): + warnings.warn( + fmt.format(name=func.__name__, reason=reason), + category=DeprecationWarning, + stacklevel=2 + ) + return func(*args, **kwargs) + + return new_func1 + return decorator + + elif inspect.isclass(reason) or inspect.isfunction(reason): + func = reason + fmt = "Call to deprecated `{name}`." + + @wraps(func) + def new_func2(*args, **kwargs): + warnings.warn( + fmt.format(name=func.__name__), + category=DeprecationWarning, + stacklevel=2 + ) + return func(*args, **kwargs) + return new_func2 + + else: + raise TypeError(repr(type(reason))) + + +@deprecated("Function will be removed in 4.0.0") def toptexts(query, texts, index, n=10): """ Debug fnc to help inspect the top `n` most similar documents (according to a similarity index `index`), to see if they are actually related to the query. - `texts` is any object that can return something insightful for each document - via `texts[docid]`, such as its fulltext or snippet. - - Return a list of 3-tuples (docid, doc's similarity to the query, texts[docid]). + Parameters + ---------- + query : list + vector OR BoW (list of tuples) + texts : str + object that can return something insightful for each document via `texts[docid]`, + such as its fulltext or snippet. + index : any + a class from gensim.similarity.docsim + + Return + ------ + list + a list of 3-tuples (docid, doc's similarity to the query, texts[docid]) """ sims = index[query] # perform a similarity query against the corpus @@ -964,18 +1417,31 @@ def toptexts(query, texts, index, n=10): def randfname(prefix='gensim'): + """Generate path with random filename/ + + Parameters + ---------- + prefix : str + Prefix of filename. + + Returns + ------- + str + Full path with random filename (in temporary folder). + + """ randpart = hex(random.randint(0, 0xffffff))[2:] return os.path.join(tempfile.gettempdir(), prefix + randpart) +@deprecated("Function will be removed in 4.0.0") def upload_chunked(server, docs, chunksize=1000, preprocess=None): - """ - Memory-friendly upload of documents to a SimServer (or Pyro SimServer proxy). - + """Memory-friendly upload of documents to a SimServer (or Pyro SimServer proxy). + Notes + ----- Use this function to train or index large collections -- avoid sending the entire corpus over the wire as a single Pyro in-memory object. The documents will be sent in smaller chunks, of `chunksize` documents each. - """ start = 0 for chunk in grouper(docs, chunksize): @@ -993,8 +1459,29 @@ def upload_chunked(server, docs, chunksize=1000, preprocess=None): def getNS(host=None, port=None, broadcast=True, hmac_key=None): - """ - Return a Pyro name server proxy. + """Get a Pyro4 name server proxy. + + Parameters + ---------- + host : str, optional + Hostname of ns. + port : int, optional + Port of ns. + broadcast : bool, optional + If True - use broadcast mechanism (i.e. all Pyro nodes in local network), not otherwise. + hmac_key : str, optional + Private key. + + Raises + ------ + RuntimeError + when Pyro name server is not found + + Returns + ------- + :class:`Pyro4.core.Proxy` + Proxy from Pyro4. + """ import Pyro4 try: @@ -1004,8 +1491,7 @@ def getNS(host=None, port=None, broadcast=True, hmac_key=None): def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf=None): - """ - Register object with name server (starting the name server if not running + """Register object with name server (starting the name server if not running yet) and block until the daemon is terminated. The object is registered under `name`, or `name`+ some random suffix if `random_suffix` is set. @@ -1027,8 +1513,17 @@ def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf=None def has_pattern(): - """ - Function which returns a flag indicating whether pattern is installed or not + """Check that `pattern` [5]_ package already installed. + + Returns + ------- + bool + True if `pattern` installed, False otherwise. + + References + ---------- + .. [5] https://github.com/clips/pattern + """ try: from pattern.en import parse # noqa:F401 @@ -1039,21 +1534,42 @@ def has_pattern(): def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False, stopwords=frozenset(), min_length=2, max_length=15): - """ - This function is only available when the optional 'pattern' package is installed. - - Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in + """Use the English lemmatizer from `pattern` [5]_ to extract UTF8-encoded tokens in their base form=lemma, e.g. "are, is, being" -> "be" etc. This is a smarter version of stemming, taking word context into account. - Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). + Parameters + ---------- + content : str + Input string + allowed_tags : :class:`_sre.SRE_Pattern`, optional + Compiled regexp to select POS that will be used. + Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). + light : bool, optional + DEPRECATED FLAG, DOESN'T SUPPORT BY `pattern`. + stopwords : frozenset + Set of words that will be removed from output. + min_length : int + Minimal token length in output (inclusive). + max_length : int + Maximal token length in output (inclusive). + + Returns + ------- + list of str + List with tokens with POS tag. + + Warnings + -------- + This function is only available when the optional 'pattern' package is installed. + Examples + -------- + >>> from gensim.utils import lemmatize >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21') ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN'] - >>> lemmatize('The study ranks high.') ['study/NN', 'rank/VB', 'high/JJ'] - >>> lemmatize('The ranks study hard.') ['rank/NN', 'study/VB', 'hard/RB'] @@ -1086,10 +1602,21 @@ def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False, def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0): - """ - Create a random gensim sparse vector. Each coordinate is nonzero with - probability `prob_nnz`, each non-zero coordinate value is drawn from - a Poisson distribution with parameter lambda equal to `lam`. + """Create a random gensim BoW vector. + + Parameters + ---------- + dim : int, optional + Dimension of vector. + prob_nnz : float, optional + Probability of each coordinate will be nonzero, will be drawn from Poisson distribution. + lam : float, optional + Parameter for Poisson distribution. + + Returns + ------- + list of (int, float) + Vector in BoW format. """ nnz = np.random.uniform(size=(dim,)) @@ -1097,19 +1624,46 @@ def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0): def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0): - """ - Create a random gensim-style corpus, as a list of lists of (int, float) tuples, - to be used as a mock corpus. + """Create a random gensim-style corpus (BoW), used :func:`~gensim.utils.mock_data_row`. + + Parameters + ---------- + n_items : int + Size of corpus + dim : int + Dimension of vector, used for :func:`~gensim.utils.mock_data_row`. + prob_nnz : float, optional + Probability of each coordinate will be nonzero, will be drawn from Poisson distribution, + used for :func:`~gensim.utils.mock_data_row`. + lam : float, optional + Parameter for Poisson distribution, used for :func:`~gensim.utils.mock_data_row`. + + Returns + ------- + list of list of (int, float) + Gensim-style corpus. """ return [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam) for _ in xrange(n_items)] def prune_vocab(vocab, min_reduce, trim_rule=None): - """ - Remove all entries from the `vocab` dictionary with count smaller than `min_reduce`. + """Remove all entries from the `vocab` dictionary with count smaller than `min_reduce`. Modifies `vocab` in place, returns the sum of all counts that were pruned. + Parameters + ---------- + vocab : dict + Input dictionary. + min_reduce : int + Frequency threshold for tokens in `vocab`. + trim_rule : function, optional + Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_reduce`. + + Returns + ------- + result : int + Sum of all counts that were pruned. """ result = 0 @@ -1126,7 +1680,19 @@ def prune_vocab(vocab, min_reduce, trim_rule=None): def qsize(queue): - """Return the (approximate) queue size where available; -1 where not (OS X).""" + """Get the (approximate) queue size where available. + + Parameters + ---------- + queue : :class:`queue.Queue` + Input queue. + + Returns + ------- + int + Queue size, -1 if `qsize` method isn't implemented (OS X). + + """ try: return queue.qsize() except NotImplementedError: @@ -1140,6 +1706,25 @@ def qsize(queue): def keep_vocab_item(word, count, min_count, trim_rule=None): + """Check that should we keep `word` in vocab or remove. + + Parameters + ---------- + word : str + Input word. + count : int + Number of times that word contains in corpus. + min_count : int + Frequency threshold for `word`. + trim_rule : function, optional + Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_reduce`. + + Returns + ------- + bool + True if `word` should stay, False otherwise. + + """ default_res = count >= min_count if trim_rule is None: @@ -1155,13 +1740,27 @@ def keep_vocab_item(word, count, min_count, trim_rule=None): def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs): - """ - Run command with arguments and return its output as a byte string. - Backported from Python 2.7 as it's implemented as pure python on stdlib. + r"""Run command with arguments and return its output as a byte string. + Backported from Python 2.7 as it's implemented as pure python on stdlib + small modification. + Widely used for :mod:`gensim.models.wrappers`. + + Very similar with [6]_ + + Examples + -------- + >>> from gensim.utils import check_output + >>> check_output(args=['echo', '1']) + '1\n' + + Raises + ------ + KeyboardInterrupt + If Ctrl+C pressed. + + References + ---------- + .. [6] https://docs.python.org/2/library/subprocess.html#subprocess.check_output - >>> check_output(args=['/usr/bin/python', '--version']) - Python 2.6.2 - Added extra KeyboardInterrupt handling """ try: logger.debug("COMMAND: %s %s", popenargs, kwargs) @@ -1182,19 +1781,47 @@ def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs): def sample_dict(d, n=10, use_random=True): - """ - Pick `n` items from dictionary `d` and return them as a list. - The items are picked randomly if `use_random` is True, otherwise picked - according to natural dict iteration. + """Pick `n` items from dictionary `d`. + + Parameters + ---------- + d : dict + Input dictionary. + n : int, optional + Number of items that will be picked. + use_random : bool, optional + If True - pick items randomly, otherwise - according to natural dict iteration. + + Returns + ------- + list of (object, object) + Picked items from dictionary, represented as list. + """ selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n) return [(key, d[key]) for key in selected_keys] def strided_windows(ndarray, window_size): - """ - Produce a numpy.ndarray of windows, as from a sliding window. - + """Produce a numpy.ndarray of windows, as from a sliding window. + + Parameters + ---------- + ndarray : numpy.ndarray + Input array + window_size : int + Sliding window size. + + Returns + ------- + numpy.ndarray + Subsequences produced by sliding a window of the given size over the `ndarray`. + Since this uses striding, the individual arrays are views rather than copies of `ndarray`. + Changes to one view modifies the others and the original. + + Examples + -------- + >>> from gensim.utils import strided_windows >>> strided_windows(np.arange(5), 2) array([[0, 1], [1, 2], @@ -1208,14 +1835,6 @@ def strided_windows(ndarray, window_size): [4, 5, 6, 7, 8], [5, 6, 7, 8, 9]]) - Args: - ndarray: either a numpy.ndarray or something that can be converted into one. - window_size: sliding window size. - - Returns: - numpy.ndarray of the subsequences produced by sliding a window of the given size over - the `ndarray`. Since this uses striding, the individual arrays are views rather than - copies of `ndarray`. Changes to one view modifies the others and the original. """ ndarray = np.asarray(ndarray) if window_size == ndarray.shape[0]: @@ -1231,15 +1850,22 @@ def strided_windows(ndarray, window_size): def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include_doc_num=False): """Produce a generator over the given texts using a sliding window of `window_size`. - The windows produced are views of some subsequence of a text. To use deep copies - instead, pass `copy=True`. - - Args: - texts: List of string sentences. - window_size: Size of sliding window. - copy: False to use views of the texts (default) or True to produce deep copies. - ignore_below_size: ignore documents that are not at least `window_size` in length (default behavior). - If False, the documents below `window_size` will be yielded as the full document. + The windows produced are views of some subsequence of a text. + To use deep copies instead, pass `copy=True`. + + + Parameters + ---------- + texts : list of str + List of string sentences. + window_size : int + Size of sliding window. + copy : bool, optional + If True - produce deep copies. + ignore_below_size : bool, optional + If True - ignore documents that are not at least `window_size` in length. + include_doc_num : bool, optional + If True - will be yield doc_num too. """ for doc_num, document in enumerate(texts): @@ -1263,60 +1889,38 @@ def _iter_windows(document, window_size, copy=False, ignore_below_size=True): def flatten(nested_list): """Recursively flatten out a nested list. - Args: - nested_list (list): possibly nested list. + Parameters + ---------- + nested_list : list + Possibly nested list. + + Returns + ------- + list + Flattened version of input, where any list elements have been unpacked into the top-level list + in a recursive fashion. - Returns: - list: flattened version of input, where any list elements have been unpacked into the - top-level list in a recursive fashion. """ return list(lazy_flatten(nested_list)) def lazy_flatten(nested_list): - """Lazy version of `flatten`.""" + """Lazy version of :func:`~gensim.utils.flatten`. + + Parameters + ---------- + nested_list : list + Possibly nested list. + + Yields + ------ + object + Element of list + + """ for el in nested_list: if isinstance(el, collections.Iterable) and not isinstance(el, string_types): for sub in flatten(el): yield sub else: yield el - - -def deprecated(reason): - """Decorator which can be used to mark functions as deprecated. It will result in a warning being emitted - when the function is used, base code from https://stackoverflow.com/a/40301488/8001386. - - """ - if isinstance(reason, string_types): - def decorator(func): - fmt = "Call to deprecated `{name}` ({reason})." - - @wraps(func) - def new_func1(*args, **kwargs): - warnings.warn( - fmt.format(name=func.__name__, reason=reason), - category=DeprecationWarning, - stacklevel=2 - ) - return func(*args, **kwargs) - - return new_func1 - return decorator - - elif inspect.isclass(reason) or inspect.isfunction(reason): - func = reason - fmt = "Call to deprecated `{name}`." - - @wraps(func) - def new_func2(*args, **kwargs): - warnings.warn( - fmt.format(name=func.__name__), - category=DeprecationWarning, - stacklevel=2 - ) - return func(*args, **kwargs) - return new_func2 - - else: - raise TypeError(repr(type(reason))) diff --git a/setup.py b/setup.py index a5d8fa6bdc..2c16ded0f7 100644 --- a/setup.py +++ b/setup.py @@ -307,7 +307,7 @@ def finalize_options(self): 'distributed': distributed_env, 'test-win': win_testenv, 'test': linux_testenv, - 'docs': linux_testenv + distributed_env + ['sphinx', 'sphinxcontrib-napoleon', 'plotly'], + 'docs': linux_testenv + distributed_env + ['sphinx', 'sphinxcontrib-napoleon', 'plotly', 'pattern'], }, include_package_data=True,