From 4cee8faf148e5cec6e8b5678e7c4a72739b17841 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sat, 10 Feb 2018 19:13:18 +0100 Subject: [PATCH 01/45] fixed docstring for `sklearn_api.lsimodel` --- gensim/sklearn_api/lsimodel.py | 79 ++++++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 14 deletions(-) diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index 7034df7da6..ddbefb591c 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -20,14 +20,36 @@ class LsiTransformer(TransformerMixin, BaseEstimator): - """ - Base LSI module + """Base LSI module. + + Scikit learn interface for `gensim.models.lsimodel` for easy use of gensim with scikit-learn. + Follows scikit-learn API conventions. + """ def __init__(self, num_topics=200, id2word=None, chunksize=20000, decay=1.0, onepass=True, power_iters=2, extra_samples=100): - """ - Sklearn wrapper for LSI model. See gensim.model.LsiModel for parameter details. + """Sklearn wrapper for LSI model. + + Parameters + ---------- + num_topics : int, optional + Number of requested factors (latent dimensions) + id2word : dict of {int: str}, optional + ID to word mapping, optional. + chunksize : int, optional + Number of documents to be used in each training chunk. + decay : float, optional + Weight of existing observations relatively to new ones. + onepass : bool, optional + Whether the one-pass algorithm should be used for training. + Pass `False` to force a multi-pass stochastic algorithm. + power_iters: int, optional + Number of power iteration steps to be used. + Increasing the number of power iterations improves accuracy, but lowers performance + extra_samples : int, optional + Extra samples to be used besides the rank `k`. Can improve accuracy. + """ self.gensim_model = None self.num_topics = num_topics @@ -42,6 +64,17 @@ def fit(self, X, y=None): """ Fit the model according to the given training data. Calls gensim.models.LsiModel + + Parameters + ---------- + X : iterable of iterable of (int, float) + Stream of document vectors or sparse matrix of shape: [num_terms, num_documents]. + + Returns + ------- + LsiTransformer + The trained model + """ if sparse.issparse(X): corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False) @@ -55,14 +88,18 @@ def fit(self, X, y=None): return self def transform(self, docs): - """ - Takes a list of documents as input ('docs'). - Returns a matrix of topic distribution for the given document bow, where a_ij - indicates (topic_i, topic_probability_j). - The input `docs` should be in BOW format and can be a list of documents like - [[(4, 1), (7, 1)], - [(9, 1), (13, 1)], [(2, 1), (6, 1)]] - or a single document like : [(4, 1), (7, 1)] + """Computes the topic distribution matrix + + Parameters + ---------- + docs : iterable of iterable of (int, float) + Stream of document vectors or sparse matrix of shape: [`num_terms`, num_documents]. + + Returns + ------- + 2D ndarray of float + Topic distribution matrix of shape [num_docs, num_topics] + """ if self.gensim_model is None: raise NotFittedError( @@ -78,8 +115,22 @@ def transform(self, docs): return np.reshape(np.array(distribution), (len(docs), self.num_topics)) def partial_fit(self, X): - """ - Train model over X. + """Train model over a potentially incomplete set of documents. + + This method can be used in two ways: + 1. On an unfitted model in which case the model is initialized and trained on `X`. + 2. On an already fitted model in which case the model is **further** trained on `X`. + + Parameters + ---------- + X : iterable of iterable of (int, float) + Stream of document vectors or sparse matrix of shape: [num_terms, num_documents]. + + Returns + ------- + LsiTransformer + The trained model. + """ if sparse.issparse(X): X = matutils.Sparse2Corpus(sparse=X, documents_columns=False) From ab0303cb5e391df2bff3119cf3591b3b6202f20e Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sat, 10 Feb 2018 19:40:47 +0100 Subject: [PATCH 02/45] removed duplicated comment --- gensim/sklearn_api/lsimodel.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index ddbefb591c..092e5db54c 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -5,11 +5,6 @@ # Copyright (C) 2017 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Scikit learn interface for gensim for easy use of gensim with scikit-learn -Follows scikit-learn API conventions -""" - import numpy as np from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator @@ -97,7 +92,7 @@ def transform(self, docs): Returns ------- - 2D ndarray of float + list of (int, int) Topic distribution matrix of shape [num_docs, num_topics] """ From 4dc001f865231ed36337a1b8ab1d2909e7a03aea Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sat, 10 Feb 2018 19:41:08 +0100 Subject: [PATCH 03/45] Fixed docstring for `sklearn_api.text2bow` --- gensim/sklearn_api/text2bow.py | 68 +++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 13 deletions(-) diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index e71a954c32..2bfd14fa06 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -4,11 +4,6 @@ # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Scikit learn interface for gensim for easy use of gensim with scikit-learn -Follows scikit-learn API conventions -""" - from six import string_types from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -18,29 +13,59 @@ class Text2BowTransformer(TransformerMixin, BaseEstimator): - """ - Base Text2Bow module + """Base Text2Bow module + + Scikit learn interface for `gensim.models.lsimodel` for easy use of gensim with scikit-learn. + Follows scikit-learn API conventions. + """ def __init__(self, prune_at=2000000, tokenizer=tokenize): - """ - Sklearn wrapper for Text2Bow model. + """Sklearn wrapper for Text2Bow model. + + Parameters + ---------- + prune_at : int, optional + Total number of unique words. Dictionary will keep not more than `prune_at` words. + tokenizer : callable (str -> list of str), optional + A callable to split a document into a list of each terms + """ self.gensim_model = None self.prune_at = prune_at self.tokenizer = tokenizer def fit(self, X, y=None): - """ - Fit the model according to the given training data. + """Fit the model according to the given training data. + + Parameters + ---------- + X : iterable of str + A collection of documents used for training the model. + + Returns + ------- + Text2BowTransformer + The trained model. + """ tokenized_docs = [list(self.tokenizer(x)) for x in X] self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at) return self def transform(self, docs): - """ - Return the BOW format for the input documents. + """Return the BOW format for the input documents. + + Parameters + ---------- + docs : iterable of str + A collection of documents to be transformed. + + Returns + ------- + iterable of list (int, int) 2-tuples. + The BOW representation of each document. + """ if self.gensim_model is None: raise NotFittedError( @@ -54,6 +79,23 @@ def transform(self, docs): return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs] def partial_fit(self, X): + """Train model over a potentially incomplete set of documents. + + This method can be used in two ways: + 1. On an unfitted model in which case the dictionary is initialized and trained on `X`. + 2. On an already fitted model in which case the dictionary is **expanded** by `X`. + + Parameters + ---------- + X : iterable of str + A collection of documents used to train the model. + + Returns + ------- + Text2BowTransformer + The trained model. + + """ if self.gensim_model is None: self.gensim_model = Dictionary(prune_at=self.prune_at) From 69faf410850a62ab2ce71cd1d6b6d008e4e4391d Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sat, 10 Feb 2018 20:06:07 +0100 Subject: [PATCH 04/45] Fixed docstrings for `sklearn_api.phrases` --- gensim/sklearn_api/phrases.py | 105 +++++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 8 deletions(-) diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index c64b809bb7..5ea3187779 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -17,14 +17,62 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator): - """ - Base Phrases module + """Base Phrases module + + Scikit learn interface for `gensim.models.phrases` for easy use of gensim with scikit-learn. + Follows scikit-learn API conventions. + """ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_', progress_per=10000, scoring='default'): - """ - Sklearn wrapper for Phrases model. + """Sklearn wrapper for Phrases model. + + Parameters + ---------- + min_count : int + Terms with a count lower than this will be ignored + threshold : float + Only phrases scoring above this will be accepted, see `scoring` below. + max_vocab_size : int + Maximum size of the vocabulary. + Used to control pruning of less common words, to keep memory under control. + The default of 40M needs about 3.6GB of RAM; + delimiter : str + Character used to join collocation tokens. Should be a byte string (e.g. b'_'). + progress_per : int + Training will report to the logger every that many phrases are learned. + scoring : str or callable + Specifies how potential phrases are scored for comparison to the `threshold` + setting. `scoring` can be set with either a string that refers to a built-in scoring function, + or with a function with the expected parameter names. Two built-in scoring functions are available + by setting `scoring` to a string: + + 'default': from [1]_. + 'npmi': normalized pointwise mutual information, from [2]_. + + 'npmi' is more robust when dealing with common words that form part of common bigrams, and + ranges from -1 to 1, but is slower to calculate than the default. + + To use a custom scoring function, create a function with the following parameters and set the `scoring` + parameter to the custom function. You must use all the parameters in your function call, even if the + function does not require all the parameters. + + worda_count: number of occurrances in `sentences` of the first token in the phrase being scored + wordb_count: number of occurrances in `sentences` of the second token in the phrase being scored + bigram_count: number of occurrances in `sentences` of the phrase being scored + len_vocab: the number of unique tokens in `sentences` + min_count: the `min_count` setting of the Phrases class + corpus_word_count: the total number of (non-unique) tokens in `sentences` + + A scoring function without any of these parameters (even if the parameters are not used) will + raise a ValueError on initialization of the Phrases class. The scoring function must be pic + + References + ---------- + .. [1] "Efficient Estimaton of Word Representations in Vector Space" by Mikolov, et. al. + .. [2] "Normalized (Pointwise) Mutual Information in Colocation Extraction" by Gerlof Bouma. + """ self.gensim_model = None self.min_count = min_count @@ -35,8 +83,18 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, self.scoring = scoring def fit(self, X, y=None): - """ - Fit the model according to the given training data. + """Fit the model according to the given training data. + + Parameters + ---------- + X : iterable of list of str + Sequence of sentences to be used for training the model. + + Returns + ------- + PhrasesTransformer + The trained model. + """ self.gensim_model = models.Phrases( sentences=X, min_count=self.min_count, threshold=self.threshold, @@ -46,9 +104,22 @@ def fit(self, X, y=None): return self def transform(self, docs): + """Transform the input documents into phrase tokens. + + Words in the sentence will be joined by u`_`. + + Parameters + ---------- + docs : iterable of list of str + Sequence of sentences to be used transformed. + + Returns + ------- + iterable of str + Phrase representation for each of the input sentences. + """ - Return the input documents to return phrase tokens. - """ + if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." @@ -60,6 +131,24 @@ def transform(self, docs): return [self.gensim_model[doc] for doc in docs] def partial_fit(self, X): + """Train model over a potentially incomplete set of sentences. + + This method can be used in two ways: + 1. On an unfitted model in which case the model is initialized and trained on `X`. + 2. On an already fitted model in which case the X sentences are **added** to the vocabulary. + + Parameters + ---------- + X : iterable of list of str + Sequence of sentences to be used for training the model. + + Returns + ------- + PhrasesTransformer + The trained model. + + """ + if self.gensim_model is None: self.gensim_model = models.Phrases( sentences=X, min_count=self.min_count, threshold=self.threshold, From 5052dfba0382d3257143ba33a6f44f51b1378bfe Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Mon, 12 Feb 2018 13:02:26 +0100 Subject: [PATCH 05/45] Applied code review corrections in sklearn wrappers for: - `lsimodel` - `text2bow` - `phrases` * Added `doc` in every file * Provided sphinx style links to parameter types referencing gensim classes. * Propagated arguments are still duplicated for readability - maybe remove? --- gensim/sklearn_api/lsimodel.py | 28 ++++++++++++++++++++++------ gensim/sklearn_api/phrases.py | 16 +++++++++------- gensim/sklearn_api/text2bow.py | 17 ++++++++++++----- 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index 092e5db54c..ae2c49e291 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -5,6 +5,21 @@ # Copyright (C) 2017 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""Scikit learn interface for `gensim.models.lsimodel`. + +Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. + +Examples +-------- +Integrate with sklearn Pipelines: + + >>> model = LsiTransformer(num_topics=15, id2word=id2word) + >>> clf = linear_model.LogisticRegression(penalty='l2', C=0.1) + >>> pipe = Pipeline([('features', model,), ('classifier', clf)]) + >>> pipe.fit(corpus, data.target) + +""" + import numpy as np from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator @@ -17,8 +32,9 @@ class LsiTransformer(TransformerMixin, BaseEstimator): """Base LSI module. - Scikit learn interface for `gensim.models.lsimodel` for easy use of gensim with scikit-learn. - Follows scikit-learn API conventions. + Wraps :class:`~gensim.model.lsimodel.LsiModel`. + For more information on the inner working please take a look at + the original class. """ @@ -58,7 +74,7 @@ def __init__(self, num_topics=200, id2word=None, chunksize=20000, def fit(self, X, y=None): """ Fit the model according to the given training data. - Calls gensim.models.LsiModel + Calls :meth:`~gensim.models.lsimodel.LsiModel` Parameters ---------- @@ -67,8 +83,8 @@ def fit(self, X, y=None): Returns ------- - LsiTransformer - The trained model + :class:`~gensim.sklearn_api.lsimodel.LsiTransformer` + The trained model. """ if sparse.issparse(X): @@ -123,7 +139,7 @@ def partial_fit(self, X): Returns ------- - LsiTransformer + :class:`~gensim.sklearn_api.lsimodel.LsiTransformer` The trained model. """ diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 5ea3187779..371c24be32 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -4,9 +4,10 @@ # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Scikit learn interface for gensim for easy use of gensim with scikit-learn -Follows scikit-learn API conventions +"""Scikit learn interface for `gensim.models.phrases`. + +Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. + """ from six import string_types @@ -19,8 +20,9 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator): """Base Phrases module - Scikit learn interface for `gensim.models.phrases` for easy use of gensim with scikit-learn. - Follows scikit-learn API conventions. + Wraps :class:`~gensim.models.phrases.Phrases`. + For more information on the inner workings please take a look at + the original class. """ @@ -92,7 +94,7 @@ def fit(self, X, y=None): Returns ------- - PhrasesTransformer + :class:`~gensim.sklearn_api.phrases.PhrasesTransformer` The trained model. """ @@ -144,7 +146,7 @@ def partial_fit(self, X): Returns ------- - PhrasesTransformer + :class:`~gensim.sklearn_api.phrases.PhrasesTransformer` The trained model. """ diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index 2bfd14fa06..2d682e2736 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -4,6 +4,12 @@ # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""Scikit learn interface for `gensim.corpora.Dictionary`. + +Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. + +""" + from six import string_types from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -13,10 +19,11 @@ class Text2BowTransformer(TransformerMixin, BaseEstimator): - """Base Text2Bow module + """Base Text2Bow module. - Scikit learn interface for `gensim.models.lsimodel` for easy use of gensim with scikit-learn. - Follows scikit-learn API conventions. + Wraps :class:`~gensim.corpora.dictionary.Dictionary`. + For more information on the inner workings please take a look at + the original class. """ @@ -45,7 +52,7 @@ def fit(self, X, y=None): Returns ------- - Text2BowTransformer + :class:`~gensim.sklearn_api.text2bow.Text2BowTransformer` The trained model. """ @@ -92,7 +99,7 @@ def partial_fit(self, X): Returns ------- - Text2BowTransformer + :class:`~gensim.sklearn_api.text2bow.Text2BowTransformer` The trained model. """ From c0272033072bb6e0029007d2959efa92e7b6b6de Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Mon, 12 Feb 2018 14:18:03 +0100 Subject: [PATCH 06/45] constructor docstrings now only mention the type of each argument. For explanation of their meaning the reader is redirected to the original models documentation --- gensim/sklearn_api/lsimodel.py | 12 +++-------- gensim/sklearn_api/phrases.py | 39 +++------------------------------- gensim/sklearn_api/text2bow.py | 5 +++-- 3 files changed, 9 insertions(+), 47 deletions(-) diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index ae2c49e291..6a2168223b 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -42,24 +42,18 @@ def __init__(self, num_topics=200, id2word=None, chunksize=20000, decay=1.0, onepass=True, power_iters=2, extra_samples=100): """Sklearn wrapper for LSI model. + Parameters are propagated to the original models constructor. For an explanation + please refer to :meth:`~gensim.models.lsimodel.LsiModel.__init__` + Parameters ---------- num_topics : int, optional - Number of requested factors (latent dimensions) id2word : dict of {int: str}, optional - ID to word mapping, optional. chunksize : int, optional - Number of documents to be used in each training chunk. decay : float, optional - Weight of existing observations relatively to new ones. onepass : bool, optional - Whether the one-pass algorithm should be used for training. - Pass `False` to force a multi-pass stochastic algorithm. power_iters: int, optional - Number of power iteration steps to be used. - Increasing the number of power iterations improves accuracy, but lowers performance extra_samples : int, optional - Extra samples to be used besides the rank `k`. Can improve accuracy. """ self.gensim_model = None diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 371c24be32..a7687d62cb 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -30,50 +30,17 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_', progress_per=10000, scoring='default'): """Sklearn wrapper for Phrases model. + Parameters are propagated to the original models constructor. For an explanation + please refer to :meth:`~gensim.models.phrases.Phrases.__init__` + Parameters ---------- min_count : int - Terms with a count lower than this will be ignored threshold : float - Only phrases scoring above this will be accepted, see `scoring` below. max_vocab_size : int - Maximum size of the vocabulary. - Used to control pruning of less common words, to keep memory under control. - The default of 40M needs about 3.6GB of RAM; delimiter : str - Character used to join collocation tokens. Should be a byte string (e.g. b'_'). progress_per : int - Training will report to the logger every that many phrases are learned. scoring : str or callable - Specifies how potential phrases are scored for comparison to the `threshold` - setting. `scoring` can be set with either a string that refers to a built-in scoring function, - or with a function with the expected parameter names. Two built-in scoring functions are available - by setting `scoring` to a string: - - 'default': from [1]_. - 'npmi': normalized pointwise mutual information, from [2]_. - - 'npmi' is more robust when dealing with common words that form part of common bigrams, and - ranges from -1 to 1, but is slower to calculate than the default. - - To use a custom scoring function, create a function with the following parameters and set the `scoring` - parameter to the custom function. You must use all the parameters in your function call, even if the - function does not require all the parameters. - - worda_count: number of occurrances in `sentences` of the first token in the phrase being scored - wordb_count: number of occurrances in `sentences` of the second token in the phrase being scored - bigram_count: number of occurrances in `sentences` of the phrase being scored - len_vocab: the number of unique tokens in `sentences` - min_count: the `min_count` setting of the Phrases class - corpus_word_count: the total number of (non-unique) tokens in `sentences` - - A scoring function without any of these parameters (even if the parameters are not used) will - raise a ValueError on initialization of the Phrases class. The scoring function must be pic - - References - ---------- - .. [1] "Efficient Estimaton of Word Representations in Vector Space" by Mikolov, et. al. - .. [2] "Normalized (Pointwise) Mutual Information in Colocation Extraction" by Gerlof Bouma. """ self.gensim_model = None diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index 2d682e2736..073c8b2c09 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -30,12 +30,13 @@ class Text2BowTransformer(TransformerMixin, BaseEstimator): def __init__(self, prune_at=2000000, tokenizer=tokenize): """Sklearn wrapper for Text2Bow model. + Parameters are propagated to the original models constructor. For an explanation + please refer to :meth:`~gensim.corpora.dictionary.Dictionary.__init__` + Parameters ---------- prune_at : int, optional - Total number of unique words. Dictionary will keep not more than `prune_at` words. tokenizer : callable (str -> list of str), optional - A callable to split a document into a list of each terms """ self.gensim_model = None From 3815605fd4bef57754dd99a88a7310c9f2d4ec74 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Tue, 13 Feb 2018 17:07:58 +0100 Subject: [PATCH 07/45] Brought back parameter explanation in the wrappers for easier lookup --- gensim/sklearn_api/lsimodel.py | 12 ++++++++--- gensim/sklearn_api/phrases.py | 39 +++++++++++++++++++++++++++++++--- gensim/sklearn_api/text2bow.py | 5 ++--- 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index 6a2168223b..ae2c49e291 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -42,18 +42,24 @@ def __init__(self, num_topics=200, id2word=None, chunksize=20000, decay=1.0, onepass=True, power_iters=2, extra_samples=100): """Sklearn wrapper for LSI model. - Parameters are propagated to the original models constructor. For an explanation - please refer to :meth:`~gensim.models.lsimodel.LsiModel.__init__` - Parameters ---------- num_topics : int, optional + Number of requested factors (latent dimensions) id2word : dict of {int: str}, optional + ID to word mapping, optional. chunksize : int, optional + Number of documents to be used in each training chunk. decay : float, optional + Weight of existing observations relatively to new ones. onepass : bool, optional + Whether the one-pass algorithm should be used for training. + Pass `False` to force a multi-pass stochastic algorithm. power_iters: int, optional + Number of power iteration steps to be used. + Increasing the number of power iterations improves accuracy, but lowers performance extra_samples : int, optional + Extra samples to be used besides the rank `k`. Can improve accuracy. """ self.gensim_model = None diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index a7687d62cb..371c24be32 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -30,17 +30,50 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_', progress_per=10000, scoring='default'): """Sklearn wrapper for Phrases model. - Parameters are propagated to the original models constructor. For an explanation - please refer to :meth:`~gensim.models.phrases.Phrases.__init__` - Parameters ---------- min_count : int + Terms with a count lower than this will be ignored threshold : float + Only phrases scoring above this will be accepted, see `scoring` below. max_vocab_size : int + Maximum size of the vocabulary. + Used to control pruning of less common words, to keep memory under control. + The default of 40M needs about 3.6GB of RAM; delimiter : str + Character used to join collocation tokens. Should be a byte string (e.g. b'_'). progress_per : int + Training will report to the logger every that many phrases are learned. scoring : str or callable + Specifies how potential phrases are scored for comparison to the `threshold` + setting. `scoring` can be set with either a string that refers to a built-in scoring function, + or with a function with the expected parameter names. Two built-in scoring functions are available + by setting `scoring` to a string: + + 'default': from [1]_. + 'npmi': normalized pointwise mutual information, from [2]_. + + 'npmi' is more robust when dealing with common words that form part of common bigrams, and + ranges from -1 to 1, but is slower to calculate than the default. + + To use a custom scoring function, create a function with the following parameters and set the `scoring` + parameter to the custom function. You must use all the parameters in your function call, even if the + function does not require all the parameters. + + worda_count: number of occurrances in `sentences` of the first token in the phrase being scored + wordb_count: number of occurrances in `sentences` of the second token in the phrase being scored + bigram_count: number of occurrances in `sentences` of the phrase being scored + len_vocab: the number of unique tokens in `sentences` + min_count: the `min_count` setting of the Phrases class + corpus_word_count: the total number of (non-unique) tokens in `sentences` + + A scoring function without any of these parameters (even if the parameters are not used) will + raise a ValueError on initialization of the Phrases class. The scoring function must be pic + + References + ---------- + .. [1] "Efficient Estimaton of Word Representations in Vector Space" by Mikolov, et. al. + .. [2] "Normalized (Pointwise) Mutual Information in Colocation Extraction" by Gerlof Bouma. """ self.gensim_model = None diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index 073c8b2c09..2d682e2736 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -30,13 +30,12 @@ class Text2BowTransformer(TransformerMixin, BaseEstimator): def __init__(self, prune_at=2000000, tokenizer=tokenize): """Sklearn wrapper for Text2Bow model. - Parameters are propagated to the original models constructor. For an explanation - please refer to :meth:`~gensim.corpora.dictionary.Dictionary.__init__` - Parameters ---------- prune_at : int, optional + Total number of unique words. Dictionary will keep not more than `prune_at` words. tokenizer : callable (str -> list of str), optional + A callable to split a document into a list of each terms """ self.gensim_model = None From c1e05df6bfb4fd263e13bb037ade5bc3fa2843ff Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Thu, 15 Feb 2018 11:19:43 +0100 Subject: [PATCH 08/45] added examples to __doc__, work still in progress --- gensim/sklearn_api/lsimodel.py | 17 ++++++++++++++++- gensim/sklearn_api/text2bow.py | 14 ++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index ae2c49e291..3d1c82dc5d 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -13,10 +13,25 @@ -------- Integrate with sklearn Pipelines: + >>> from sklearn.pipeline import Pipeline + >>> from sklearn import linear_model + >>> from sklearn.datasets import fetch_20newsgroups + >>> from gensim.sklearn_api import LsiTransformer + >>> from gensim.corpora import Dictionary + >>> + >>> # Create an ID to word mapping using some corpus included in sklearn. + >>> cats = ['rec.sport.baseball', 'sci.crypt'] + >>> data = fetch_20newsgroups(subset='train', categories=cats, shuffle=True) + >>> id2word = Dictionary([_.split() for _ in data.data]) + + >>> # Create stages for our pipeline (including gensim and sklearn models alike). >>> model = LsiTransformer(num_topics=15, id2word=id2word) >>> clf = linear_model.LogisticRegression(penalty='l2', C=0.1) >>> pipe = Pipeline([('features', model,), ('classifier', clf)]) - >>> pipe.fit(corpus, data.target) + + >>> # Fit our pipeline to some corpus + >>> corpus = [id2word.doc2bow(i.split()) for i in data.data] + >>> fitted_pipeline = pipe.fit(corpus, data.target) """ diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index 2d682e2736..4e575c5033 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -8,6 +8,20 @@ Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. +Examples +-------- + + >>> from gensim.sklearn_api import Text2BowTransformer + >>> + >>> # Get some random text. + >>> texts = [['complier system computer loading computer system']] + >>> + >>> # Create a transformer allowing 3 words only. + >>> model = Text2BowTransformer() + + >>> # Use sklearn style `fit_transform` to get the BOW representation. + >>> texts_bow = model.fit_transform(texts) + """ from six import string_types From 4cfbf5c395ba4fdb29193cac5308fc2146fbb2f0 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Thu, 15 Feb 2018 18:42:03 +0100 Subject: [PATCH 09/45] added simple and executable examples to `__doc__` --- gensim/sklearn_api/phrases.py | 18 ++++++++++++++++++ gensim/sklearn_api/text2bow.py | 13 +++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 371c24be32..7231a47184 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -8,6 +8,24 @@ Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. +Examples +-------- + + >>> from gensim.sklearn_api.phrases import PhrasesTransformer + >>> + >>> # Create the model. Make sure no term is ignored and combinations seen 3+ times are captured. + >>> m = PhrasesTransformer(min_count=1, threshold=3) + >>> texts = [ + ... ['I', 'love', 'computer', 'science'], + ... ['computer', 'science', 'is', 'my', 'passion'], + ... ['I', 'studied', 'computer', 'science'] + ... ] + >>> + >>> # Use sklearn fit_transform to see the transformation. + >>> # Since computer and science were seen together 3+ times they are considered a phrase. + >>> m.fit_transform(texts)[0] + ['I', 'love', 'computer_science'] + """ from six import string_types diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index 4e575c5033..0aa08b8eb1 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -13,14 +13,15 @@ >>> from gensim.sklearn_api import Text2BowTransformer >>> - >>> # Get some random text. - >>> texts = [['complier system computer loading computer system']] + >>> # Get a corpus as an iterable of unicode strings. + >>> texts = [u'complier system computer', u'loading computer system'] >>> - >>> # Create a transformer allowing 3 words only. + >>> # Create a transformer.. >>> model = Text2BowTransformer() - - >>> # Use sklearn style `fit_transform` to get the BOW representation. - >>> texts_bow = model.fit_transform(texts) + >>> + >>> # Use sklearn style `fit_transform` to get the BOW representation of each document. + >>> model.fit_transform(texts) + [[(0, 1), (1, 1), (2, 1)], [(1, 1), (2, 1), (3, 1)]] """ From 3581a46be148ef45e49d13f5a92716a7b575596e Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Mon, 19 Feb 2018 18:37:19 +0100 Subject: [PATCH 10/45] temp work on some more wrappers --- gensim/sklearn_api/d2vmodel.py | 133 +++++++++++++++++++++++++++++---- gensim/sklearn_api/ldamodel.py | 96 +++++++++++++++++++++--- gensim/sklearn_api/tfidf.py | 98 +++++++++++++++++++++--- 3 files changed, 291 insertions(+), 36 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 1e3bf61d7d..d2d9dcf642 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -4,9 +4,15 @@ # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Scikit learn interface for gensim for easy use of gensim with scikit-learn -Follows scikit-learn API conventions +"""Scikit learn interface for :class:`~gensim.models.doc2vec.Doc2Vec`. + +Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. + +Examples +-------- + + + """ import numpy as np @@ -19,16 +25,96 @@ class D2VTransformer(TransformerMixin, BaseEstimator): - """ - Base Doc2Vec module + """Base Dov2Vec module. + + Wraps :class:`~gensim.models.doc2vec.Doc2Vec`. + For more information on the inner workings please take a look at + the original class. + """ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000): - """ - Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. + """Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. + + Parameters + ---------- + + dm_mean : int {1,0}, optional + If 0, use the sum of the context word vectors. If 1, use the mean. + Only applies when `dm` is used in non-concatenative mode. + dm : int {1,0}, optional + Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used. + Otherwise, `distributed bag of words` (PV-DBOW) is employed. + dbow_words : int {1,0}, optional + If set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW + doc-vector training; If 0, only trains doc-vectors (faster). + dm_concat : int {1,0}, optional + If 1, use concatenation of context vectors rather than sum/average; + Note concatenation results in a much-larger model, as the input + is no longer the size of one (sampled or arithmetically combined) word vector, but the + size of the tag(s) and all words in the context strung together. + dm_tag_count : int, optional + Expected constant number of document tags per document, when using + dm_concat mode; default is 1. + docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` + A mapping from a tag to its vector representation. Either this or `docvecs_mapfile` **MUST** be supplied. + docvecs_mapfile : str, optional + Path to a file containing the docvecs mapping. + If `docvecs` is None, this file will be used to create it. + comment : str, optional + A model descriptive comment, used for logging and debugging purposes. + trim_rule : callable ((str, int, int) -> int), optional + Vocabulary trimming rule that accepts (word, count, min_count). + Specifies whether certain words should remain in the vocabulary (:attr:`gensim.utils.RULE_KEEP`), + be trimmed away (:attr:`gensim.utils.RULE_DISCARD`), or handled using the default (:attr:`gensim.utils.RULE_DEFAULT`). + If None, then :func:`~gensim.utils.keep_vocab_item` will be used. + Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. + size : int + Dimensionality of the feature vectors. + alpha : float + The initial learning rate. + window : int + The maximum distance between the current and predicted word within a sentence. + min_count : int + Ignores all words with total frequency lower than this. + max_vocab_size : int + Limits the RAM during vocabulary building; if there are more unique + words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. + Set to `None` for no limit. + sample : float + The threshold for configuring which higher-frequency words are randomly downsampled, + useful range is (0, 1e-5). + seed : int + Seed for the random number generator. Initial vectors for each word are seeded with a hash of + the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, + you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter + from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires + use of the `PYTHONHASHSEED` environment variable to control hash randomization). + workers : int + Use these many worker threads to train the model (=faster training with multicore machines). + min_alpha : float + Learning rate will linearly drop to `min_alpha` as training progresses. + hs : int {1,0} + If 1, hierarchical softmax will be used for model training. + If set to 0, and `negative` is non-zero, negative sampling will be used. + negative : int + If > 0, negative sampling will be used, the int for negative specifies how many "noise words" + should be drawn (usually between 5-20). + If set to 0, no negative sampling is used. + cbow_mean : int + Same as `dm_mean`, unused. + hashfxn : callable (object -> int), optional + A hashing function. Used to create an initial random reproducible vector by hashing the random seed. + iter : int, optional + Number of iterations (epochs) over the corpus. + sorted_vocab : bool, optional + Whether the vocabulary should be sorted internally. + batch_words : int + Number of words to be handled by each job. + """ self.gensim_model = None self.dm_mean = dm_mean @@ -60,9 +146,18 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.batch_words = batch_words def fit(self, X, y=None): - """ - Fit the model according to the given training data. - Calls gensim.models.Doc2Vec + """Fit the model according to the given training data. + + Parameters + ---------- + X : {iterable of {:class:`~gensim.models.doc2vec.TaggedDocument`, iterable of iterable of str} + A collection of tagged documents used for training the model. If these are not tags, their order index will be used to tag them. + + Returns + ------- + :class:`~gensim.sklearn_api.d2vmodel.D2VTransformer` + The trained model. + """ if isinstance(X[0], doc2vec.TaggedDocument): d2v_sentences = X @@ -81,12 +176,18 @@ def fit(self, X, y=None): return self def transform(self, docs): - """ - Return the vector representations for the input documents. - The input `docs` should be a list of lists like - [['calculus', 'mathematical'], - ['geometry', 'operations', 'curves']] - or a single document like : ['calculus', 'mathematical'] + """Get the vector representations for the input documents. + + Parameters + ---------- + docs : iterable of iterable of str + The input corpus. + + Returns + ------- + np.array of shape (`len(docs)`, `size`) + The vector representation of the input corpus. + """ if self.gensim_model is None: raise NotFittedError( diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index 178a52c571..a49cc355ba 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -5,9 +5,14 @@ # Copyright (C) 2017 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Scikit learn interface for gensim for easy use of gensim with scikit-learn -follows on scikit learn API conventions +"""Scikit learn interface for :class:`~gensim.models.ldamodel.LdaModel`. + +Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. + +Examples +-------- + + """ import numpy as np @@ -20,20 +25,93 @@ class LdaTransformer(TransformerMixin, BaseEstimator): - """ - Base LDA module + """Base LDA module. + + Wraps :class:`~gensim.models.ldamodel.LdaModel`. + For more information on the inner workings please take a look at + the original class. + """ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, scorer='perplexity', dtype=np.float32): - """ - Sklearn wrapper for LDA model. See gensim.model.LdaModel for parameter details. - `scorer` specifies the metric used in the `score` function. + """Sklearn wrapper for LDA model. + + Parameters + ---------- + + num_topics : int, optional + The number of requested latent topics to be extracted from the training corpus. + id2word : dict of (int, str), optional + Mapping from integer ID to words in the corpus. Used to determine vocabulary size and logging. + chunksize : int, optional + If `distributed` is True, this is the number of documents to be handled in each worker job. + passes : int + Number of passes through the corpus during online training. + update_every : int + Number of documents to be iterated through for each update. Set to 0 for batch learning, > 1 for online iterative learning. + alpha : {np.array, str} + Can be set to an 1D array of length equal to the number of expected topics that expresses + our a-priori belief for the each topics' probability. + Alternatively default prior selecting strategies can be employed by supplying a string: + + 'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`. + 'default': Learns an assymetric prior from the corpus. + eta : {np.array, str} + + decay + offset + eval_every + iterations + gamma_threshold + minimum_probability + random_state + scorer + dtype + + + `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic + (theta) and topic-word (lambda) distributions. Both default to a symmetric + 1.0/num_topics prior. + + `alpha` can be set to an explicit array = prior of your choice. It also + support special values of 'asymmetric' and 'auto': the former uses a fixed + normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric + prior directly from your data. + + `eta` can be a scalar for a symmetric prior over topic/word + distributions, or a vector of shape num_words, which can be used to + impose (user defined) asymmetric priors over the word distribution. + It also supports the special value 'auto', which learns an asymmetric + prior over words directly from your data. `eta` can also be a matrix + of shape num_topics x num_words, which can be used to impose + asymmetric priors over the word distribution on a per-topic basis + (can not be learned from data). + + Turn on `distributed` to force distributed computing + (see the `web tutorial `_ + on how to set up a cluster of machines for gensim). + + Calculate and log perplexity estimate from the latest mini-batch every + `eval_every` model updates (setting this to 1 slows down training ~2x; + default is 10 for better performance). Set to None to disable perplexity estimation. + + `decay` and `offset` parameters are the same as Kappa and Tau_0 in + Hoffman et al, respectively. + + `minimum_probability` controls filtering the topics returned for a document (bow). + + `random_state` can be a np.random.RandomState object or the seed for one. + + `callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training. + + `dtype` is data-type to use during calculations inside model. All inputs are also converted to this dtype. + Available types: `numpy.float16`, `numpy.float32`, `numpy.float64`. - See `gensim.models.LdaModel` class for description of the other parameters. """ + self.gensim_model = None self.num_topics = num_topics self.id2word = id2word diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index dc4ab86c01..427dda3b40 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -4,10 +4,16 @@ # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""Scikit learn interface for :class:`~gensim.models.tfidfmodel.TfidfModel`. + +Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. + +Examples +-------- + + """ -Scikit learn interface for gensim for easy use of gensim with scikit-learn -Follows scikit-learn API conventions -""" + from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -17,14 +23,64 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator): - """ - Base Tf-Idf module + """Base TfIdf module. + + Wraps :class:`~gensim.models.tfidfmodel.TfidfModel`. + For more information on the inner workings please take a look at + the original class. + """ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="ntc"): - """ - Sklearn wrapper for Tf-Idf model. + """Sklearn wrapper for TfIdf model. + + Parameters + ---------- + + id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional + Mapping token - id, that was used for converting input data to bag of words format. + dictionary : :class:`~gensim.corpora.Dictionary` + If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used. + to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored). + wlocals : function, optional + Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity` + (other options: :func:`math.sqrt`, :func:`math.log1p`, etc). + wglobal : function, optional + Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`. + normalize : bool, optional + It dictates how the final transformed vectors will be normalized. `normalize=True` means set to unit length + (default); `False` means don't normalize. You can also set `normalize` to your own function that accepts + and returns a sparse vector. + smartirs : str, optional + SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System, + a mnemonic scheme for denoting tf-idf weighting variants in the vector space model. + The mnemonic for representing a combination of weights takes the form XYZ, + for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector. + + Term frequency weighing: + * `n` - natural, + * `l` - logarithm, + * `a` - augmented, + * `b` - boolean, + * `L` - log average. + + Document frequency weighting: + * `n` - none, + * `t` - idf, + * `p` - prob idf. + + Document normalization: + * `n` - none, + * `c` - cosine. + + For more information visit [1]_. + + References + ---------- + + .. [1] https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System + """ self.gensim_model = None self.id2word = id2word @@ -35,8 +91,18 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, self.smartirs = smartirs def fit(self, X, y=None): - """ - Fit the model according to the given training data. + """Fit the model according to the given training data. + + Parameters + ---------- + X : iterable of iterable of (int, int) + Input corpus + + Returns + ------- + :class:`~gensim.sklearn_api.tfidf.TfIdfTransformer` + The trained model. + """ self.gensim_model = TfidfModel( corpus=X, id2word=self.id2word, dictionary=self.dictionary, wlocal=self.wlocal, @@ -45,8 +111,18 @@ def fit(self, X, y=None): return self def transform(self, docs): - """ - Return the transformed documents after multiplication with the tf-idf matrix. + """Get the transformed documents after multiplication with the tf-idf matrix. + + Parameters + ---------- + docs: iterable of iterable of (int, int) + Input corpus in BoW format. + + Returns + ------- + iterable of list (int, float) 2-tuples. + The BOW representation of each document. + """ if self.gensim_model is None: raise NotFittedError( From 8ef1105cab0534082036a563af08de6c0916cdc4 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Tue, 20 Feb 2018 00:50:05 +0100 Subject: [PATCH 11/45] finished docstrings for LDA wrapper, examples pending --- gensim/sklearn_api/ldamodel.py | 184 +++++++++++++++++++-------------- 1 file changed, 107 insertions(+), 77 deletions(-) diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index a49cc355ba..71055a3c7b 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -38,77 +38,73 @@ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, updat minimum_probability=0.01, random_state=None, scorer='perplexity', dtype=np.float32): """Sklearn wrapper for LDA model. + Based on [1]_. - Parameters + Notes + ----- + Configure `passes` and `update_every` params to choose the mode among: + + - online (single-pass): update_every != None and passes == 1 + - online (multi-pass): update_every != None and passes > 1 + - batch: update_every == None + + By default, 'online (single-pass)' mode is used for training the LDA model. + + References ---------- + .. [1] Matthew D. Hoffman, David M. Blei, Francis Bach, "Online Learning for Latent Dirichlet Allocation", + NIPS'10 Proceedings of the 23rd International Conference on Neural Information Processing Systems - + Volume 1 Pages 856-864, https://www.di.ens.fr/~fbach/mdhnips2010.pdf + Parameters + ---------- num_topics : int, optional The number of requested latent topics to be extracted from the training corpus. id2word : dict of (int, str), optional Mapping from integer ID to words in the corpus. Used to determine vocabulary size and logging. chunksize : int, optional If `distributed` is True, this is the number of documents to be handled in each worker job. - passes : int + passes : int, optional Number of passes through the corpus during online training. - update_every : int - Number of documents to be iterated through for each update. Set to 0 for batch learning, > 1 for online iterative learning. - alpha : {np.array, str} + update_every : int, optional + Number of documents to be iterated through for each update. + Set to 0 for batch learning, > 1 for online iterative learning. + alpha : {np.array, str}, optional Can be set to an 1D array of length equal to the number of expected topics that expresses our a-priori belief for the each topics' probability. Alternatively default prior selecting strategies can be employed by supplying a string: - 'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`. 'default': Learns an assymetric prior from the corpus. - eta : {np.array, str} - - decay - offset - eval_every - iterations - gamma_threshold - minimum_probability - random_state - scorer - dtype - - - `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic - (theta) and topic-word (lambda) distributions. Both default to a symmetric - 1.0/num_topics prior. - - `alpha` can be set to an explicit array = prior of your choice. It also - support special values of 'asymmetric' and 'auto': the former uses a fixed - normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric - prior directly from your data. - - `eta` can be a scalar for a symmetric prior over topic/word - distributions, or a vector of shape num_words, which can be used to - impose (user defined) asymmetric priors over the word distribution. - It also supports the special value 'auto', which learns an asymmetric - prior over words directly from your data. `eta` can also be a matrix - of shape num_topics x num_words, which can be used to impose - asymmetric priors over the word distribution on a per-topic basis - (can not be learned from data). - - Turn on `distributed` to force distributed computing - (see the `web tutorial `_ - on how to set up a cluster of machines for gensim). - - Calculate and log perplexity estimate from the latest mini-batch every - `eval_every` model updates (setting this to 1 slows down training ~2x; - default is 10 for better performance). Set to None to disable perplexity estimation. - - `decay` and `offset` parameters are the same as Kappa and Tau_0 in - Hoffman et al, respectively. - - `minimum_probability` controls filtering the topics returned for a document (bow). - - `random_state` can be a np.random.RandomState object or the seed for one. - - `callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training. - - `dtype` is data-type to use during calculations inside model. All inputs are also converted to this dtype. - Available types: `numpy.float16`, `numpy.float32`, `numpy.float64`. + eta : {float, np.array, str}, optional + A-priori belief on word probability. This can be: + a scalar for a symmetric prior over topic/word probability + a vector : of length num_words to denote an assymetric user defined probability for each word. + a matrix of shape (`num_topics`, num_words) to assign a probability for each word condition on each topic. + the string 'auto' to learn the asymmetric prior from the data. + decay : float, optional + A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten + when each new document is examined. Corresponds to Kappa from [1]_. + offset : float, optional + Hyperparameter that controls how much we will slow down the first steps the first few iterations. + Corresponds to Tau_0 from [1]_. + eval_every : int, optional + Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x. + iterations : int, optional + Maximum number of iterations through the corpus when infering the topic distribution of a corpus. + gamma_threshold : float, optional + Minimum change in the value of the gamma parameters to continue iterating. + minimum_probability : float, optional + Topics with a probability lower than this threshold will be filtered out. + random_state : {np.random.RandomState, int}, optional + Either a randomState object or a seed to generate one. Useful for reproducibility. + scorer : str, optional + Method to compute a score reflecting how well the model has fit the input corpus. + Allowed values are: + 'perplexity': Minimize the model's perplexity. + 'mass_u': Use :class:`~gensim.models.coherencemodel.CoherenceModel` to compute a topics coherence. + dtype : type, optional + Data-type to use during calculations inside model. All inputs are also converted. + Available types: `numpy.float16`, `numpy.float32`, `numpy.float64`. """ @@ -131,9 +127,18 @@ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, updat self.dtype = dtype def fit(self, X, y=None): - """ - Fit the model according to the given training data. - Calls gensim.models.LdaModel + """Fit the model according to the given training data. + + Parameters + ---------- + X : {iterable of iterable of (int, int), scipy.sparse matrix} + A collection of documents in BOW format used for training the model. + + Returns + ------- + :class:`~gensim.sklearn_api.ldamodel.LdaTransformer` + The trained model. + """ if sparse.issparse(X): corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False) @@ -151,14 +156,18 @@ def fit(self, X, y=None): return self def transform(self, docs): - """ - Takes a list of documents as input ('docs'). - Returns a matrix of topic distribution for the given document bow, where a_ij - indicates (topic_i, topic_probability_j). - The input `docs` should be in BOW format and can be a list of documents like - [[(4, 1), (7, 1)], - [(9, 1), (13, 1)], [(2, 1), (6, 1)]] - or a single document like : [(4, 1), (7, 1)] + """Return the BOW format for the input documents. + + Parameters + ---------- + docs : iterable of iterable of (int, int) + A collection of documents in BOW format to be transformed. + + Returns + ------- + np.array of shape (`len(docs)`, `num_topics`) + The topic distribution for each input document. + """ if self.gensim_model is None: raise NotFittedError( @@ -174,14 +183,22 @@ def transform(self, docs): return np.reshape(np.array(distribution), (len(docs), self.num_topics)) def partial_fit(self, X): - """ - Train model over X. - By default, 'online (single-pass)' mode is used for training the LDA model. - Configure `passes` and `update_every` params at init to choose the mode among : + """Train model over a potentially incomplete set of documents. - - online (single-pass): update_every != None and passes == 1 - - online (multi-pass): update_every != None and passes > 1 - - batch: update_every == None + Uses the parameters set in the constructor. + This method can be used in two ways: + 1. On an unfitted model in which case the model is initialized and trained on `X`. + 2. On an already fitted model in which case the model is **updated** by `X`. + + Parameters + ---------- + X : {iterable of iterable of (int, int), scipy.sparse matrix} + A collection of documents in BOW format used for training the model. + + Returns + ------- + :class:`~gensim.sklearn_api.ldamodel.LdaTransformer` + The trained model. """ if sparse.issparse(X): @@ -201,8 +218,21 @@ def partial_fit(self, X): return self def score(self, X, y=None): - """ - Compute score reflecting how well the model has fit for the input data. + """Compute score reflecting how well the model has fitted for the input data. + + The scoring method is set using the `scorer` argument in :meth:`~gensim.sklearn_api.ldamodel.LdaTransformer`. + Higher score is better. + + Parameters + ---------- + X : iterable of iterable of (int, int) + Input corpus in BOW format. + + Returns + ------- + float + The score computed based on the selected method. + """ if self.scorer == 'perplexity': corpus_words = sum(cnt for document in X for _, cnt in document) @@ -214,4 +244,4 @@ def score(self, X, y=None): goodcm = models.CoherenceModel(model=self.gensim_model, corpus=X, coherence=self.scorer, topn=3) return goodcm.get_coherence() else: - raise ValueError("Invalid value of `scorer` param supplied") + raise ValueError("Invalid value {} supplied for `scorer` param".format(self.scorer)) From add7420f50568bfbd0e4dae2078f81dad0b1d634 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Tue, 20 Feb 2018 18:54:58 +0100 Subject: [PATCH 12/45] finished doc2vec wrapper with example --- gensim/sklearn_api/d2vmodel.py | 49 +++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index d2d9dcf642..ad29dfbeee 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -11,7 +11,13 @@ Examples -------- - + >>> from gensim.test.utils import common_texts + >>> from gensim.sklearn_api import D2VTransformer + >>> + >>> # Lets represent each document using a 50 dimensional vector + >>> model = D2VTransformer(min_count=1, size=50) + >>> docvecs = model.fit_transform(common_texts) + >>> assert docvecs.shape == (len(common_texts), 50) """ @@ -37,7 +43,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000): - """Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. + """Sklearn api for Doc2Vec model. Parameters ---------- @@ -60,7 +66,8 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 Expected constant number of document tags per document, when using dm_concat mode; default is 1. docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` - A mapping from a tag to its vector representation. Either this or `docvecs_mapfile` **MUST** be supplied. + A mapping from a string or int tag to its vector representation. + Either this or `docvecs_mapfile` **MUST** be supplied. docvecs_mapfile : str, optional Path to a file containing the docvecs mapping. If `docvecs` is None, this file will be used to create it. @@ -72,47 +79,46 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 be trimmed away (:attr:`gensim.utils.RULE_DISCARD`), or handled using the default (:attr:`gensim.utils.RULE_DEFAULT`). If None, then :func:`~gensim.utils.keep_vocab_item` will be used. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. - size : int + size : int, optional Dimensionality of the feature vectors. - alpha : float + alpha : float, optional The initial learning rate. - window : int + window : int, optional The maximum distance between the current and predicted word within a sentence. - min_count : int + min_count : int, optional Ignores all words with total frequency lower than this. - max_vocab_size : int + max_vocab_size : int, optional Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to `None` for no limit. - sample : float + sample : float, optional The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5). - seed : int + seed : int, optional Seed for the random number generator. Initial vectors for each word are seeded with a hash of the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED` environment variable to control hash randomization). - workers : int - Use these many worker threads to train the model (=faster training with multicore machines). - min_alpha : float + workers : int, optional + Use this many worker threads to train the model. Will yield a speedup when training with multicore machines. + min_alpha : float, optional Learning rate will linearly drop to `min_alpha` as training progresses. - hs : int {1,0} + hs : int {1,0}, optional If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, negative sampling will be used. - negative : int + negative : int, optional If > 0, negative sampling will be used, the int for negative specifies how many "noise words" - should be drawn (usually between 5-20). - If set to 0, no negative sampling is used. - cbow_mean : int + should be drawn (usually between 5-20). If set to 0, no negative sampling is used. + cbow_mean : int, optional Same as `dm_mean`, unused. hashfxn : callable (object -> int), optional A hashing function. Used to create an initial random reproducible vector by hashing the random seed. iter : int, optional - Number of iterations (epochs) over the corpus. + Number of epochs to iterate through the corpus. sorted_vocab : bool, optional Whether the vocabulary should be sorted internally. - batch_words : int + batch_words : int, optional Number of words to be handled by each job. """ @@ -151,7 +157,8 @@ def fit(self, X, y=None): Parameters ---------- X : {iterable of {:class:`~gensim.models.doc2vec.TaggedDocument`, iterable of iterable of str} - A collection of tagged documents used for training the model. If these are not tags, their order index will be used to tag them. + A collection of tagged documents used for training the model. + If these are not tagged, their order integer index will be used to tag them. Returns ------- From 38a610f0a8c7c44aa035f45f58b4c2f65d87d7e1 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Tue, 20 Feb 2018 19:02:29 +0100 Subject: [PATCH 13/45] completed LDA wrapper including example --- gensim/sklearn_api/ldamodel.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index 71055a3c7b..65de219d80 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -12,6 +12,13 @@ Examples -------- + >>> from gensim.test.utils import common_corpus, common_dictionary + >>> from gensim.sklearn_api import LdaTransformer + >>> + >>> # Reduce each document to 2 dimensions (topics) using the sklearn interface. + >>> model = LdaTransformer(num_topics=2, id2word=common_dictionary, iterations=20, random_state=1) + >>> docvecs = model.fit_transform(common_corpus) + >>> assert docvecs.shape == (len(common_corpus), 2) """ @@ -77,15 +84,15 @@ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, updat 'default': Learns an assymetric prior from the corpus. eta : {float, np.array, str}, optional A-priori belief on word probability. This can be: - a scalar for a symmetric prior over topic/word probability - a vector : of length num_words to denote an assymetric user defined probability for each word. - a matrix of shape (`num_topics`, num_words) to assign a probability for each word condition on each topic. + a scalar for a symmetric prior over topic/word probability. + a vector : of length num_words to denote an asymmetric user defined probability for each word. + a matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination. the string 'auto' to learn the asymmetric prior from the data. decay : float, optional A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten when each new document is examined. Corresponds to Kappa from [1]_. offset : float, optional - Hyperparameter that controls how much we will slow down the first steps the first few iterations. + Hyper-parameter that controls how much we will slow down the first steps the first few iterations. Corresponds to Tau_0 from [1]_. eval_every : int, optional Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x. From 5f00f34872ac7099fee2f0c2c5559d44ece30ca6 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Tue, 20 Feb 2018 19:08:15 +0100 Subject: [PATCH 14/45] finished the tfidf wrapper including example --- gensim/sklearn_api/tfidf.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index 427dda3b40..48b41920e1 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -11,6 +11,14 @@ Examples -------- + >>> from gensim.test.utils import common_corpus, common_dictionary + >>> from gensim.sklearn_api import TfIdfTransformer + >>> + >>> # Transform the word counts inversely to their global frequency using the sklearn interface. + >>> model = TfIdfTransformer(dictionary=common_dictionary) + >>> weighted_corpus = model.fit_transform(common_corpus) + >>> weighted_corpus[0] + [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)] """ @@ -39,13 +47,12 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, ---------- id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional - Mapping token - id, that was used for converting input data to bag of words format. - dictionary : :class:`~gensim.corpora.Dictionary` - If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used. - to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored). + Mapping from int id to word token, that was used for converting input data to bag of words format. + dictionary : :class:`~gensim.corpora.Dictionary`, optional + If specified it will be used to directly construct the inverse document frequency mapping. wlocals : function, optional - Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity` - (other options: :func:`math.sqrt`, :func:`math.log1p`, etc). + Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity` which does nothing. + Other options include :func:`math.sqrt`, :func:`math.log1p`, etc. wglobal : function, optional Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`. normalize : bool, optional @@ -121,7 +128,7 @@ def transform(self, docs): Returns ------- iterable of list (int, float) 2-tuples. - The BOW representation of each document. + The BOW representation of each document. Will have the same shape as `docs`. """ if self.gensim_model is None: From 1d8c63ccf00bb16bea15d30ad863d23310344f21 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Tue, 20 Feb 2018 19:26:10 +0100 Subject: [PATCH 15/45] PEP-8 corrections --- gensim/sklearn_api/d2vmodel.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index ad29dfbeee..2cdd887458 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -76,9 +76,10 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 trim_rule : callable ((str, int, int) -> int), optional Vocabulary trimming rule that accepts (word, count, min_count). Specifies whether certain words should remain in the vocabulary (:attr:`gensim.utils.RULE_KEEP`), - be trimmed away (:attr:`gensim.utils.RULE_DISCARD`), or handled using the default (:attr:`gensim.utils.RULE_DEFAULT`). - If None, then :func:`~gensim.utils.keep_vocab_item` will be used. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. + be trimmed away (:attr:`gensim.utils.RULE_DISCARD`), or handled using the default + (:attr:`gensim.utils.RULE_DEFAULT`).If None, then :func:`~gensim.utils.keep_vocab_item` will be used. + Note: The rule, if given, is only used to prune vocabulary during build_vocab() + and is not stored as part of the model. size : int, optional Dimensionality of the feature vectors. alpha : float, optional From f8fffd6cc8e56aa97a8ee64afb959802d36967ca Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Wed, 21 Feb 2018 14:16:46 +0100 Subject: [PATCH 16/45] w2v documentation - example result pending --- gensim/sklearn_api/text2bow.py | 2 +- gensim/sklearn_api/w2vmodel.py | 126 +++++++++++++++++++++++++++++---- 2 files changed, 115 insertions(+), 13 deletions(-) diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index 0aa08b8eb1..f51cea62ba 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -4,7 +4,7 @@ # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Scikit learn interface for `gensim.corpora.Dictionary`. +"""Scikit learn interface for :class:`~gensim.corpora.dictionary.Dictionary`. Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 317842ee07..f8f222f943 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -5,10 +5,25 @@ # Copyright (C) 2017 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""Scikit learn interface for :class:`~gensim.models.word2vec.Word2Vec`. + +Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. + + +Examples +-------- + >>> from gensim.test.utils import common_texts + >>> from gensim.sklearn_api import W2VTransformer + >>> + >>> # Create a model to represent each word by a 10 dimensional vector. + >>> model = W2VTransformer(size=10, min_count=1) + >>> model.fit(common_texts) + >>> + >>> # What is the vector representation of the word 'graph'? + >>> model.transform('graph') + """ -Scikit learn interface for gensim for easy use of gensim with scikit-learn -Follows scikit-learn API conventions -""" + import numpy as np import six @@ -19,16 +34,81 @@ class W2VTransformer(TransformerMixin, BaseEstimator): - """ - Base Word2Vec module - """ + """Base Word2Vec module. + + Wraps :class:`~gensim.models.word2vec.Word2Vec`. + For more information on the inner workings please take a look at + the original class. + """ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000): """ - Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details. + Initialize the model from an iterable of `sentences`. Each sentence is a + list of words (unicode strings) that will be used for training. + + Parameters + ---------- + size : int + Dimensionality of the feature vectors. + alpha : float + The initial learning rate. + window : int + The maximum distance between the current and predicted word within a sentence. + min_count : int + Ignores all words with total frequency lower than this. + max_vocab_size : int + Limits the RAM during vocabulary building; if there are more unique + words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. + Set to `None` for no limit. + sample : float + The threshold for configuring which higher-frequency words are randomly downsampled, + useful range is (0, 1e-5). + seed : int + Seed for the random number generator. Initial vectors for each word are seeded with a hash of + the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, + you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter + from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires + use of the `PYTHONHASHSEED` environment variable to control hash randomization). + workers : int + Use these many worker threads to train the model (=faster training with multicore machines). + min_alpha : float + Learning rate will linearly drop to `min_alpha` as training progresses. + sg : int {1, 0} + Defines the training algorithm. If 1, CBOW is used, otherwise, skip-gram is employed. + hs : int {1,0} + If 1, hierarchical softmax will be used for model training. + If set to 0, and `negative` is non-zero, negative sampling will be used. + negative : int + If > 0, negative sampling will be used, the int for negative specifies how many "noise words" + should be drawn (usually between 5-20). + If set to 0, no negative sampling is used. + cbow_mean : int {1,0} + If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. + hashfxn : function + Hash function to use to randomly initialize weights, for increased training reproducibility. + iter : int + Number of iterations (epochs) over the corpus. + null_word : int {1, 0} + If 1, a null pseudo-word will be created for padding when using concatenative L1 (run-of-words) + trim_rule : function + Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, + be trimmed away, or handled using the default (discard if word count < min_count). + Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), + or a callable that accepts parameters (word, count, min_count) and returns either + :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part + of the model. + sorted_vocab : int {1,0} + If 1, sort the vocabulary by descending frequency before assigning word indexes. + batch_words : int + Target size (in words) for batches of examples passed to worker threads (and + thus cython routines).(Larger batches will be passed if individual + texts are longer than 10000 words, but the standard cython code truncates to that maximum.) + """ + self.gensim_model = None self.size = size self.alpha = alpha @@ -51,9 +131,21 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size= self.batch_words = batch_words def fit(self, X, y=None): - """ - Fit the model according to the given training data. - Calls gensim.models.Word2Vec + """Fit the model according to the given training data. + + Parameters + ---------- + X : iterable of iterables of str + The input corpus. X can be simply a list of lists of tokens, but for larger corpora, + consider an iterable that streams the sentences directly from disk/network. + See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` + or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + + Returns + ------- + :class:`~gensim.sklearn_api.w2vmodel.W2VTransformer` + The trained model. + """ self.gensim_model = models.Word2Vec( sentences=X, size=self.size, alpha=self.alpha, @@ -66,8 +158,18 @@ def fit(self, X, y=None): return self def transform(self, words): - """ - Return the word-vectors for the input list of words. + """Return the word vectors the input words. + + Parameters + ---------- + words : iterable of str + A collection of words to be transformed. + + Returns + ------- + np.ndarray of shape (num_words, size) + A 2D array where each row is the vector of one word. + """ if self.gensim_model is None: raise NotFittedError( From 3cf28a372c0e9f588ca08c9537097b7dd104167a Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Wed, 21 Feb 2018 19:50:29 +0100 Subject: [PATCH 17/45] fixed w2v example --- gensim/sklearn_api/w2vmodel.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index f8f222f943..216ff6630c 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -16,11 +16,12 @@ >>> from gensim.sklearn_api import W2VTransformer >>> >>> # Create a model to represent each word by a 10 dimensional vector. - >>> model = W2VTransformer(size=10, min_count=1) - >>> model.fit(common_texts) + >>> model = W2VTransformer(size=10, min_count=1, seed=1) >>> >>> # What is the vector representation of the word 'graph'? - >>> model.transform('graph') + >>> wordvecs = model.fit(common_texts).transform(['graph', 'system']) + >>> assert wordvecs.shape == (2, 10) + """ From b55a2a20aaaf9e626fddf55a9512c47bb6899f29 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Thu, 22 Feb 2018 17:11:53 +0100 Subject: [PATCH 18/45] added documentation for the lda sequential model - examples pending --- gensim/sklearn_api/ldaseqmodel.py | 96 +++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 16 deletions(-) diff --git a/gensim/sklearn_api/ldaseqmodel.py b/gensim/sklearn_api/ldaseqmodel.py index 1328e22af1..ffd8a1c883 100644 --- a/gensim/sklearn_api/ldaseqmodel.py +++ b/gensim/sklearn_api/ldaseqmodel.py @@ -5,11 +5,15 @@ # Copyright (C) 2017 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Scikit learn interface for gensim for easy use of gensim with scikit-learn -Follows scikit-learn API conventions -""" +"""Scikit learn interface for :class:`~gensim.models.ldaseqmodel.LdaSeqModel`. + +Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. +Examples +-------- + + +""" import numpy as np from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -18,15 +22,60 @@ class LdaSeqTransformer(TransformerMixin, BaseEstimator): - """ - Base LdaSeq module + """Base Sequential LDA module. + + Wraps :class:`~gensim.models.ldaseqmodel.LdaSeqModel`. + For more information on the inner workings please take a look at + the original class. + + """ def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100): - """ - Sklearn wrapper for LdaSeq model. See gensim.models.LdaSeqModel for parameter details. + """Sklearn wrapper for :class:`~gensim.models.ldaseqmodel.LdaSeqModel` model. + + Parameters + ---------- + time_slice : list of int, optional + Contains the number of documents in each time-slice. + id2word : dict of (int, str) + Mapping from an ID to the word it represents in the vocabulary. + alphas : float + The prior probability of each topic. + num_topics : int + Number of latent topics to be discovered in the corpus. + initialize : str {'gensim', 'own', 'ldamodel'} + Controls the initialization of the DTM model. Supports three different modes: + - 'gensim', default: Uses gensim's own LDA initialization. + - 'own': You can use your own initialization matrix of an LDA model previously trained by passing it to `sstats`. + - 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument. + sstats : np.ndarray of shape (vocab_len, `num_topics`) + If `initialize` is set to 'own' this will be used to initialize the DTM model. + lda_model : :class:`~gensim.models.ldamodel.LdaModel` + If `initialize` is set to 'lda_model' this object will be used to create the `sstats` initialization matrix. + obs_variance : float + Observed variance used to approximate the true and forward variance as shown in _[1]. + + References + ---------- + http://repository.cmu.edu/cgi/viewcontent.cgi?article=2036&context=compsci + chain_variance : float + Gaussian parameter defined in the beta distribution to dictate how the beta values evolve. + passes : int + Number of passes over the corpus for the initial :class:`~gensim.models.ldamodel.LdaModel` + random_state : {np.random.RandomState, int} + Can be a np.random.RandomState object, or the seed to generate one. Used for reproducibility of results. + lda_inference_max_iter : int + Maximum number of iterations in the inference step of the LDA training. + en_min_iter : int + Minimum number of iterations until converge of the Expectation-Maximization algorithm + en_max_iter : int + Maximum number of iterations until converge of the Expectation-Maximization algorithm + chunksize : int + Number of documents in the corpus do be processed in in a chunk. + """ self.gensim_model = None self.time_slice = time_slice @@ -46,9 +95,18 @@ def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, in self.chunksize = chunksize def fit(self, X, y=None): - """ - Fit the model according to the given training data. - Calls gensim.models.LdaSeqModel + """Fit the model according to the given training data. + + Parameters + ---------- + X : {iterable of iterable of (int, int), scipy.sparse matrix} + A collection of documents in BOW format used for training the model. + + Returns + ------- + :class:`~gensim.sklearn_api.ldaseqmodel.LdaSeqTransformer` + The trained model. + """ self.gensim_model = models.LdaSeqModel( corpus=X, time_slice=self.time_slice, id2word=self.id2word, @@ -61,11 +119,17 @@ def fit(self, X, y=None): def transform(self, docs): """ - Return the topic proportions for the documents passed. - The input `docs` should be in BOW format and can be a list of documents like - [[(4, 1), (7, 1)], - [(9, 1), (13, 1)], [(2, 1), (6, 1)]] - or a single document like : [(4, 1), (7, 1)] + + Parameters + ---------- + docs : {iterable of iterable of (int, int), scipy.sparse matrix} + A collection of documents in BOW format to be transformed. + + Returns + ------- + np.ndarray of shape (`len(docs)`, `num_topics`) + The topic representation of each document. + """ if self.gensim_model is None: raise NotFittedError( From b0600cd809a866c23db12d518501feb51b6c03a4 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sat, 24 Feb 2018 18:58:42 +0100 Subject: [PATCH 19/45] added documentation for the author topic sklearn wrapper including example --- gensim/sklearn_api/atmodel.py | 153 ++++++++++++++++++++++++++++++---- 1 file changed, 138 insertions(+), 15 deletions(-) diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py index 792e950491..41e117c448 100644 --- a/gensim/sklearn_api/atmodel.py +++ b/gensim/sklearn_api/atmodel.py @@ -5,10 +5,27 @@ # Copyright (C) 2017 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""Scikit learn interface for :class:`~gensim.models.atmodel.AuthorTopicModel`. + +Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. + +Examples +-------- + + >>> from gensim.test.utils import common_texts, common_dictionary, common_corpus + >>> from gensim.sklearn_api.atmodel import AuthorTopicTransformer + >>> + >>> # Pass a mapping from authors to the documents they contributed to. + >>> author2doc = {'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8]} + + >>> # Lets use the model to discover 2 different topics. + >>> model = AuthorTopicTransformer(id2word=common_dictionary, author2doc=author2doc, num_topics=2, passes=100) + >>> + >>> # In which of those 2 topics does jack mostly contribute to? + >>> jacks_topic_distr = model.fit(common_corpus).transform('jack') + """ -Scikit learn interface for gensim for easy use of gensim with scikit-learn -Follows scikit-learn API conventions -""" + import numpy as np from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -18,8 +35,18 @@ class AuthorTopicTransformer(TransformerMixin, BaseEstimator): - """ - Base AuthorTopic module + """Base Word2Vec module. + + Wraps :class:`~gensim.models.atmodel.AuthorTopicModel`. + For more information on the inner workings please take a look at + the original class. The model's internal workings are heavily based on _[1]. + + References + ---------- + .. [1] Matthew D. Hoffman, David M. Blei, Francis Bach, "Online Learning for Latent Dirichlet Allocation", + NIPS'10 Proceedings of the 23rd International Conference on Neural Information Processing Systems - + Volume 1 Pages 856-864, https://www.di.ens.fr/~fbach/mdhnips2010.pdf + """ def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=None, @@ -28,7 +55,63 @@ def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=Non gamma_threshold=0.001, serialized=False, serialization_path=None, minimum_probability=0.01, random_state=None): """ - Sklearn wrapper for AuthorTopic model. See gensim.models.AuthorTopicModel for parameter details. + Parameters + ---------- + num_topics : int, optional + Number of requested latent topics to be extracted from the training corpus. + id2word : dict of (int, str), optional + Mapping from a words' ID to the word itself. Used to determine the vocabulary size, + as well as for debugging and topic printing. + author2doc : dict(str, list of int), optional + Maps an authors name to a list of document IDs where has has contributed. + Either `author2doc` or `doc2author` **MUST** be supplied. + doc2author : dict of (int, list of str) + Maps a document (using its ID) to a list of author names that contributed to it. + Either `author2doc` or `doc2author` **MUST** be supplied. + chunksize : int, optional + Number of documents to be processed by the model in each mini-batch. + passes : int, optional + Number of times the model can make a pass over the corpus during training. + iterations : int, optional + Maximum number of times the model before convergence during the M step + of the EM algorithm. + decay : float, optional + A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten + when each new document is examined. Corresponds to Kappa from [1]_. + offset : float, optional + Hyper-parameter that controls how much we will slow down the first steps the first few iterations. + Corresponds to Tau_0 from [1]_. + alpha : {np.array, str}, optional + Can be set to an 1D array of length equal to the number of expected topics that expresses + our a-priori belief for the each topics' probability. + Alternatively default prior selecting strategies can be employed by supplying a string: + 'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`. + 'default': Learns an assymetric prior from the corpus. + eta : {float, np.array, str}, optional + A-priori belief on word probability. This can be: + a scalar for a symmetric prior over topic/word probability. + a vector : of length num_words to denote an asymmetric user defined probability for each word. + a matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination. + the string 'auto' to learn the asymmetric prior from the data. + update_every : int, optional + Number of mini-batches between each model update. + eval_every : int, optional + Number of updates between two log perplexity estimates. + Set to None to disable perplexity estimation. + gamma_threshold : float, optional + Minimum change in the value of the gamma parameters to continue iterating. + serialized : bool, optional + Indicates whether the input corpora to the model are simple in-memory lists (`serialized = False`) + or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from + other Gensim models. If your data is too large to fit in to memory, use this functionality. + serialization_path : str, optional + Filepath to be used for storing the serialized object. **Must** be supplied if `serialized = True`. + An existing file *cannot* be overwritten; either delete the old file or choose a different name + minimum_probability : float, optional + Topics with a probability lower than this threshold will be filtered out. + random_state : {np.random.RandomState, int}, optional + Either a randomState object or a seed to generate one. Useful for reproducibility. + """ self.gensim_model = None self.num_topics = num_topics @@ -51,9 +134,18 @@ def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=Non self.random_state = random_state def fit(self, X, y=None): - """ - Fit the model according to the given training data. - Calls gensim.models.AuthorTopicModel + """Fit the model according to the given training data. + + Parameters + ---------- + X : {iterable of iterable of (int, int), :class:`~gensim.corpora.mmcorpus.MmCorpus`} + A collection of documents in BOW format used for training the model. + + Returns + ------- + :class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer` + The trained model. + """ self.gensim_model = models.AuthorTopicModel( corpus=X, num_topics=self.num_topics, id2word=self.id2word, @@ -66,16 +158,25 @@ def fit(self, X, y=None): return self def transform(self, author_names): + """Find the topic probabilities for each author. + + Parameters + ---------- + author_names : iterable of str + A collection of authors whose topics will be identified. + + Returns + ------- + iterable of (int, float) + Topic distribution for each input author as a tuple of (topic_id, topic_probability). + """ - Return topic distribution for input authors as a list of - (topic_id, topic_probabiity) 2-tuples. - """ - # The input as array of array if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) + # The input as array of arrays if not isinstance(author_names, list): author_names = [author_names] # returning dense representation for compatibility with sklearn @@ -84,8 +185,30 @@ def transform(self, author_names): return np.reshape(np.array(topics), (len(author_names), self.num_topics)) def partial_fit(self, X, author2doc=None, doc2author=None): - """ - Train model over X. + """Train model over a potentially incomplete set of documents. + + Uses the parameters set in the constructor. + This method can be used in two ways: + 1. On an unfitted model in which case the model is initialized and trained on `X`. + 2. On an already fitted model in which case the model is **updated** by `X`. Additional authors + can be passed using `author2doc` or `doc2author` + + Parameters + ---------- + X : {iterable of iterable of (int, int), :class:`~gensim.corpora.mmcorpus.MmCorpus`} + A collection of documents in BOW format used for training the model. + author2doc : dict(str, list of int), optional + Maps an authors name to a list of document IDs corresponding to indexes in input corpus. + Either `author2doc` or `doc2author` **MUST** be supplied. + doc2author : dict of (int, list of str), optional + Maps a document (using its ID) to a list of author names corresponding to indexes in input corpus. + Either `author2doc` or `doc2author` **MUST** be supplied. + + Returns + ------- + :class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer` + The trained model. + """ if self.gensim_model is None: self.gensim_model = models.AuthorTopicModel( From e2ca72f3fe333d86eddcdc28bdf46515c1ebe223 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sat, 24 Feb 2018 19:00:51 +0100 Subject: [PATCH 20/45] improved example by presenting a way to get a pipeline score --- gensim/sklearn_api/lsimodel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index 3d1c82dc5d..1ad9194949 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -31,7 +31,9 @@ >>> # Fit our pipeline to some corpus >>> corpus = [id2word.doc2bow(i.split()) for i in data.data] - >>> fitted_pipeline = pipe.fit(corpus, data.target) + + >>> # How well does our pipeline perform on the training set? + >>> score = pipe.fit(corpus, data.target).score(corpus, data.target) """ From f66abbb425a4b819fdf5dc1b32340f97ea023b9c Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sat, 24 Feb 2018 19:03:39 +0100 Subject: [PATCH 21/45] improved example using similarities --- gensim/sklearn_api/d2vmodel.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 2cdd887458..e6d347a34d 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -13,11 +13,19 @@ >>> from gensim.test.utils import common_texts >>> from gensim.sklearn_api import D2VTransformer + >>> from gensim.similarities import Similarity >>> >>> # Lets represent each document using a 50 dimensional vector - >>> model = D2VTransformer(min_count=1, size=50) + >>> model = D2VTransformer(min_count=1, size=5) >>> docvecs = model.fit_transform(common_texts) - >>> assert docvecs.shape == (len(common_texts), 50) + >>> + >>> # Let's use the vector representations to compute similarities with one of the documents. + >>> index = Similarity(None, docvecs, num_features=5) + >>> + >>> # Which document is most similar to the last one in the corpus? Probably itself! + >>> result = index[docvecs[8]] + >>> result.argmax() + 8 """ From e4dc868a0ab2623b980bcfb9c68b89ef815a64f8 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sat, 24 Feb 2018 20:11:13 +0100 Subject: [PATCH 22/45] added documentation and examples for the rp and hdp models --- gensim/sklearn_api/atmodel.py | 3 +- gensim/sklearn_api/hdp.py | 118 +++++++++++++++++++++++++++++----- gensim/sklearn_api/rpmodel.py | 73 +++++++++++++++------ 3 files changed, 158 insertions(+), 36 deletions(-) diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py index 41e117c448..67c77d9833 100644 --- a/gensim/sklearn_api/atmodel.py +++ b/gensim/sklearn_api/atmodel.py @@ -54,7 +54,8 @@ def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=Non alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, gamma_threshold=0.001, serialized=False, serialization_path=None, minimum_probability=0.01, random_state=None): - """ + """Sklearn wrapper for Author-Topic model. + Parameters ---------- num_topics : int, optional diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py index 80bb13e19d..2e6055ede1 100644 --- a/gensim/sklearn_api/hdp.py +++ b/gensim/sklearn_api/hdp.py @@ -4,9 +4,21 @@ # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Scikit learn interface for gensim for easy use of gensim with scikit-learn -Follows scikit-learn API conventions +"""Scikit learn interface for :class:`~gensim.models.hdpmodel.HdpModel`. + +Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. + + +Examples +-------- + + >>> from gensim.test.utils import common_dictionary, common_corpus + >>> from gensim.sklearn_api import HdpTransformer + >>> + >>> # Lets extract the distribution of each document in topics + >>> model = HdpTransformer(id2word=common_dictionary) + >>> distr = model.fit_transform(common_corpus) + """ import numpy as np @@ -19,14 +31,57 @@ class HdpTransformer(TransformerMixin, BaseEstimator): - """ - Base HDP module + """Base Word2Vec module. + + Wraps :class:`~gensim.models.hdpmodel.HdpModel. + For more information on the inner workings please take a look at + the original class. The inner workings of this class heavily depends on _[1]. + + References + ---------- + .. [1] Wang, Paisley, Blei: Online Variational Inference for the Hierarchical Dirichlet + Process, JMLR (2011), http://jmlr.csail.mit.edu/proceedings/papers/v15/wang11a/wang11a.pdf + """ def __init__(self, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): """ Sklearn api for HDP model. See gensim.models.HdpModel for parameter details. + + + id2word : :class:`~gensim.corpora.dictionary.Dictionary` + Mapping between a words ID and the word itself in the vocabulary. + max_chunks : int, optional + Upper bound on how many chunks to process.It wraps around corpus beginning in another corpus pass, + if there are not enough chunks in the corpus + max_time : int, optional + Upper bound on time in seconds for which model will be trained. + chunksize : int, optional + Number of documents to be processed by the model in each mini-batch. + kappa : float, optional + Learning rate. See _[1]. + tau : float, optional + Slow down parameter. + K : int, optional + Second level truncation level. See _[1]. + T : int, optional + Top level truncation level. See _[1]. + alpha : int, optional + Second level concentration. See _[1]. + gamma : int, optional + First level concentration. See _[1]. + eta : float, optional + The topic Dirichlet. See _[1]. + scale : float, optional + Weights information from the mini-chunk of corpus to calculate rhot. + var_converge : float, optional + Lower bound on the right side of convergence. Used when updating variational parameters for a + single document. + outputdir : str, optional + Path to a directory where topic and options information will be stored. + random_state : int, optional + Seed used to create a :class:`~np.random.RandomState`. Useful for obtaining reproducible results. """ self.gensim_model = None self.id2word = id2word @@ -46,9 +101,18 @@ def __init__(self, id2word, max_chunks=None, max_time=None, chunksize=256, kappa self.random_state = random_state def fit(self, X, y=None): - """ - Fit the model according to the given training data. - Calls gensim.models.HdpModel + """Fit the model according to the given training data. + + Parameters + ---------- + X : {iterable of iterable of (int, int), scipy.sparse matrix} + A collection of documents in BOW format used for training the model. + + Returns + ------- + :class:`~gensim.sklearn_api.hdp.HdpTransformer` + The trained model. + """ if sparse.issparse(X): corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False) @@ -64,14 +128,19 @@ def fit(self, X, y=None): return self def transform(self, docs): - """ - Takes a list of documents as input ('docs'). - Returns a matrix of topic distribution for the given document bow, where a_ij + """Returns a matrix of topic distribution for the given document bow, where a_ij indicates (topic_i, topic_probability_j). - The input `docs` should be in BOW format and can be a list of documents like - [[(4, 1), (7, 1)], - [(9, 1), (13, 1)], [(2, 1), (6, 1)]] - or a single document like : [(4, 1), (7, 1)] + + Parameters + ---------- + docs : iterable of iterable of (int, int) + A list of documents in BOW format to be transformed. + + Returns + ------- + np.ndarray of shape (`len(docs), num_topics`) + Topic distribution for each input document. + """ if self.gensim_model is None: raise NotFittedError( @@ -94,8 +163,23 @@ def transform(self, docs): return np.reshape(np.array(distribution), (len(docs), max_num_topics)) def partial_fit(self, X): - """ - Train model over X. + """Train model over a potentially incomplete set of documents. + + Uses the parameters set in the constructor. + This method can be used in two ways: + 1. On an unfitted model in which case the model is initialized and trained on `X`. + 2. On an already fitted model in which case the model is **updated** by `X`. + + Parameters + ---------- + X : {iterable of iterable of (int, int), scipy.sparse matrix} + A collection of documents in BOW format used for training the model. + + Returns + ------- + :class:`~gensim.sklearn_api.hdp.HdpTransformer` + The trained model. + """ if sparse.issparse(X): X = matutils.Sparse2Corpus(sparse=X, documents_columns=False) diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py index c2f50f5d0f..7382d3ebfd 100644 --- a/gensim/sklearn_api/rpmodel.py +++ b/gensim/sklearn_api/rpmodel.py @@ -5,9 +5,22 @@ # Copyright (C) 2017 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Scikit learn interface for gensim for easy use of gensim with scikit-learn -Follows scikit-learn API conventions +"""Scikit learn interface for :class:`~gensim.models.rpmodel.RpModel`. + +Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. + +Examples +-------- + + >>> from gensim.sklearn_api.rpmodel import RpTransformer + >>> from gensim.test.utils import common_dictionary, common_corpus + >>> + >>> # Initialize and fit the model. + >>> model = RpTransformer(id2word=common_dictionary).fit(common_corpus) + >>> + >>> # Use the trained model to transform a document. + >>> result = model.transform(common_corpus[3]) + """ import numpy as np @@ -19,34 +32,58 @@ class RpTransformer(TransformerMixin, BaseEstimator): - """ - Base RP module - """ + """Base Word2Vec module. + + Wraps :class:`~gensim.models.rpmodel.RpModel`. + For more information on the inner workings please take a look at + the original class. + """ def __init__(self, id2word=None, num_topics=300): - """ - Sklearn wrapper for RP model. See gensim.models.RpModel for parameter details. + """Sklearn wrapper for Random Projections model. + + Parameters + ---------- + id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional + Mapping `token_id` -> `token`, will be determined from corpus if `id2word == None`. + num_topics : int, optional + Number of topics. + """ self.gensim_model = None self.id2word = id2word self.num_topics = num_topics def fit(self, X, y=None): - """ - Fit the model according to the given training data. - Calls gensim.models.RpModel + """Fit the model according to the given training data. + + Parameters + ---------- + X : iterable of iterable of (int, int) + Input corpus in BOW format. + + Returns + ------- + :class:`~gensim.sklearn_api.rpmodel.RpTransformer` + The trained model. + """ self.gensim_model = models.RpModel(corpus=X, id2word=self.id2word, num_topics=self.num_topics) return self def transform(self, docs): - """ - Take documents/corpus as input. - Return RP representation of the input documents/corpus. - The input `docs` can correspond to multiple documents like - [[(0, 1.0), (1, 1.0), (2, 1.0)], - [(0, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)]] - or a single document like : [(0, 1.0), (1, 1.0), (2, 1.0)] + """Find the topic probabilities for each author. + + Parameters + ---------- + docs : iterable of iterable of (int, int) + Documents to be transformed in BOW format. + + Returns + ------- + np.ndarray of shape (`len(docs)`, num_topics) + RP representation for each input document. + """ if self.gensim_model is None: raise NotFittedError( From 8df7ce5a9094ffe991566e3fab9309cc32c9698b Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sun, 25 Feb 2018 10:07:21 +0100 Subject: [PATCH 23/45] minor example improvements --- gensim/sklearn_api/hdp.py | 10 +++++----- gensim/sklearn_api/lsimodel.py | 20 +++++++------------- gensim/sklearn_api/w2vmodel.py | 6 +++--- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py index 2e6055ede1..cbfb63b09a 100644 --- a/gensim/sklearn_api/hdp.py +++ b/gensim/sklearn_api/hdp.py @@ -31,7 +31,7 @@ class HdpTransformer(TransformerMixin, BaseEstimator): - """Base Word2Vec module. + """Base HDP module. Wraps :class:`~gensim.models.hdpmodel.HdpModel. For more information on the inner workings please take a look at @@ -46,11 +46,11 @@ class HdpTransformer(TransformerMixin, BaseEstimator): def __init__(self, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): - """ - Sklearn api for HDP model. See gensim.models.HdpModel for parameter details. - + """Sklearn api for HDP model. - id2word : :class:`~gensim.corpora.dictionary.Dictionary` + Parameters + ---------- + id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`} Mapping between a words ID and the word itself in the vocabulary. max_chunks : int, optional Upper bound on how many chunks to process.It wraps around corpus beginning in another corpus pass, diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index 1ad9194949..71f34118c9 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -15,25 +15,19 @@ >>> from sklearn.pipeline import Pipeline >>> from sklearn import linear_model - >>> from sklearn.datasets import fetch_20newsgroups + >>> from gensim.test.utils import common_corpus, common_dictionary >>> from gensim.sklearn_api import LsiTransformer - >>> from gensim.corpora import Dictionary >>> - >>> # Create an ID to word mapping using some corpus included in sklearn. - >>> cats = ['rec.sport.baseball', 'sci.crypt'] - >>> data = fetch_20newsgroups(subset='train', categories=cats, shuffle=True) - >>> id2word = Dictionary([_.split() for _ in data.data]) - >>> # Create stages for our pipeline (including gensim and sklearn models alike). - >>> model = LsiTransformer(num_topics=15, id2word=id2word) + >>> model = LsiTransformer(num_topics=15, id2word=common_dictionary) >>> clf = linear_model.LogisticRegression(penalty='l2', C=0.1) >>> pipe = Pipeline([('features', model,), ('classifier', clf)]) - - >>> # Fit our pipeline to some corpus - >>> corpus = [id2word.doc2bow(i.split()) for i in data.data] - + >>> + >>> # Create some random binary labels for our documents. + >>> labels = np.random.choice([0, 1], len(common_corpus)) + >>> >>> # How well does our pipeline perform on the training set? - >>> score = pipe.fit(corpus, data.target).score(corpus, data.target) + >>> score = pipe.fit(common_corpus, labels).score(common_corpus, labels) """ diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 216ff6630c..db877e87d0 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -12,6 +12,7 @@ Examples -------- + >>> from gensim.test.utils import common_texts >>> from gensim.sklearn_api import W2VTransformer >>> @@ -22,7 +23,6 @@ >>> wordvecs = model.fit(common_texts).transform(['graph', 'system']) >>> assert wordvecs.shape == (2, 10) - """ @@ -87,8 +87,8 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size= If set to 0, no negative sampling is used. cbow_mean : int {1,0} If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. - hashfxn : function - Hash function to use to randomly initialize weights, for increased training reproducibility. + hashfxn : callable (object -> int), optional + A hashing function. Used to create an initial random reproducible vector by hashing the random seed. iter : int Number of iterations (epochs) over the corpus. null_word : int {1, 0} From dc33b917b93dd67f6b81eedddd65d7852cbb6819 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sun, 25 Feb 2018 10:26:02 +0100 Subject: [PATCH 24/45] fixed reference --- gensim/sklearn_api/atmodel.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py index 67c77d9833..771eb0f88f 100644 --- a/gensim/sklearn_api/atmodel.py +++ b/gensim/sklearn_api/atmodel.py @@ -17,7 +17,7 @@ >>> >>> # Pass a mapping from authors to the documents they contributed to. >>> author2doc = {'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8]} - + >>> >>> # Lets use the model to discover 2 different topics. >>> model = AuthorTopicTransformer(id2word=common_dictionary, author2doc=author2doc, num_topics=2, passes=100) >>> @@ -35,7 +35,7 @@ class AuthorTopicTransformer(TransformerMixin, BaseEstimator): - """Base Word2Vec module. + """Base Author Topic module. Wraps :class:`~gensim.models.atmodel.AuthorTopicModel`. For more information on the inner workings please take a look at @@ -43,9 +43,7 @@ class AuthorTopicTransformer(TransformerMixin, BaseEstimator): References ---------- - .. [1] Matthew D. Hoffman, David M. Blei, Francis Bach, "Online Learning for Latent Dirichlet Allocation", - NIPS'10 Proceedings of the 23rd International Conference on Neural Information Processing Systems - - Volume 1 Pages 856-864, https://www.di.ens.fr/~fbach/mdhnips2010.pdf + .. [1] Osen-Zvi et. al 2004, https://mimno.infosci.cornell.edu/info6150/readings/398.pdf. """ From 836af6fecbfc784309d36e0ddc1d63cce3ec54fd Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sun, 25 Feb 2018 10:57:13 +0100 Subject: [PATCH 25/45] removed reference --- gensim/sklearn_api/atmodel.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py index 771eb0f88f..802ceea5ad 100644 --- a/gensim/sklearn_api/atmodel.py +++ b/gensim/sklearn_api/atmodel.py @@ -39,11 +39,7 @@ class AuthorTopicTransformer(TransformerMixin, BaseEstimator): Wraps :class:`~gensim.models.atmodel.AuthorTopicModel`. For more information on the inner workings please take a look at - the original class. The model's internal workings are heavily based on _[1]. - - References - ---------- - .. [1] Osen-Zvi et. al 2004, https://mimno.infosci.cornell.edu/info6150/readings/398.pdf. + the original class. """ @@ -76,10 +72,9 @@ def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=Non of the EM algorithm. decay : float, optional A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten - when each new document is examined. Corresponds to Kappa from [1]_. + when each new document is examined. offset : float, optional Hyper-parameter that controls how much we will slow down the first steps the first few iterations. - Corresponds to Tau_0 from [1]_. alpha : {np.array, str}, optional Can be set to an 1D array of length equal to the number of expected topics that expresses our a-priori belief for the each topics' probability. From 4a3ce08f9ba224cd89537de7a8e8b24866667518 Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 27 Feb 2018 13:54:05 +0500 Subject: [PATCH 26/45] fix doc building --- gensim/sklearn_api/atmodel.py | 70 +++++++++++++++++----------------- gensim/sklearn_api/d2vmodel.py | 30 +++++++-------- gensim/sklearn_api/hdp.py | 33 ++++++++-------- gensim/sklearn_api/ldamodel.py | 52 ++++++++++++------------- gensim/sklearn_api/lsimodel.py | 30 +++++++-------- gensim/sklearn_api/phrases.py | 28 +++++++------- gensim/sklearn_api/rpmodel.py | 16 ++++---- gensim/sklearn_api/text2bow.py | 23 ++++++----- gensim/sklearn_api/tfidf.py | 16 ++++---- gensim/sklearn_api/w2vmodel.py | 18 ++++----- 10 files changed, 156 insertions(+), 160 deletions(-) diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py index 771eb0f88f..7667d25bb6 100644 --- a/gensim/sklearn_api/atmodel.py +++ b/gensim/sklearn_api/atmodel.py @@ -11,21 +11,23 @@ Examples -------- - - >>> from gensim.test.utils import common_texts, common_dictionary, common_corpus - >>> from gensim.sklearn_api.atmodel import AuthorTopicTransformer - >>> - >>> # Pass a mapping from authors to the documents they contributed to. - >>> author2doc = {'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8]} - >>> - >>> # Lets use the model to discover 2 different topics. - >>> model = AuthorTopicTransformer(id2word=common_dictionary, author2doc=author2doc, num_topics=2, passes=100) - >>> - >>> # In which of those 2 topics does jack mostly contribute to? - >>> jacks_topic_distr = model.fit(common_corpus).transform('jack') +>>> from gensim.test.utils import common_texts, common_dictionary, common_corpus +>>> from gensim.sklearn_api.atmodel import AuthorTopicTransformer +>>> +>>> # Pass a mapping from authors to the documents they contributed to. +>>> author2doc = { +... 'john': [0, 1, 2, 3, 4, 5, 6], +... 'jane': [2, 3, 4, 5, 6, 7, 8], +... 'jack': [0, 2, 4, 6, 8] +... } +>>> +>>> # Lets use the model to discover 2 different topics. +>>> model = AuthorTopicTransformer(id2word=common_dictionary, author2doc=author2doc, num_topics=2, passes=100) +>>> +>>> # In which of those 2 topics does jack mostly contribute to? +>>> topic_dist = model.fit(common_corpus).transform('jack') """ - import numpy as np from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -35,24 +37,19 @@ class AuthorTopicTransformer(TransformerMixin, BaseEstimator): - """Base Author Topic module. - - Wraps :class:`~gensim.models.atmodel.AuthorTopicModel`. - For more information on the inner workings please take a look at - the original class. The model's internal workings are heavily based on _[1]. + """Base Author Topic module, wraps :class:`~gensim.models.atmodel.AuthorTopicModel`. - References - ---------- - .. [1] Osen-Zvi et. al 2004, https://mimno.infosci.cornell.edu/info6150/readings/398.pdf. + For more information on the inner workings please take a look at the original class. The model's internal workings + are heavily based on `"The Author-Topic Model for Authors and Documents", Osen-Zvi et. al 2004 + `_. """ - def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=None, chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, gamma_threshold=0.001, serialized=False, serialization_path=None, minimum_probability=0.01, random_state=None): - """Sklearn wrapper for Author-Topic model. + """ Parameters ---------- @@ -76,22 +73,24 @@ def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=Non of the EM algorithm. decay : float, optional A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten - when each new document is examined. Corresponds to Kappa from [1]_. + when each new document is examined. Corresponds to Kappa from `"The Author-Topic Model for Authors + and Documents", Osen-Zvi et. al 2004 `_. offset : float, optional Hyper-parameter that controls how much we will slow down the first steps the first few iterations. - Corresponds to Tau_0 from [1]_. + Corresponds to Tau_0 from `"The Author-Topic Model for Authors and Documents", Osen-Zvi et. al 2004 + `_. alpha : {np.array, str}, optional Can be set to an 1D array of length equal to the number of expected topics that expresses our a-priori belief for the each topics' probability. Alternatively default prior selecting strategies can be employed by supplying a string: - 'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`. - 'default': Learns an assymetric prior from the corpus. + 'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`. + 'default': Learns an assymetric prior from the corpus. eta : {float, np.array, str}, optional - A-priori belief on word probability. This can be: - a scalar for a symmetric prior over topic/word probability. - a vector : of length num_words to denote an asymmetric user defined probability for each word. - a matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination. - the string 'auto' to learn the asymmetric prior from the data. + A-priori belief on word probability, this can be: + * scalar for a symmetric prior over topic/word probability, + * vector : of length num_words to denote an asymmetric user defined probability for each word, + * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination, + * the string 'auto' to learn the asymmetric prior from the data. update_every : int, optional Number of mini-batches between each model update. eval_every : int, optional @@ -186,11 +185,10 @@ def transform(self, author_names): def partial_fit(self, X, author2doc=None, doc2author=None): """Train model over a potentially incomplete set of documents. - Uses the parameters set in the constructor. This method can be used in two ways: - 1. On an unfitted model in which case the model is initialized and trained on `X`. - 2. On an already fitted model in which case the model is **updated** by `X`. Additional authors - can be passed using `author2doc` or `doc2author` + * On an unfitted model in which case the model is initialized and trained on `X`. + * On an already fitted model in which case the model is **updated** by `X`. + Parameters ---------- diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index e6d347a34d..9aa015fe6f 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -11,21 +11,21 @@ Examples -------- - >>> from gensim.test.utils import common_texts - >>> from gensim.sklearn_api import D2VTransformer - >>> from gensim.similarities import Similarity - >>> - >>> # Lets represent each document using a 50 dimensional vector - >>> model = D2VTransformer(min_count=1, size=5) - >>> docvecs = model.fit_transform(common_texts) - >>> - >>> # Let's use the vector representations to compute similarities with one of the documents. - >>> index = Similarity(None, docvecs, num_features=5) - >>> - >>> # Which document is most similar to the last one in the corpus? Probably itself! - >>> result = index[docvecs[8]] - >>> result.argmax() - 8 +>>> from gensim.test.utils import common_texts +>>> from gensim.sklearn_api import D2VTransformer +>>> from gensim.similarities import Similarity +>>> +>>> # Lets represent each document using a 50 dimensional vector +>>> model = D2VTransformer(min_count=1, size=5) +>>> docvecs = model.fit_transform(common_texts) +>>> +>>> # Let's use the vector representations to compute similarities with one of the documents. +>>> index = Similarity(None, docvecs, num_features=5) +>>> +>>> # Which document is most similar to the last one in the corpus? Probably itself! +>>> result = index[docvecs[8]] +>>> result.argmax() +8 """ diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py index cbfb63b09a..88fd39bece 100644 --- a/gensim/sklearn_api/hdp.py +++ b/gensim/sklearn_api/hdp.py @@ -33,14 +33,11 @@ class HdpTransformer(TransformerMixin, BaseEstimator): """Base HDP module. - Wraps :class:`~gensim.models.hdpmodel.HdpModel. + Wraps :class:`~gensim.models.hdpmodel.HdpModel`. For more information on the inner workings please take a look at - the original class. The inner workings of this class heavily depends on _[1]. - - References - ---------- - .. [1] Wang, Paisley, Blei: Online Variational Inference for the Hierarchical Dirichlet - Process, JMLR (2011), http://jmlr.csail.mit.edu/proceedings/papers/v15/wang11a/wang11a.pdf + the original class. The inner workings of this class heavily depends on `Wang, Paisley, Blei: "Online Variational + Inference for the Hierarchical Dirichlet Process, JMLR (2011)" + `_. """ @@ -60,19 +57,25 @@ def __init__(self, id2word, max_chunks=None, max_time=None, chunksize=256, kappa chunksize : int, optional Number of documents to be processed by the model in each mini-batch. kappa : float, optional - Learning rate. See _[1]. + Learning rate, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical Dirichlet + Process, JMLR (2011)" `_. tau : float, optional Slow down parameter. K : int, optional - Second level truncation level. See _[1]. + Second level truncation level, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical + Dirichlet Process, JMLR (2011)" `_. T : int, optional - Top level truncation level. See _[1]. + Top level truncation level, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical + Dirichlet Process, JMLR (2011)" `_. alpha : int, optional - Second level concentration. See _[1]. + Second level concentration, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical + Dirichlet Process, JMLR (2011)" `_. gamma : int, optional - First level concentration. See _[1]. + First level concentration, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical + Dirichlet Process, JMLR (2011)" `_. eta : float, optional - The topic Dirichlet. See _[1]. + The topic Dirichlet, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical + Dirichlet Process, JMLR (2011)" `_. scale : float, optional Weights information from the mini-chunk of corpus to calculate rhot. var_converge : float, optional @@ -167,8 +170,8 @@ def partial_fit(self, X): Uses the parameters set in the constructor. This method can be used in two ways: - 1. On an unfitted model in which case the model is initialized and trained on `X`. - 2. On an already fitted model in which case the model is **updated** by `X`. + * On an unfitted model in which case the model is initialized and trained on `X`. + * On an already fitted model in which case the model is **updated** by `X`. Parameters ---------- diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index 65de219d80..50b57d2f1f 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -12,13 +12,13 @@ Examples -------- - >>> from gensim.test.utils import common_corpus, common_dictionary - >>> from gensim.sklearn_api import LdaTransformer - >>> - >>> # Reduce each document to 2 dimensions (topics) using the sklearn interface. - >>> model = LdaTransformer(num_topics=2, id2word=common_dictionary, iterations=20, random_state=1) - >>> docvecs = model.fit_transform(common_corpus) - >>> assert docvecs.shape == (len(common_corpus), 2) +>>> from gensim.test.utils import common_corpus, common_dictionary +>>> from gensim.sklearn_api import LdaTransformer +>>> +>>> # Reduce each document to 2 dimensions (topics) using the sklearn interface. +>>> model = LdaTransformer(num_topics=2, id2word=common_dictionary, iterations=20, random_state=1) +>>> docvecs = model.fit_transform(common_corpus) +>>> assert docvecs.shape == (len(common_corpus), 2) """ @@ -35,8 +35,7 @@ class LdaTransformer(TransformerMixin, BaseEstimator): """Base LDA module. Wraps :class:`~gensim.models.ldamodel.LdaModel`. - For more information on the inner workings please take a look at - the original class. + For more information on the inner workings please take a look at the original class. """ @@ -44,16 +43,14 @@ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, updat eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, scorer='perplexity', dtype=np.float32): - """Sklearn wrapper for LDA model. - Based on [1]_. + """Sklearn wrapper for LDA model. Based on [1]_. Notes ----- Configure `passes` and `update_every` params to choose the mode among: - - - online (single-pass): update_every != None and passes == 1 - - online (multi-pass): update_every != None and passes > 1 - - batch: update_every == None + * online (single-pass): update_every != None and passes == 1 + * online (multi-pass): update_every != None and passes > 1 + * batch: update_every == None By default, 'online (single-pass)' mode is used for training the LDA model. @@ -80,14 +77,14 @@ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, updat Can be set to an 1D array of length equal to the number of expected topics that expresses our a-priori belief for the each topics' probability. Alternatively default prior selecting strategies can be employed by supplying a string: - 'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`. - 'default': Learns an assymetric prior from the corpus. + 'asymmetric': Uses a fixed normalized asymmetric prior of `1.0 / topicno`. + 'default': Learns an asymmetric prior from the corpus. eta : {float, np.array, str}, optional - A-priori belief on word probability. This can be: - a scalar for a symmetric prior over topic/word probability. - a vector : of length num_words to denote an asymmetric user defined probability for each word. - a matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination. - the string 'auto' to learn the asymmetric prior from the data. + A-priori belief on word probability, this can be: + * scalar for a symmetric prior over topic/word probability, + * vector : of length num_words to denote an asymmetric user defined probability for each word, + * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination, + * the string 'auto' to learn the asymmetric prior from the data. decay : float, optional A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten when each new document is examined. Corresponds to Kappa from [1]_. @@ -105,10 +102,9 @@ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, updat random_state : {np.random.RandomState, int}, optional Either a randomState object or a seed to generate one. Useful for reproducibility. scorer : str, optional - Method to compute a score reflecting how well the model has fit the input corpus. - Allowed values are: - 'perplexity': Minimize the model's perplexity. - 'mass_u': Use :class:`~gensim.models.coherencemodel.CoherenceModel` to compute a topics coherence. + Method to compute a score reflecting how well the model has fit the input corpus, allowed values are: + * 'perplexity': Minimize the model's perplexity. + * 'mass_u': Use :class:`~gensim.models.coherencemodel.CoherenceModel` to compute a topics coherence. dtype : type, optional Data-type to use during calculations inside model. All inputs are also converted. Available types: `numpy.float16`, `numpy.float32`, `numpy.float64`. @@ -194,8 +190,8 @@ def partial_fit(self, X): Uses the parameters set in the constructor. This method can be used in two ways: - 1. On an unfitted model in which case the model is initialized and trained on `X`. - 2. On an already fitted model in which case the model is **updated** by `X`. + * On an unfitted model in which case the model is initialized and trained on `X`. + * On an already fitted model in which case the model is **updated** by `X`. Parameters ---------- diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index 71f34118c9..d9d3a694d3 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -13,21 +13,21 @@ -------- Integrate with sklearn Pipelines: - >>> from sklearn.pipeline import Pipeline - >>> from sklearn import linear_model - >>> from gensim.test.utils import common_corpus, common_dictionary - >>> from gensim.sklearn_api import LsiTransformer - >>> - >>> # Create stages for our pipeline (including gensim and sklearn models alike). - >>> model = LsiTransformer(num_topics=15, id2word=common_dictionary) - >>> clf = linear_model.LogisticRegression(penalty='l2', C=0.1) - >>> pipe = Pipeline([('features', model,), ('classifier', clf)]) - >>> - >>> # Create some random binary labels for our documents. - >>> labels = np.random.choice([0, 1], len(common_corpus)) - >>> - >>> # How well does our pipeline perform on the training set? - >>> score = pipe.fit(common_corpus, labels).score(common_corpus, labels) +>>> from sklearn.pipeline import Pipeline +>>> from sklearn import linear_model +>>> from gensim.test.utils import common_corpus, common_dictionary +>>> from gensim.sklearn_api import LsiTransformer +>>> +>>> # Create stages for our pipeline (including gensim and sklearn models alike). +>>> model = LsiTransformer(num_topics=15, id2word=common_dictionary) +>>> clf = linear_model.LogisticRegression(penalty='l2', C=0.1) +>>> pipe = Pipeline([('features', model,), ('classifier', clf)]) +>>> +>>> # Create some random binary labels for our documents. +>>> labels = np.random.choice([0, 1], len(common_corpus)) +>>> +>>> # How well does our pipeline perform on the training set? +>>> score = pipe.fit(common_corpus, labels).score(common_corpus, labels) """ diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 7231a47184..61bc213921 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -11,20 +11,20 @@ Examples -------- - >>> from gensim.sklearn_api.phrases import PhrasesTransformer - >>> - >>> # Create the model. Make sure no term is ignored and combinations seen 3+ times are captured. - >>> m = PhrasesTransformer(min_count=1, threshold=3) - >>> texts = [ - ... ['I', 'love', 'computer', 'science'], - ... ['computer', 'science', 'is', 'my', 'passion'], - ... ['I', 'studied', 'computer', 'science'] - ... ] - >>> - >>> # Use sklearn fit_transform to see the transformation. - >>> # Since computer and science were seen together 3+ times they are considered a phrase. - >>> m.fit_transform(texts)[0] - ['I', 'love', 'computer_science'] +>>> from gensim.sklearn_api.phrases import PhrasesTransformer +>>> +>>> # Create the model. Make sure no term is ignored and combinations seen 3+ times are captured. +>>> m = PhrasesTransformer(min_count=1, threshold=3) +>>> texts = [ +... ['I', 'love', 'computer', 'science'], +... ['computer', 'science', 'is', 'my', 'passion'], +... ['I', 'studied', 'computer', 'science'] +... ] +>>> +>>> # Use sklearn fit_transform to see the transformation. +>>> # Since computer and science were seen together 3+ times they are considered a phrase. +>>> m.fit_transform(texts)[0] +['I', 'love', 'computer_science'] """ diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py index 7382d3ebfd..0e241c3ad4 100644 --- a/gensim/sklearn_api/rpmodel.py +++ b/gensim/sklearn_api/rpmodel.py @@ -12,14 +12,14 @@ Examples -------- - >>> from gensim.sklearn_api.rpmodel import RpTransformer - >>> from gensim.test.utils import common_dictionary, common_corpus - >>> - >>> # Initialize and fit the model. - >>> model = RpTransformer(id2word=common_dictionary).fit(common_corpus) - >>> - >>> # Use the trained model to transform a document. - >>> result = model.transform(common_corpus[3]) +>>> from gensim.sklearn_api.rpmodel import RpTransformer +>>> from gensim.test.utils import common_dictionary, common_corpus +>>> +>>> # Initialize and fit the model. +>>> model = RpTransformer(id2word=common_dictionary).fit(common_corpus) +>>> +>>> # Use the trained model to transform a document. +>>> result = model.transform(common_corpus[3]) """ diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index f51cea62ba..fd439bd0d5 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -10,18 +10,17 @@ Examples -------- - - >>> from gensim.sklearn_api import Text2BowTransformer - >>> - >>> # Get a corpus as an iterable of unicode strings. - >>> texts = [u'complier system computer', u'loading computer system'] - >>> - >>> # Create a transformer.. - >>> model = Text2BowTransformer() - >>> - >>> # Use sklearn style `fit_transform` to get the BOW representation of each document. - >>> model.fit_transform(texts) - [[(0, 1), (1, 1), (2, 1)], [(1, 1), (2, 1), (3, 1)]] +>>> from gensim.sklearn_api import Text2BowTransformer +>>> +>>> # Get a corpus as an iterable of unicode strings. +>>> texts = [u'complier system computer', u'loading computer system'] +>>> +>>> # Create a transformer.. +>>> model = Text2BowTransformer() +>>> +>>> # Use sklearn style `fit_transform` to get the BOW representation of each document. +>>> model.fit_transform(texts) +[[(0, 1), (1, 1), (2, 1)], [(1, 1), (2, 1), (3, 1)]] """ diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index 48b41920e1..1e3431275a 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -11,14 +11,14 @@ Examples -------- - >>> from gensim.test.utils import common_corpus, common_dictionary - >>> from gensim.sklearn_api import TfIdfTransformer - >>> - >>> # Transform the word counts inversely to their global frequency using the sklearn interface. - >>> model = TfIdfTransformer(dictionary=common_dictionary) - >>> weighted_corpus = model.fit_transform(common_corpus) - >>> weighted_corpus[0] - [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)] +>>> from gensim.test.utils import common_corpus, common_dictionary +>>> from gensim.sklearn_api import TfIdfTransformer +>>> +>>> # Transform the word counts inversely to their global frequency using the sklearn interface. +>>> model = TfIdfTransformer(dictionary=common_dictionary) +>>> weighted_corpus = model.fit_transform(common_corpus) +>>> weighted_corpus[0] +[(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)] """ diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index db877e87d0..a3e81ec93e 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -13,15 +13,15 @@ Examples -------- - >>> from gensim.test.utils import common_texts - >>> from gensim.sklearn_api import W2VTransformer - >>> - >>> # Create a model to represent each word by a 10 dimensional vector. - >>> model = W2VTransformer(size=10, min_count=1, seed=1) - >>> - >>> # What is the vector representation of the word 'graph'? - >>> wordvecs = model.fit(common_texts).transform(['graph', 'system']) - >>> assert wordvecs.shape == (2, 10) +>>> from gensim.test.utils import common_texts +>>> from gensim.sklearn_api import W2VTransformer +>>> +>>> # Create a model to represent each word by a 10 dimensional vector. +>>> model = W2VTransformer(size=10, min_count=1, seed=1) +>>> +>>> # What is the vector representation of the word 'graph'? +>>> wordvecs = model.fit(common_texts).transform(['graph', 'system']) +>>> assert wordvecs.shape == (2, 10) """ From 428574192a01d515a6763679710115ab0d00c4c6 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Tue, 27 Feb 2018 11:41:28 +0100 Subject: [PATCH 27/45] unidented examples and fixed paper references --- gensim/sklearn_api/d2vmodel.py | 3 +-- gensim/sklearn_api/hdp.py | 15 +++++++-------- gensim/sklearn_api/ldamodel.py | 21 ++++++++++----------- gensim/sklearn_api/lsimodel.py | 3 +-- gensim/sklearn_api/phrases.py | 11 ++++------- gensim/sklearn_api/rpmodel.py | 3 +-- gensim/sklearn_api/text2bow.py | 3 +-- gensim/sklearn_api/tfidf.py | 10 ++-------- gensim/sklearn_api/w2vmodel.py | 3 +-- 9 files changed, 28 insertions(+), 44 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 9aa015fe6f..25913b18a9 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -39,9 +39,8 @@ class D2VTransformer(TransformerMixin, BaseEstimator): - """Base Dov2Vec module. + """Base Dov2Vec module, wraps :class:`~gensim.models.doc2vec.Doc2Vec`. - Wraps :class:`~gensim.models.doc2vec.Doc2Vec`. For more information on the inner workings please take a look at the original class. diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py index 88fd39bece..c1e9f54f7c 100644 --- a/gensim/sklearn_api/hdp.py +++ b/gensim/sklearn_api/hdp.py @@ -12,12 +12,12 @@ Examples -------- - >>> from gensim.test.utils import common_dictionary, common_corpus - >>> from gensim.sklearn_api import HdpTransformer - >>> - >>> # Lets extract the distribution of each document in topics - >>> model = HdpTransformer(id2word=common_dictionary) - >>> distr = model.fit_transform(common_corpus) +>>> from gensim.test.utils import common_dictionary, common_corpus +>>> from gensim.sklearn_api import HdpTransformer +>>> +>>> # Lets extract the distribution of each document in topics +>>> model = HdpTransformer(id2word=common_dictionary) +>>> distr = model.fit_transform(common_corpus) """ @@ -31,9 +31,8 @@ class HdpTransformer(TransformerMixin, BaseEstimator): - """Base HDP module. + """Base HDP module, wraps :class:`~gensim.models.hdpmodel.HdpModel`. - Wraps :class:`~gensim.models.hdpmodel.HdpModel`. For more information on the inner workings please take a look at the original class. The inner workings of this class heavily depends on `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical Dirichlet Process, JMLR (2011)" diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index 50b57d2f1f..66e27d4c22 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -32,10 +32,12 @@ class LdaTransformer(TransformerMixin, BaseEstimator): - """Base LDA module. + """Base LDA module, wraps :class:`~gensim.models.ldamodel.LdaModel`. - Wraps :class:`~gensim.models.ldamodel.LdaModel`. For more information on the inner workings please take a look at the original class. + The inner workings of this class depends heavily on `Matthew D. Hoffman, David M. Blei, Francis Bach: + "Online Learning for Latent Dirichlet Allocation NIPS'10" + `_. """ @@ -43,7 +45,7 @@ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, updat eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, scorer='perplexity', dtype=np.float32): - """Sklearn wrapper for LDA model. Based on [1]_. + """Sklearn wrapper for LDA model. Notes ----- @@ -54,12 +56,6 @@ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, updat By default, 'online (single-pass)' mode is used for training the LDA model. - References - ---------- - .. [1] Matthew D. Hoffman, David M. Blei, Francis Bach, "Online Learning for Latent Dirichlet Allocation", - NIPS'10 Proceedings of the 23rd International Conference on Neural Information Processing Systems - - Volume 1 Pages 856-864, https://www.di.ens.fr/~fbach/mdhnips2010.pdf - Parameters ---------- num_topics : int, optional @@ -87,10 +83,13 @@ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, updat * the string 'auto' to learn the asymmetric prior from the data. decay : float, optional A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten - when each new document is examined. Corresponds to Kappa from [1]_. + when each new document is examined. Corresponds to Kappa from + `Matthew D. Hoffman, David M. Blei, Francis Bach: + "Online Learning for Latent Dirichlet Allocation NIPS'10" `_. offset : float, optional Hyper-parameter that controls how much we will slow down the first steps the first few iterations. - Corresponds to Tau_0 from [1]_. + Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach: + "Online Learning for Latent Dirichlet Allocation NIPS'10" `_. eval_every : int, optional Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x. iterations : int, optional diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index d9d3a694d3..e287cf8ad6 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -41,9 +41,8 @@ class LsiTransformer(TransformerMixin, BaseEstimator): - """Base LSI module. + """Base LSI module, wraps :class:`~gensim.model.lsimodel.LsiModel`. - Wraps :class:`~gensim.model.lsimodel.LsiModel`. For more information on the inner working please take a look at the original class. diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 61bc213921..2d56f2cc9c 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -68,8 +68,10 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, or with a function with the expected parameter names. Two built-in scoring functions are available by setting `scoring` to a string: - 'default': from [1]_. - 'npmi': normalized pointwise mutual information, from [2]_. + 'default': Explained in `Mikolov, et. al: "Efficient Estimation of Word Representations in Vector Space" + `_. + 'npmi': Explained in `Gerlof Bouma: "Normalized (Pointwise) Mutual Information in Collocation Extraction" + `_. 'npmi' is more robust when dealing with common words that form part of common bigrams, and ranges from -1 to 1, but is slower to calculate than the default. @@ -88,11 +90,6 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, A scoring function without any of these parameters (even if the parameters are not used) will raise a ValueError on initialization of the Phrases class. The scoring function must be pic - References - ---------- - .. [1] "Efficient Estimaton of Word Representations in Vector Space" by Mikolov, et. al. - .. [2] "Normalized (Pointwise) Mutual Information in Colocation Extraction" by Gerlof Bouma. - """ self.gensim_model = None self.min_count = min_count diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py index 0e241c3ad4..8420832df8 100644 --- a/gensim/sklearn_api/rpmodel.py +++ b/gensim/sklearn_api/rpmodel.py @@ -32,9 +32,8 @@ class RpTransformer(TransformerMixin, BaseEstimator): - """Base Word2Vec module. + """Base Word2Vec module, wraps :class:`~gensim.models.rpmodel.RpModel`. - Wraps :class:`~gensim.models.rpmodel.RpModel`. For more information on the inner workings please take a look at the original class. diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index fd439bd0d5..872dcb4ecc 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -33,9 +33,8 @@ class Text2BowTransformer(TransformerMixin, BaseEstimator): - """Base Text2Bow module. + """Base Text2Bow module , wraps :class:`~gensim.corpora.dictionary.Dictionary`. - Wraps :class:`~gensim.corpora.dictionary.Dictionary`. For more information on the inner workings please take a look at the original class. diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index 1e3431275a..c2ed635611 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -31,9 +31,8 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator): - """Base TfIdf module. + """Base TfIdf module, wraps :class:`~gensim.models.tfidfmodel.TfidfModel`. - Wraps :class:`~gensim.models.tfidfmodel.TfidfModel`. For more information on the inner workings please take a look at the original class. @@ -81,12 +80,7 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, * `n` - none, * `c` - cosine. - For more information visit [1]_. - - References - ---------- - - .. [1] https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System + For more info, visit `"Wikipedia" `_. """ self.gensim_model = None diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index a3e81ec93e..9f64c58702 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -35,9 +35,8 @@ class W2VTransformer(TransformerMixin, BaseEstimator): - """Base Word2Vec module. + """Base Word2Vec module, wraps :class:`~gensim.models.word2vec.Word2Vec`. - Wraps :class:`~gensim.models.word2vec.Word2Vec`. For more information on the inner workings please take a look at the original class. From 0c56ae97a7f673a4b2a5dc8911d6621e739e3b44 Mon Sep 17 00:00:00 2001 From: "Stergiadis, E" Date: Wed, 28 Feb 2018 10:38:48 +0100 Subject: [PATCH 28/45] finalized ldaseq wrapper --- gensim/sklearn_api/ldaseqmodel.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/gensim/sklearn_api/ldaseqmodel.py b/gensim/sklearn_api/ldaseqmodel.py index ffd8a1c883..e4fb1f92bd 100644 --- a/gensim/sklearn_api/ldaseqmodel.py +++ b/gensim/sklearn_api/ldaseqmodel.py @@ -12,6 +12,15 @@ Examples -------- +>>> from gensim.test.utils import common_corpus, common_dictionary +>>> from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer +>>> +>>> # Create a sequential LDA transformer to extract 2 topics from the common corpus. +>>> # Divide the work into 3 unequal time slices. +>>> model = LdaSeqTransformer(id2word=common_dictionary, num_topics=2, time_slice=[3, 4, 2], initialize='gensim') +>>> +>>> # Each document almost entirely belongs to one of the two topics. +>>> transformed_corpus = model.fit_transform(common_corpus) """ import numpy as np @@ -48,19 +57,17 @@ def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, in Number of latent topics to be discovered in the corpus. initialize : str {'gensim', 'own', 'ldamodel'} Controls the initialization of the DTM model. Supports three different modes: - - 'gensim', default: Uses gensim's own LDA initialization. - - 'own': You can use your own initialization matrix of an LDA model previously trained by passing it to `sstats`. - - 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument. + * 'gensim', default: Uses gensim's own LDA initialization. + * 'own': Uses your own initialization matrix of an LDA model that has been previously trained. + * 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument. sstats : np.ndarray of shape (vocab_len, `num_topics`) If `initialize` is set to 'own' this will be used to initialize the DTM model. lda_model : :class:`~gensim.models.ldamodel.LdaModel` If `initialize` is set to 'lda_model' this object will be used to create the `sstats` initialization matrix. obs_variance : float - Observed variance used to approximate the true and forward variance as shown in _[1]. - - References - ---------- - http://repository.cmu.edu/cgi/viewcontent.cgi?article=2036&context=compsci + Observed variance used to approximate the true and forward variance as shown in + `David M. Blei, John D. Lafferty: "Dynamic Topic Models" + `_. chain_variance : float Gaussian parameter defined in the beta distribution to dictate how the beta values evolve. passes : int From 64f8d4f2ef394e4eafe87424dc448418c0192443 Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 13 Mar 2018 17:01:23 +0500 Subject: [PATCH 29/45] fix __init__ --- gensim/sklearn_api/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gensim/sklearn_api/__init__.py b/gensim/sklearn_api/__init__.py index 570fc7e875..d5d80398dc 100644 --- a/gensim/sklearn_api/__init__.py +++ b/gensim/sklearn_api/__init__.py @@ -4,13 +4,12 @@ # Author: Chinmaya Pancholi # Copyright (C) 2017 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Scikit learn wrapper for gensim. -Contains various gensim based implementations which match with scikit-learn standards. -See [1] for complete set of conventions. -[1] http://scikit-learn.org/stable/developers/ -""" +"""Scikit-learn wrappers for gensim. +Contains various gensim based implementations which match with scikit-learn standards. +See `sklearn dev-guide `_ for complete set of conventions. +""" from .ldamodel import LdaTransformer # noqa: F401 from .lsimodel import LsiTransformer # noqa: F401 from .rpmodel import RpTransformer # noqa: F401 From 39bbe31932e5bd2a23dec5087d475787c532fdba Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 09:44:46 +0500 Subject: [PATCH 30/45] fix atmodel --- gensim/sklearn_api/atmodel.py | 37 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py index fd6509b99e..b1a5e4767d 100644 --- a/gensim/sklearn_api/atmodel.py +++ b/gensim/sklearn_api/atmodel.py @@ -60,17 +60,16 @@ def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=Non and topic printing. author2doc : dict of (str, list of int), optional Maps an authors name to a list of document IDs where has has contributed. - Either `author2doc` or `doc2author` **MUST** be supplied. + Either `author2doc` or `doc2author` **must be supplied**. doc2author : dict of (int, list of str) Maps a document (using its ID) to a list of author names that contributed to it. - Either `author2doc` or `doc2author` **MUST** be supplied. + Either `author2doc` or `doc2author` **must be supplied**. chunksize : int, optional Number of documents to be processed by the model in each mini-batch. passes : int, optional Number of times the model can make a pass over the corpus during training. iterations : int, optional - Maximum number of times the model before convergence during the M step - of the EM algorithm. + Maximum number of times the model before convergence during the M step of the EM algorithm. decay : float, optional A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten when each new document is examined. Corresponds to Kappa from `"The Author-Topic Model for Authors @@ -90,7 +89,7 @@ def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=Non A-priori belief on word probability, this can be: * scalar for a symmetric prior over topic/word probability, - * vector : of length num_words to denote an asymmetric user defined probability for each word, + * vector of length num_words to denote an asymmetric user defined probability for each word, * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination, * the string 'auto' to learn the asymmetric prior from the data. update_every : int, optional @@ -105,8 +104,8 @@ def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=Non or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from other Gensim models. If your data is too large to fit in to memory, use this functionality. serialization_path : str, optional - Filepath to be used for storing the serialized object. **Must** be supplied if `serialized = True`. - An existing file *cannot* be overwritten; either delete the old file or choose a different name + Path to file that used for storing the serialized object, **must be supplied if `serialized = True`**. + An existing file *cannot* be overwritten, either delete the old file or choose a different name. minimum_probability : float, optional Topics with a probability lower than this threshold will be filtered out. random_state : {np.random.RandomState, int}, optional @@ -138,8 +137,8 @@ def fit(self, X, y=None): Parameters ---------- - X : {iterable of iterable of (int, int), :class:`~gensim.corpora.mmcorpus.MmCorpus`} - A collection of documents in BOW format used for training the model. + X : iterable of list of (int, number) + Sequence of documents in BoW format. Returns ------- @@ -167,8 +166,8 @@ def transform(self, author_names): Returns ------- - iterable of (int, float) - Topic distribution for each input author as a tuple of (topic_id, topic_probability). + numpy.ndarray + Topic distribution for each input author. """ if self.gensim_model is None: @@ -194,14 +193,14 @@ def partial_fit(self, X, author2doc=None, doc2author=None): Parameters ---------- - X : {iterable of iterable of (int, int), :class:`~gensim.corpora.mmcorpus.MmCorpus`} - A collection of documents in BOW format used for training the model. - author2doc : dict(str, list of int), optional - Maps an authors name to a list of document IDs corresponding to indexes in input corpus. - Either `author2doc` or `doc2author` **MUST** be supplied. - doc2author : dict of (int, list of str), optional - Maps a document (using its ID) to a list of author names corresponding to indexes in input corpus. - Either `author2doc` or `doc2author` **MUST** be supplied. + X : iterable of list of (int, number) + Sequence of documents in BoW format. + author2doc : dict of (str, list of int), optional + Maps an authors name to a list of document IDs where has has contributed. + Either `author2doc` or `doc2author` **must be supplied**. + doc2author : dict of (int, list of str) + Maps a document (using its ID) to a list of author names that contributed to it. + Either `author2doc` or `doc2author` **must be supplied**. Returns ------- From 20ea33e938e85d656cd17cd23d9d19322f23052a Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 10:24:18 +0500 Subject: [PATCH 31/45] fix atmodel[2] --- gensim/sklearn_api/atmodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py index b1a5e4767d..47af7d26a0 100644 --- a/gensim/sklearn_api/atmodel.py +++ b/gensim/sklearn_api/atmodel.py @@ -161,8 +161,8 @@ def transform(self, author_names): Parameters ---------- - author_names : iterable of str - A collection of authors whose topics will be identified. + author_names : {iterable of str, str} + Author name or sequence of author names whose topics will be identified. Returns ------- From 31fb94e8e74b8f5ce5d1fc24c5f106c34c56dd92 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 10:24:36 +0500 Subject: [PATCH 32/45] fix d2vmodel --- gensim/sklearn_api/d2vmodel.py | 73 ++++++++++++++-------------------- 1 file changed, 29 insertions(+), 44 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index ef5e85a25b..398231d0fa 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -13,22 +13,11 @@ >>> from gensim.test.utils import common_texts >>> from gensim.sklearn_api import D2VTransformer ->>> from gensim.similarities import Similarity >>> ->>> # Lets represent each document using a 50 dimensional vector >>> model = D2VTransformer(min_count=1, size=5) ->>> docvecs = model.fit_transform(common_texts) ->>> ->>> # Let's use the vector representations to compute similarities with one of the documents. ->>> index = Similarity(None, docvecs, num_features=5) ->>> ->>> # Which document is most similar to the last one in the corpus? Probably itself! ->>> result = index[docvecs[8]] ->>> result.argmax() -8 +>>> docvecs = model.fit_transform(common_texts) # represent `common_texts` as vectors """ - import numpy as np from six import string_types from sklearn.base import TransformerMixin, BaseEstimator @@ -56,37 +45,33 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 ---------- dm_mean : int {1,0}, optional - If 0, use the sum of the context word vectors. If 1, use the mean. - Only applies when `dm` is used in non-concatenative mode. + If 0, use the sum of the context word vectors. If 1, use the mean. Only applies when `dm_concat=0`. dm : int {1,0}, optional - Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used. - Otherwise, `distributed bag of words` (PV-DBOW) is employed. + Defines the training algorithm. If `dm=1` - distributed memory (PV-DM) is used. + Otherwise, distributed bag of words (PV-DBOW) is employed. dbow_words : int {1,0}, optional - If set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW - doc-vector training; If 0, only trains doc-vectors (faster). + If set to 1 - trains word-vectors (in skip-gram fashion) simultaneous with DBOW + doc-vector training, If 0, only trains doc-vectors (faster). dm_concat : int {1,0}, optional - If 1, use concatenation of context vectors rather than sum/average; - Note concatenation results in a much-larger model, as the input - is no longer the size of one (sampled or arithmetically combined) word vector, but the - size of the tag(s) and all words in the context strung together. + If 1, use concatenation of context vectors rather than sum/average. + Note concatenation results in a much-larger model, as the input is no longer the size of one + (sampled or arithmetically combined) word vector, but the size of the tag(s) and all words + in the context strung together. dm_tag_count : int, optional - Expected constant number of document tags per document, when using - dm_concat mode; default is 1. + Expected constant number of document tags per document, when using dm_concat mode. docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors` A mapping from a string or int tag to its vector representation. Either this or `docvecs_mapfile` **MUST** be supplied. docvecs_mapfile : str, optional - Path to a file containing the docvecs mapping. - If `docvecs` is None, this file will be used to create it. + Path to a file containing the docvecs mapping. If `docvecs` is None, this file will be used to create it. comment : str, optional A model descriptive comment, used for logging and debugging purposes. - trim_rule : callable ((str, int, int) -> int), optional + trim_rule : function ((str, int, int) -> int), optional Vocabulary trimming rule that accepts (word, count, min_count). Specifies whether certain words should remain in the vocabulary (:attr:`gensim.utils.RULE_KEEP`), be trimmed away (:attr:`gensim.utils.RULE_DISCARD`), or handled using the default - (:attr:`gensim.utils.RULE_DEFAULT`).If None, then :func:`~gensim.utils.keep_vocab_item` will be used. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() - and is not stored as part of the model. + (:attr:`gensim.utils.RULE_DEFAULT`). + If None, then :func:`gensim.utils.keep_vocab_item` will be used. size : int, optional Dimensionality of the feature vectors. alpha : float, optional @@ -104,23 +89,24 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 useful range is (0, 1e-5). seed : int, optional Seed for the random number generator. Initial vectors for each word are seeded with a hash of - the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, - you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter - from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires - use of the `PYTHONHASHSEED` environment variable to control hash randomization). + the concatenation of word + `str(seed)`. + Note that for a **fully deterministically-reproducible run**, you **must also limit the model to + a single worker thread (`workers=1`)**, to eliminate ordering jitter from OS thread scheduling. + In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED` + environment variable to control hash randomization. workers : int, optional Use this many worker threads to train the model. Will yield a speedup when training with multicore machines. min_alpha : float, optional Learning rate will linearly drop to `min_alpha` as training progresses. hs : int {1,0}, optional - If 1, hierarchical softmax will be used for model training. - If set to 0, and `negative` is non-zero, negative sampling will be used. + If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, + negative sampling will be used. negative : int, optional If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. cbow_mean : int, optional - Same as `dm_mean`, unused. - hashfxn : callable (object -> int), optional + Same as `dm_mean`, **unused**. + hashfxn : function (object -> int), optional A hashing function. Used to create an initial random reproducible vector by hashing the random seed. iter : int, optional Number of epochs to iterate through the corpus. @@ -164,9 +150,8 @@ def fit(self, X, y=None): Parameters ---------- - X : {iterable of {:class:`~gensim.models.doc2vec.TaggedDocument`, iterable of iterable of str} + X : {iterable of :class:`~gensim.models.doc2vec.TaggedDocument`, iterable of list of str} A collection of tagged documents used for training the model. - If these are not tagged, their order integer index will be used to tag them. Returns ------- @@ -195,13 +180,13 @@ def transform(self, docs): Parameters ---------- - docs : iterable of iterable of str - The input corpus. + docs : {iterable of list of str, list of str} + Input document or sequence of documents. Returns ------- - np.array of shape (`len(docs)`, `size`) - The vector representation of the input corpus. + np.ndarray of shape (`len(docs)`, `size`) + The vector representation of the `docs`. """ if self.gensim_model is None: From 4432b776f3ff2eb4365872a2993991c0682f2f1c Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 10:48:47 +0500 Subject: [PATCH 33/45] fix hdp + small fixes --- gensim/sklearn_api/atmodel.py | 2 +- gensim/sklearn_api/d2vmodel.py | 4 ++-- gensim/sklearn_api/hdp.py | 23 ++++++++++++----------- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py index 47af7d26a0..b21e1818cd 100644 --- a/gensim/sklearn_api/atmodel.py +++ b/gensim/sklearn_api/atmodel.py @@ -157,7 +157,7 @@ def fit(self, X, y=None): return self def transform(self, author_names): - """Find the topic probabilities for each author. + """Infer the topic probabilities for each author. Parameters ---------- diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 398231d0fa..10a2a34119 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -39,7 +39,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000): - """Sklearn api for Doc2Vec model. + """ Parameters ---------- @@ -176,7 +176,7 @@ def fit(self, X, y=None): return self def transform(self, docs): - """Get the vector representations for the input documents. + """Infer the vector representations for the input documents. Parameters ---------- diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py index c1e9f54f7c..6d70953803 100644 --- a/gensim/sklearn_api/hdp.py +++ b/gensim/sklearn_api/hdp.py @@ -20,7 +20,6 @@ >>> distr = model.fit_transform(common_corpus) """ - import numpy as np from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator @@ -42,15 +41,15 @@ class HdpTransformer(TransformerMixin, BaseEstimator): def __init__(self, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): - """Sklearn api for HDP model. + """ Parameters ---------- - id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`} + id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional Mapping between a words ID and the word itself in the vocabulary. max_chunks : int, optional Upper bound on how many chunks to process.It wraps around corpus beginning in another corpus pass, - if there are not enough chunks in the corpus + if there are not enough chunks in the corpus. max_time : int, optional Upper bound on time in seconds for which model will be trained. chunksize : int, optional @@ -59,7 +58,8 @@ def __init__(self, id2word, max_chunks=None, max_time=None, chunksize=256, kappa Learning rate, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical Dirichlet Process, JMLR (2011)" `_. tau : float, optional - Slow down parameter. + Slow down parameter, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical + Dirichlet Process, JMLR (2011)" `_. K : int, optional Second level truncation level, see `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical Dirichlet Process, JMLR (2011)" `_. @@ -78,12 +78,13 @@ def __init__(self, id2word, max_chunks=None, max_time=None, chunksize=256, kappa scale : float, optional Weights information from the mini-chunk of corpus to calculate rhot. var_converge : float, optional - Lower bound on the right side of convergence. Used when updating variational parameters for a - single document. + Lower bound on the right side of convergence. Used when updating variational parameters + for a single document. outputdir : str, optional Path to a directory where topic and options information will be stored. random_state : int, optional Seed used to create a :class:`~np.random.RandomState`. Useful for obtaining reproducible results. + """ self.gensim_model = None self.id2word = id2word @@ -130,18 +131,18 @@ def fit(self, X, y=None): return self def transform(self, docs): - """Returns a matrix of topic distribution for the given document bow, where a_ij + """Infer a matrix of topic distribution for the given document bow, where a_ij indicates (topic_i, topic_probability_j). Parameters ---------- - docs : iterable of iterable of (int, int) - A list of documents in BOW format to be transformed. + docs : {iterable of list of (int, number), list of (int, number)} + Document or sequence of documents in BOW format. Returns ------- np.ndarray of shape (`len(docs), num_topics`) - Topic distribution for each input document. + Topic distribution for `docs`. """ if self.gensim_model is None: From e729a2622a2d04c951ba05dabf96f4726b41fd80 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 11:21:57 +0500 Subject: [PATCH 34/45] fix ldamodel + small fixes --- gensim/sklearn_api/d2vmodel.py | 2 +- gensim/sklearn_api/hdp.py | 2 +- gensim/sklearn_api/ldamodel.py | 60 ++++++++++++++++------------------ 3 files changed, 31 insertions(+), 33 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 10a2a34119..ded9527d8c 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -185,7 +185,7 @@ def transform(self, docs): Returns ------- - np.ndarray of shape (`len(docs)`, `size`) + numpy.ndarray of shape [`len(docs)`, `size`] The vector representation of the `docs`. """ diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py index 6d70953803..1efa770486 100644 --- a/gensim/sklearn_api/hdp.py +++ b/gensim/sklearn_api/hdp.py @@ -141,7 +141,7 @@ def transform(self, docs): Returns ------- - np.ndarray of shape (`len(docs), num_topics`) + numpy.ndarray of shape [`len(docs), num_topics`] Topic distribution for `docs`. """ diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index 66e27d4c22..3dc8ff2bc4 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -18,7 +18,6 @@ >>> # Reduce each document to 2 dimensions (topics) using the sklearn interface. >>> model = LdaTransformer(num_topics=2, id2word=common_dictionary, iterations=20, random_state=1) >>> docvecs = model.fit_transform(common_corpus) ->>> assert docvecs.shape == (len(common_corpus), 2) """ @@ -44,41 +43,33 @@ class LdaTransformer(TransformerMixin, BaseEstimator): def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, scorer='perplexity', dtype=np.float32): - - """Sklearn wrapper for LDA model. - - Notes - ----- - Configure `passes` and `update_every` params to choose the mode among: - * online (single-pass): update_every != None and passes == 1 - * online (multi-pass): update_every != None and passes > 1 - * batch: update_every == None - - By default, 'online (single-pass)' mode is used for training the LDA model. + """ Parameters ---------- num_topics : int, optional The number of requested latent topics to be extracted from the training corpus. - id2word : dict of (int, str), optional + id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional Mapping from integer ID to words in the corpus. Used to determine vocabulary size and logging. chunksize : int, optional - If `distributed` is True, this is the number of documents to be handled in each worker job. + Number of documents in batch. passes : int, optional - Number of passes through the corpus during online training. + Number of passes through the corpus during training. update_every : int, optional Number of documents to be iterated through for each update. Set to 0 for batch learning, > 1 for online iterative learning. - alpha : {np.array, str}, optional + alpha : {np.ndarray, str}, optional Can be set to an 1D array of length equal to the number of expected topics that expresses our a-priori belief for the each topics' probability. Alternatively default prior selecting strategies can be employed by supplying a string: - 'asymmetric': Uses a fixed normalized asymmetric prior of `1.0 / topicno`. - 'default': Learns an asymmetric prior from the corpus. + + * 'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`. + * 'default': Learns an assymetric prior from the corpus. eta : {float, np.array, str}, optional A-priori belief on word probability, this can be: + * scalar for a symmetric prior over topic/word probability, - * vector : of length num_words to denote an asymmetric user defined probability for each word, + * vector of length num_words to denote an asymmetric user defined probability for each word, * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination, * the string 'auto' to learn the asymmetric prior from the data. decay : float, optional @@ -93,7 +84,7 @@ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, updat eval_every : int, optional Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x. iterations : int, optional - Maximum number of iterations through the corpus when infering the topic distribution of a corpus. + Maximum number of iterations through the corpus when inferring the topic distribution of a corpus. gamma_threshold : float, optional Minimum change in the value of the gamma parameters to continue iterating. minimum_probability : float, optional @@ -102,14 +93,21 @@ def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, updat Either a randomState object or a seed to generate one. Useful for reproducibility. scorer : str, optional Method to compute a score reflecting how well the model has fit the input corpus, allowed values are: - * 'perplexity': Minimize the model's perplexity. - * 'mass_u': Use :class:`~gensim.models.coherencemodel.CoherenceModel` to compute a topics coherence. - dtype : type, optional + * 'perplexity': Perplexity of language model + * 'mass_u': Use :class:`~gensim.models.coherencemodel.CoherenceModel` to compute a topics coherence. + dtype : {numpy.float16, numpy.float32, numpy.float64}, optional Data-type to use during calculations inside model. All inputs are also converted. - Available types: `numpy.float16`, `numpy.float32`, `numpy.float64`. - """ + Notes + ----- + Configure `passes` and `update_every` params to choose the mode among: + * online (single-pass): update_every != None and passes == 1 + * online (multi-pass): update_every != None and passes > 1 + * batch: update_every == None + + By default, 'online (single-pass)' mode is used for training the LDA model. + """ self.gensim_model = None self.num_topics = num_topics self.id2word = id2word @@ -158,16 +156,16 @@ def fit(self, X, y=None): return self def transform(self, docs): - """Return the BOW format for the input documents. + """Infer the topic distribution for `docs`. Parameters ---------- - docs : iterable of iterable of (int, int) - A collection of documents in BOW format to be transformed. + docs : {iterable of list of (int, number), list of (int, number)} + Document or sequence of documents in BoW format. Returns ------- - np.array of shape (`len(docs)`, `num_topics`) + numpy.ndarray of shape [`len(docs)`, `num_topics`] The topic distribution for each input document. """ @@ -227,8 +225,8 @@ def score(self, X, y=None): Parameters ---------- - X : iterable of iterable of (int, int) - Input corpus in BOW format. + X : iterable of list of (int, number) + Sequence of documents in BOW format. Returns ------- From 14fcf22f6012e0d7e72372feafbf48b19003c773 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 11:24:21 +0500 Subject: [PATCH 35/45] small fixes --- gensim/sklearn_api/ldaseqmodel.py | 2 +- gensim/sklearn_api/lsimodel.py | 4 ++-- gensim/sklearn_api/rpmodel.py | 2 +- gensim/sklearn_api/w2vmodel.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/sklearn_api/ldaseqmodel.py b/gensim/sklearn_api/ldaseqmodel.py index df8820151f..4b1a9d6eee 100644 --- a/gensim/sklearn_api/ldaseqmodel.py +++ b/gensim/sklearn_api/ldaseqmodel.py @@ -134,7 +134,7 @@ def transform(self, docs): Returns ------- - np.ndarray of shape (`len(docs)`, `num_topics`) + numpy.ndarray of shape [`len(docs)`, `num_topics`] The topic representation of each document. """ diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index e287cf8ad6..a7838f7699 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -118,8 +118,8 @@ def transform(self, docs): Returns ------- - list of (int, int) - Topic distribution matrix of shape [num_docs, num_topics] + numpy.ndarray of shape [`len(docs)`, `num_topics`] + Topic distribution matrix. """ if self.gensim_model is None: diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py index 8420832df8..70a30b5545 100644 --- a/gensim/sklearn_api/rpmodel.py +++ b/gensim/sklearn_api/rpmodel.py @@ -80,7 +80,7 @@ def transform(self, docs): Returns ------- - np.ndarray of shape (`len(docs)`, num_topics) + numpy.ndarray of shape [`len(docs)`, `num_topics`] RP representation for each input document. """ diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 9f64c58702..5b05b4b0b7 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -167,7 +167,7 @@ def transform(self, words): Returns ------- - np.ndarray of shape (num_words, size) + np.ndarray of shape [`len(words)`, `size`] A 2D array where each row is the vector of one word. """ From 07a8cbae86eed718d72921f527195e49271842ad Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 11:34:41 +0500 Subject: [PATCH 36/45] fix ldaseqmodel --- gensim/sklearn_api/ldaseqmodel.py | 47 +++++++++++++++---------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/gensim/sklearn_api/ldaseqmodel.py b/gensim/sklearn_api/ldaseqmodel.py index 4b1a9d6eee..ab33ee2b7c 100644 --- a/gensim/sklearn_api/ldaseqmodel.py +++ b/gensim/sklearn_api/ldaseqmodel.py @@ -11,7 +11,6 @@ Examples -------- - >>> from gensim.test.utils import common_corpus, common_dictionary >>> from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer >>> @@ -31,56 +30,54 @@ class LdaSeqTransformer(TransformerMixin, BaseEstimator): - """Base Sequential LDA module. - - Wraps :class:`~gensim.models.ldaseqmodel.LdaSeqModel`. - For more information on the inner workings please take a look at - the original class. + """Base Sequential LDA module, wraps :class:`~gensim.models.ldaseqmodel.LdaSeqModel` model. + For more information take a look at `David M. Blei, John D. Lafferty: "Dynamic Topic Models" + `_. """ def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100): - """Sklearn wrapper for :class:`~gensim.models.ldaseqmodel.LdaSeqModel` model. + """ Parameters ---------- time_slice : list of int, optional - Contains the number of documents in each time-slice. - id2word : dict of (int, str) + Number of documents in each time-slice. + id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional Mapping from an ID to the word it represents in the vocabulary. - alphas : float + alphas : float, optional The prior probability of each topic. - num_topics : int + num_topics : int, optional Number of latent topics to be discovered in the corpus. - initialize : str {'gensim', 'own', 'ldamodel'} + initialize : {'gensim', 'own', 'ldamodel'}, optional Controls the initialization of the DTM model. Supports three different modes: - * 'gensim', default: Uses gensim's own LDA initialization. + * 'gensim': Uses gensim's own LDA initialization. * 'own': Uses your own initialization matrix of an LDA model that has been previously trained. * 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument. - sstats : np.ndarray of shape (vocab_len, `num_topics`) + sstats : np.ndarray of shape [vocab_len, `num_topics`], optional If `initialize` is set to 'own' this will be used to initialize the DTM model. - lda_model : :class:`~gensim.models.ldamodel.LdaModel` + lda_model : :class:`~gensim.models.ldamodel.LdaModel`, optional If `initialize` is set to 'lda_model' this object will be used to create the `sstats` initialization matrix. - obs_variance : float + obs_variance : float, optional Observed variance used to approximate the true and forward variance as shown in `David M. Blei, John D. Lafferty: "Dynamic Topic Models" - `_. - chain_variance : float + `_. + chain_variance : float, optional Gaussian parameter defined in the beta distribution to dictate how the beta values evolve. - passes : int + passes : int, optional Number of passes over the corpus for the initial :class:`~gensim.models.ldamodel.LdaModel` - random_state : {np.random.RandomState, int} + random_state : {numpy.random.RandomState, int}, optional Can be a np.random.RandomState object, or the seed to generate one. Used for reproducibility of results. - lda_inference_max_iter : int + lda_inference_max_iter : int, optional Maximum number of iterations in the inference step of the LDA training. - en_min_iter : int + em_min_iter : int, optional Minimum number of iterations until converge of the Expectation-Maximization algorithm - en_max_iter : int + em_max_iter : int, optional Maximum number of iterations until converge of the Expectation-Maximization algorithm - chunksize : int + chunksize : int, optional Number of documents in the corpus do be processed in in a chunk. """ @@ -125,7 +122,7 @@ def fit(self, X, y=None): return self def transform(self, docs): - """ + """Infer the topic distribution for `docs`. Parameters ---------- From 5325d05143b7799982aafec1fb6cab959a98ca9f Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 11:34:50 +0500 Subject: [PATCH 37/45] small fixes (again) --- gensim/sklearn_api/d2vmodel.py | 1 - gensim/sklearn_api/hdp.py | 1 - gensim/sklearn_api/ldamodel.py | 1 - gensim/sklearn_api/phrases.py | 1 - gensim/sklearn_api/rpmodel.py | 1 - gensim/sklearn_api/tfidf.py | 1 - gensim/sklearn_api/w2vmodel.py | 1 - 7 files changed, 7 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index ded9527d8c..396ac3287b 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -10,7 +10,6 @@ Examples -------- - >>> from gensim.test.utils import common_texts >>> from gensim.sklearn_api import D2VTransformer >>> diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py index 1efa770486..b33b9ac862 100644 --- a/gensim/sklearn_api/hdp.py +++ b/gensim/sklearn_api/hdp.py @@ -11,7 +11,6 @@ Examples -------- - >>> from gensim.test.utils import common_dictionary, common_corpus >>> from gensim.sklearn_api import HdpTransformer >>> diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index 3dc8ff2bc4..ff2ae94cbc 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -11,7 +11,6 @@ Examples -------- - >>> from gensim.test.utils import common_corpus, common_dictionary >>> from gensim.sklearn_api import LdaTransformer >>> diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 2d56f2cc9c..9900a060db 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -10,7 +10,6 @@ Examples -------- - >>> from gensim.sklearn_api.phrases import PhrasesTransformer >>> >>> # Create the model. Make sure no term is ignored and combinations seen 3+ times are captured. diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py index 70a30b5545..1ba1765977 100644 --- a/gensim/sklearn_api/rpmodel.py +++ b/gensim/sklearn_api/rpmodel.py @@ -11,7 +11,6 @@ Examples -------- - >>> from gensim.sklearn_api.rpmodel import RpTransformer >>> from gensim.test.utils import common_dictionary, common_corpus >>> diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index a1d474b23d..ee2bd5d5db 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -10,7 +10,6 @@ Examples -------- - >>> from gensim.test.utils import common_corpus, common_dictionary >>> from gensim.sklearn_api import TfIdfTransformer >>> diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 5b05b4b0b7..966307a349 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -12,7 +12,6 @@ Examples -------- - >>> from gensim.test.utils import common_texts >>> from gensim.sklearn_api import W2VTransformer >>> From b250ca45d99760996c27712d1a311ffbf93f7cc9 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 11:43:59 +0500 Subject: [PATCH 38/45] fix lsimodel --- gensim/sklearn_api/lsimodel.py | 36 +++++++++++++++------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index a7838f7699..858a39e0fe 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -5,7 +5,7 @@ # Copyright (C) 2017 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Scikit learn interface for `gensim.models.lsimodel`. +"""Scikit learn interface for :class:`gensim.models.lsimodel.LsiModel`. Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. @@ -30,7 +30,6 @@ >>> score = pipe.fit(common_corpus, labels).score(common_corpus, labels) """ - import numpy as np from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator @@ -43,31 +42,30 @@ class LsiTransformer(TransformerMixin, BaseEstimator): """Base LSI module, wraps :class:`~gensim.model.lsimodel.LsiModel`. - For more information on the inner working please take a look at - the original class. + For more information on the inner working please take a look at the original class. """ def __init__(self, num_topics=200, id2word=None, chunksize=20000, decay=1.0, onepass=True, power_iters=2, extra_samples=100): - """Sklearn wrapper for LSI model. + """ Parameters ---------- num_topics : int, optional - Number of requested factors (latent dimensions) - id2word : dict of {int: str}, optional + Number of requested factors (latent dimensions). + id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional ID to word mapping, optional. chunksize : int, optional Number of documents to be used in each training chunk. decay : float, optional Weight of existing observations relatively to new ones. onepass : bool, optional - Whether the one-pass algorithm should be used for training. - Pass `False` to force a multi-pass stochastic algorithm. + Whether the one-pass algorithm should be used for training, pass `False` to force a + multi-pass stochastic algorithm. power_iters: int, optional Number of power iteration steps to be used. - Increasing the number of power iterations improves accuracy, but lowers performance + Increasing the number of power iterations improves accuracy, but lowers performance. extra_samples : int, optional Extra samples to be used besides the rank `k`. Can improve accuracy. @@ -82,14 +80,12 @@ def __init__(self, num_topics=200, id2word=None, chunksize=20000, self.power_iters = power_iters def fit(self, X, y=None): - """ - Fit the model according to the given training data. - Calls :meth:`~gensim.models.lsimodel.LsiModel` + """Fit the model according to the given training data. Parameters ---------- - X : iterable of iterable of (int, float) - Stream of document vectors or sparse matrix of shape: [num_terms, num_documents]. + X : {iterable of list of (int, number), scipy.sparse matrix} + A collection of documents in BOW format to be transformed. Returns ------- @@ -109,12 +105,12 @@ def fit(self, X, y=None): return self def transform(self, docs): - """Computes the topic distribution matrix + """Computes the latent factors for `docs`. Parameters ---------- - docs : iterable of iterable of (int, float) - Stream of document vectors or sparse matrix of shape: [`num_terms`, num_documents]. + docs : {iterable of list of (int, number), list of (int, number), scipy.sparse matrix} + Document or collection of documents in BOW format to be transformed. Returns ------- @@ -144,8 +140,8 @@ def partial_fit(self, X): Parameters ---------- - X : iterable of iterable of (int, float) - Stream of document vectors or sparse matrix of shape: [num_terms, num_documents]. + X : {iterable of list of (int, number), scipy.sparse matrix} + Stream of document vectors or sparse matrix of shape: [`num_terms`, `num_documents`]. Returns ------- From 3fc3beffac5e9a0e66700c617b4ce66c3f72197a Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 13:05:06 +0500 Subject: [PATCH 39/45] fix phrases --- gensim/sklearn_api/phrases.py | 67 ++++++++++++++++------------------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 9900a060db..ee6960b217 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -4,7 +4,7 @@ # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Scikit learn interface for `gensim.models.phrases`. +"""Scikit learn interface for `gensim.models.phrases.Phrases`. Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn. @@ -22,11 +22,9 @@ >>> >>> # Use sklearn fit_transform to see the transformation. >>> # Since computer and science were seen together 3+ times they are considered a phrase. ->>> m.fit_transform(texts)[0] -['I', 'love', 'computer_science'] +>>> assert ['I', 'love', 'computer_science'] == m.fit_transform(texts)[0] """ - from six import string_types from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -35,59 +33,56 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator): - """Base Phrases module + """Base Phrases module, wraps :class:`~gensim.models.phrases.Phrases`. - Wraps :class:`~gensim.models.phrases.Phrases`. - For more information on the inner workings please take a look at - the original class. + For more information on the inner workings please take a look at the original class. """ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_', progress_per=10000, scoring='default'): - """Sklearn wrapper for Phrases model. + """ Parameters ---------- - min_count : int + min_count : int, optional Terms with a count lower than this will be ignored - threshold : float + threshold : float, optional Only phrases scoring above this will be accepted, see `scoring` below. - max_vocab_size : int - Maximum size of the vocabulary. - Used to control pruning of less common words, to keep memory under control. - The default of 40M needs about 3.6GB of RAM; - delimiter : str - Character used to join collocation tokens. Should be a byte string (e.g. b'_'). - progress_per : int + max_vocab_size : int, optional + Maximum size of the vocabulary. Used to control pruning of less common words, to keep memory under control. + The default of 40M needs about 3.6GB of RAM. + delimiter : str, optional + Character used to join collocation tokens, should be a byte string (e.g. b'_'). + progress_per : int, optional Training will report to the logger every that many phrases are learned. - scoring : str or callable + scoring : str or function, optional Specifies how potential phrases are scored for comparison to the `threshold` setting. `scoring` can be set with either a string that refers to a built-in scoring function, or with a function with the expected parameter names. Two built-in scoring functions are available by setting `scoring` to a string: - 'default': Explained in `Mikolov, et. al: "Efficient Estimation of Word Representations in Vector Space" - `_. - 'npmi': Explained in `Gerlof Bouma: "Normalized (Pointwise) Mutual Information in Collocation Extraction" - `_. + * 'default': Explained in `Mikolov, et. al: "Efficient Estimation of Word Representations + in Vector Space" `_. + * 'npmi': Explained in `Gerlof Bouma: "Normalized (Pointwise) Mutual Information in Collocation + Extraction" `_. 'npmi' is more robust when dealing with common words that form part of common bigrams, and ranges from -1 to 1, but is slower to calculate than the default. To use a custom scoring function, create a function with the following parameters and set the `scoring` - parameter to the custom function. You must use all the parameters in your function call, even if the - function does not require all the parameters. + parameter to the custom function, see :func:`~gensim.models.phrases.original_scorer` as example. + You must define all the parameters (but can use only part of it): - worda_count: number of occurrances in `sentences` of the first token in the phrase being scored - wordb_count: number of occurrances in `sentences` of the second token in the phrase being scored - bigram_count: number of occurrances in `sentences` of the phrase being scored - len_vocab: the number of unique tokens in `sentences` - min_count: the `min_count` setting of the Phrases class - corpus_word_count: the total number of (non-unique) tokens in `sentences` + * worda_count: number of occurrences in `sentences` of the first token in the phrase being scored + * wordb_count: number of occurrences in `sentences` of the second token in the phrase being scored + * bigram_count: number of occurrences in `sentences` of the phrase being scored + * len_vocab: the number of unique tokens in `sentences` + * min_count: the `min_count` setting of the Phrases class + * corpus_word_count: the total number of (non-unique) tokens in `sentences` A scoring function without any of these parameters (even if the parameters are not used) will - raise a ValueError on initialization of the Phrases class. The scoring function must be pic + raise a ValueError on initialization of the Phrases class. The scoring function must be pickleable. """ self.gensim_model = None @@ -122,12 +117,12 @@ def fit(self, X, y=None): def transform(self, docs): """Transform the input documents into phrase tokens. - Words in the sentence will be joined by u`_`. + Words in the sentence will be joined by `self.delimiter`. Parameters ---------- - docs : iterable of list of str - Sequence of sentences to be used transformed. + docs : {iterable of list of str, list of str} + Sequence of documents to be used transformed. Returns ------- @@ -135,7 +130,6 @@ def transform(self, docs): Phrase representation for each of the input sentences. """ - if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." @@ -164,7 +158,6 @@ def partial_fit(self, X): The trained model. """ - if self.gensim_model is None: self.gensim_model = models.Phrases( sentences=X, min_count=self.min_count, threshold=self.threshold, From dc9f659cc85b285fadeda97ff52d15fdd1ae3046 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 13:16:57 +0500 Subject: [PATCH 40/45] fix rpmodel --- gensim/sklearn_api/rpmodel.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py index 1ba1765977..cc2adaf6a1 100644 --- a/gensim/sklearn_api/rpmodel.py +++ b/gensim/sklearn_api/rpmodel.py @@ -33,19 +33,18 @@ class RpTransformer(TransformerMixin, BaseEstimator): """Base Word2Vec module, wraps :class:`~gensim.models.rpmodel.RpModel`. - For more information on the inner workings please take a look at - the original class. + For more information on the inner workings please take a look at the original class. """ def __init__(self, id2word=None, num_topics=300): - """Sklearn wrapper for Random Projections model. + """ Parameters ---------- - id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional + id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional Mapping `token_id` -> `token`, will be determined from corpus if `id2word == None`. num_topics : int, optional - Number of topics. + Number of dimensions. """ self.gensim_model = None @@ -57,7 +56,7 @@ def fit(self, X, y=None): Parameters ---------- - X : iterable of iterable of (int, int) + X : iterable of list of (int, number) Input corpus in BOW format. Returns @@ -70,11 +69,11 @@ def fit(self, X, y=None): return self def transform(self, docs): - """Find the topic probabilities for each author. + """Find the Random Projection factors for `docs`. Parameters ---------- - docs : iterable of iterable of (int, int) + docs : {iterable of iterable of (int, int), list of (int, number)} Documents to be transformed in BOW format. Returns From 4ec46190ded5d5c78aec986ad7034c52f3357690 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 13:19:13 +0500 Subject: [PATCH 41/45] fix text2bow --- gensim/sklearn_api/text2bow.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index 872dcb4ecc..a223d2d206 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -18,12 +18,11 @@ >>> # Create a transformer.. >>> model = Text2BowTransformer() >>> ->>> # Use sklearn style `fit_transform` to get the BOW representation of each document. +>>> # Use sklearn-style `fit_transform` to get the BOW representation of each document. >>> model.fit_transform(texts) [[(0, 1), (1, 1), (2, 1)], [(1, 1), (2, 1), (3, 1)]] """ - from six import string_types from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -35,20 +34,17 @@ class Text2BowTransformer(TransformerMixin, BaseEstimator): """Base Text2Bow module , wraps :class:`~gensim.corpora.dictionary.Dictionary`. - For more information on the inner workings please take a look at - the original class. + For more information on the inner workings please take a look at the original class. """ - def __init__(self, prune_at=2000000, tokenizer=tokenize): - """Sklearn wrapper for Text2Bow model. - + """ Parameters ---------- prune_at : int, optional Total number of unique words. Dictionary will keep not more than `prune_at` words. tokenizer : callable (str -> list of str), optional - A callable to split a document into a list of each terms + A callable to split a document into a list of each terms, default is :func:`gensim.utils.tokenize`. """ self.gensim_model = None @@ -74,11 +70,11 @@ def fit(self, X, y=None): return self def transform(self, docs): - """Return the BOW format for the input documents. + """Get the BOW format for the `docs`. Parameters ---------- - docs : iterable of str + docs : {iterable of str, str} A collection of documents to be transformed. Returns From 36a263a0e559cef80c369bb1ed9bb562abf373de Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 13:21:50 +0500 Subject: [PATCH 42/45] fix tfidf --- gensim/sklearn_api/tfidf.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index ee2bd5d5db..bec2667796 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -15,13 +15,9 @@ >>> >>> # Transform the word counts inversely to their global frequency using the sklearn interface. >>> model = TfIdfTransformer(dictionary=common_dictionary) ->>> weighted_corpus = model.fit_transform(common_corpus) ->>> weighted_corpus[0] -[(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)] +>>> tfidf_corpus = model.fit_transform(common_corpus) """ - - from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -32,15 +28,13 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator): """Base TfIdf module, wraps :class:`~gensim.models.tfidfmodel.TfidfModel`. - For more information on the inner workings please take a look at - the original class. + For more information on the inner workings please take a look at the original class. """ - def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="ntc", pivot=None, slope=0.65): - """Sklearn wrapper for TfIdf model. + """ Parameters ---------- @@ -81,6 +75,18 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, * `c` - cosine. For more info, visit `"Wikipedia" `_. + pivot : float, optional + It is the point around which the regular normalization curve is `tilted` to get the new pivoted + normalization curve. In the paper `Amit Singhal, Chris Buckley, Mandar Mitra: + "Pivoted Document Length Normalization" `_ it is the point where the + retrieval and relevance curves intersect. + This parameter along with slope is used for pivoted document length normalization. + Only when `pivot` is not None pivoted document length normalization will be applied else regular TfIdf + is used. + slope : float, optional + It is the parameter required by pivoted document length normalization which determines the slope to which + the `old normalization` can be tilted. This parameter only works when pivot is defined by user and is not + None. """ self.gensim_model = None @@ -115,12 +121,12 @@ def fit(self, X, y=None): return self def transform(self, docs): - """Get the transformed documents after multiplication with the tf-idf matrix. + """Get the tf-idf scores in BoW representation for `docs` Parameters ---------- - docs: iterable of iterable of (int, int) - Input corpus in BoW format. + docs: {iterable of list of (int, number), list of (int, number)} + Document or corpus in BoW format. Returns ------- From ae4a5b46c9fd12286f344829f8729138678a42ca Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 13:23:03 +0500 Subject: [PATCH 43/45] fix word2vec --- gensim/sklearn_api/w2vmodel.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 966307a349..1230f5775f 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -36,16 +36,13 @@ class W2VTransformer(TransformerMixin, BaseEstimator): """Base Word2Vec module, wraps :class:`~gensim.models.word2vec.Word2Vec`. - For more information on the inner workings please take a look at - the original class. + For more information on the inner workings please take a look at the original class. """ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000): """ - Initialize the model from an iterable of `sentences`. Each sentence is a - list of words (unicode strings) that will be used for training. Parameters ---------- @@ -107,7 +104,6 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size= texts are longer than 10000 words, but the standard cython code truncates to that maximum.) """ - self.gensim_model = None self.size = size self.alpha = alpha @@ -157,12 +153,12 @@ def fit(self, X, y=None): return self def transform(self, words): - """Return the word vectors the input words. + """Get the word vectors the input words. Parameters ---------- - words : iterable of str - A collection of words to be transformed. + words : {iterable of str, str} + Word or a collection of words to be transformed. Returns ------- From 0ad6580e0dc2d5ed2139e770f53d40b2a6da6f5a Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 13:43:43 +0500 Subject: [PATCH 44/45] cleanup --- gensim/sklearn_api/atmodel.py | 1 - gensim/sklearn_api/d2vmodel.py | 7 +++---- gensim/sklearn_api/hdp.py | 8 +++----- gensim/sklearn_api/ldamodel.py | 7 +++---- gensim/sklearn_api/ldaseqmodel.py | 5 ++--- gensim/sklearn_api/lsimodel.py | 4 ++-- gensim/sklearn_api/phrases.py | 5 +++-- gensim/sklearn_api/rpmodel.py | 4 ++-- gensim/sklearn_api/text2bow.py | 2 +- gensim/sklearn_api/tfidf.py | 2 +- gensim/sklearn_api/w2vmodel.py | 5 ++--- 11 files changed, 22 insertions(+), 28 deletions(-) diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py index b21e1818cd..69397833c0 100644 --- a/gensim/sklearn_api/atmodel.py +++ b/gensim/sklearn_api/atmodel.py @@ -39,7 +39,6 @@ class AuthorTopicTransformer(TransformerMixin, BaseEstimator): """Base Author Topic module, wraps :class:`~gensim.models.atmodel.AuthorTopicModel`. - For more information on the inner workings please take a look at the original class. The model's internal workings are heavily based on `"The Author-Topic Model for Authors and Documents", Osen-Zvi et. al 2004 `_. diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 396ac3287b..fbc8375e5a 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -27,13 +27,12 @@ class D2VTransformer(TransformerMixin, BaseEstimator): - """Base Dov2Vec module, wraps :class:`~gensim.models.doc2vec.Doc2Vec`. + """Base Doc2Vec module, wraps :class:`~gensim.models.doc2vec.Doc2Vec`. - For more information on the inner workings please take a look at - the original class. + This model based on `Quoc Le, Tomas Mikolov: "Distributed Representations of Sentences and Documents" + `_. """ - def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, diff --git a/gensim/sklearn_api/hdp.py b/gensim/sklearn_api/hdp.py index b33b9ac862..f62b46d9c5 100644 --- a/gensim/sklearn_api/hdp.py +++ b/gensim/sklearn_api/hdp.py @@ -31,13 +31,11 @@ class HdpTransformer(TransformerMixin, BaseEstimator): """Base HDP module, wraps :class:`~gensim.models.hdpmodel.HdpModel`. - For more information on the inner workings please take a look at - the original class. The inner workings of this class heavily depends on `Wang, Paisley, Blei: "Online Variational + The inner workings of this class heavily depends on `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical Dirichlet Process, JMLR (2011)" `_. """ - def __init__(self, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): """ @@ -107,7 +105,7 @@ def fit(self, X, y=None): Parameters ---------- - X : {iterable of iterable of (int, int), scipy.sparse matrix} + X : {iterable of list of (int, number), scipy.sparse matrix} A collection of documents in BOW format used for training the model. Returns @@ -174,7 +172,7 @@ def partial_fit(self, X): Parameters ---------- - X : {iterable of iterable of (int, int), scipy.sparse matrix} + X : {iterable of list of (int, number), scipy.sparse matrix} A collection of documents in BOW format used for training the model. Returns diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index ff2ae94cbc..342b8825ff 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -19,7 +19,6 @@ >>> docvecs = model.fit_transform(common_corpus) """ - import numpy as np from scipy import sparse from sklearn.base import TransformerMixin, BaseEstimator @@ -34,11 +33,11 @@ class LdaTransformer(TransformerMixin, BaseEstimator): For more information on the inner workings please take a look at the original class. The inner workings of this class depends heavily on `Matthew D. Hoffman, David M. Blei, Francis Bach: - "Online Learning for Latent Dirichlet Allocation NIPS'10" - `_. + "Online Learning for Latent Dirichlet Allocation NIPS'10" `_ and + `David M. Blei, Andrew Y. Ng, Michael I. Jordan: "Latent Dirichlet Allocation" + `_. """ - def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, scorer='perplexity', dtype=np.float32): diff --git a/gensim/sklearn_api/ldaseqmodel.py b/gensim/sklearn_api/ldaseqmodel.py index ab33ee2b7c..277578e55a 100644 --- a/gensim/sklearn_api/ldaseqmodel.py +++ b/gensim/sklearn_api/ldaseqmodel.py @@ -36,7 +36,6 @@ class LdaSeqTransformer(TransformerMixin, BaseEstimator): `_. """ - def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100): @@ -103,7 +102,7 @@ def fit(self, X, y=None): Parameters ---------- - X : {iterable of iterable of (int, int), scipy.sparse matrix} + X : {iterable of list of (int, number), scipy.sparse matrix} A collection of documents in BOW format used for training the model. Returns @@ -126,7 +125,7 @@ def transform(self, docs): Parameters ---------- - docs : {iterable of iterable of (int, int), scipy.sparse matrix} + docs : {iterable of list of (int, number), scipy.sparse matrix} A collection of documents in BOW format to be transformed. Returns diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index 858a39e0fe..c288918f75 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -42,10 +42,10 @@ class LsiTransformer(TransformerMixin, BaseEstimator): """Base LSI module, wraps :class:`~gensim.model.lsimodel.LsiModel`. - For more information on the inner working please take a look at the original class. + For more information please have a look to `Latent semantic analysis + `_. """ - def __init__(self, num_topics=200, id2word=None, chunksize=20000, decay=1.0, onepass=True, power_iters=2, extra_samples=100): """ diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index ee6960b217..d960b29efa 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -35,10 +35,11 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator): """Base Phrases module, wraps :class:`~gensim.models.phrases.Phrases`. - For more information on the inner workings please take a look at the original class. + For more information, please have a look to `Mikolov, et. al: "Efficient Estimation of Word Representations in + Vector Space" `_ and "Normalized (Pointwise) Mutual Information in Collocation + Extraction" `_. """ - def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_', progress_per=10000, scoring='default'): """ diff --git a/gensim/sklearn_api/rpmodel.py b/gensim/sklearn_api/rpmodel.py index cc2adaf6a1..323aee6a34 100644 --- a/gensim/sklearn_api/rpmodel.py +++ b/gensim/sklearn_api/rpmodel.py @@ -33,7 +33,7 @@ class RpTransformer(TransformerMixin, BaseEstimator): """Base Word2Vec module, wraps :class:`~gensim.models.rpmodel.RpModel`. - For more information on the inner workings please take a look at the original class. + For more information please have a look to `Random projection `_. """ def __init__(self, id2word=None, num_topics=300): @@ -74,7 +74,7 @@ def transform(self, docs): Parameters ---------- docs : {iterable of iterable of (int, int), list of (int, number)} - Documents to be transformed in BOW format. + Document or documents to be transformed in BOW format. Returns ------- diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index a223d2d206..dd6b555c7b 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -34,7 +34,7 @@ class Text2BowTransformer(TransformerMixin, BaseEstimator): """Base Text2Bow module , wraps :class:`~gensim.corpora.dictionary.Dictionary`. - For more information on the inner workings please take a look at the original class. + For more information please have a look to `Bag-of-words model `_. """ def __init__(self, prune_at=2000000, tokenizer=tokenize): diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index bec2667796..f8d1615e8d 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -28,7 +28,7 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator): """Base TfIdf module, wraps :class:`~gensim.models.tfidfmodel.TfidfModel`. - For more information on the inner workings please take a look at the original class. + For more information please have a look to `tf-idf `_. """ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 1230f5775f..8ef2ef18d1 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -23,8 +23,6 @@ >>> assert wordvecs.shape == (2, 10) """ - - import numpy as np import six from sklearn.base import TransformerMixin, BaseEstimator @@ -36,7 +34,8 @@ class W2VTransformer(TransformerMixin, BaseEstimator): """Base Word2Vec module, wraps :class:`~gensim.models.word2vec.Word2Vec`. - For more information on the inner workings please take a look at the original class. + For more information please have a look to `Tomas Mikolov, Kai Chen, Greg Corrado, Jeffrey Dean: "Efficient + Estimation of Word Representations in Vector Space" `_. """ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, From 8a45bef3432e37f6ab5cb7f85e312b66bebb1e49 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 15 Mar 2018 13:49:33 +0500 Subject: [PATCH 45/45] cleanup[2] --- gensim/sklearn_api/ldamodel.py | 1 - gensim/sklearn_api/lsimodel.py | 2 +- gensim/sklearn_api/phrases.py | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/gensim/sklearn_api/ldamodel.py b/gensim/sklearn_api/ldamodel.py index 342b8825ff..3e5d65dcc6 100644 --- a/gensim/sklearn_api/ldamodel.py +++ b/gensim/sklearn_api/ldamodel.py @@ -31,7 +31,6 @@ class LdaTransformer(TransformerMixin, BaseEstimator): """Base LDA module, wraps :class:`~gensim.models.ldamodel.LdaModel`. - For more information on the inner workings please take a look at the original class. The inner workings of this class depends heavily on `Matthew D. Hoffman, David M. Blei, Francis Bach: "Online Learning for Latent Dirichlet Allocation NIPS'10" `_ and `David M. Blei, Andrew Y. Ng, Michael I. Jordan: "Latent Dirichlet Allocation" diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py index c288918f75..709c911fa4 100644 --- a/gensim/sklearn_api/lsimodel.py +++ b/gensim/sklearn_api/lsimodel.py @@ -40,7 +40,7 @@ class LsiTransformer(TransformerMixin, BaseEstimator): - """Base LSI module, wraps :class:`~gensim.model.lsimodel.LsiModel`. + """Base LSI module, wraps :class:`~gensim.models.lsimodel.LsiModel`. For more information please have a look to `Latent semantic analysis `_. diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index d960b29efa..7579a09cc9 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -36,8 +36,8 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator): """Base Phrases module, wraps :class:`~gensim.models.phrases.Phrases`. For more information, please have a look to `Mikolov, et. al: "Efficient Estimation of Word Representations in - Vector Space" `_ and "Normalized (Pointwise) Mutual Information in Collocation - Extraction" `_. + Vector Space" `_ and `Gerlof Bouma: "Normalized (Pointwise) Mutual Information + in Collocation Extraction" `_. """ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,