From 2ee6ab85292971ccdcbaccbff23606a25be27597 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Wed, 28 Feb 2018 21:02:56 +0530 Subject: [PATCH 01/41] fix 'iter' and 'size' warnings --- gensim/sklearn_api/d2vmodel.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 1e3bf61d7d..f4c7afb662 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -9,6 +9,7 @@ Follows scikit-learn API conventions """ +import warnings import numpy as np from six import string_types from sklearn.base import TransformerMixin, BaseEstimator @@ -26,10 +27,18 @@ class D2VTransformer(TransformerMixin, BaseEstimator): def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, - hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000): + hashfxn=hash, iter=5,sorted_vocab=1, batch_words=10000 ,**kwargs): """ Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. """ + if 'iter' in kwargs: + warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") + kwargs['epochs'] = kwargs['iter'] + + if 'size' in kwargs: + warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") + kwargs['vector_size'] = kwargs['size'] + self.gensim_model = None self.dm_mean = dm_mean self.dm = dm @@ -42,7 +51,9 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.trim_rule = trim_rule # attributes associated with gensim.models.Word2Vec + self.size = size + self.vector_size = vector_size self.alpha = alpha self.window = window self.min_count = min_count @@ -56,6 +67,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn self.iter = iter + self.epochs = epochs self.sorted_vocab = sorted_vocab self.batch_words = batch_words @@ -72,11 +84,11 @@ def fit(self, X, y=None): documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm, dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, - trim_rule=self.trim_rule, size=self.size, alpha=self.alpha, window=self.window, + trim_rule=self.trim_rule, size=self.size, vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, - iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words + iter=self.iter,epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self From 6301be686c57dac26cfd582ffe2a11b0d0e38e66 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Wed, 28 Feb 2018 21:13:34 +0530 Subject: [PATCH 02/41] Revert "fix 'iter' and 'size' warnings" --- gensim/sklearn_api/d2vmodel.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index f4c7afb662..1e3bf61d7d 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -9,7 +9,6 @@ Follows scikit-learn API conventions """ -import warnings import numpy as np from six import string_types from sklearn.base import TransformerMixin, BaseEstimator @@ -27,18 +26,10 @@ class D2VTransformer(TransformerMixin, BaseEstimator): def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, - hashfxn=hash, iter=5,sorted_vocab=1, batch_words=10000 ,**kwargs): + hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000): """ Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. """ - if 'iter' in kwargs: - warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") - kwargs['epochs'] = kwargs['iter'] - - if 'size' in kwargs: - warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") - kwargs['vector_size'] = kwargs['size'] - self.gensim_model = None self.dm_mean = dm_mean self.dm = dm @@ -51,9 +42,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.trim_rule = trim_rule # attributes associated with gensim.models.Word2Vec - self.size = size - self.vector_size = vector_size self.alpha = alpha self.window = window self.min_count = min_count @@ -67,7 +56,6 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn self.iter = iter - self.epochs = epochs self.sorted_vocab = sorted_vocab self.batch_words = batch_words @@ -84,11 +72,11 @@ def fit(self, X, y=None): documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm, dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, - trim_rule=self.trim_rule, size=self.size, vector_size=self.vector_size, alpha=self.alpha, window=self.window, + trim_rule=self.trim_rule, size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, - iter=self.iter,epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words + iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self From 5b9d2ffe321ca3ac89409807a6300aa53929978a Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Wed, 28 Feb 2018 21:22:03 +0530 Subject: [PATCH 03/41] Update w2vmodel.py --- gensim/sklearn_api/w2vmodel.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 317842ee07..9bc89c6a88 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -10,6 +10,7 @@ Follows scikit-learn API conventions """ +import warnings import numpy as np import six from sklearn.base import TransformerMixin, BaseEstimator @@ -25,12 +26,21 @@ class W2VTransformer(TransformerMixin, BaseEstimator): def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=10000): + trim_rule=None, sorted_vocab=1, batch_words=10000,**kwargs): """ Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details. """ + if iter is not None: + warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") + kwargs['epochs'] = iter + + if size is not None: + warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") + kwargs['vector_size'] = size + self.gensim_model = None self.size = size + self.vector_size = kwargs['vector_size'] self.alpha = alpha self.window = window self.min_count = min_count @@ -45,6 +55,7 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size= self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn self.iter = iter + self.epochs = kwargs['epochs'] self.null_word = null_word self.trim_rule = trim_rule self.sorted_vocab = sorted_vocab @@ -56,11 +67,11 @@ def fit(self, X, y=None): Calls gensim.models.Word2Vec """ self.gensim_model = models.Word2Vec( - sentences=X, size=self.size, alpha=self.alpha, + sentences=X, size=self.size, vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, - hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, trim_rule=self.trim_rule, + hashfxn=self.hashfxn, iter=self.iter, epochs=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self From d2733ff0e116997320e0af10e9521a4bf9a7dbc8 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Wed, 28 Feb 2018 21:30:52 +0530 Subject: [PATCH 04/41] fix 'iter' and 'size' warnings --- gensim/sklearn_api/d2vmodel.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 1e3bf61d7d..d01744fea4 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -26,10 +26,19 @@ class D2VTransformer(TransformerMixin, BaseEstimator): def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, - hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000): + hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000,**kwargs): """ Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. """ + + if iter is not None: + warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") + kwargs['epochs'] = iter + + if size is not None: + warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") + kwargs['vector_size'] = size + self.gensim_model = None self.dm_mean = dm_mean self.dm = dm @@ -43,6 +52,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 # attributes associated with gensim.models.Word2Vec self.size = size + self.vector_size = kwargs['vector_size'] self.alpha = alpha self.window = window self.min_count = min_count @@ -56,6 +66,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn self.iter = iter + self.epochs = kwargs['epochs'] self.sorted_vocab = sorted_vocab self.batch_words = batch_words @@ -72,11 +83,11 @@ def fit(self, X, y=None): documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm, dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, - trim_rule=self.trim_rule, size=self.size, alpha=self.alpha, window=self.window, + trim_rule=self.trim_rule, size=self.size, vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, - iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words + iter=self.iter, epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self From 21934e10f19ec65a3f167454aaf25de339fc69f8 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Wed, 28 Feb 2018 21:48:45 +0530 Subject: [PATCH 05/41] Fix 'iter' and 'size' warnings --- gensim/sklearn_api/d2vmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index d01744fea4..628c55783a 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -8,7 +8,7 @@ Scikit learn interface for gensim for easy use of gensim with scikit-learn Follows scikit-learn API conventions """ - +import warnings import numpy as np from six import string_types from sklearn.base import TransformerMixin, BaseEstimator From 428d428d35e2068e5738f907d0857eb60ff7f572 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Wed, 28 Feb 2018 22:27:32 +0530 Subject: [PATCH 06/41] fixed deprecated argument warnings --- gensim/sklearn_api/w2vmodel.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 9bc89c6a88..f189f07d7d 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -24,8 +24,8 @@ class W2VTransformer(TransformerMixin, BaseEstimator): Base Word2Vec module """ - def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, - workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, + def __init__(self, size=None, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, + workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000,**kwargs): """ Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details. @@ -33,10 +33,14 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size= if iter is not None: warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") kwargs['epochs'] = iter + if iter is None and 'epochs' not in kwargs: + kwargs['epochs'] = 5 if size is not None: warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") kwargs['vector_size'] = size + if size is None and 'vector_size' not in kwargs: + kwargs['vector_size'] = 100 self.gensim_model = None self.size = size From 0a0e1d67b46d23c9644917aec0dc3b3bcef4d530 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Wed, 28 Feb 2018 22:30:17 +0530 Subject: [PATCH 07/41] fixed deprecated argument warnings --- gensim/sklearn_api/d2vmodel.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 628c55783a..02f34ae5c3 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -24,9 +24,9 @@ class D2VTransformer(TransformerMixin, BaseEstimator): """ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, - docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5, + docvecs_mapfile=None, comment=None, trim_rule=None, size=None, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, - hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000,**kwargs): + hashfxn=hash, iter=None, sorted_vocab=1, batch_words=10000,**kwargs): """ Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. """ @@ -34,10 +34,14 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 if iter is not None: warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") kwargs['epochs'] = iter + if iter is None and 'epochs' not in kwargs: + kwargs['epochs'] = 5 if size is not None: warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") kwargs['vector_size'] = size + if size is None and 'vector_size' not in kwargs: + kwargs['vector_size'] = 100 self.gensim_model = None self.dm_mean = dm_mean From c79bdfbe93339029a99c55a85e78451b8dcf3ada Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Thu, 1 Mar 2018 10:37:36 +0530 Subject: [PATCH 08/41] Update test_sklearn_api.py --- gensim/test/test_sklearn_api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index ed5516df37..7f9381bf64 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -646,7 +646,7 @@ def testModelNotFitted(self): class TestWord2VecWrapper(unittest.TestCase): def setUp(self): numpy.random.seed(0) - self.model = W2VTransformer(size=10, min_count=0, seed=42) + self.model = W2VTransformer(vector_size=10, min_count=0, seed=42) self.model.fit(texts) def testTransform(self): @@ -665,7 +665,7 @@ def testTransform(self): def testConsistencyWithGensimModel(self): # training a W2VTransformer - self.model = W2VTransformer(size=10, min_count=0, seed=42) + self.model = W2VTransformer(vector_size=10, min_count=0, seed=42) self.model.fit(texts) # training a Gensim Word2Vec model with the same params @@ -679,7 +679,7 @@ def testConsistencyWithGensimModel(self): def testPipeline(self): numpy.random.seed(0) # set fixed seed to get similar values everytime - model = W2VTransformer(size=10, min_count=1) + model = W2VTransformer(vector_size=10, min_count=1) model.fit(w2v_texts) class_dict = {'mathematics': 1, 'physics': 0} @@ -724,7 +724,7 @@ def testPersistence(self): self.assertTrue(passed) def testModelNotFitted(self): - w2vmodel_wrapper = W2VTransformer(size=10, min_count=0, seed=42) + w2vmodel_wrapper = W2VTransformer(vector_size=10, min_count=0, seed=42) word = texts[0][0] self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word) From da76d3c6b7d24df4bb55dde94fc4bb5243687e98 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Thu, 1 Mar 2018 10:42:16 +0530 Subject: [PATCH 09/41] fix deprecated arguments --- gensim/test/test_sklearn_api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 7f9381bf64..f28bf6817b 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -646,7 +646,7 @@ def testModelNotFitted(self): class TestWord2VecWrapper(unittest.TestCase): def setUp(self): numpy.random.seed(0) - self.model = W2VTransformer(vector_size=10, min_count=0, seed=42) + self.model = W2VTransformer(min_count=0, seed=42, vector_size=10) self.model.fit(texts) def testTransform(self): @@ -665,7 +665,7 @@ def testTransform(self): def testConsistencyWithGensimModel(self): # training a W2VTransformer - self.model = W2VTransformer(vector_size=10, min_count=0, seed=42) + self.model = W2VTransformer(min_count=0, seed=42, vector_size=10) self.model.fit(texts) # training a Gensim Word2Vec model with the same params @@ -679,7 +679,7 @@ def testConsistencyWithGensimModel(self): def testPipeline(self): numpy.random.seed(0) # set fixed seed to get similar values everytime - model = W2VTransformer(vector_size=10, min_count=1) + model = W2VTransformer(min_count=1,vector_size=10) model.fit(w2v_texts) class_dict = {'mathematics': 1, 'physics': 0} @@ -724,7 +724,7 @@ def testPersistence(self): self.assertTrue(passed) def testModelNotFitted(self): - w2vmodel_wrapper = W2VTransformer(vector_size=10, min_count=0, seed=42) + w2vmodel_wrapper = W2VTransformer(min_count=0, seed=42, vector_size=10) word = texts[0][0] self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word) From bbdb0d454a7627f115e5cbb15a185a187facb00e Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Thu, 1 Mar 2018 22:55:08 +0530 Subject: [PATCH 10/41] fix deprecated argument warnings --- gensim/sklearn_api/w2vmodel.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index f189f07d7d..36ba96595f 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -24,27 +24,22 @@ class W2VTransformer(TransformerMixin, BaseEstimator): Base Word2Vec module """ - def __init__(self, size=None, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, - workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=10000,**kwargs): + def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, + workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, null_word=0, + trim_rule=None, sorted_vocab=1, batch_words=10000): """ Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details. """ if iter is not None: warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") - kwargs['epochs'] = iter - if iter is None and 'epochs' not in kwargs: - kwargs['epochs'] = 5 + epochs = iter if size is not None: warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") - kwargs['vector_size'] = size - if size is None and 'vector_size' not in kwargs: - kwargs['vector_size'] = 100 + vector_size = size self.gensim_model = None - self.size = size - self.vector_size = kwargs['vector_size'] + self.vector_size = vector_size self.alpha = alpha self.window = window self.min_count = min_count @@ -58,24 +53,22 @@ def __init__(self, size=None, alpha=0.025, window=5, min_count=5, max_vocab_size self.negative = negative self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn - self.iter = iter - self.epochs = kwargs['epochs'] + self.epochs = epochs self.null_word = null_word self.trim_rule = trim_rule self.sorted_vocab = sorted_vocab self.batch_words = batch_words - - def fit(self, X, y=None): + def fit(self, X, y=None): """ Fit the model according to the given training data. Calls gensim.models.Word2Vec """ self.gensim_model = models.Word2Vec( - sentences=X, size=self.size, vector_size=self.vector_size, alpha=self.alpha, + sentences=X, vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, - hashfxn=self.hashfxn, iter=self.iter, epochs=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule, + hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self From 31d3729ddc038c5a09738ca9e92ee62c71ce9d74 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Thu, 1 Mar 2018 22:57:41 +0530 Subject: [PATCH 11/41] fix deprecated argumet warnings --- gensim/sklearn_api/d2vmodel.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 02f34ae5c3..30ea3e0991 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -24,25 +24,21 @@ class D2VTransformer(TransformerMixin, BaseEstimator): """ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, - docvecs_mapfile=None, comment=None, trim_rule=None, size=None, alpha=0.025, window=5, min_count=5, + docvecs_mapfile=None, comment=None, trim_rule=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, - hashfxn=hash, iter=None, sorted_vocab=1, batch_words=10000,**kwargs): + hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000): """ Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. """ if iter is not None: warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") - kwargs['epochs'] = iter - if iter is None and 'epochs' not in kwargs: - kwargs['epochs'] = 5 + epochs = iter if size is not None: warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") - kwargs['vector_size'] = size - if size is None and 'vector_size' not in kwargs: - kwargs['vector_size'] = 100 - + vector_size = size + self.gensim_model = None self.dm_mean = dm_mean self.dm = dm @@ -55,8 +51,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.trim_rule = trim_rule # attributes associated with gensim.models.Word2Vec - self.size = size - self.vector_size = kwargs['vector_size'] + self.vector_size = vector_size self.alpha = alpha self.window = window self.min_count = min_count @@ -69,12 +64,11 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.negative = negative self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn - self.iter = iter - self.epochs = kwargs['epochs'] + self.epochs = epochs self.sorted_vocab = sorted_vocab self.batch_words = batch_words - - def fit(self, X, y=None): + + def fit(self, X, y=None): """ Fit the model according to the given training data. Calls gensim.models.Doc2Vec @@ -87,14 +81,15 @@ def fit(self, X, y=None): documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm, dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, - trim_rule=self.trim_rule, size=self.size, vector_size=self.vector_size, alpha=self.alpha, window=self.window, + trim_rule=self.trim_rule, vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, - iter=self.iter, epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words + epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self + def transform(self, docs): """ Return the vector representations for the input documents. From bbcd0e548df0f321a720e4cd7bffabfdafb751d9 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Thu, 1 Mar 2018 23:00:27 +0530 Subject: [PATCH 12/41] fix deprecated argumet warnings --- gensim/models/word2vec.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index f51b4cd25f..bf4dcefd89 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -411,25 +411,21 @@ def score_cbow_pair(model, word, l1): class Word2Vec(BaseWordEmbeddingsModel): """Class for training, using and evaluating neural networks described in https://code.google.com/p/word2vec/ - If you're finished training a model (=no more updates, only querying) then switch to the :mod:`gensim.models.KeyedVectors` instance in wv - The model can be stored/loaded via its :meth:`~gensim.models.word2vec.Word2Vec.save()` and :meth:`~gensim.models.word2vec.Word2Vec.load()` methods, or stored/loaded in a format compatible with the original word2vec implementation via `wv.save_word2vec_format()` and `Word2VecKeyedVectors.load_word2vec_format()`. - """ - def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, + def __init__( sentences=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=()): + sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, null_word=0, + trim_rule=None, sorted_vocab=1, compute_loss=False, callbacks=()): """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. - Parameters ---------- sentences : iterable of iterables @@ -439,7 +435,6 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. - sg : int {1, 0} Defines the training algorithm. If 1, skip-gram is employed; otherwise, CBOW is used. size : int @@ -498,30 +493,38 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. - Examples -------- Initialize and train a `Word2Vec` model - >>> from gensim.models import Word2Vec >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] >>> >>> model = Word2Vec(sentences, min_count=1) >>> say_vector = model['say'] # get vector for word - """ - + + + if iter is not None: + warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") + epochs = iter + + if size is not None: + warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") + vector_size = size + + self.vector_size = vector_size + self.epochs = epochs self.callbacks = callbacks self.load = call_on_class_only - self.wv = Word2VecKeyedVectors(size) + self.wv = Word2VecKeyedVectors(vector_size) self.vocabulary = Word2VecVocab( max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=bool(sorted_vocab), null_word=null_word) - self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn) + self.trainables = Word2VecTrainables(seed=seed, vector_size=vector_size, hashfxn=hashfxn) super(Word2Vec, self).__init__( - sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks, + sentences=sentences, workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss, fast_version=FAST_VERSION) From 734d5c56a5c89e656f28b2ad1460e2b3dd95bdbd Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Thu, 1 Mar 2018 23:02:38 +0530 Subject: [PATCH 13/41] fix deprecated argumet warnings --- gensim/models/doc2vec.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index f57694273d..d860163f8b 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -278,7 +278,6 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), **kwargs): """Initialize the model from an iterable of `documents`. Each document is a TaggedDocument object that will be used for training. - Parameters ---------- documents : iterable of iterables @@ -286,11 +285,9 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 consider an iterable that streams the documents directly from disk/network. If you don't supply `documents`, the model is left uninitialized -- use if you plan to initialize it in some other way. - dm : int {1,0} Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used. Otherwise, `distributed bag of words` (PV-DBOW) is employed. - size : int Dimensionality of the feature vectors. window : int @@ -349,7 +346,6 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 of the model. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. - """ if 'sentences' in kwargs: @@ -404,7 +400,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 self.train( documents, total_examples=self.corpus_count, epochs=self.epochs, start_alpha=self.alpha, end_alpha=self.min_alpha, callbacks=callbacks) - + @property def dm(self): """int {1,0} : `dm=1` indicates 'distributed memory' (PV-DM) else From 7cfbfe194cb11761604366f51f64e32c95c3f6b5 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Thu, 1 Mar 2018 23:04:30 +0530 Subject: [PATCH 14/41] fix deprecated argument warnings From be108c1e60c3f5ecaa10aa84404a4cd1ea1dfc5b Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Thu, 1 Mar 2018 23:04:52 +0530 Subject: [PATCH 15/41] fix deprecated argument warnings From e4f7ad715dfabc58f4281760fe4c27389c09eb9f Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Thu, 1 Mar 2018 23:27:04 +0530 Subject: [PATCH 16/41] fix deprecated arguments --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index bf4dcefd89..e4f658ca53 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -419,7 +419,7 @@ class Word2Vec(BaseWordEmbeddingsModel): and `Word2VecKeyedVectors.load_word2vec_format()`. """ - def __init__( sentences=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, + def __init__(self, sentences=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, compute_loss=False, callbacks=()): From 02fdf706121b81305138f666cf9b294a95a50601 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 11:18:48 +0530 Subject: [PATCH 17/41] fix deprecated arguments --- gensim/sklearn_api/d2vmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 30ea3e0991..45bf2750ea 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -81,7 +81,7 @@ def fit(self, X, y=None): documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm, dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, - trim_rule=self.trim_rule, vector_size=self.vector_size, alpha=self.alpha, window=self.window, + trim_rule=self.trim_rule, size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, From 477b3814303d1d2263ddbba62d97437ac2b261e0 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 11:21:12 +0530 Subject: [PATCH 18/41] fix deprecated arguments --- gensim/sklearn_api/w2vmodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 36ba96595f..443c6801ae 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -64,11 +64,11 @@ def fit(self, X, y=None): Calls gensim.models.Word2Vec """ self.gensim_model = models.Word2Vec( - sentences=X, vector_size=self.vector_size, alpha=self.alpha, + sentences=X, size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, - hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule, + hashfxn=self.hashfxn, iter=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self From 36993ef8c4ac495cd56882cbc3443cf3c3952df1 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 11:28:42 +0530 Subject: [PATCH 19/41] Update doc2vec.py --- gensim/models/doc2vec.py | 44 ++-------------------------------------- 1 file changed, 2 insertions(+), 42 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index d860163f8b..ec912608d6 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -1,3 +1,4 @@ + #!/usr/bin/env python # -*- coding: utf-8 -*- # @@ -9,27 +10,16 @@ """ Deep learning via the distributed memory and distributed bag of words models from [1]_, using either hierarchical softmax or negative sampling [2]_ [3]_. See [#tutorial]_ - **Make sure you have a C compiler before installing gensim, to use optimized (compiled) doc2vec training** (70x speedup [blog]_). - Initialize a model with e.g.:: - >>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4) - Persist a model to disk with:: - >>> model.save(fname) >>> model = Doc2Vec.load(fname) # you can continue training with the loaded model! - If you're finished training a model (=no more updates, only querying), you can do - >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True): - to trim unneeded model memory = use (much) less RAM. - - - .. [1] Quoc Le and Tomas Mikolov. Distributed Representations of Sentences and Documents. http://arxiv.org/pdf/1405.4053v2.pdf .. [2] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. @@ -37,12 +27,8 @@ .. [3] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. .. [blog] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/ - .. [#tutorial] Doc2vec in gensim tutorial, https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb - - - """ import logging @@ -239,9 +225,7 @@ class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): and `tags` (a list of tokens). Tags may be one or more unicode string tokens, but typical practice (which will also be most memory-efficient) is for the tags list to include a unique integer id as the only tag. - Replaces "sentence as a list of words" from Word2Vec. - """ def __str__(self): @@ -257,9 +241,7 @@ class LabeledSentence(TaggedDocument): class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')): """A string document tag discovered during the initial vocabulary scan. (The document-vector equivalent of a Vocab object.) - Will not be used if all presented document tags are ints. - The offset is only the true index into the doctags_syn0/doctags_syn0_lockf if-and-only-if no raw-int tags were used. If any raw-int tags were used, string Doctag vectors begin at index (max_rawint + 1), so the true index is @@ -400,7 +382,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 self.train( documents, total_examples=self.corpus_count, epochs=self.epochs, start_alpha=self.alpha, end_alpha=self.min_alpha, callbacks=callbacks) - + @property def dm(self): """int {1,0} : `dm=1` indicates 'distributed memory' (PV-DM) else @@ -463,18 +445,15 @@ def train(self, documents, total_examples=None, total_words=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=()): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). The `documents` iterable can be simply a list of TaggedDocument elements. - To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate progress-percentage logging, either total_examples (count of sentences) or total_words (count of raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to :meth:`~gensim.models.word2vec.Word2Vec.build_vocab()`, the count of examples in that corpus will be available in the model's :attr:`corpus_count` property). - To avoid common mistakes around the model's ability to do multiple training passes itself, an explicit `epochs` argument **MUST** be provided. In the common and recommended case, where :meth:`~gensim.models.word2vec.Word2Vec.train()` is only called once, the model's cached `iter` value should be supplied as `epochs` value. - Parameters ---------- documents : iterable of iterables @@ -518,7 +497,6 @@ def estimated_lookup_memory(self): def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): """ Infer a vector for given post-bulk training document. - Parameters ---------- doc_words : :obj: `list` of :obj: `str` @@ -529,12 +507,10 @@ def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): Learning rate will linearly drop to `min_alpha` as training progresses. steps : int Number of times to train the new document. - Returns ------- :obj: `numpy.ndarray` Returns the inferred vector for the new document. - """ doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size) doctag_indexes = [0] @@ -605,7 +581,6 @@ def __str__(self): def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True): """Discard parameters that are used in training and score. Use if you're sure you're done training a model. - Parameters ---------- keep_doctags_vectors : bool @@ -613,7 +588,6 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen in this case you can't to use docvecs's most_similar, similarity etc. methods. keep_inference : bool Set `keep_inference` to False if you don't want to store parameters that is used for infer_vector method - """ if not keep_inference: if hasattr(self.trainables, 'syn1'): @@ -631,7 +605,6 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. - Parameters ---------- fname : str @@ -647,7 +620,6 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* Optional file path used to save the vocabulary binary : bool If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. - """ total_vec = len(self.wv.vocab) + len(self.docvecs) write_first_line = False @@ -668,14 +640,11 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* def init_sims(self, replace=False): """ Precompute L2-normalized vectors. - If `replace` is set, forget the original vectors and only keep the normalized ones = saves lots of memory! - Note that you **cannot continue training or inference** after doing a replace. The model becomes effectively read-only = you can call `most_similar`, `similarity` etc., but not `train` or `infer_vector`. - """ return self.docvecs.init_sims(replace=replace) @@ -698,7 +667,6 @@ def estimate_memory(self, vocab_size=None, report=None): def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence is a iterable of iterables (can simply be a list of unicode strings too). - Parameters ---------- documents : iterable of iterables @@ -737,7 +705,6 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Build vocabulary from a dictionary of word frequencies. Build model vocabulary from a passed dictionary that contains (word,word count). Words must be of type unicode strings. - Parameters ---------- word_freq : dict @@ -756,7 +723,6 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No of the model. update : bool If true, the new provided words in `word_freq` dict will be added to model's vocab. - Examples -------- >>> from gensim.models.word2vec import Word2Vec @@ -936,23 +902,17 @@ def __iter__(self): class TaggedLineDocument(object): """Simple format: one document = one line = one TaggedDocument object. - Words are expected to be already preprocessed and separated by whitespace, tags are constructed automatically from the document line number.""" def __init__(self, source): """ `source` can be either a string (filename) or a file object. - Example:: - documents = TaggedLineDocument('myfile.txt') - Or for compressed files:: - documents = TaggedLineDocument('compressed_text.txt.bz2') documents = TaggedLineDocument('compressed_text.txt.gz') - """ self.source = source From 055f043a14b439efad306be24b86845f90f43d98 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 11:37:36 +0530 Subject: [PATCH 20/41] Update word2vec.py --- gensim/models/word2vec.py | 97 +++------------------------------------ 1 file changed, 7 insertions(+), 90 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index e4f658ca53..1ca8fd0af3 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -7,92 +7,59 @@ """Produce word vectors with deep learning via word2vec's "skip-gram and CBOW models", using either hierarchical softmax or negative sampling [1]_ [2]_. - NOTE: There are more ways to get word vectors in Gensim than just Word2Vec. See FastText and wrappers for VarEmbed and WordRank. - The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/ and extended with additional functionality. - For a blog tutorial on gensim word2vec, with an interactive web app trained on GoogleNews, visit http://radimrehurek.com/2014/02/word2vec-tutorial/ - **Make sure you have a C compiler before installing gensim, to use optimized (compiled) word2vec training** (70x speedup compared to plain NumPy implementation [3]_). - Initialize a model with e.g.:: - >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) - Persist a model to disk with:: - >>> model.save(fname) >>> model = Word2Vec.load(fname) # you can continue training with the loaded model! - The word vectors are stored in a KeyedVectors instance in model.wv. This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:: - >>> model.wv['computer'] # numpy vector of a word array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) - The word vectors can also be instantiated from an existing file on disk in the word2vec C format as a KeyedVectors instance. - NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights, vocabulary frequency and the binary tree is missing:: - >>> from gensim.models import KeyedVectors >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format - - You can perform various NLP word tasks with the model. Some of them are already built-in:: - >>> model.wv.most_similar(positive=['woman', 'king'], negative=['man']) [('queen', 0.50882536), ...] - >>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) [('queen', 0.71382287), ...] - - >>> model.wv.doesnt_match("breakfast cereal dinner lunch".split()) 'cereal' - >>> model.wv.similarity('woman', 'man') 0.73723527 - Probability of a text under the model:: - >>> model.score(["The fox jumped over a lazy dog".split()]) 0.2158356 - Correlation with human opinion on word similarity:: - >>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv')) 0.51, 0.62, 0.13 - And on analogies:: - >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) - and so on. - If you're finished training a model (i.e. no more updates, only querying), then switch to the :mod:`gensim.models.KeyedVectors` instance in wv - >>> word_vectors = model.wv >>> del model - to trim unneeded model memory = use much less RAM. - Note that there is a :mod:`gensim.models.phrases` module which lets you automatically detect phrases longer than one word. Using phrases, you can learn a word2vec model where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`: - >>> bigram_transformer = gensim.models.Phrases(sentences) >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) - .. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. .. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. @@ -419,10 +386,10 @@ class Word2Vec(BaseWordEmbeddingsModel): and `Word2VecKeyedVectors.load_word2vec_format()`. """ - def __init__(self, sentences=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, + def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, null_word=0, - trim_rule=None, sorted_vocab=1, compute_loss=False, callbacks=()): + sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, + trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=()): """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. @@ -502,29 +469,18 @@ def __init__(self, sentences=None, size=None, vector_size=100, alpha=0.025, wind >>> model = Word2Vec(sentences, min_count=1) >>> say_vector = model['say'] # get vector for word """ - - - if iter is not None: - warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") - epochs = iter - - if size is not None: - warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") - vector_size = size - - self.vector_size = vector_size - self.epochs = epochs + self.callbacks = callbacks self.load = call_on_class_only - self.wv = Word2VecKeyedVectors(vector_size) + self.wv = Word2VecKeyedVectors(size) self.vocabulary = Word2VecVocab( max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=bool(sorted_vocab), null_word=null_word) - self.trainables = Word2VecTrainables(seed=seed, vector_size=vector_size, hashfxn=hashfxn) + self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn) super(Word2Vec, self).__init__( - sentences=sentences, workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks, + sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss, fast_version=FAST_VERSION) @@ -556,18 +512,15 @@ def train(self, sentences, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) - To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate progress-percentage logging, either total_examples (count of sentences) or total_words (count of raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to :meth:`~gensim.models.word2vec.Word2Vec.build_vocab()`, the count of examples in that corpus will be available in the model's :attr:`corpus_count` property). - To avoid common mistakes around the model's ability to do multiple training passes itself, an explicit `epochs` argument **MUST** be provided. In the common and recommended case, where :meth:`~gensim.models.word2vec.Word2Vec.train()` is only called once, the model's cached `iter` value should be supplied as `epochs` value. - Parameters ---------- sentences : iterable of iterables @@ -596,7 +549,6 @@ def train(self, sentences, total_examples=None, total_words=None, If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. - Examples -------- >>> from gensim.models import Word2Vec @@ -605,7 +557,6 @@ def train(self, sentences, total_examples=None, total_words=None, >>> model = Word2Vec(min_count=1) >>> model.build_vocab(sentences) >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) - """ return super(Word2Vec, self).train( @@ -617,20 +568,15 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor """Score the log probability for a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. This does not change the fitted model in any way (see Word2Vec.train() for that). - We have currently only implemented score for the hierarchical softmax scheme, so you need to have run word2vec with hs=1 and negative=0 for this to work. - Note that you should specify total_sentences; we'll run into problems if you ask to score more than this number of sentences but it is inefficient to set the value too high. - See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of how to use such scores in document classification. - .. [#taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics. .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb - Parameters ---------- sentences : iterable of iterables @@ -646,7 +592,6 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor Multiplier for size of queue (number of workers * queue_factor). report_delay : float Seconds to wait before reporting progress. - """ if FAST_VERSION < 0: warnings.warn( @@ -763,20 +708,16 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut given, where it intersects with the current vocabulary. (No words are added to the existing vocabulary, but intersecting words adopt the file's weights, and non-intersecting words are left alone.) - Parameters ---------- fname : str The file path used to save the vectors in - binary : bool If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. - lockf : float Lock-factor value to be set for any imported word-vectors; the default value of 0.0 prevents further updating of the vector during subsequent training. Use 1.0 to allow further training updates of merged vectors. - """ overlap_count = 0 logger.info("loading projection weights from %s", fname) @@ -834,19 +775,16 @@ def __contains__(self, word): def predict_output_word(self, context_words_list, topn=10): """Report the probability distribution of the center word given the context words as input to the trained model. - Parameters ---------- context_words_list : :obj: `list` of :obj: `str` List of context words topn: int Return `topn` words and their probabilities - Returns ------- :obj: `list` of :obj: `tuple` `topn` length list of tuples of (word, probability) - """ if not self.negative: raise RuntimeError( @@ -921,12 +859,10 @@ def delete_temporary_training_data(self, replace_word_vectors_with_normalized=Fa def save(self, *args, **kwargs): """Save the model. This saved model can be loaded again using :func:`~gensim.models.word2vec.Word2Vec.load`, which supports online training and getting vectors for vocabulary words. - Parameters ---------- fname : str Path to the file. - """ # don't bother storing the cached normalized vectors, recalculable table kwargs['ignore'] = kwargs.get('ignore', ['vectors_norm', 'cum_table']) @@ -963,12 +899,10 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False): @classmethod def load(cls, *args, **kwargs): """Loads a previously saved `Word2Vec` model. Also see `save()`. - Parameters ---------- fname : str Path to the saved file. - Returns ------- :obj: `~gensim.models.word2vec.Word2Vec` @@ -1042,16 +976,11 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): """ `source` can be either a string or a file object. Clip the file to the first `limit` lines (or not clipped if limit is None, the default). - Example:: - sentences = LineSentence('myfile.txt') - Or for compressed files:: - sentences = LineSentence('compressed_text.txt.bz2') sentences = LineSentence('compressed_text.txt.gz') - """ self.source = source self.max_sentence_length = max_sentence_length @@ -1084,23 +1013,17 @@ class PathLineSentences(object): """Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename. The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories. - The format of files (either text, or compressed text files) in the path is one sentence = one line, with words already preprocessed and separated by whitespace. - """ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): """ `source` should be a path to a directory (as a string) where all files can be opened by the LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default). - Example:: - sentences = PathLineSentences(os.getcwd() + '\\corpus\\') - The files in the directory should be either text files, .bz2 files, or .gz files. - """ self.source = source self.max_sentence_length = max_sentence_length @@ -1193,15 +1116,12 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr min_count=None, sample=None, dry_run=False): """Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). - Calling with `dry_run=True` will only simulate the provided settings and report the size of the retained vocabulary, effective corpus length, and estimated memory requirements. Results are both printed via logging and returned as a dict. - Delete the raw vocabulary after the scaling is done to free up RAM, unless `keep_raw_vocab` is set. - """ min_count = min_count or self.min_count sample = sample or self.sample @@ -1337,7 +1257,6 @@ def add_null_word(self, wv): def create_binary_tree(self, wv): """Create a binary Huffman tree using stored vocabulary word counts. Frequent words will have shorter binary codes. Called internally from `build_vocab()`. - """ logger.info("constructing a huffman tree from %i words", len(wv.vocab)) @@ -1370,12 +1289,10 @@ def create_binary_tree(self, wv): def make_cum_table(self, wv, power=0.75, domain=2**31 - 1): """Create a cumulative-distribution table using stored vocabulary word counts for drawing random words in the negative-sampling training routines. - To draw a word index, choose a random integer up to the maximum value in the table (cum_table[-1]), then finding that integer's sorted insertion point (as if by bisect_left or ndarray.searchsorted()). That insertion point is the drawn index, coming up in proportion equal to the increment at that slot. - Called internally from 'build_vocab()'. """ vocab_size = len(wv.index2word) From 74d1c59c2f43882f0af5bf213abad81bc58e3feb Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 12:31:13 +0530 Subject: [PATCH 21/41] Update doc2vec.py --- gensim/models/doc2vec.py | 54 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index ec912608d6..ebf2793f59 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -1,4 +1,3 @@ - #!/usr/bin/env python # -*- coding: utf-8 -*- # @@ -10,16 +9,27 @@ """ Deep learning via the distributed memory and distributed bag of words models from [1]_, using either hierarchical softmax or negative sampling [2]_ [3]_. See [#tutorial]_ + **Make sure you have a C compiler before installing gensim, to use optimized (compiled) doc2vec training** (70x speedup [blog]_). + Initialize a model with e.g.:: + >>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4) + Persist a model to disk with:: + >>> model.save(fname) >>> model = Doc2Vec.load(fname) # you can continue training with the loaded model! + If you're finished training a model (=no more updates, only querying), you can do + >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True): + to trim unneeded model memory = use (much) less RAM. + + + .. [1] Quoc Le and Tomas Mikolov. Distributed Representations of Sentences and Documents. http://arxiv.org/pdf/1405.4053v2.pdf .. [2] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. @@ -27,8 +37,12 @@ .. [3] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. .. [blog] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/ + .. [#tutorial] Doc2vec in gensim tutorial, https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb + + + """ import logging @@ -225,7 +239,9 @@ class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): and `tags` (a list of tokens). Tags may be one or more unicode string tokens, but typical practice (which will also be most memory-efficient) is for the tags list to include a unique integer id as the only tag. + Replaces "sentence as a list of words" from Word2Vec. + """ def __str__(self): @@ -241,7 +257,9 @@ class LabeledSentence(TaggedDocument): class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')): """A string document tag discovered during the initial vocabulary scan. (The document-vector equivalent of a Vocab object.) + Will not be used if all presented document tags are ints. + The offset is only the true index into the doctags_syn0/doctags_syn0_lockf if-and-only-if no raw-int tags were used. If any raw-int tags were used, string Doctag vectors begin at index (max_rawint + 1), so the true index is @@ -260,6 +278,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), **kwargs): """Initialize the model from an iterable of `documents`. Each document is a TaggedDocument object that will be used for training. + Parameters ---------- documents : iterable of iterables @@ -267,9 +286,11 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 consider an iterable that streams the documents directly from disk/network. If you don't supply `documents`, the model is left uninitialized -- use if you plan to initialize it in some other way. + dm : int {1,0} Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used. Otherwise, `distributed bag of words` (PV-DBOW) is employed. + size : int Dimensionality of the feature vectors. window : int @@ -328,8 +349,9 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 of the model. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. + """ - + if 'sentences' in kwargs: raise DeprecationWarning( "Parameter 'sentences' was renamed to 'documents', and will be removed in 4.0.0, " @@ -445,15 +467,18 @@ def train(self, documents, total_examples=None, total_words=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=()): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). The `documents` iterable can be simply a list of TaggedDocument elements. + To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate progress-percentage logging, either total_examples (count of sentences) or total_words (count of raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to :meth:`~gensim.models.word2vec.Word2Vec.build_vocab()`, the count of examples in that corpus will be available in the model's :attr:`corpus_count` property). + To avoid common mistakes around the model's ability to do multiple training passes itself, an explicit `epochs` argument **MUST** be provided. In the common and recommended case, where :meth:`~gensim.models.word2vec.Word2Vec.train()` is only called once, the model's cached `iter` value should be supplied as `epochs` value. + Parameters ---------- documents : iterable of iterables @@ -497,6 +522,7 @@ def estimated_lookup_memory(self): def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): """ Infer a vector for given post-bulk training document. + Parameters ---------- doc_words : :obj: `list` of :obj: `str` @@ -507,10 +533,12 @@ def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): Learning rate will linearly drop to `min_alpha` as training progresses. steps : int Number of times to train the new document. + Returns ------- :obj: `numpy.ndarray` Returns the inferred vector for the new document. + """ doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size) doctag_indexes = [0] @@ -581,6 +609,7 @@ def __str__(self): def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True): """Discard parameters that are used in training and score. Use if you're sure you're done training a model. + Parameters ---------- keep_doctags_vectors : bool @@ -588,7 +617,8 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen in this case you can't to use docvecs's most_similar, similarity etc. methods. keep_inference : bool Set `keep_inference` to False if you don't want to store parameters that is used for infer_vector method - """ + + """ if not keep_inference: if hasattr(self.trainables, 'syn1'): del self.trainables.syn1 @@ -605,7 +635,8 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. - Parameters + + Parameters ---------- fname : str The file path used to save the vectors in. @@ -620,6 +651,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* Optional file path used to save the vocabulary binary : bool If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. + """ total_vec = len(self.wv.vocab) + len(self.docvecs) write_first_line = False @@ -640,11 +672,14 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* def init_sims(self, replace=False): """ Precompute L2-normalized vectors. + If `replace` is set, forget the original vectors and only keep the normalized ones = saves lots of memory! + Note that you **cannot continue training or inference** after doing a replace. The model becomes effectively read-only = you can call `most_similar`, `similarity` etc., but not `train` or `infer_vector`. + """ return self.docvecs.init_sims(replace=replace) @@ -667,6 +702,7 @@ def estimate_memory(self, vocab_size=None, report=None): def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence is a iterable of iterables (can simply be a list of unicode strings too). + Parameters ---------- documents : iterable of iterables @@ -705,6 +741,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Build vocabulary from a dictionary of word frequencies. Build model vocabulary from a passed dictionary that contains (word,word count). Words must be of type unicode strings. + Parameters ---------- word_freq : dict @@ -723,6 +760,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No of the model. update : bool If true, the new provided words in `word_freq` dict will be added to model's vocab. + Examples -------- >>> from gensim.models.word2vec import Word2Vec @@ -902,17 +940,23 @@ def __iter__(self): class TaggedLineDocument(object): """Simple format: one document = one line = one TaggedDocument object. - Words are expected to be already preprocessed and separated by whitespace, + + Words are expected to be already preprocessed and separated by whitespace, tags are constructed automatically from the document line number.""" def __init__(self, source): """ `source` can be either a string (filename) or a file object. + Example:: + documents = TaggedLineDocument('myfile.txt') + Or for compressed files:: + documents = TaggedLineDocument('compressed_text.txt.bz2') documents = TaggedLineDocument('compressed_text.txt.gz') + """ self.source = source From 9b8aa9151ab4e14763bf1af0d8c210803409703c Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 13:11:29 +0530 Subject: [PATCH 22/41] Update doc2vec.py --- gensim/models/doc2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index ebf2793f59..8c2d166fc8 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -636,7 +636,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. - Parameters + Parameters ---------- fname : str The file path used to save the vectors in. From 7bc192dd67c708843872527a3a417d067c6415c5 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 13:17:35 +0530 Subject: [PATCH 23/41] Update word2vec.py --- gensim/models/word2vec.py | 80 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 1ca8fd0af3..f51b4cd25f 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -7,59 +7,92 @@ """Produce word vectors with deep learning via word2vec's "skip-gram and CBOW models", using either hierarchical softmax or negative sampling [1]_ [2]_. + NOTE: There are more ways to get word vectors in Gensim than just Word2Vec. See FastText and wrappers for VarEmbed and WordRank. + The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/ and extended with additional functionality. + For a blog tutorial on gensim word2vec, with an interactive web app trained on GoogleNews, visit http://radimrehurek.com/2014/02/word2vec-tutorial/ + **Make sure you have a C compiler before installing gensim, to use optimized (compiled) word2vec training** (70x speedup compared to plain NumPy implementation [3]_). + Initialize a model with e.g.:: + >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) + Persist a model to disk with:: + >>> model.save(fname) >>> model = Word2Vec.load(fname) # you can continue training with the loaded model! + The word vectors are stored in a KeyedVectors instance in model.wv. This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:: + >>> model.wv['computer'] # numpy vector of a word array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) + The word vectors can also be instantiated from an existing file on disk in the word2vec C format as a KeyedVectors instance. + NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights, vocabulary frequency and the binary tree is missing:: + >>> from gensim.models import KeyedVectors >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format + + You can perform various NLP word tasks with the model. Some of them are already built-in:: + >>> model.wv.most_similar(positive=['woman', 'king'], negative=['man']) [('queen', 0.50882536), ...] + >>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) [('queen', 0.71382287), ...] + + >>> model.wv.doesnt_match("breakfast cereal dinner lunch".split()) 'cereal' + >>> model.wv.similarity('woman', 'man') 0.73723527 + Probability of a text under the model:: + >>> model.score(["The fox jumped over a lazy dog".split()]) 0.2158356 + Correlation with human opinion on word similarity:: + >>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv')) 0.51, 0.62, 0.13 + And on analogies:: + >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) + and so on. + If you're finished training a model (i.e. no more updates, only querying), then switch to the :mod:`gensim.models.KeyedVectors` instance in wv + >>> word_vectors = model.wv >>> del model + to trim unneeded model memory = use much less RAM. + Note that there is a :mod:`gensim.models.phrases` module which lets you automatically detect phrases longer than one word. Using phrases, you can learn a word2vec model where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`: + >>> bigram_transformer = gensim.models.Phrases(sentences) >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) + .. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. .. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. @@ -378,12 +411,15 @@ def score_cbow_pair(model, word, l1): class Word2Vec(BaseWordEmbeddingsModel): """Class for training, using and evaluating neural networks described in https://code.google.com/p/word2vec/ + If you're finished training a model (=no more updates, only querying) then switch to the :mod:`gensim.models.KeyedVectors` instance in wv + The model can be stored/loaded via its :meth:`~gensim.models.word2vec.Word2Vec.save()` and :meth:`~gensim.models.word2vec.Word2Vec.load()` methods, or stored/loaded in a format compatible with the original word2vec implementation via `wv.save_word2vec_format()` and `Word2VecKeyedVectors.load_word2vec_format()`. + """ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, @@ -393,6 +429,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. + Parameters ---------- sentences : iterable of iterables @@ -402,6 +439,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. + sg : int {1, 0} Defines the training algorithm. If 1, skip-gram is employed; otherwise, CBOW is used. size : int @@ -460,14 +498,17 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. + Examples -------- Initialize and train a `Word2Vec` model + >>> from gensim.models import Word2Vec >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] >>> >>> model = Word2Vec(sentences, min_count=1) >>> say_vector = model['say'] # get vector for word + """ self.callbacks = callbacks @@ -512,15 +553,18 @@ def train(self, sentences, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) + To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate progress-percentage logging, either total_examples (count of sentences) or total_words (count of raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to :meth:`~gensim.models.word2vec.Word2Vec.build_vocab()`, the count of examples in that corpus will be available in the model's :attr:`corpus_count` property). + To avoid common mistakes around the model's ability to do multiple training passes itself, an explicit `epochs` argument **MUST** be provided. In the common and recommended case, where :meth:`~gensim.models.word2vec.Word2Vec.train()` is only called once, the model's cached `iter` value should be supplied as `epochs` value. + Parameters ---------- sentences : iterable of iterables @@ -549,6 +593,7 @@ def train(self, sentences, total_examples=None, total_words=None, If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. + Examples -------- >>> from gensim.models import Word2Vec @@ -557,6 +602,7 @@ def train(self, sentences, total_examples=None, total_words=None, >>> model = Word2Vec(min_count=1) >>> model.build_vocab(sentences) >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) + """ return super(Word2Vec, self).train( @@ -568,15 +614,20 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor """Score the log probability for a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. This does not change the fitted model in any way (see Word2Vec.train() for that). + We have currently only implemented score for the hierarchical softmax scheme, so you need to have run word2vec with hs=1 and negative=0 for this to work. + Note that you should specify total_sentences; we'll run into problems if you ask to score more than this number of sentences but it is inefficient to set the value too high. + See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of how to use such scores in document classification. + .. [#taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics. .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb + Parameters ---------- sentences : iterable of iterables @@ -592,6 +643,7 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor Multiplier for size of queue (number of workers * queue_factor). report_delay : float Seconds to wait before reporting progress. + """ if FAST_VERSION < 0: warnings.warn( @@ -708,16 +760,20 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut given, where it intersects with the current vocabulary. (No words are added to the existing vocabulary, but intersecting words adopt the file's weights, and non-intersecting words are left alone.) + Parameters ---------- fname : str The file path used to save the vectors in + binary : bool If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. + lockf : float Lock-factor value to be set for any imported word-vectors; the default value of 0.0 prevents further updating of the vector during subsequent training. Use 1.0 to allow further training updates of merged vectors. + """ overlap_count = 0 logger.info("loading projection weights from %s", fname) @@ -775,16 +831,19 @@ def __contains__(self, word): def predict_output_word(self, context_words_list, topn=10): """Report the probability distribution of the center word given the context words as input to the trained model. + Parameters ---------- context_words_list : :obj: `list` of :obj: `str` List of context words topn: int Return `topn` words and their probabilities + Returns ------- :obj: `list` of :obj: `tuple` `topn` length list of tuples of (word, probability) + """ if not self.negative: raise RuntimeError( @@ -859,10 +918,12 @@ def delete_temporary_training_data(self, replace_word_vectors_with_normalized=Fa def save(self, *args, **kwargs): """Save the model. This saved model can be loaded again using :func:`~gensim.models.word2vec.Word2Vec.load`, which supports online training and getting vectors for vocabulary words. + Parameters ---------- fname : str Path to the file. + """ # don't bother storing the cached normalized vectors, recalculable table kwargs['ignore'] = kwargs.get('ignore', ['vectors_norm', 'cum_table']) @@ -899,10 +960,12 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False): @classmethod def load(cls, *args, **kwargs): """Loads a previously saved `Word2Vec` model. Also see `save()`. + Parameters ---------- fname : str Path to the saved file. + Returns ------- :obj: `~gensim.models.word2vec.Word2Vec` @@ -976,11 +1039,16 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): """ `source` can be either a string or a file object. Clip the file to the first `limit` lines (or not clipped if limit is None, the default). + Example:: + sentences = LineSentence('myfile.txt') + Or for compressed files:: + sentences = LineSentence('compressed_text.txt.bz2') sentences = LineSentence('compressed_text.txt.gz') + """ self.source = source self.max_sentence_length = max_sentence_length @@ -1013,17 +1081,23 @@ class PathLineSentences(object): """Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename. The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories. + The format of files (either text, or compressed text files) in the path is one sentence = one line, with words already preprocessed and separated by whitespace. + """ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): """ `source` should be a path to a directory (as a string) where all files can be opened by the LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default). + Example:: + sentences = PathLineSentences(os.getcwd() + '\\corpus\\') + The files in the directory should be either text files, .bz2 files, or .gz files. + """ self.source = source self.max_sentence_length = max_sentence_length @@ -1116,12 +1190,15 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr min_count=None, sample=None, dry_run=False): """Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). + Calling with `dry_run=True` will only simulate the provided settings and report the size of the retained vocabulary, effective corpus length, and estimated memory requirements. Results are both printed via logging and returned as a dict. + Delete the raw vocabulary after the scaling is done to free up RAM, unless `keep_raw_vocab` is set. + """ min_count = min_count or self.min_count sample = sample or self.sample @@ -1257,6 +1334,7 @@ def add_null_word(self, wv): def create_binary_tree(self, wv): """Create a binary Huffman tree using stored vocabulary word counts. Frequent words will have shorter binary codes. Called internally from `build_vocab()`. + """ logger.info("constructing a huffman tree from %i words", len(wv.vocab)) @@ -1289,10 +1367,12 @@ def create_binary_tree(self, wv): def make_cum_table(self, wv, power=0.75, domain=2**31 - 1): """Create a cumulative-distribution table using stored vocabulary word counts for drawing random words in the negative-sampling training routines. + To draw a word index, choose a random integer up to the maximum value in the table (cum_table[-1]), then finding that integer's sorted insertion point (as if by bisect_left or ndarray.searchsorted()). That insertion point is the drawn index, coming up in proportion equal to the increment at that slot. + Called internally from 'build_vocab()'. """ vocab_size = len(wv.index2word) From c6d73f6576b97defb5e30ebbb2d838fef3f7a1c6 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 13:28:42 +0530 Subject: [PATCH 24/41] Update w2vmodel.py --- gensim/sklearn_api/w2vmodel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 443c6801ae..36eeaba67a 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -58,7 +58,8 @@ def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count= self.trim_rule = trim_rule self.sorted_vocab = sorted_vocab self.batch_words = batch_words - def fit(self, X, y=None): + + def fit(self, X, y=None): """ Fit the model according to the given training data. Calls gensim.models.Word2Vec From f06c65352abc6532742e41440bfa40e0f2210c2a Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 13:52:04 +0530 Subject: [PATCH 25/41] Update d2vmodel.py --- gensim/sklearn_api/d2vmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 45bf2750ea..d18158e2f6 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -68,7 +68,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.sorted_vocab = sorted_vocab self.batch_words = batch_words - def fit(self, X, y=None): + def fit(self, X, y=None): """ Fit the model according to the given training data. Calls gensim.models.Doc2Vec From 3dad3458a7f2a98dfe09401fc3ffb25ea66ae7b5 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 14:09:28 +0530 Subject: [PATCH 26/41] Update test_sklearn_api.py --- gensim/test/test_sklearn_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index f28bf6817b..2521bf13fe 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -679,7 +679,7 @@ def testConsistencyWithGensimModel(self): def testPipeline(self): numpy.random.seed(0) # set fixed seed to get similar values everytime - model = W2VTransformer(min_count=1,vector_size=10) + model = W2VTransformer(min_count=1, vector_size=10) model.fit(w2v_texts) class_dict = {'mathematics': 1, 'physics': 0} From 3148a1753e39dfd6b0b710f52b8c94122d4e98fc Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 14:16:20 +0530 Subject: [PATCH 27/41] Update d2vmodel.py --- gensim/sklearn_api/d2vmodel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index d18158e2f6..c20ab77818 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -24,9 +24,9 @@ class D2VTransformer(TransformerMixin, BaseEstimator): """ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, - docvecs_mapfile=None, comment=None, trim_rule=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, - hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000): + docvecs_mapfile=None, comment=None, trim_rule=None, size=None, vector_size=100, alpha=0.025, window=5, + min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, + cbow_mean=1, hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000): """ Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. """ From 9ac44f93416259f5b4ebba3f22c61bfd5b64316a Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 14:17:26 +0530 Subject: [PATCH 28/41] Update w2vmodel.py --- gensim/sklearn_api/w2vmodel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 36eeaba67a..11ad6cea25 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -24,9 +24,9 @@ class W2VTransformer(TransformerMixin, BaseEstimator): Base Word2Vec module """ - def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, - workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=10000): + def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, + seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, + null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000): """ Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details. """ From e90fcf6f36a04b4cb9d26814efa43123a83d623a Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 14:50:57 +0530 Subject: [PATCH 29/41] Update w2vmodel.py --- gensim/sklearn_api/w2vmodel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 11ad6cea25..f0774fdd7f 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -40,6 +40,7 @@ def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count= self.gensim_model = None self.vector_size = vector_size + self.size = vector_size self.alpha = alpha self.window = window self.min_count = min_count From 9686738687de5f2dcf0aa8915531997d95727574 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 14:51:02 +0530 Subject: [PATCH 30/41] Update test_sklearn_api.py --- gensim/test/test_sklearn_api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 2521bf13fe..482ae37bce 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -646,7 +646,7 @@ def testModelNotFitted(self): class TestWord2VecWrapper(unittest.TestCase): def setUp(self): numpy.random.seed(0) - self.model = W2VTransformer(min_count=0, seed=42, vector_size=10) + self.model = W2VTransformer(min_count=0, seed=42, size=10) self.model.fit(texts) def testTransform(self): @@ -665,7 +665,7 @@ def testTransform(self): def testConsistencyWithGensimModel(self): # training a W2VTransformer - self.model = W2VTransformer(min_count=0, seed=42, vector_size=10) + self.model = W2VTransformer(min_count=0, seed=42, size=10) self.model.fit(texts) # training a Gensim Word2Vec model with the same params @@ -679,7 +679,7 @@ def testConsistencyWithGensimModel(self): def testPipeline(self): numpy.random.seed(0) # set fixed seed to get similar values everytime - model = W2VTransformer(min_count=1, vector_size=10) + model = W2VTransformer(min_count=1, size=10) model.fit(w2v_texts) class_dict = {'mathematics': 1, 'physics': 0} @@ -724,7 +724,7 @@ def testPersistence(self): self.assertTrue(passed) def testModelNotFitted(self): - w2vmodel_wrapper = W2VTransformer(min_count=0, seed=42, vector_size=10) + w2vmodel_wrapper = W2VTransformer(min_count=0, seed=42, size=10) word = texts[0][0] self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word) From 33961ed5d5948354512e8a675ccd6ea06c0305da Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 14:52:19 +0530 Subject: [PATCH 31/41] Update d2vmodel.py --- gensim/sklearn_api/d2vmodel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index c20ab77818..cbe074377f 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -52,6 +52,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 # attributes associated with gensim.models.Word2Vec self.vector_size = vector_size + self.size = vector_size self.alpha = alpha self.window = window self.min_count = min_count From a5fb143e7110b7af0e6444781d411e5164c780c2 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 15:50:18 +0530 Subject: [PATCH 32/41] Update w2vmodel.py --- gensim/sklearn_api/w2vmodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index f0774fdd7f..c0d2cd35ba 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -25,8 +25,8 @@ class W2VTransformer(TransformerMixin, BaseEstimator): """ def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, - seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, - null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000): + seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, + epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000): """ Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details. """ From d1c2d5a3ec059367b605596f453e3008413bbe17 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 15:57:42 +0530 Subject: [PATCH 33/41] Update d2vmodel.py --- gensim/sklearn_api/d2vmodel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index cbe074377f..6c7313b24a 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -90,7 +90,6 @@ def fit(self, X, y=None): ) return self - def transform(self, docs): """ Return the vector representations for the input documents. From 7246e37f4139e6d489154a47d0ddf41f32837607 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 16:43:52 +0530 Subject: [PATCH 34/41] Update d2vmodel.py --- gensim/sklearn_api/d2vmodel.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 6c7313b24a..3326703225 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -29,16 +29,13 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 cbow_mean=1, hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000): """ Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. - """ - + """ if iter is not None: warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") epochs = iter - if size is not None: warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") vector_size = size - self.gensim_model = None self.dm_mean = dm_mean self.dm = dm From b254009b806cc709f30dd129b253475bcd1fd5e2 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 16:43:56 +0530 Subject: [PATCH 35/41] Update w2vmodel.py --- gensim/sklearn_api/w2vmodel.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index c0d2cd35ba..fa45d6b3d0 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -33,11 +33,9 @@ def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count= if iter is not None: warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") epochs = iter - if size is not None: warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") - vector_size = size - + vector_size = size self.gensim_model = None self.vector_size = vector_size self.size = vector_size From 750522935fd1a6df66bd0d37edfc393984762863 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 17:19:08 +0530 Subject: [PATCH 36/41] Update w2vmodel.py --- gensim/sklearn_api/w2vmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index fa45d6b3d0..484ce35428 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -35,7 +35,7 @@ def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count= epochs = iter if size is not None: warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") - vector_size = size + vector_size = size self.gensim_model = None self.vector_size = vector_size self.size = vector_size From e6036496f934d45aeb172bbeb79c3e53f8e68065 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sat, 10 Mar 2018 18:00:03 +0530 Subject: [PATCH 37/41] Update d2vmodel.py --- gensim/sklearn_api/d2vmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 3326703225..b6eb3b9b04 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -29,7 +29,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 cbow_mean=1, hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000): """ Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. - """ + """ if iter is not None: warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") epochs = iter From 61dfef07961746842246d8a7bf54ddd4fc0ad307 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Sun, 11 Mar 2018 20:18:35 +0530 Subject: [PATCH 38/41] Update test_sklearn_api.py --- gensim/test/test_sklearn_api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 482ae37bce..ed5516df37 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -646,7 +646,7 @@ def testModelNotFitted(self): class TestWord2VecWrapper(unittest.TestCase): def setUp(self): numpy.random.seed(0) - self.model = W2VTransformer(min_count=0, seed=42, size=10) + self.model = W2VTransformer(size=10, min_count=0, seed=42) self.model.fit(texts) def testTransform(self): @@ -665,7 +665,7 @@ def testTransform(self): def testConsistencyWithGensimModel(self): # training a W2VTransformer - self.model = W2VTransformer(min_count=0, seed=42, size=10) + self.model = W2VTransformer(size=10, min_count=0, seed=42) self.model.fit(texts) # training a Gensim Word2Vec model with the same params @@ -679,7 +679,7 @@ def testConsistencyWithGensimModel(self): def testPipeline(self): numpy.random.seed(0) # set fixed seed to get similar values everytime - model = W2VTransformer(min_count=1, size=10) + model = W2VTransformer(size=10, min_count=1) model.fit(w2v_texts) class_dict = {'mathematics': 1, 'physics': 0} @@ -724,7 +724,7 @@ def testPersistence(self): self.assertTrue(passed) def testModelNotFitted(self): - w2vmodel_wrapper = W2VTransformer(min_count=0, seed=42, size=10) + w2vmodel_wrapper = W2VTransformer(size=10, min_count=0, seed=42) word = texts[0][0] self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word) From 52d2945d68d3f2bed0e577a152bf74c7505b4a42 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Mon, 12 Mar 2018 12:28:04 +0530 Subject: [PATCH 39/41] Update doc2vec.py --- gensim/models/doc2vec.py | 66 ++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 8c2d166fc8..f57694273d 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -239,9 +239,9 @@ class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): and `tags` (a list of tokens). Tags may be one or more unicode string tokens, but typical practice (which will also be most memory-efficient) is for the tags list to include a unique integer id as the only tag. - + Replaces "sentence as a list of words" from Word2Vec. - + """ def __str__(self): @@ -257,9 +257,9 @@ class LabeledSentence(TaggedDocument): class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')): """A string document tag discovered during the initial vocabulary scan. (The document-vector equivalent of a Vocab object.) - + Will not be used if all presented document tags are ints. - + The offset is only the true index into the doctags_syn0/doctags_syn0_lockf if-and-only-if no raw-int tags were used. If any raw-int tags were used, string Doctag vectors begin at index (max_rawint + 1), so the true index is @@ -278,7 +278,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), **kwargs): """Initialize the model from an iterable of `documents`. Each document is a TaggedDocument object that will be used for training. - + Parameters ---------- documents : iterable of iterables @@ -286,11 +286,11 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 consider an iterable that streams the documents directly from disk/network. If you don't supply `documents`, the model is left uninitialized -- use if you plan to initialize it in some other way. - + dm : int {1,0} Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used. Otherwise, `distributed bag of words` (PV-DBOW) is employed. - + size : int Dimensionality of the feature vectors. window : int @@ -349,9 +349,9 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 of the model. callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec` List of callbacks that need to be executed/run at specific stages during training. - + """ - + if 'sentences' in kwargs: raise DeprecationWarning( "Parameter 'sentences' was renamed to 'documents', and will be removed in 4.0.0, " @@ -467,18 +467,18 @@ def train(self, documents, total_examples=None, total_words=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=()): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). The `documents` iterable can be simply a list of TaggedDocument elements. - + To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate progress-percentage logging, either total_examples (count of sentences) or total_words (count of raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to :meth:`~gensim.models.word2vec.Word2Vec.build_vocab()`, the count of examples in that corpus will be available in the model's :attr:`corpus_count` property). - + To avoid common mistakes around the model's ability to do multiple training passes itself, an explicit `epochs` argument **MUST** be provided. In the common and recommended case, where :meth:`~gensim.models.word2vec.Word2Vec.train()` is only called once, the model's cached `iter` value should be supplied as `epochs` value. - + Parameters ---------- documents : iterable of iterables @@ -522,7 +522,7 @@ def estimated_lookup_memory(self): def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): """ Infer a vector for given post-bulk training document. - + Parameters ---------- doc_words : :obj: `list` of :obj: `str` @@ -533,12 +533,12 @@ def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): Learning rate will linearly drop to `min_alpha` as training progresses. steps : int Number of times to train the new document. - + Returns ------- :obj: `numpy.ndarray` Returns the inferred vector for the new document. - + """ doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size) doctag_indexes = [0] @@ -609,7 +609,7 @@ def __str__(self): def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True): """Discard parameters that are used in training and score. Use if you're sure you're done training a model. - + Parameters ---------- keep_doctags_vectors : bool @@ -617,8 +617,8 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen in this case you can't to use docvecs's most_similar, similarity etc. methods. keep_inference : bool Set `keep_inference` to False if you don't want to store parameters that is used for infer_vector method - - """ + + """ if not keep_inference: if hasattr(self.trainables, 'syn1'): del self.trainables.syn1 @@ -635,7 +635,7 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. - + Parameters ---------- fname : str @@ -651,7 +651,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* Optional file path used to save the vocabulary binary : bool If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. - + """ total_vec = len(self.wv.vocab) + len(self.docvecs) write_first_line = False @@ -672,14 +672,14 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* def init_sims(self, replace=False): """ Precompute L2-normalized vectors. - + If `replace` is set, forget the original vectors and only keep the normalized ones = saves lots of memory! - + Note that you **cannot continue training or inference** after doing a replace. The model becomes effectively read-only = you can call `most_similar`, `similarity` etc., but not `train` or `infer_vector`. - + """ return self.docvecs.init_sims(replace=replace) @@ -702,7 +702,7 @@ def estimate_memory(self, vocab_size=None, report=None): def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence is a iterable of iterables (can simply be a list of unicode strings too). - + Parameters ---------- documents : iterable of iterables @@ -741,7 +741,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No Build vocabulary from a dictionary of word frequencies. Build model vocabulary from a passed dictionary that contains (word,word count). Words must be of type unicode strings. - + Parameters ---------- word_freq : dict @@ -760,7 +760,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No of the model. update : bool If true, the new provided words in `word_freq` dict will be added to model's vocab. - + Examples -------- >>> from gensim.models.word2vec import Word2Vec @@ -940,23 +940,23 @@ def __iter__(self): class TaggedLineDocument(object): """Simple format: one document = one line = one TaggedDocument object. - - Words are expected to be already preprocessed and separated by whitespace, + + Words are expected to be already preprocessed and separated by whitespace, tags are constructed automatically from the document line number.""" def __init__(self, source): """ `source` can be either a string (filename) or a file object. - + Example:: - + documents = TaggedLineDocument('myfile.txt') - + Or for compressed files:: - + documents = TaggedLineDocument('compressed_text.txt.bz2') documents = TaggedLineDocument('compressed_text.txt.gz') - + """ self.source = source From 09b5dc7a9d78a379605c87bd09cc60dcc1054c9d Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Mon, 12 Mar 2018 12:28:08 +0530 Subject: [PATCH 40/41] Update w2vmodel.py --- gensim/sklearn_api/w2vmodel.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 484ce35428..317842ee07 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -10,7 +10,6 @@ Follows scikit-learn API conventions """ -import warnings import numpy as np import six from sklearn.base import TransformerMixin, BaseEstimator @@ -24,21 +23,14 @@ class W2VTransformer(TransformerMixin, BaseEstimator): Base Word2Vec module """ - def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, - seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, - epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000): + def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, + workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, + trim_rule=None, sorted_vocab=1, batch_words=10000): """ Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details. """ - if iter is not None: - warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") - epochs = iter - if size is not None: - warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") - vector_size = size self.gensim_model = None - self.vector_size = vector_size - self.size = vector_size + self.size = size self.alpha = alpha self.window = window self.min_count = min_count @@ -52,23 +44,23 @@ def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count= self.negative = negative self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn - self.epochs = epochs + self.iter = iter self.null_word = null_word self.trim_rule = trim_rule self.sorted_vocab = sorted_vocab self.batch_words = batch_words - + def fit(self, X, y=None): """ Fit the model according to the given training data. Calls gensim.models.Word2Vec """ self.gensim_model = models.Word2Vec( - sentences=X, size=self.vector_size, alpha=self.alpha, + sentences=X, size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, - hashfxn=self.hashfxn, iter=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule, + hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self From 341a91f5658813e2a9470dbf7b57105f3bf0ad30 Mon Sep 17 00:00:00 2001 From: Mritunjay Mohitesh Date: Mon, 12 Mar 2018 12:44:57 +0530 Subject: [PATCH 41/41] Update d2vmodel.py --- gensim/sklearn_api/d2vmodel.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index b6eb3b9b04..f3f9f53133 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -8,7 +8,7 @@ Scikit learn interface for gensim for easy use of gensim with scikit-learn Follows scikit-learn API conventions """ -import warnings + import numpy as np from six import string_types from sklearn.base import TransformerMixin, BaseEstimator @@ -24,18 +24,12 @@ class D2VTransformer(TransformerMixin, BaseEstimator): """ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, - docvecs_mapfile=None, comment=None, trim_rule=None, size=None, vector_size=100, alpha=0.025, window=5, - min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, - cbow_mean=1, hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000): + docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5, + max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, + hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000): """ Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details. """ - if iter is not None: - warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.") - epochs = iter - if size is not None: - warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.") - vector_size = size self.gensim_model = None self.dm_mean = dm_mean self.dm = dm @@ -48,8 +42,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.trim_rule = trim_rule # attributes associated with gensim.models.Word2Vec - self.vector_size = vector_size - self.size = vector_size + self.size = size self.alpha = alpha self.window = window self.min_count = min_count @@ -62,10 +55,10 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.negative = negative self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn - self.epochs = epochs + self.iter = iter self.sorted_vocab = sorted_vocab self.batch_words = batch_words - + def fit(self, X, y=None): """ Fit the model according to the given training data. @@ -79,11 +72,11 @@ def fit(self, X, y=None): documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm, dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, - trim_rule=self.trim_rule, size=self.vector_size, alpha=self.alpha, window=self.window, + trim_rule=self.trim_rule, vector_size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, - epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words + epochs=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self