From 2ee6ab85292971ccdcbaccbff23606a25be27597 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Wed, 28 Feb 2018 21:02:56 +0530
Subject: [PATCH 01/41] fix 'iter' and 'size' warnings

---
 gensim/sklearn_api/d2vmodel.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index 1e3bf61d7d..f4c7afb662 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -9,6 +9,7 @@
 Follows scikit-learn API conventions
 """
 
+import warnings
 import numpy as np
 from six import string_types
 from sklearn.base import TransformerMixin, BaseEstimator
@@ -26,10 +27,18 @@ class D2VTransformer(TransformerMixin, BaseEstimator):
     def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None,
                  docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1,
-                 hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000):
+                 hashfxn=hash, iter=5,sorted_vocab=1, batch_words=10000 ,**kwargs):
         """
         Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details.
         """
+        if 'iter' in kwargs:
+            warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
+            kwargs['epochs'] = kwargs['iter']
+
+        if 'size' in kwargs:
+            warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
+            kwargs['vector_size'] = kwargs['size']
+            
         self.gensim_model = None
         self.dm_mean = dm_mean
         self.dm = dm
@@ -42,7 +51,9 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
         self.trim_rule = trim_rule
 
         # attributes associated with gensim.models.Word2Vec
+        
         self.size = size
+        self.vector_size = vector_size
         self.alpha = alpha
         self.window = window
         self.min_count = min_count
@@ -56,6 +67,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
         self.cbow_mean = int(cbow_mean)
         self.hashfxn = hashfxn
         self.iter = iter
+        self.epochs = epochs
         self.sorted_vocab = sorted_vocab
         self.batch_words = batch_words
 
@@ -72,11 +84,11 @@ def fit(self, X, y=None):
             documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm,
             dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count,
             docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment,
-            trim_rule=self.trim_rule, size=self.size, alpha=self.alpha, window=self.window,
+            trim_rule=self.trim_rule, size=self.size, vector_size=self.vector_size, alpha=self.alpha, window=self.window,
             min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample,
             seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs,
             negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn,
-            iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
+            iter=self.iter,epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
         )
         return self
 

From 6301be686c57dac26cfd582ffe2a11b0d0e38e66 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Wed, 28 Feb 2018 21:13:34 +0530
Subject: [PATCH 02/41] Revert "fix 'iter' and 'size' warnings"

---
 gensim/sklearn_api/d2vmodel.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index f4c7afb662..1e3bf61d7d 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -9,7 +9,6 @@
 Follows scikit-learn API conventions
 """
 
-import warnings
 import numpy as np
 from six import string_types
 from sklearn.base import TransformerMixin, BaseEstimator
@@ -27,18 +26,10 @@ class D2VTransformer(TransformerMixin, BaseEstimator):
     def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None,
                  docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1,
-                 hashfxn=hash, iter=5,sorted_vocab=1, batch_words=10000 ,**kwargs):
+                 hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000):
         """
         Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details.
         """
-        if 'iter' in kwargs:
-            warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
-            kwargs['epochs'] = kwargs['iter']
-
-        if 'size' in kwargs:
-            warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
-            kwargs['vector_size'] = kwargs['size']
-            
         self.gensim_model = None
         self.dm_mean = dm_mean
         self.dm = dm
@@ -51,9 +42,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
         self.trim_rule = trim_rule
 
         # attributes associated with gensim.models.Word2Vec
-        
         self.size = size
-        self.vector_size = vector_size
         self.alpha = alpha
         self.window = window
         self.min_count = min_count
@@ -67,7 +56,6 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
         self.cbow_mean = int(cbow_mean)
         self.hashfxn = hashfxn
         self.iter = iter
-        self.epochs = epochs
         self.sorted_vocab = sorted_vocab
         self.batch_words = batch_words
 
@@ -84,11 +72,11 @@ def fit(self, X, y=None):
             documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm,
             dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count,
             docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment,
-            trim_rule=self.trim_rule, size=self.size, vector_size=self.vector_size, alpha=self.alpha, window=self.window,
+            trim_rule=self.trim_rule, size=self.size, alpha=self.alpha, window=self.window,
             min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample,
             seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs,
             negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn,
-            iter=self.iter,epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
+            iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
         )
         return self
 

From 5b9d2ffe321ca3ac89409807a6300aa53929978a Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Wed, 28 Feb 2018 21:22:03 +0530
Subject: [PATCH 03/41] Update w2vmodel.py

---
 gensim/sklearn_api/w2vmodel.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index 317842ee07..9bc89c6a88 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -10,6 +10,7 @@
 Follows scikit-learn API conventions
 """
 
+import warnings
 import numpy as np
 import six
 from sklearn.base import TransformerMixin, BaseEstimator
@@ -25,12 +26,21 @@ class W2VTransformer(TransformerMixin, BaseEstimator):
 
     def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
                  workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
-                 trim_rule=None, sorted_vocab=1, batch_words=10000):
+                 trim_rule=None, sorted_vocab=1, batch_words=10000,**kwargs):
         """
         Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details.
         """
+        if iter is not None:
+            warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
+            kwargs['epochs'] = iter
+
+        if size is not None:
+            warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
+            kwargs['vector_size'] = size
+        
         self.gensim_model = None
         self.size = size
+        self.vector_size = kwargs['vector_size']
         self.alpha = alpha
         self.window = window
         self.min_count = min_count
@@ -45,6 +55,7 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=
         self.cbow_mean = int(cbow_mean)
         self.hashfxn = hashfxn
         self.iter = iter
+        self.epochs = kwargs['epochs']
         self.null_word = null_word
         self.trim_rule = trim_rule
         self.sorted_vocab = sorted_vocab
@@ -56,11 +67,11 @@ def fit(self, X, y=None):
         Calls gensim.models.Word2Vec
         """
         self.gensim_model = models.Word2Vec(
-            sentences=X, size=self.size, alpha=self.alpha,
+            sentences=X, size=self.size, vector_size=self.vector_size, alpha=self.alpha,
             window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size,
             sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha,
             sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean,
-            hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, trim_rule=self.trim_rule,
+            hashfxn=self.hashfxn, iter=self.iter, epochs=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule,
             sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
         )
         return self

From d2733ff0e116997320e0af10e9521a4bf9a7dbc8 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Wed, 28 Feb 2018 21:30:52 +0530
Subject: [PATCH 04/41] fix 'iter' and 'size' warnings

---
 gensim/sklearn_api/d2vmodel.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index 1e3bf61d7d..d01744fea4 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -26,10 +26,19 @@ class D2VTransformer(TransformerMixin, BaseEstimator):
     def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None,
                  docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1,
-                 hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000):
+                 hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000,**kwargs):
         """
         Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details.
         """
+        
+        if iter is not None:
+            warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
+            kwargs['epochs'] = iter
+
+        if size is not None:
+            warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
+            kwargs['vector_size'] = size
+            
         self.gensim_model = None
         self.dm_mean = dm_mean
         self.dm = dm
@@ -43,6 +52,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
 
         # attributes associated with gensim.models.Word2Vec
         self.size = size
+        self.vector_size = kwargs['vector_size']
         self.alpha = alpha
         self.window = window
         self.min_count = min_count
@@ -56,6 +66,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
         self.cbow_mean = int(cbow_mean)
         self.hashfxn = hashfxn
         self.iter = iter
+        self.epochs = kwargs['epochs']
         self.sorted_vocab = sorted_vocab
         self.batch_words = batch_words
 
@@ -72,11 +83,11 @@ def fit(self, X, y=None):
             documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm,
             dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count,
             docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment,
-            trim_rule=self.trim_rule, size=self.size, alpha=self.alpha, window=self.window,
+            trim_rule=self.trim_rule, size=self.size, vector_size=self.vector_size, alpha=self.alpha, window=self.window,
             min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample,
             seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs,
             negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn,
-            iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
+            iter=self.iter, epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
         )
         return self
 

From 21934e10f19ec65a3f167454aaf25de339fc69f8 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Wed, 28 Feb 2018 21:48:45 +0530
Subject: [PATCH 05/41] Fix 'iter' and 'size' warnings

---
 gensim/sklearn_api/d2vmodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index d01744fea4..628c55783a 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -8,7 +8,7 @@
 Scikit learn interface for gensim for easy use of gensim with scikit-learn
 Follows scikit-learn API conventions
 """
-
+import warnings
 import numpy as np
 from six import string_types
 from sklearn.base import TransformerMixin, BaseEstimator

From 428d428d35e2068e5738f907d0857eb60ff7f572 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Wed, 28 Feb 2018 22:27:32 +0530
Subject: [PATCH 06/41] fixed deprecated argument warnings

---
 gensim/sklearn_api/w2vmodel.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index 9bc89c6a88..f189f07d7d 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -24,8 +24,8 @@ class W2VTransformer(TransformerMixin, BaseEstimator):
     Base Word2Vec module
     """
 
-    def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
-                 workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
+    def __init__(self, size=None, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
+                 workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, null_word=0,
                  trim_rule=None, sorted_vocab=1, batch_words=10000,**kwargs):
         """
         Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details.
@@ -33,10 +33,14 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=
         if iter is not None:
             warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
             kwargs['epochs'] = iter
+        if iter is None and 'epochs' not in kwargs:
+            kwargs['epochs'] = 5
 
         if size is not None:
             warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
             kwargs['vector_size'] = size
+        if size is None and 'vector_size' not in kwargs:
+            kwargs['vector_size'] = 100
         
         self.gensim_model = None
         self.size = size

From 0a0e1d67b46d23c9644917aec0dc3b3bcef4d530 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Wed, 28 Feb 2018 22:30:17 +0530
Subject: [PATCH 07/41] fixed deprecated argument warnings

---
 gensim/sklearn_api/d2vmodel.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index 628c55783a..02f34ae5c3 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -24,9 +24,9 @@ class D2VTransformer(TransformerMixin, BaseEstimator):
     """
 
     def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None,
-                 docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5,
+                 docvecs_mapfile=None, comment=None, trim_rule=None, size=None, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1,
-                 hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000,**kwargs):
+                 hashfxn=hash, iter=None, sorted_vocab=1, batch_words=10000,**kwargs):
         """
         Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details.
         """
@@ -34,10 +34,14 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
         if iter is not None:
             warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
             kwargs['epochs'] = iter
+        if iter is None and 'epochs' not in kwargs:
+            kwargs['epochs'] = 5
 
         if size is not None:
             warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
             kwargs['vector_size'] = size
+        if size is None and 'vector_size' not in kwargs:
+            kwargs['vector_size'] = 100
             
         self.gensim_model = None
         self.dm_mean = dm_mean

From c79bdfbe93339029a99c55a85e78451b8dcf3ada Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Thu, 1 Mar 2018 10:37:36 +0530
Subject: [PATCH 08/41] Update test_sklearn_api.py

---
 gensim/test/test_sklearn_api.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index ed5516df37..7f9381bf64 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -646,7 +646,7 @@ def testModelNotFitted(self):
 class TestWord2VecWrapper(unittest.TestCase):
     def setUp(self):
         numpy.random.seed(0)
-        self.model = W2VTransformer(size=10, min_count=0, seed=42)
+        self.model = W2VTransformer(vector_size=10, min_count=0, seed=42)
         self.model.fit(texts)
 
     def testTransform(self):
@@ -665,7 +665,7 @@ def testTransform(self):
 
     def testConsistencyWithGensimModel(self):
         # training a W2VTransformer
-        self.model = W2VTransformer(size=10, min_count=0, seed=42)
+        self.model = W2VTransformer(vector_size=10, min_count=0, seed=42)
         self.model.fit(texts)
 
         # training a Gensim Word2Vec model with the same params
@@ -679,7 +679,7 @@ def testConsistencyWithGensimModel(self):
 
     def testPipeline(self):
         numpy.random.seed(0)  # set fixed seed to get similar values everytime
-        model = W2VTransformer(size=10, min_count=1)
+        model = W2VTransformer(vector_size=10, min_count=1)
         model.fit(w2v_texts)
 
         class_dict = {'mathematics': 1, 'physics': 0}
@@ -724,7 +724,7 @@ def testPersistence(self):
         self.assertTrue(passed)
 
     def testModelNotFitted(self):
-        w2vmodel_wrapper = W2VTransformer(size=10, min_count=0, seed=42)
+        w2vmodel_wrapper = W2VTransformer(vector_size=10, min_count=0, seed=42)
         word = texts[0][0]
         self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word)
 

From da76d3c6b7d24df4bb55dde94fc4bb5243687e98 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Thu, 1 Mar 2018 10:42:16 +0530
Subject: [PATCH 09/41] fix deprecated arguments

---
 gensim/test/test_sklearn_api.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index 7f9381bf64..f28bf6817b 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -646,7 +646,7 @@ def testModelNotFitted(self):
 class TestWord2VecWrapper(unittest.TestCase):
     def setUp(self):
         numpy.random.seed(0)
-        self.model = W2VTransformer(vector_size=10, min_count=0, seed=42)
+        self.model = W2VTransformer(min_count=0, seed=42, vector_size=10)
         self.model.fit(texts)
 
     def testTransform(self):
@@ -665,7 +665,7 @@ def testTransform(self):
 
     def testConsistencyWithGensimModel(self):
         # training a W2VTransformer
-        self.model = W2VTransformer(vector_size=10, min_count=0, seed=42)
+        self.model = W2VTransformer(min_count=0, seed=42, vector_size=10)
         self.model.fit(texts)
 
         # training a Gensim Word2Vec model with the same params
@@ -679,7 +679,7 @@ def testConsistencyWithGensimModel(self):
 
     def testPipeline(self):
         numpy.random.seed(0)  # set fixed seed to get similar values everytime
-        model = W2VTransformer(vector_size=10, min_count=1)
+        model = W2VTransformer(min_count=1,vector_size=10)
         model.fit(w2v_texts)
 
         class_dict = {'mathematics': 1, 'physics': 0}
@@ -724,7 +724,7 @@ def testPersistence(self):
         self.assertTrue(passed)
 
     def testModelNotFitted(self):
-        w2vmodel_wrapper = W2VTransformer(vector_size=10, min_count=0, seed=42)
+        w2vmodel_wrapper = W2VTransformer(min_count=0, seed=42, vector_size=10)
         word = texts[0][0]
         self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word)
 

From bbdb0d454a7627f115e5cbb15a185a187facb00e Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Thu, 1 Mar 2018 22:55:08 +0530
Subject: [PATCH 10/41] fix deprecated argument warnings

---
 gensim/sklearn_api/w2vmodel.py | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index f189f07d7d..36ba96595f 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -24,27 +24,22 @@ class W2VTransformer(TransformerMixin, BaseEstimator):
     Base Word2Vec module
     """
 
-    def __init__(self, size=None, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
-                 workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, null_word=0,
-                 trim_rule=None, sorted_vocab=1, batch_words=10000,**kwargs):
+    def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
+                 workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, null_word=0,
+                 trim_rule=None, sorted_vocab=1, batch_words=10000):
         """
         Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details.
         """
         if iter is not None:
             warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
-            kwargs['epochs'] = iter
-        if iter is None and 'epochs' not in kwargs:
-            kwargs['epochs'] = 5
+            epochs = iter
 
         if size is not None:
             warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
-            kwargs['vector_size'] = size
-        if size is None and 'vector_size' not in kwargs:
-            kwargs['vector_size'] = 100
+            vector_size = size
         
         self.gensim_model = None
-        self.size = size
-        self.vector_size = kwargs['vector_size']
+        self.vector_size = vector_size
         self.alpha = alpha
         self.window = window
         self.min_count = min_count
@@ -58,24 +53,22 @@ def __init__(self, size=None, alpha=0.025, window=5, min_count=5, max_vocab_size
         self.negative = negative
         self.cbow_mean = int(cbow_mean)
         self.hashfxn = hashfxn
-        self.iter = iter
-        self.epochs = kwargs['epochs']
+        self.epochs = epochs
         self.null_word = null_word
         self.trim_rule = trim_rule
         self.sorted_vocab = sorted_vocab
         self.batch_words = batch_words
-
-    def fit(self, X, y=None):
+        def fit(self, X, y=None):
         """
         Fit the model according to the given training data.
         Calls gensim.models.Word2Vec
         """
         self.gensim_model = models.Word2Vec(
-            sentences=X, size=self.size, vector_size=self.vector_size, alpha=self.alpha,
+            sentences=X, vector_size=self.vector_size, alpha=self.alpha,
             window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size,
             sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha,
             sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean,
-            hashfxn=self.hashfxn, iter=self.iter, epochs=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule,
+            hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule,
             sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
         )
         return self

From 31d3729ddc038c5a09738ca9e92ee62c71ce9d74 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Thu, 1 Mar 2018 22:57:41 +0530
Subject: [PATCH 11/41] fix deprecated argumet warnings

---
 gensim/sklearn_api/d2vmodel.py | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index 02f34ae5c3..30ea3e0991 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -24,25 +24,21 @@ class D2VTransformer(TransformerMixin, BaseEstimator):
     """
 
     def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None,
-                 docvecs_mapfile=None, comment=None, trim_rule=None, size=None, alpha=0.025, window=5, min_count=5,
+                 docvecs_mapfile=None, comment=None, trim_rule=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1,
-                 hashfxn=hash, iter=None, sorted_vocab=1, batch_words=10000,**kwargs):
+                 hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000):
         """
         Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details.
         """
         
         if iter is not None:
             warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
-            kwargs['epochs'] = iter
-        if iter is None and 'epochs' not in kwargs:
-            kwargs['epochs'] = 5
+            epochs = iter
 
         if size is not None:
             warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
-            kwargs['vector_size'] = size
-        if size is None and 'vector_size' not in kwargs:
-            kwargs['vector_size'] = 100
-            
+            vector_size = size
+          
         self.gensim_model = None
         self.dm_mean = dm_mean
         self.dm = dm
@@ -55,8 +51,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
         self.trim_rule = trim_rule
 
         # attributes associated with gensim.models.Word2Vec
-        self.size = size
-        self.vector_size = kwargs['vector_size']
+        self.vector_size = vector_size
         self.alpha = alpha
         self.window = window
         self.min_count = min_count
@@ -69,12 +64,11 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
         self.negative = negative
         self.cbow_mean = int(cbow_mean)
         self.hashfxn = hashfxn
-        self.iter = iter
-        self.epochs = kwargs['epochs']
+        self.epochs = epochs
         self.sorted_vocab = sorted_vocab
         self.batch_words = batch_words
-
-    def fit(self, X, y=None):
+        
+         def fit(self, X, y=None):
         """
         Fit the model according to the given training data.
         Calls gensim.models.Doc2Vec
@@ -87,14 +81,15 @@ def fit(self, X, y=None):
             documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm,
             dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count,
             docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment,
-            trim_rule=self.trim_rule, size=self.size, vector_size=self.vector_size, alpha=self.alpha, window=self.window,
+            trim_rule=self.trim_rule, vector_size=self.vector_size, alpha=self.alpha, window=self.window,
             min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample,
             seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs,
             negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn,
-            iter=self.iter, epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
+            epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
         )
         return self
 
+
     def transform(self, docs):
         """
         Return the vector representations for the input documents.

From bbcd0e548df0f321a720e4cd7bffabfdafb751d9 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Thu, 1 Mar 2018 23:00:27 +0530
Subject: [PATCH 12/41] fix deprecated argumet warnings

---
 gensim/models/word2vec.py | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index f51b4cd25f..bf4dcefd89 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -411,25 +411,21 @@ def score_cbow_pair(model, word, l1):
 
 class Word2Vec(BaseWordEmbeddingsModel):
     """Class for training, using and evaluating neural networks described in https://code.google.com/p/word2vec/
-
     If you're finished training a model (=no more updates, only querying)
     then switch to the :mod:`gensim.models.KeyedVectors` instance in wv
-
     The model can be stored/loaded via its :meth:`~gensim.models.word2vec.Word2Vec.save()` and
     :meth:`~gensim.models.word2vec.Word2Vec.load()` methods, or stored/loaded in a format
     compatible with the original word2vec implementation via `wv.save_word2vec_format()`
     and `Word2VecKeyedVectors.load_word2vec_format()`.
-
     """
 
-    def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
+    def __init__( sentences=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
-                 sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
-                 trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=()):
+                 sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, null_word=0,
+                 trim_rule=None, sorted_vocab=1, compute_loss=False, callbacks=()):
         """
         Initialize the model from an iterable of `sentences`. Each sentence is a
         list of words (unicode strings) that will be used for training.
-
         Parameters
         ----------
         sentences : iterable of iterables
@@ -439,7 +435,6 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
             or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
             If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
             in some other way.
-
         sg : int {1, 0}
             Defines the training algorithm. If 1, skip-gram is employed; otherwise, CBOW is used.
         size : int
@@ -498,30 +493,38 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
             If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`.
         callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
             List of callbacks that need to be executed/run at specific stages during training.
-
         Examples
         --------
         Initialize and train a `Word2Vec` model
-
         >>> from gensim.models import Word2Vec
         >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
         >>>
         >>> model = Word2Vec(sentences, min_count=1)
         >>> say_vector = model['say']  # get vector for word
-
         """
-
+        
+        
+        if iter is not None:
+            warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
+            epochs = iter
+            
+        if size is not None:
+            warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
+            vector_size = size
+            
+        self.vector_size = vector_size
+        self.epochs = epochs
         self.callbacks = callbacks
         self.load = call_on_class_only
 
-        self.wv = Word2VecKeyedVectors(size)
+        self.wv = Word2VecKeyedVectors(vector_size)
         self.vocabulary = Word2VecVocab(
             max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
             sorted_vocab=bool(sorted_vocab), null_word=null_word)
-        self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn)
+        self.trainables = Word2VecTrainables(seed=seed, vector_size=vector_size, hashfxn=hashfxn)
 
         super(Word2Vec, self).__init__(
-            sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks,
+            sentences=sentences, workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks,
             batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed,
             hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss,
             fast_version=FAST_VERSION)

From 734d5c56a5c89e656f28b2ad1460e2b3dd95bdbd Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Thu, 1 Mar 2018 23:02:38 +0530
Subject: [PATCH 13/41] fix deprecated argumet warnings

---
 gensim/models/doc2vec.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
index f57694273d..d860163f8b 100644
--- a/gensim/models/doc2vec.py
+++ b/gensim/models/doc2vec.py
@@ -278,7 +278,6 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
                  docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), **kwargs):
         """Initialize the model from an iterable of `documents`. Each document is a
         TaggedDocument object that will be used for training.
-
         Parameters
         ----------
         documents : iterable of iterables
@@ -286,11 +285,9 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
             consider an iterable that streams the documents directly from disk/network.
             If you don't supply `documents`, the model is left uninitialized -- use if
             you plan to initialize it in some other way.
-
         dm : int {1,0}
             Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
             Otherwise, `distributed bag of words` (PV-DBOW) is employed.
-
         size : int
             Dimensionality of the feature vectors.
         window : int
@@ -349,7 +346,6 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
             of the model.
         callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
             List of callbacks that need to be executed/run at specific stages during training.
-
         """
 
         if 'sentences' in kwargs:
@@ -404,7 +400,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
             self.train(
                 documents, total_examples=self.corpus_count, epochs=self.epochs,
                 start_alpha=self.alpha, end_alpha=self.min_alpha, callbacks=callbacks)
-
+    
     @property
     def dm(self):
         """int {1,0} : `dm=1` indicates 'distributed memory' (PV-DM) else

From 7cfbfe194cb11761604366f51f64e32c95c3f6b5 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Thu, 1 Mar 2018 23:04:30 +0530
Subject: [PATCH 14/41] fix deprecated argument warnings


From be108c1e60c3f5ecaa10aa84404a4cd1ea1dfc5b Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Thu, 1 Mar 2018 23:04:52 +0530
Subject: [PATCH 15/41] fix deprecated argument warnings


From e4f7ad715dfabc58f4281760fe4c27389c09eb9f Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Thu, 1 Mar 2018 23:27:04 +0530
Subject: [PATCH 16/41] fix deprecated arguments

---
 gensim/models/word2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index bf4dcefd89..e4f658ca53 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -419,7 +419,7 @@ class Word2Vec(BaseWordEmbeddingsModel):
     and `Word2VecKeyedVectors.load_word2vec_format()`.
     """
 
-    def __init__( sentences=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5,
+    def __init__(self, sentences=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
                  sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, null_word=0,
                  trim_rule=None, sorted_vocab=1, compute_loss=False, callbacks=()):

From 02fdf706121b81305138f666cf9b294a95a50601 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 11:18:48 +0530
Subject: [PATCH 17/41] fix deprecated arguments

---
 gensim/sklearn_api/d2vmodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index 30ea3e0991..45bf2750ea 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -81,7 +81,7 @@ def fit(self, X, y=None):
             documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm,
             dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count,
             docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment,
-            trim_rule=self.trim_rule, vector_size=self.vector_size, alpha=self.alpha, window=self.window,
+            trim_rule=self.trim_rule, size=self.vector_size, alpha=self.alpha, window=self.window,
             min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample,
             seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs,
             negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn,

From 477b3814303d1d2263ddbba62d97437ac2b261e0 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 11:21:12 +0530
Subject: [PATCH 18/41] fix deprecated arguments

---
 gensim/sklearn_api/w2vmodel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index 36ba96595f..443c6801ae 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -64,11 +64,11 @@ def fit(self, X, y=None):
         Calls gensim.models.Word2Vec
         """
         self.gensim_model = models.Word2Vec(
-            sentences=X, vector_size=self.vector_size, alpha=self.alpha,
+            sentences=X, size=self.vector_size, alpha=self.alpha,
             window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size,
             sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha,
             sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean,
-            hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule,
+            hashfxn=self.hashfxn, iter=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule,
             sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
         )
         return self

From 36993ef8c4ac495cd56882cbc3443cf3c3952df1 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 11:28:42 +0530
Subject: [PATCH 19/41] Update doc2vec.py

---
 gensim/models/doc2vec.py | 44 ++--------------------------------------
 1 file changed, 2 insertions(+), 42 deletions(-)

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
index d860163f8b..ec912608d6 100644
--- a/gensim/models/doc2vec.py
+++ b/gensim/models/doc2vec.py
@@ -1,3 +1,4 @@
+
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
@@ -9,27 +10,16 @@
 """
 Deep learning via the distributed memory and distributed bag of words models from
 [1]_, using either hierarchical softmax or negative sampling [2]_ [3]_. See [#tutorial]_
-
 **Make sure you have a C compiler before installing gensim, to use optimized (compiled)
 doc2vec training** (70x speedup [blog]_).
-
 Initialize a model with e.g.::
-
 >>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4)
-
 Persist a model to disk with::
-
 >>> model.save(fname)
 >>> model = Doc2Vec.load(fname)  # you can continue training with the loaded model!
-
 If you're finished training a model (=no more updates, only querying), you can do
-
   >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True):
-
 to trim unneeded model memory = use (much) less RAM.
-
-
-
 .. [1] Quoc Le and Tomas Mikolov. Distributed Representations of Sentences and Documents.
        http://arxiv.org/pdf/1405.4053v2.pdf
 .. [2] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean.
@@ -37,12 +27,8 @@
 .. [3] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean.
        Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013.
 .. [blog] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/
-
 .. [#tutorial] Doc2vec in gensim tutorial,
                https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb
-
-
-
 """
 
 import logging
@@ -239,9 +225,7 @@ class TaggedDocument(namedtuple('TaggedDocument', 'words tags')):
     and `tags` (a list of tokens). Tags may be one or more unicode string
     tokens, but typical practice (which will also be most memory-efficient) is
     for the tags list to include a unique integer id as the only tag.
-
     Replaces "sentence as a list of words" from Word2Vec.
-
     """
 
     def __str__(self):
@@ -257,9 +241,7 @@ class LabeledSentence(TaggedDocument):
 class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')):
     """A string document tag discovered during the initial vocabulary
     scan. (The document-vector equivalent of a Vocab object.)
-
     Will not be used if all presented document tags are ints.
-
     The offset is only the true index into the doctags_syn0/doctags_syn0_lockf
     if-and-only-if no raw-int tags were used. If any raw-int tags were used,
     string Doctag vectors begin at index (max_rawint + 1), so the true index is
@@ -400,7 +382,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
             self.train(
                 documents, total_examples=self.corpus_count, epochs=self.epochs,
                 start_alpha=self.alpha, end_alpha=self.min_alpha, callbacks=callbacks)
-    
+
     @property
     def dm(self):
         """int {1,0} : `dm=1` indicates 'distributed memory' (PV-DM) else
@@ -463,18 +445,15 @@ def train(self, documents, total_examples=None, total_words=None,
               word_count=0, queue_factor=2, report_delay=1.0, callbacks=()):
         """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
         The `documents` iterable can be simply a list of TaggedDocument elements.
-
         To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate
         progress-percentage logging, either total_examples (count of sentences) or total_words (count of
         raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to
         :meth:`~gensim.models.word2vec.Word2Vec.build_vocab()`, the count of examples in that corpus
         will be available in the model's :attr:`corpus_count` property).
-
         To avoid common mistakes around the model's ability to do multiple training passes itself, an
         explicit `epochs` argument **MUST** be provided. In the common and recommended case,
         where :meth:`~gensim.models.word2vec.Word2Vec.train()` is only called once,
         the model's cached `iter` value should be supplied as `epochs` value.
-
         Parameters
         ----------
         documents : iterable of iterables
@@ -518,7 +497,6 @@ def estimated_lookup_memory(self):
     def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
         """
         Infer a vector for given post-bulk training document.
-
         Parameters
         ----------
         doc_words : :obj: `list` of :obj: `str`
@@ -529,12 +507,10 @@ def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
             Learning rate will linearly drop to `min_alpha` as training progresses.
         steps : int
             Number of times to train the new document.
-
         Returns
         -------
         :obj: `numpy.ndarray`
             Returns the inferred vector for the new document.
-
         """
         doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size)
         doctag_indexes = [0]
@@ -605,7 +581,6 @@ def __str__(self):
 
     def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True):
         """Discard parameters that are used in training and score. Use if you're sure you're done training a model.
-
         Parameters
         ----------
         keep_doctags_vectors : bool
@@ -613,7 +588,6 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen
             in this case you can't to use docvecs's most_similar, similarity etc. methods.
         keep_inference : bool
             Set `keep_inference` to False if you don't want to store parameters that is used for infer_vector method
-
         """
         if not keep_inference:
             if hasattr(self.trainables, 'syn1'):
@@ -631,7 +605,6 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen
     def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False):
         """Store the input-hidden weight matrix in the same format used by the original
         C word2vec-tool, for compatibility.
-
         Parameters
         ----------
         fname : str
@@ -647,7 +620,6 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
             Optional file path used to save the vocabulary
         binary : bool
             If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
-
         """
         total_vec = len(self.wv.vocab) + len(self.docvecs)
         write_first_line = False
@@ -668,14 +640,11 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
     def init_sims(self, replace=False):
         """
         Precompute L2-normalized vectors.
-
         If `replace` is set, forget the original vectors and only keep the normalized
         ones = saves lots of memory!
-
         Note that you **cannot continue training or inference** after doing a replace.
         The model becomes effectively read-only = you can call `most_similar`, `similarity`
         etc., but not `train` or `infer_vector`.
-
         """
         return self.docvecs.init_sims(replace=replace)
 
@@ -698,7 +667,6 @@ def estimate_memory(self, vocab_size=None, report=None):
     def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs):
         """Build vocabulary from a sequence of sentences (can be a once-only generator stream).
         Each sentence is a iterable of iterables (can simply be a list of unicode strings too).
-
         Parameters
         ----------
         documents : iterable of iterables
@@ -737,7 +705,6 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
         Build vocabulary from a dictionary of word frequencies.
         Build model vocabulary from a passed dictionary that contains (word,word count).
         Words must be of type unicode strings.
-
         Parameters
         ----------
         word_freq : dict
@@ -756,7 +723,6 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
             of the model.
         update : bool
             If true, the new provided words in `word_freq` dict will be added to model's vocab.
-
         Examples
         --------
         >>> from gensim.models.word2vec import Word2Vec
@@ -936,23 +902,17 @@ def __iter__(self):
 
 class TaggedLineDocument(object):
     """Simple format: one document = one line = one TaggedDocument object.
-
     Words are expected to be already preprocessed and separated by whitespace,
     tags are constructed automatically from the document line number."""
 
     def __init__(self, source):
         """
         `source` can be either a string (filename) or a file object.
-
         Example::
-
             documents = TaggedLineDocument('myfile.txt')
-
         Or for compressed files::
-
             documents = TaggedLineDocument('compressed_text.txt.bz2')
             documents = TaggedLineDocument('compressed_text.txt.gz')
-
         """
         self.source = source
 

From 055f043a14b439efad306be24b86845f90f43d98 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 11:37:36 +0530
Subject: [PATCH 20/41] Update word2vec.py

---
 gensim/models/word2vec.py | 97 +++------------------------------------
 1 file changed, 7 insertions(+), 90 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index e4f658ca53..1ca8fd0af3 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -7,92 +7,59 @@
 
 """Produce word vectors with deep learning via word2vec's "skip-gram and CBOW models", using either
 hierarchical softmax or negative sampling [1]_ [2]_.
-
 NOTE: There are more ways to get word vectors in Gensim than just Word2Vec.
 See FastText and wrappers for VarEmbed and WordRank.
-
 The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/
 and extended with additional functionality.
-
 For a blog tutorial on gensim word2vec, with an interactive web app trained on GoogleNews,
 visit http://radimrehurek.com/2014/02/word2vec-tutorial/
-
 **Make sure you have a C compiler before installing gensim, to use optimized (compiled) word2vec training**
 (70x speedup compared to plain NumPy implementation [3]_).
-
 Initialize a model with e.g.::
-
     >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
-
 Persist a model to disk with::
-
     >>> model.save(fname)
     >>> model = Word2Vec.load(fname)  # you can continue training with the loaded model!
-
 The word vectors are stored in a KeyedVectors instance in model.wv.
 This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec::
-
   >>> model.wv['computer']  # numpy vector of a word
   array([-0.00449447, -0.00310097,  0.02421786, ...], dtype=float32)
-
 The word vectors can also be instantiated from an existing file on disk in the word2vec C format
 as a KeyedVectors instance.
-
 NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights,
 vocabulary frequency and the binary tree is missing::
-
     >>> from gensim.models import KeyedVectors
     >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False)  # C text format
     >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True)  # C binary format
-
-
 You can perform various NLP word tasks with the model. Some of them
 are already built-in::
-
   >>> model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
   [('queen', 0.50882536), ...]
-
   >>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
   [('queen', 0.71382287), ...]
-
-
   >>> model.wv.doesnt_match("breakfast cereal dinner lunch".split())
   'cereal'
-
   >>> model.wv.similarity('woman', 'man')
   0.73723527
-
 Probability of a text under the model::
-
   >>> model.score(["The fox jumped over a lazy dog".split()])
   0.2158356
-
 Correlation with human opinion on word similarity::
-
   >>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv'))
   0.51, 0.62, 0.13
-
 And on analogies::
-
   >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
-
 and so on.
-
 If you're finished training a model (i.e. no more updates, only querying),
 then switch to the :mod:`gensim.models.KeyedVectors` instance in wv
-
   >>> word_vectors = model.wv
   >>> del model
-
 to trim unneeded model memory = use much less RAM.
-
 Note that there is a :mod:`gensim.models.phrases` module which lets you automatically
 detect phrases longer than one word. Using phrases, you can learn a word2vec model
 where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`:
-
     >>> bigram_transformer = gensim.models.Phrases(sentences)
     >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...)
-
 .. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean.
        Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013.
 .. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean.
@@ -419,10 +386,10 @@ class Word2Vec(BaseWordEmbeddingsModel):
     and `Word2VecKeyedVectors.load_word2vec_format()`.
     """
 
-    def __init__(self, sentences=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5,
+    def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
-                 sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, null_word=0,
-                 trim_rule=None, sorted_vocab=1, compute_loss=False, callbacks=()):
+                 sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
+                 trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=()):
         """
         Initialize the model from an iterable of `sentences`. Each sentence is a
         list of words (unicode strings) that will be used for training.
@@ -502,29 +469,18 @@ def __init__(self, sentences=None, size=None, vector_size=100, alpha=0.025, wind
         >>> model = Word2Vec(sentences, min_count=1)
         >>> say_vector = model['say']  # get vector for word
         """
-        
-        
-        if iter is not None:
-            warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
-            epochs = iter
-            
-        if size is not None:
-            warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
-            vector_size = size
-            
-        self.vector_size = vector_size
-        self.epochs = epochs
+
         self.callbacks = callbacks
         self.load = call_on_class_only
 
-        self.wv = Word2VecKeyedVectors(vector_size)
+        self.wv = Word2VecKeyedVectors(size)
         self.vocabulary = Word2VecVocab(
             max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
             sorted_vocab=bool(sorted_vocab), null_word=null_word)
-        self.trainables = Word2VecTrainables(seed=seed, vector_size=vector_size, hashfxn=hashfxn)
+        self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn)
 
         super(Word2Vec, self).__init__(
-            sentences=sentences, workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks,
+            sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks,
             batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed,
             hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss,
             fast_version=FAST_VERSION)
@@ -556,18 +512,15 @@ def train(self, sentences, total_examples=None, total_words=None,
               queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()):
         """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
         For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)
-
         To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate
         progress-percentage logging, either total_examples (count of sentences) or total_words (count of
         raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to
         :meth:`~gensim.models.word2vec.Word2Vec.build_vocab()`, the count of examples in that corpus
         will be available in the model's :attr:`corpus_count` property).
-
         To avoid common mistakes around the model's ability to do multiple training passes itself, an
         explicit `epochs` argument **MUST** be provided. In the common and recommended case,
         where :meth:`~gensim.models.word2vec.Word2Vec.train()` is only called once,
         the model's cached `iter` value should be supplied as `epochs` value.
-
         Parameters
         ----------
         sentences : iterable of iterables
@@ -596,7 +549,6 @@ def train(self, sentences, total_examples=None, total_words=None,
             If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`.
         callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
             List of callbacks that need to be executed/run at specific stages during training.
-
         Examples
         --------
         >>> from gensim.models import Word2Vec
@@ -605,7 +557,6 @@ def train(self, sentences, total_examples=None, total_words=None,
         >>> model = Word2Vec(min_count=1)
         >>> model.build_vocab(sentences)
         >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
-
         """
 
         return super(Word2Vec, self).train(
@@ -617,20 +568,15 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
         """Score the log probability for a sequence of sentences (can be a once-only generator stream).
         Each sentence must be a list of unicode strings.
         This does not change the fitted model in any way (see Word2Vec.train() for that).
-
         We have currently only implemented score for the hierarchical softmax scheme,
         so you need to have run word2vec with hs=1 and negative=0 for this to work.
-
         Note that you should specify total_sentences; we'll run into problems if you ask to
         score more than this number of sentences but it is inefficient to set the value too high.
-
         See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of
         how to use such scores in document classification.
-
         .. [#taddy] Taddy, Matt.  Document Classification by Inversion of Distributed Language Representations,
                     in Proceedings of the 2015 Conference of the Association of Computational Linguistics.
         .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb
-
         Parameters
         ----------
         sentences : iterable of iterables
@@ -646,7 +592,6 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
             Multiplier for size of queue (number of workers * queue_factor).
         report_delay : float
             Seconds to wait before reporting progress.
-
         """
         if FAST_VERSION < 0:
             warnings.warn(
@@ -763,20 +708,16 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
         given, where it intersects with the current vocabulary. (No words are added to the
         existing vocabulary, but intersecting words adopt the file's weights, and
         non-intersecting words are left alone.)
-
         Parameters
         ----------
         fname : str
             The file path used to save the vectors in
-
         binary : bool
             If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
-
         lockf : float
             Lock-factor value to be set for any imported word-vectors; the
             default value of 0.0 prevents further updating of the vector during subsequent
             training. Use 1.0 to allow further training updates of merged vectors.
-
         """
         overlap_count = 0
         logger.info("loading projection weights from %s", fname)
@@ -834,19 +775,16 @@ def __contains__(self, word):
     def predict_output_word(self, context_words_list, topn=10):
         """Report the probability distribution of the center word given the context words
         as input to the trained model.
-
         Parameters
         ----------
         context_words_list : :obj: `list` of :obj: `str`
             List of context words
         topn: int
             Return `topn` words and their probabilities
-
         Returns
         -------
         :obj: `list` of :obj: `tuple`
             `topn` length list of tuples of (word, probability)
-
         """
         if not self.negative:
             raise RuntimeError(
@@ -921,12 +859,10 @@ def delete_temporary_training_data(self, replace_word_vectors_with_normalized=Fa
     def save(self, *args, **kwargs):
         """Save the model. This saved model can be loaded again using :func:`~gensim.models.word2vec.Word2Vec.load`,
         which supports online training and getting vectors for vocabulary words.
-
         Parameters
         ----------
         fname : str
             Path to the file.
-
         """
         # don't bother storing the cached normalized vectors, recalculable table
         kwargs['ignore'] = kwargs.get('ignore', ['vectors_norm', 'cum_table'])
@@ -963,12 +899,10 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False):
     @classmethod
     def load(cls, *args, **kwargs):
         """Loads a previously saved `Word2Vec` model. Also see `save()`.
-
         Parameters
         ----------
         fname : str
             Path to the saved file.
-
         Returns
         -------
         :obj: `~gensim.models.word2vec.Word2Vec`
@@ -1042,16 +976,11 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         """
         `source` can be either a string or a file object. Clip the file to the first
         `limit` lines (or not clipped if limit is None, the default).
-
         Example::
-
             sentences = LineSentence('myfile.txt')
-
         Or for compressed files::
-
             sentences = LineSentence('compressed_text.txt.bz2')
             sentences = LineSentence('compressed_text.txt.gz')
-
         """
         self.source = source
         self.max_sentence_length = max_sentence_length
@@ -1084,23 +1013,17 @@ class PathLineSentences(object):
     """Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename.
     The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files.
     Any file not ending with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories.
-
     The format of files (either text, or compressed text files) in the path is one sentence = one line,
     with words already preprocessed and separated by whitespace.
-
     """
 
     def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         """
         `source` should be a path to a directory (as a string) where all files can be opened by the
         LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default).
-
         Example::
-
             sentences = PathLineSentences(os.getcwd() + '\\corpus\\')
-
         The files in the directory should be either text files, .bz2 files, or .gz files.
-
         """
         self.source = source
         self.max_sentence_length = max_sentence_length
@@ -1193,15 +1116,12 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr
                       min_count=None, sample=None, dry_run=False):
         """Apply vocabulary settings for `min_count` (discarding less-frequent words)
         and `sample` (controlling the downsampling of more-frequent words).
-
         Calling with `dry_run=True` will only simulate the provided settings and
         report the size of the retained vocabulary, effective corpus length, and
         estimated memory requirements. Results are both printed via logging and
         returned as a dict.
-
         Delete the raw vocabulary after the scaling is done to free up RAM,
         unless `keep_raw_vocab` is set.
-
         """
         min_count = min_count or self.min_count
         sample = sample or self.sample
@@ -1337,7 +1257,6 @@ def add_null_word(self, wv):
     def create_binary_tree(self, wv):
         """Create a binary Huffman tree using stored vocabulary word counts. Frequent words
         will have shorter binary codes. Called internally from `build_vocab()`.
-
         """
         logger.info("constructing a huffman tree from %i words", len(wv.vocab))
 
@@ -1370,12 +1289,10 @@ def create_binary_tree(self, wv):
     def make_cum_table(self, wv, power=0.75, domain=2**31 - 1):
         """Create a cumulative-distribution table using stored vocabulary word counts for
         drawing random words in the negative-sampling training routines.
-
         To draw a word index, choose a random integer up to the maximum value in the
         table (cum_table[-1]), then finding that integer's sorted insertion point
         (as if by bisect_left or ndarray.searchsorted()). That insertion point is the
         drawn index, coming up in proportion equal to the increment at that slot.
-
         Called internally from 'build_vocab()'.
         """
         vocab_size = len(wv.index2word)

From 74d1c59c2f43882f0af5bf213abad81bc58e3feb Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 12:31:13 +0530
Subject: [PATCH 21/41] Update doc2vec.py

---
 gensim/models/doc2vec.py | 54 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
index ec912608d6..ebf2793f59 100644
--- a/gensim/models/doc2vec.py
+++ b/gensim/models/doc2vec.py
@@ -1,4 +1,3 @@
-
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
@@ -10,16 +9,27 @@
 """
 Deep learning via the distributed memory and distributed bag of words models from
 [1]_, using either hierarchical softmax or negative sampling [2]_ [3]_. See [#tutorial]_
+
 **Make sure you have a C compiler before installing gensim, to use optimized (compiled)
 doc2vec training** (70x speedup [blog]_).
+
 Initialize a model with e.g.::
+
 >>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4)
+
 Persist a model to disk with::
+
 >>> model.save(fname)
 >>> model = Doc2Vec.load(fname)  # you can continue training with the loaded model!
+
 If you're finished training a model (=no more updates, only querying), you can do
+
   >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True):
+
 to trim unneeded model memory = use (much) less RAM.
+
+
+
 .. [1] Quoc Le and Tomas Mikolov. Distributed Representations of Sentences and Documents.
        http://arxiv.org/pdf/1405.4053v2.pdf
 .. [2] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean.
@@ -27,8 +37,12 @@
 .. [3] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean.
        Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013.
 .. [blog] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/
+
 .. [#tutorial] Doc2vec in gensim tutorial,
                https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb
+
+
+
 """
 
 import logging
@@ -225,7 +239,9 @@ class TaggedDocument(namedtuple('TaggedDocument', 'words tags')):
     and `tags` (a list of tokens). Tags may be one or more unicode string
     tokens, but typical practice (which will also be most memory-efficient) is
     for the tags list to include a unique integer id as the only tag.
+    
     Replaces "sentence as a list of words" from Word2Vec.
+    
     """
 
     def __str__(self):
@@ -241,7 +257,9 @@ class LabeledSentence(TaggedDocument):
 class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')):
     """A string document tag discovered during the initial vocabulary
     scan. (The document-vector equivalent of a Vocab object.)
+    
     Will not be used if all presented document tags are ints.
+    
     The offset is only the true index into the doctags_syn0/doctags_syn0_lockf
     if-and-only-if no raw-int tags were used. If any raw-int tags were used,
     string Doctag vectors begin at index (max_rawint + 1), so the true index is
@@ -260,6 +278,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
                  docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), **kwargs):
         """Initialize the model from an iterable of `documents`. Each document is a
         TaggedDocument object that will be used for training.
+        
         Parameters
         ----------
         documents : iterable of iterables
@@ -267,9 +286,11 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
             consider an iterable that streams the documents directly from disk/network.
             If you don't supply `documents`, the model is left uninitialized -- use if
             you plan to initialize it in some other way.
+        
         dm : int {1,0}
             Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
             Otherwise, `distributed bag of words` (PV-DBOW) is employed.
+        
         size : int
             Dimensionality of the feature vectors.
         window : int
@@ -328,8 +349,9 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
             of the model.
         callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
             List of callbacks that need to be executed/run at specific stages during training.
+        
         """
-
+        
         if 'sentences' in kwargs:
             raise DeprecationWarning(
                 "Parameter 'sentences' was renamed to 'documents', and will be removed in 4.0.0, "
@@ -445,15 +467,18 @@ def train(self, documents, total_examples=None, total_words=None,
               word_count=0, queue_factor=2, report_delay=1.0, callbacks=()):
         """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
         The `documents` iterable can be simply a list of TaggedDocument elements.
+        
         To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate
         progress-percentage logging, either total_examples (count of sentences) or total_words (count of
         raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to
         :meth:`~gensim.models.word2vec.Word2Vec.build_vocab()`, the count of examples in that corpus
         will be available in the model's :attr:`corpus_count` property).
+        
         To avoid common mistakes around the model's ability to do multiple training passes itself, an
         explicit `epochs` argument **MUST** be provided. In the common and recommended case,
         where :meth:`~gensim.models.word2vec.Word2Vec.train()` is only called once,
         the model's cached `iter` value should be supplied as `epochs` value.
+        
         Parameters
         ----------
         documents : iterable of iterables
@@ -497,6 +522,7 @@ def estimated_lookup_memory(self):
     def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
         """
         Infer a vector for given post-bulk training document.
+        
         Parameters
         ----------
         doc_words : :obj: `list` of :obj: `str`
@@ -507,10 +533,12 @@ def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
             Learning rate will linearly drop to `min_alpha` as training progresses.
         steps : int
             Number of times to train the new document.
+        
         Returns
         -------
         :obj: `numpy.ndarray`
             Returns the inferred vector for the new document.
+        
         """
         doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size)
         doctag_indexes = [0]
@@ -581,6 +609,7 @@ def __str__(self):
 
     def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True):
         """Discard parameters that are used in training and score. Use if you're sure you're done training a model.
+        
         Parameters
         ----------
         keep_doctags_vectors : bool
@@ -588,7 +617,8 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen
             in this case you can't to use docvecs's most_similar, similarity etc. methods.
         keep_inference : bool
             Set `keep_inference` to False if you don't want to store parameters that is used for infer_vector method
-        """
+       
+       """
         if not keep_inference:
             if hasattr(self.trainables, 'syn1'):
                 del self.trainables.syn1
@@ -605,7 +635,8 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen
     def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False):
         """Store the input-hidden weight matrix in the same format used by the original
         C word2vec-tool, for compatibility.
-        Parameters
+       
+       Parameters
         ----------
         fname : str
             The file path used to save the vectors in.
@@ -620,6 +651,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
             Optional file path used to save the vocabulary
         binary : bool
             If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
+        
         """
         total_vec = len(self.wv.vocab) + len(self.docvecs)
         write_first_line = False
@@ -640,11 +672,14 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
     def init_sims(self, replace=False):
         """
         Precompute L2-normalized vectors.
+        
         If `replace` is set, forget the original vectors and only keep the normalized
         ones = saves lots of memory!
+        
         Note that you **cannot continue training or inference** after doing a replace.
         The model becomes effectively read-only = you can call `most_similar`, `similarity`
         etc., but not `train` or `infer_vector`.
+        
         """
         return self.docvecs.init_sims(replace=replace)
 
@@ -667,6 +702,7 @@ def estimate_memory(self, vocab_size=None, report=None):
     def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs):
         """Build vocabulary from a sequence of sentences (can be a once-only generator stream).
         Each sentence is a iterable of iterables (can simply be a list of unicode strings too).
+        
         Parameters
         ----------
         documents : iterable of iterables
@@ -705,6 +741,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
         Build vocabulary from a dictionary of word frequencies.
         Build model vocabulary from a passed dictionary that contains (word,word count).
         Words must be of type unicode strings.
+        
         Parameters
         ----------
         word_freq : dict
@@ -723,6 +760,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
             of the model.
         update : bool
             If true, the new provided words in `word_freq` dict will be added to model's vocab.
+        
         Examples
         --------
         >>> from gensim.models.word2vec import Word2Vec
@@ -902,17 +940,23 @@ def __iter__(self):
 
 class TaggedLineDocument(object):
     """Simple format: one document = one line = one TaggedDocument object.
-    Words are expected to be already preprocessed and separated by whitespace,
+   
+   Words are expected to be already preprocessed and separated by whitespace,
     tags are constructed automatically from the document line number."""
 
     def __init__(self, source):
         """
         `source` can be either a string (filename) or a file object.
+        
         Example::
+            
             documents = TaggedLineDocument('myfile.txt')
+        
         Or for compressed files::
+            
             documents = TaggedLineDocument('compressed_text.txt.bz2')
             documents = TaggedLineDocument('compressed_text.txt.gz')
+        
         """
         self.source = source
 

From 9b8aa9151ab4e14763bf1af0d8c210803409703c Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 13:11:29 +0530
Subject: [PATCH 22/41] Update doc2vec.py

---
 gensim/models/doc2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
index ebf2793f59..8c2d166fc8 100644
--- a/gensim/models/doc2vec.py
+++ b/gensim/models/doc2vec.py
@@ -636,7 +636,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
         """Store the input-hidden weight matrix in the same format used by the original
         C word2vec-tool, for compatibility.
        
-       Parameters
+        Parameters
         ----------
         fname : str
             The file path used to save the vectors in.

From 7bc192dd67c708843872527a3a417d067c6415c5 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 13:17:35 +0530
Subject: [PATCH 23/41] Update word2vec.py

---
 gensim/models/word2vec.py | 80 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 1ca8fd0af3..f51b4cd25f 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -7,59 +7,92 @@
 
 """Produce word vectors with deep learning via word2vec's "skip-gram and CBOW models", using either
 hierarchical softmax or negative sampling [1]_ [2]_.
+
 NOTE: There are more ways to get word vectors in Gensim than just Word2Vec.
 See FastText and wrappers for VarEmbed and WordRank.
+
 The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/
 and extended with additional functionality.
+
 For a blog tutorial on gensim word2vec, with an interactive web app trained on GoogleNews,
 visit http://radimrehurek.com/2014/02/word2vec-tutorial/
+
 **Make sure you have a C compiler before installing gensim, to use optimized (compiled) word2vec training**
 (70x speedup compared to plain NumPy implementation [3]_).
+
 Initialize a model with e.g.::
+
     >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
+
 Persist a model to disk with::
+
     >>> model.save(fname)
     >>> model = Word2Vec.load(fname)  # you can continue training with the loaded model!
+
 The word vectors are stored in a KeyedVectors instance in model.wv.
 This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec::
+
   >>> model.wv['computer']  # numpy vector of a word
   array([-0.00449447, -0.00310097,  0.02421786, ...], dtype=float32)
+
 The word vectors can also be instantiated from an existing file on disk in the word2vec C format
 as a KeyedVectors instance.
+
 NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights,
 vocabulary frequency and the binary tree is missing::
+
     >>> from gensim.models import KeyedVectors
     >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False)  # C text format
     >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True)  # C binary format
+
+
 You can perform various NLP word tasks with the model. Some of them
 are already built-in::
+
   >>> model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
   [('queen', 0.50882536), ...]
+
   >>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
   [('queen', 0.71382287), ...]
+
+
   >>> model.wv.doesnt_match("breakfast cereal dinner lunch".split())
   'cereal'
+
   >>> model.wv.similarity('woman', 'man')
   0.73723527
+
 Probability of a text under the model::
+
   >>> model.score(["The fox jumped over a lazy dog".split()])
   0.2158356
+
 Correlation with human opinion on word similarity::
+
   >>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv'))
   0.51, 0.62, 0.13
+
 And on analogies::
+
   >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
+
 and so on.
+
 If you're finished training a model (i.e. no more updates, only querying),
 then switch to the :mod:`gensim.models.KeyedVectors` instance in wv
+
   >>> word_vectors = model.wv
   >>> del model
+
 to trim unneeded model memory = use much less RAM.
+
 Note that there is a :mod:`gensim.models.phrases` module which lets you automatically
 detect phrases longer than one word. Using phrases, you can learn a word2vec model
 where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`:
+
     >>> bigram_transformer = gensim.models.Phrases(sentences)
     >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...)
+
 .. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean.
        Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013.
 .. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean.
@@ -378,12 +411,15 @@ def score_cbow_pair(model, word, l1):
 
 class Word2Vec(BaseWordEmbeddingsModel):
     """Class for training, using and evaluating neural networks described in https://code.google.com/p/word2vec/
+
     If you're finished training a model (=no more updates, only querying)
     then switch to the :mod:`gensim.models.KeyedVectors` instance in wv
+
     The model can be stored/loaded via its :meth:`~gensim.models.word2vec.Word2Vec.save()` and
     :meth:`~gensim.models.word2vec.Word2Vec.load()` methods, or stored/loaded in a format
     compatible with the original word2vec implementation via `wv.save_word2vec_format()`
     and `Word2VecKeyedVectors.load_word2vec_format()`.
+
     """
 
     def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
@@ -393,6 +429,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
         """
         Initialize the model from an iterable of `sentences`. Each sentence is a
         list of words (unicode strings) that will be used for training.
+
         Parameters
         ----------
         sentences : iterable of iterables
@@ -402,6 +439,7 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
             or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
             If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
             in some other way.
+
         sg : int {1, 0}
             Defines the training algorithm. If 1, skip-gram is employed; otherwise, CBOW is used.
         size : int
@@ -460,14 +498,17 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
             If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`.
         callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
             List of callbacks that need to be executed/run at specific stages during training.
+
         Examples
         --------
         Initialize and train a `Word2Vec` model
+
         >>> from gensim.models import Word2Vec
         >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
         >>>
         >>> model = Word2Vec(sentences, min_count=1)
         >>> say_vector = model['say']  # get vector for word
+
         """
 
         self.callbacks = callbacks
@@ -512,15 +553,18 @@ def train(self, sentences, total_examples=None, total_words=None,
               queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()):
         """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
         For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)
+
         To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate
         progress-percentage logging, either total_examples (count of sentences) or total_words (count of
         raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to
         :meth:`~gensim.models.word2vec.Word2Vec.build_vocab()`, the count of examples in that corpus
         will be available in the model's :attr:`corpus_count` property).
+
         To avoid common mistakes around the model's ability to do multiple training passes itself, an
         explicit `epochs` argument **MUST** be provided. In the common and recommended case,
         where :meth:`~gensim.models.word2vec.Word2Vec.train()` is only called once,
         the model's cached `iter` value should be supplied as `epochs` value.
+
         Parameters
         ----------
         sentences : iterable of iterables
@@ -549,6 +593,7 @@ def train(self, sentences, total_examples=None, total_words=None,
             If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`.
         callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
             List of callbacks that need to be executed/run at specific stages during training.
+
         Examples
         --------
         >>> from gensim.models import Word2Vec
@@ -557,6 +602,7 @@ def train(self, sentences, total_examples=None, total_words=None,
         >>> model = Word2Vec(min_count=1)
         >>> model.build_vocab(sentences)
         >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
+
         """
 
         return super(Word2Vec, self).train(
@@ -568,15 +614,20 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
         """Score the log probability for a sequence of sentences (can be a once-only generator stream).
         Each sentence must be a list of unicode strings.
         This does not change the fitted model in any way (see Word2Vec.train() for that).
+
         We have currently only implemented score for the hierarchical softmax scheme,
         so you need to have run word2vec with hs=1 and negative=0 for this to work.
+
         Note that you should specify total_sentences; we'll run into problems if you ask to
         score more than this number of sentences but it is inefficient to set the value too high.
+
         See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of
         how to use such scores in document classification.
+
         .. [#taddy] Taddy, Matt.  Document Classification by Inversion of Distributed Language Representations,
                     in Proceedings of the 2015 Conference of the Association of Computational Linguistics.
         .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb
+
         Parameters
         ----------
         sentences : iterable of iterables
@@ -592,6 +643,7 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
             Multiplier for size of queue (number of workers * queue_factor).
         report_delay : float
             Seconds to wait before reporting progress.
+
         """
         if FAST_VERSION < 0:
             warnings.warn(
@@ -708,16 +760,20 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
         given, where it intersects with the current vocabulary. (No words are added to the
         existing vocabulary, but intersecting words adopt the file's weights, and
         non-intersecting words are left alone.)
+
         Parameters
         ----------
         fname : str
             The file path used to save the vectors in
+
         binary : bool
             If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
+
         lockf : float
             Lock-factor value to be set for any imported word-vectors; the
             default value of 0.0 prevents further updating of the vector during subsequent
             training. Use 1.0 to allow further training updates of merged vectors.
+
         """
         overlap_count = 0
         logger.info("loading projection weights from %s", fname)
@@ -775,16 +831,19 @@ def __contains__(self, word):
     def predict_output_word(self, context_words_list, topn=10):
         """Report the probability distribution of the center word given the context words
         as input to the trained model.
+
         Parameters
         ----------
         context_words_list : :obj: `list` of :obj: `str`
             List of context words
         topn: int
             Return `topn` words and their probabilities
+
         Returns
         -------
         :obj: `list` of :obj: `tuple`
             `topn` length list of tuples of (word, probability)
+
         """
         if not self.negative:
             raise RuntimeError(
@@ -859,10 +918,12 @@ def delete_temporary_training_data(self, replace_word_vectors_with_normalized=Fa
     def save(self, *args, **kwargs):
         """Save the model. This saved model can be loaded again using :func:`~gensim.models.word2vec.Word2Vec.load`,
         which supports online training and getting vectors for vocabulary words.
+
         Parameters
         ----------
         fname : str
             Path to the file.
+
         """
         # don't bother storing the cached normalized vectors, recalculable table
         kwargs['ignore'] = kwargs.get('ignore', ['vectors_norm', 'cum_table'])
@@ -899,10 +960,12 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False):
     @classmethod
     def load(cls, *args, **kwargs):
         """Loads a previously saved `Word2Vec` model. Also see `save()`.
+
         Parameters
         ----------
         fname : str
             Path to the saved file.
+
         Returns
         -------
         :obj: `~gensim.models.word2vec.Word2Vec`
@@ -976,11 +1039,16 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         """
         `source` can be either a string or a file object. Clip the file to the first
         `limit` lines (or not clipped if limit is None, the default).
+
         Example::
+
             sentences = LineSentence('myfile.txt')
+
         Or for compressed files::
+
             sentences = LineSentence('compressed_text.txt.bz2')
             sentences = LineSentence('compressed_text.txt.gz')
+
         """
         self.source = source
         self.max_sentence_length = max_sentence_length
@@ -1013,17 +1081,23 @@ class PathLineSentences(object):
     """Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename.
     The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files.
     Any file not ending with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories.
+
     The format of files (either text, or compressed text files) in the path is one sentence = one line,
     with words already preprocessed and separated by whitespace.
+
     """
 
     def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         """
         `source` should be a path to a directory (as a string) where all files can be opened by the
         LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default).
+
         Example::
+
             sentences = PathLineSentences(os.getcwd() + '\\corpus\\')
+
         The files in the directory should be either text files, .bz2 files, or .gz files.
+
         """
         self.source = source
         self.max_sentence_length = max_sentence_length
@@ -1116,12 +1190,15 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr
                       min_count=None, sample=None, dry_run=False):
         """Apply vocabulary settings for `min_count` (discarding less-frequent words)
         and `sample` (controlling the downsampling of more-frequent words).
+
         Calling with `dry_run=True` will only simulate the provided settings and
         report the size of the retained vocabulary, effective corpus length, and
         estimated memory requirements. Results are both printed via logging and
         returned as a dict.
+
         Delete the raw vocabulary after the scaling is done to free up RAM,
         unless `keep_raw_vocab` is set.
+
         """
         min_count = min_count or self.min_count
         sample = sample or self.sample
@@ -1257,6 +1334,7 @@ def add_null_word(self, wv):
     def create_binary_tree(self, wv):
         """Create a binary Huffman tree using stored vocabulary word counts. Frequent words
         will have shorter binary codes. Called internally from `build_vocab()`.
+
         """
         logger.info("constructing a huffman tree from %i words", len(wv.vocab))
 
@@ -1289,10 +1367,12 @@ def create_binary_tree(self, wv):
     def make_cum_table(self, wv, power=0.75, domain=2**31 - 1):
         """Create a cumulative-distribution table using stored vocabulary word counts for
         drawing random words in the negative-sampling training routines.
+
         To draw a word index, choose a random integer up to the maximum value in the
         table (cum_table[-1]), then finding that integer's sorted insertion point
         (as if by bisect_left or ndarray.searchsorted()). That insertion point is the
         drawn index, coming up in proportion equal to the increment at that slot.
+
         Called internally from 'build_vocab()'.
         """
         vocab_size = len(wv.index2word)

From c6d73f6576b97defb5e30ebbb2d838fef3f7a1c6 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 13:28:42 +0530
Subject: [PATCH 24/41] Update w2vmodel.py

---
 gensim/sklearn_api/w2vmodel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index 443c6801ae..36eeaba67a 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -58,7 +58,8 @@ def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=
         self.trim_rule = trim_rule
         self.sorted_vocab = sorted_vocab
         self.batch_words = batch_words
-        def fit(self, X, y=None):
+        
+    def fit(self, X, y=None):
         """
         Fit the model according to the given training data.
         Calls gensim.models.Word2Vec

From f06c65352abc6532742e41440bfa40e0f2210c2a Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 13:52:04 +0530
Subject: [PATCH 25/41] Update d2vmodel.py

---
 gensim/sklearn_api/d2vmodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index 45bf2750ea..d18158e2f6 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -68,7 +68,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
         self.sorted_vocab = sorted_vocab
         self.batch_words = batch_words
         
-         def fit(self, X, y=None):
+    def fit(self, X, y=None):
         """
         Fit the model according to the given training data.
         Calls gensim.models.Doc2Vec

From 3dad3458a7f2a98dfe09401fc3ffb25ea66ae7b5 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 14:09:28 +0530
Subject: [PATCH 26/41] Update test_sklearn_api.py

---
 gensim/test/test_sklearn_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index f28bf6817b..2521bf13fe 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -679,7 +679,7 @@ def testConsistencyWithGensimModel(self):
 
     def testPipeline(self):
         numpy.random.seed(0)  # set fixed seed to get similar values everytime
-        model = W2VTransformer(min_count=1,vector_size=10)
+        model = W2VTransformer(min_count=1, vector_size=10)
         model.fit(w2v_texts)
 
         class_dict = {'mathematics': 1, 'physics': 0}

From 3148a1753e39dfd6b0b710f52b8c94122d4e98fc Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 14:16:20 +0530
Subject: [PATCH 27/41] Update d2vmodel.py

---
 gensim/sklearn_api/d2vmodel.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index d18158e2f6..c20ab77818 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -24,9 +24,9 @@ class D2VTransformer(TransformerMixin, BaseEstimator):
     """
 
     def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None,
-                 docvecs_mapfile=None, comment=None, trim_rule=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5,
-                 max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1,
-                 hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000):
+                 docvecs_mapfile=None, comment=None, trim_rule=None, size=None, vector_size=100, alpha=0.025, window=5,
+                 min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5,
+                 cbow_mean=1, hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000):
         """
         Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details.
         """

From 9ac44f93416259f5b4ebba3f22c61bfd5b64316a Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 14:17:26 +0530
Subject: [PATCH 28/41] Update w2vmodel.py

---
 gensim/sklearn_api/w2vmodel.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index 36eeaba67a..11ad6cea25 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -24,9 +24,9 @@ class W2VTransformer(TransformerMixin, BaseEstimator):
     Base Word2Vec module
     """
 
-    def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
-                 workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, null_word=0,
-                 trim_rule=None, sorted_vocab=1, batch_words=10000):
+    def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3,
+                 seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5,
+                 null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000):
         """
         Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details.
         """

From e90fcf6f36a04b4cb9d26814efa43123a83d623a Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 14:50:57 +0530
Subject: [PATCH 29/41] Update w2vmodel.py

---
 gensim/sklearn_api/w2vmodel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index 11ad6cea25..f0774fdd7f 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -40,6 +40,7 @@ def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=
         
         self.gensim_model = None
         self.vector_size = vector_size
+        self.size = vector_size
         self.alpha = alpha
         self.window = window
         self.min_count = min_count

From 9686738687de5f2dcf0aa8915531997d95727574 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 14:51:02 +0530
Subject: [PATCH 30/41] Update test_sklearn_api.py

---
 gensim/test/test_sklearn_api.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index 2521bf13fe..482ae37bce 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -646,7 +646,7 @@ def testModelNotFitted(self):
 class TestWord2VecWrapper(unittest.TestCase):
     def setUp(self):
         numpy.random.seed(0)
-        self.model = W2VTransformer(min_count=0, seed=42, vector_size=10)
+        self.model = W2VTransformer(min_count=0, seed=42, size=10)
         self.model.fit(texts)
 
     def testTransform(self):
@@ -665,7 +665,7 @@ def testTransform(self):
 
     def testConsistencyWithGensimModel(self):
         # training a W2VTransformer
-        self.model = W2VTransformer(min_count=0, seed=42, vector_size=10)
+        self.model = W2VTransformer(min_count=0, seed=42, size=10)
         self.model.fit(texts)
 
         # training a Gensim Word2Vec model with the same params
@@ -679,7 +679,7 @@ def testConsistencyWithGensimModel(self):
 
     def testPipeline(self):
         numpy.random.seed(0)  # set fixed seed to get similar values everytime
-        model = W2VTransformer(min_count=1, vector_size=10)
+        model = W2VTransformer(min_count=1, size=10)
         model.fit(w2v_texts)
 
         class_dict = {'mathematics': 1, 'physics': 0}
@@ -724,7 +724,7 @@ def testPersistence(self):
         self.assertTrue(passed)
 
     def testModelNotFitted(self):
-        w2vmodel_wrapper = W2VTransformer(min_count=0, seed=42, vector_size=10)
+        w2vmodel_wrapper = W2VTransformer(min_count=0, seed=42, size=10)
         word = texts[0][0]
         self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word)
 

From 33961ed5d5948354512e8a675ccd6ea06c0305da Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 14:52:19 +0530
Subject: [PATCH 31/41] Update d2vmodel.py

---
 gensim/sklearn_api/d2vmodel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index c20ab77818..cbe074377f 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -52,6 +52,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
 
         # attributes associated with gensim.models.Word2Vec
         self.vector_size = vector_size
+        self.size = vector_size
         self.alpha = alpha
         self.window = window
         self.min_count = min_count

From a5fb143e7110b7af0e6444781d411e5164c780c2 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 15:50:18 +0530
Subject: [PATCH 32/41] Update w2vmodel.py

---
 gensim/sklearn_api/w2vmodel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index f0774fdd7f..c0d2cd35ba 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -25,8 +25,8 @@ class W2VTransformer(TransformerMixin, BaseEstimator):
     """
 
     def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3,
-                 seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5,
-                 null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000):
+                 seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None,
+                 epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000):
         """
         Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details.
         """

From d1c2d5a3ec059367b605596f453e3008413bbe17 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 15:57:42 +0530
Subject: [PATCH 33/41] Update d2vmodel.py

---
 gensim/sklearn_api/d2vmodel.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index cbe074377f..6c7313b24a 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -90,7 +90,6 @@ def fit(self, X, y=None):
         )
         return self
 
-
     def transform(self, docs):
         """
         Return the vector representations for the input documents.

From 7246e37f4139e6d489154a47d0ddf41f32837607 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 16:43:52 +0530
Subject: [PATCH 34/41] Update d2vmodel.py

---
 gensim/sklearn_api/d2vmodel.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index 6c7313b24a..3326703225 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -29,16 +29,13 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
                  cbow_mean=1, hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000):
         """
         Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details.
-        """
-        
+        """  
         if iter is not None:
             warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
             epochs = iter
-
         if size is not None:
             warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
             vector_size = size
-          
         self.gensim_model = None
         self.dm_mean = dm_mean
         self.dm = dm

From b254009b806cc709f30dd129b253475bcd1fd5e2 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 16:43:56 +0530
Subject: [PATCH 35/41] Update w2vmodel.py

---
 gensim/sklearn_api/w2vmodel.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index c0d2cd35ba..fa45d6b3d0 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -33,11 +33,9 @@ def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=
         if iter is not None:
             warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
             epochs = iter
-
         if size is not None:
             warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
-            vector_size = size
-        
+            vector_size = size 
         self.gensim_model = None
         self.vector_size = vector_size
         self.size = vector_size

From 750522935fd1a6df66bd0d37edfc393984762863 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 17:19:08 +0530
Subject: [PATCH 36/41] Update w2vmodel.py

---
 gensim/sklearn_api/w2vmodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index fa45d6b3d0..484ce35428 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -35,7 +35,7 @@ def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=
             epochs = iter
         if size is not None:
             warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
-            vector_size = size 
+            vector_size = size
         self.gensim_model = None
         self.vector_size = vector_size
         self.size = vector_size

From e6036496f934d45aeb172bbeb79c3e53f8e68065 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sat, 10 Mar 2018 18:00:03 +0530
Subject: [PATCH 37/41] Update d2vmodel.py

---
 gensim/sklearn_api/d2vmodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index 3326703225..b6eb3b9b04 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -29,7 +29,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
                  cbow_mean=1, hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000):
         """
         Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details.
-        """  
+        """
         if iter is not None:
             warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
             epochs = iter

From 61dfef07961746842246d8a7bf54ddd4fc0ad307 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Sun, 11 Mar 2018 20:18:35 +0530
Subject: [PATCH 38/41] Update test_sklearn_api.py

---
 gensim/test/test_sklearn_api.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index 482ae37bce..ed5516df37 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -646,7 +646,7 @@ def testModelNotFitted(self):
 class TestWord2VecWrapper(unittest.TestCase):
     def setUp(self):
         numpy.random.seed(0)
-        self.model = W2VTransformer(min_count=0, seed=42, size=10)
+        self.model = W2VTransformer(size=10, min_count=0, seed=42)
         self.model.fit(texts)
 
     def testTransform(self):
@@ -665,7 +665,7 @@ def testTransform(self):
 
     def testConsistencyWithGensimModel(self):
         # training a W2VTransformer
-        self.model = W2VTransformer(min_count=0, seed=42, size=10)
+        self.model = W2VTransformer(size=10, min_count=0, seed=42)
         self.model.fit(texts)
 
         # training a Gensim Word2Vec model with the same params
@@ -679,7 +679,7 @@ def testConsistencyWithGensimModel(self):
 
     def testPipeline(self):
         numpy.random.seed(0)  # set fixed seed to get similar values everytime
-        model = W2VTransformer(min_count=1, size=10)
+        model = W2VTransformer(size=10, min_count=1)
         model.fit(w2v_texts)
 
         class_dict = {'mathematics': 1, 'physics': 0}
@@ -724,7 +724,7 @@ def testPersistence(self):
         self.assertTrue(passed)
 
     def testModelNotFitted(self):
-        w2vmodel_wrapper = W2VTransformer(min_count=0, seed=42, size=10)
+        w2vmodel_wrapper = W2VTransformer(size=10, min_count=0, seed=42)
         word = texts[0][0]
         self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word)
 

From 52d2945d68d3f2bed0e577a152bf74c7505b4a42 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Mon, 12 Mar 2018 12:28:04 +0530
Subject: [PATCH 39/41] Update doc2vec.py

---
 gensim/models/doc2vec.py | 66 ++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
index 8c2d166fc8..f57694273d 100644
--- a/gensim/models/doc2vec.py
+++ b/gensim/models/doc2vec.py
@@ -239,9 +239,9 @@ class TaggedDocument(namedtuple('TaggedDocument', 'words tags')):
     and `tags` (a list of tokens). Tags may be one or more unicode string
     tokens, but typical practice (which will also be most memory-efficient) is
     for the tags list to include a unique integer id as the only tag.
-    
+
     Replaces "sentence as a list of words" from Word2Vec.
-    
+
     """
 
     def __str__(self):
@@ -257,9 +257,9 @@ class LabeledSentence(TaggedDocument):
 class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')):
     """A string document tag discovered during the initial vocabulary
     scan. (The document-vector equivalent of a Vocab object.)
-    
+
     Will not be used if all presented document tags are ints.
-    
+
     The offset is only the true index into the doctags_syn0/doctags_syn0_lockf
     if-and-only-if no raw-int tags were used. If any raw-int tags were used,
     string Doctag vectors begin at index (max_rawint + 1), so the true index is
@@ -278,7 +278,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
                  docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), **kwargs):
         """Initialize the model from an iterable of `documents`. Each document is a
         TaggedDocument object that will be used for training.
-        
+
         Parameters
         ----------
         documents : iterable of iterables
@@ -286,11 +286,11 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
             consider an iterable that streams the documents directly from disk/network.
             If you don't supply `documents`, the model is left uninitialized -- use if
             you plan to initialize it in some other way.
-        
+
         dm : int {1,0}
             Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
             Otherwise, `distributed bag of words` (PV-DBOW) is employed.
-        
+
         size : int
             Dimensionality of the feature vectors.
         window : int
@@ -349,9 +349,9 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
             of the model.
         callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
             List of callbacks that need to be executed/run at specific stages during training.
-        
+
         """
-        
+
         if 'sentences' in kwargs:
             raise DeprecationWarning(
                 "Parameter 'sentences' was renamed to 'documents', and will be removed in 4.0.0, "
@@ -467,18 +467,18 @@ def train(self, documents, total_examples=None, total_words=None,
               word_count=0, queue_factor=2, report_delay=1.0, callbacks=()):
         """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
         The `documents` iterable can be simply a list of TaggedDocument elements.
-        
+
         To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate
         progress-percentage logging, either total_examples (count of sentences) or total_words (count of
         raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to
         :meth:`~gensim.models.word2vec.Word2Vec.build_vocab()`, the count of examples in that corpus
         will be available in the model's :attr:`corpus_count` property).
-        
+
         To avoid common mistakes around the model's ability to do multiple training passes itself, an
         explicit `epochs` argument **MUST** be provided. In the common and recommended case,
         where :meth:`~gensim.models.word2vec.Word2Vec.train()` is only called once,
         the model's cached `iter` value should be supplied as `epochs` value.
-        
+
         Parameters
         ----------
         documents : iterable of iterables
@@ -522,7 +522,7 @@ def estimated_lookup_memory(self):
     def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
         """
         Infer a vector for given post-bulk training document.
-        
+
         Parameters
         ----------
         doc_words : :obj: `list` of :obj: `str`
@@ -533,12 +533,12 @@ def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
             Learning rate will linearly drop to `min_alpha` as training progresses.
         steps : int
             Number of times to train the new document.
-        
+
         Returns
         -------
         :obj: `numpy.ndarray`
             Returns the inferred vector for the new document.
-        
+
         """
         doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size)
         doctag_indexes = [0]
@@ -609,7 +609,7 @@ def __str__(self):
 
     def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True):
         """Discard parameters that are used in training and score. Use if you're sure you're done training a model.
-        
+
         Parameters
         ----------
         keep_doctags_vectors : bool
@@ -617,8 +617,8 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen
             in this case you can't to use docvecs's most_similar, similarity etc. methods.
         keep_inference : bool
             Set `keep_inference` to False if you don't want to store parameters that is used for infer_vector method
-       
-       """
+
+        """
         if not keep_inference:
             if hasattr(self.trainables, 'syn1'):
                 del self.trainables.syn1
@@ -635,7 +635,7 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen
     def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False):
         """Store the input-hidden weight matrix in the same format used by the original
         C word2vec-tool, for compatibility.
-       
+
         Parameters
         ----------
         fname : str
@@ -651,7 +651,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
             Optional file path used to save the vocabulary
         binary : bool
             If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
-        
+
         """
         total_vec = len(self.wv.vocab) + len(self.docvecs)
         write_first_line = False
@@ -672,14 +672,14 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
     def init_sims(self, replace=False):
         """
         Precompute L2-normalized vectors.
-        
+
         If `replace` is set, forget the original vectors and only keep the normalized
         ones = saves lots of memory!
-        
+
         Note that you **cannot continue training or inference** after doing a replace.
         The model becomes effectively read-only = you can call `most_similar`, `similarity`
         etc., but not `train` or `infer_vector`.
-        
+
         """
         return self.docvecs.init_sims(replace=replace)
 
@@ -702,7 +702,7 @@ def estimate_memory(self, vocab_size=None, report=None):
     def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs):
         """Build vocabulary from a sequence of sentences (can be a once-only generator stream).
         Each sentence is a iterable of iterables (can simply be a list of unicode strings too).
-        
+
         Parameters
         ----------
         documents : iterable of iterables
@@ -741,7 +741,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
         Build vocabulary from a dictionary of word frequencies.
         Build model vocabulary from a passed dictionary that contains (word,word count).
         Words must be of type unicode strings.
-        
+
         Parameters
         ----------
         word_freq : dict
@@ -760,7 +760,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
             of the model.
         update : bool
             If true, the new provided words in `word_freq` dict will be added to model's vocab.
-        
+
         Examples
         --------
         >>> from gensim.models.word2vec import Word2Vec
@@ -940,23 +940,23 @@ def __iter__(self):
 
 class TaggedLineDocument(object):
     """Simple format: one document = one line = one TaggedDocument object.
-   
-   Words are expected to be already preprocessed and separated by whitespace,
+
+    Words are expected to be already preprocessed and separated by whitespace,
     tags are constructed automatically from the document line number."""
 
     def __init__(self, source):
         """
         `source` can be either a string (filename) or a file object.
-        
+
         Example::
-            
+
             documents = TaggedLineDocument('myfile.txt')
-        
+
         Or for compressed files::
-            
+
             documents = TaggedLineDocument('compressed_text.txt.bz2')
             documents = TaggedLineDocument('compressed_text.txt.gz')
-        
+
         """
         self.source = source
 

From 09b5dc7a9d78a379605c87bd09cc60dcc1054c9d Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Mon, 12 Mar 2018 12:28:08 +0530
Subject: [PATCH 40/41] Update w2vmodel.py

---
 gensim/sklearn_api/w2vmodel.py | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index 484ce35428..317842ee07 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -10,7 +10,6 @@
 Follows scikit-learn API conventions
 """
 
-import warnings
 import numpy as np
 import six
 from sklearn.base import TransformerMixin, BaseEstimator
@@ -24,21 +23,14 @@ class W2VTransformer(TransformerMixin, BaseEstimator):
     Base Word2Vec module
     """
 
-    def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3,
-                 seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None,
-                 epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000):
+    def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
+                 workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
+                 trim_rule=None, sorted_vocab=1, batch_words=10000):
         """
         Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details.
         """
-        if iter is not None:
-            warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
-            epochs = iter
-        if size is not None:
-            warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
-            vector_size = size
         self.gensim_model = None
-        self.vector_size = vector_size
-        self.size = vector_size
+        self.size = size
         self.alpha = alpha
         self.window = window
         self.min_count = min_count
@@ -52,23 +44,23 @@ def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=
         self.negative = negative
         self.cbow_mean = int(cbow_mean)
         self.hashfxn = hashfxn
-        self.epochs = epochs
+        self.iter = iter
         self.null_word = null_word
         self.trim_rule = trim_rule
         self.sorted_vocab = sorted_vocab
         self.batch_words = batch_words
-        
+
     def fit(self, X, y=None):
         """
         Fit the model according to the given training data.
         Calls gensim.models.Word2Vec
         """
         self.gensim_model = models.Word2Vec(
-            sentences=X, size=self.vector_size, alpha=self.alpha,
+            sentences=X, size=self.size, alpha=self.alpha,
             window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size,
             sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha,
             sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean,
-            hashfxn=self.hashfxn, iter=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule,
+            hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, trim_rule=self.trim_rule,
             sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
         )
         return self

From 341a91f5658813e2a9470dbf7b57105f3bf0ad30 Mon Sep 17 00:00:00 2001
From: Mritunjay Mohitesh <mritunjaymohitesh@gmail.com>
Date: Mon, 12 Mar 2018 12:44:57 +0530
Subject: [PATCH 41/41] Update d2vmodel.py

---
 gensim/sklearn_api/d2vmodel.py | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index b6eb3b9b04..f3f9f53133 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -8,7 +8,7 @@
 Scikit learn interface for gensim for easy use of gensim with scikit-learn
 Follows scikit-learn API conventions
 """
-import warnings
+
 import numpy as np
 from six import string_types
 from sklearn.base import TransformerMixin, BaseEstimator
@@ -24,18 +24,12 @@ class D2VTransformer(TransformerMixin, BaseEstimator):
     """
 
     def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None,
-                 docvecs_mapfile=None, comment=None, trim_rule=None, size=None, vector_size=100, alpha=0.025, window=5,
-                 min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5,
-                 cbow_mean=1, hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000):
+                 docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5,
+                 max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1,
+                 hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000):
         """
         Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details.
         """
-        if iter is not None:
-            warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
-            epochs = iter
-        if size is not None:
-            warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
-            vector_size = size
         self.gensim_model = None
         self.dm_mean = dm_mean
         self.dm = dm
@@ -48,8 +42,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
         self.trim_rule = trim_rule
 
         # attributes associated with gensim.models.Word2Vec
-        self.vector_size = vector_size
-        self.size = vector_size
+        self.size = size
         self.alpha = alpha
         self.window = window
         self.min_count = min_count
@@ -62,10 +55,10 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
         self.negative = negative
         self.cbow_mean = int(cbow_mean)
         self.hashfxn = hashfxn
-        self.epochs = epochs
+        self.iter = iter
         self.sorted_vocab = sorted_vocab
         self.batch_words = batch_words
-        
+
     def fit(self, X, y=None):
         """
         Fit the model according to the given training data.
@@ -79,11 +72,11 @@ def fit(self, X, y=None):
             documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm,
             dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count,
             docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment,
-            trim_rule=self.trim_rule, size=self.vector_size, alpha=self.alpha, window=self.window,
+            trim_rule=self.trim_rule, vector_size=self.size, alpha=self.alpha, window=self.window,
             min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample,
             seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs,
             negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn,
-            epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
+            epochs=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
         )
         return self