Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix deprecated parameters in D2VTransformer and W2VTransformer. Fix #1937 #1945

Merged
merged 45 commits into from
Mar 12, 2018
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
2ee6ab8
fix 'iter' and 'size' warnings
morsdor Feb 28, 2018
2e0afad
Merge pull request #1 from MritunjayMohitesh/MritunjayMohitesh-patch-1
morsdor Feb 28, 2018
6301be6
Revert "fix 'iter' and 'size' warnings"
morsdor Feb 28, 2018
77307c3
Merge pull request #2 from MritunjayMohitesh/revert-1-MritunjayMohite…
morsdor Feb 28, 2018
5b9d2ff
Update w2vmodel.py
morsdor Feb 28, 2018
d2733ff
fix 'iter' and 'size' warnings
morsdor Feb 28, 2018
92fc07b
Merge pull request #3 from MritunjayMohitesh/MritunjayMohitesh-patch-2
morsdor Feb 28, 2018
66c6bf1
Merge pull request #4 from MritunjayMohitesh/MritunjayMohitesh-patch-1
morsdor Feb 28, 2018
21934e1
Fix 'iter' and 'size' warnings
morsdor Feb 28, 2018
428d428
fixed deprecated argument warnings
morsdor Feb 28, 2018
0a0e1d6
fixed deprecated argument warnings
morsdor Feb 28, 2018
c79bdfb
Update test_sklearn_api.py
morsdor Mar 1, 2018
da76d3c
fix deprecated arguments
morsdor Mar 1, 2018
bbdb0d4
fix deprecated argument warnings
morsdor Mar 1, 2018
31d3729
fix deprecated argumet warnings
morsdor Mar 1, 2018
bbcd0e5
fix deprecated argumet warnings
morsdor Mar 1, 2018
734d5c5
fix deprecated argumet warnings
morsdor Mar 1, 2018
7cfbfe1
fix deprecated argument warnings
morsdor Mar 1, 2018
be108c1
fix deprecated argument warnings
morsdor Mar 1, 2018
e4f7ad7
fix deprecated arguments
morsdor Mar 1, 2018
02fdf70
fix deprecated arguments
morsdor Mar 10, 2018
477b381
fix deprecated arguments
morsdor Mar 10, 2018
36993ef
Update doc2vec.py
morsdor Mar 10, 2018
055f043
Update word2vec.py
morsdor Mar 10, 2018
74d1c59
Update doc2vec.py
morsdor Mar 10, 2018
9b8aa91
Update doc2vec.py
morsdor Mar 10, 2018
7bc192d
Update word2vec.py
morsdor Mar 10, 2018
c6d73f6
Update w2vmodel.py
morsdor Mar 10, 2018
f06c653
Update d2vmodel.py
morsdor Mar 10, 2018
3dad345
Update test_sklearn_api.py
morsdor Mar 10, 2018
3148a17
Update d2vmodel.py
morsdor Mar 10, 2018
9ac44f9
Update w2vmodel.py
morsdor Mar 10, 2018
e90fcf6
Update w2vmodel.py
morsdor Mar 10, 2018
9686738
Update test_sklearn_api.py
morsdor Mar 10, 2018
33961ed
Update d2vmodel.py
morsdor Mar 10, 2018
a5fb143
Update w2vmodel.py
morsdor Mar 10, 2018
d1c2d5a
Update d2vmodel.py
morsdor Mar 10, 2018
7246e37
Update d2vmodel.py
morsdor Mar 10, 2018
b254009
Update w2vmodel.py
morsdor Mar 10, 2018
7505229
Update w2vmodel.py
morsdor Mar 10, 2018
e603649
Update d2vmodel.py
morsdor Mar 10, 2018
61dfef0
Update test_sklearn_api.py
morsdor Mar 11, 2018
52d2945
Update doc2vec.py
morsdor Mar 12, 2018
09b5dc7
Update w2vmodel.py
morsdor Mar 12, 2018
341a91f
Update d2vmodel.py
morsdor Mar 12, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,19 +278,16 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), **kwargs):
"""Initialize the model from an iterable of `documents`. Each document is a
TaggedDocument object that will be used for training.

Parameters
----------
documents : iterable of iterables
The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora,
consider an iterable that streams the documents directly from disk/network.
If you don't supply `documents`, the model is left uninitialized -- use if
you plan to initialize it in some other way.

dm : int {1,0}
Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
Otherwise, `distributed bag of words` (PV-DBOW) is employed.

size : int
Dimensionality of the feature vectors.
window : int
Expand Down Expand Up @@ -349,7 +346,6 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
of the model.
callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
List of callbacks that need to be executed/run at specific stages during training.

"""

if 'sentences' in kwargs:
Expand Down Expand Up @@ -404,7 +400,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
self.train(
documents, total_examples=self.corpus_count, epochs=self.epochs,
start_alpha=self.alpha, end_alpha=self.min_alpha, callbacks=callbacks)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file is not relevant to PR, please remove it from PR

@property
def dm(self):
"""int {1,0} : `dm=1` indicates 'distributed memory' (PV-DM) else
Expand Down
33 changes: 18 additions & 15 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,25 +411,21 @@ def score_cbow_pair(model, word, l1):

class Word2Vec(BaseWordEmbeddingsModel):
"""Class for training, using and evaluating neural networks described in https://code.google.com/p/word2vec/

If you're finished training a model (=no more updates, only querying)
then switch to the :mod:`gensim.models.KeyedVectors` instance in wv

The model can be stored/loaded via its :meth:`~gensim.models.word2vec.Word2Vec.save()` and
:meth:`~gensim.models.word2vec.Word2Vec.load()` methods, or stored/loaded in a format
compatible with the original word2vec implementation via `wv.save_word2vec_format()`
and `Word2VecKeyedVectors.load_word2vec_format()`.

"""

def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
def __init__(self, sentences=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This changes non-relevant too, please remove it.

max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=()):
sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, null_word=0,
trim_rule=None, sorted_vocab=1, compute_loss=False, callbacks=()):
"""
Initialize the model from an iterable of `sentences`. Each sentence is a
list of words (unicode strings) that will be used for training.

Parameters
----------
sentences : iterable of iterables
Expand All @@ -439,7 +435,6 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
in some other way.

sg : int {1, 0}
Defines the training algorithm. If 1, skip-gram is employed; otherwise, CBOW is used.
size : int
Expand Down Expand Up @@ -498,30 +493,38 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`.
callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
List of callbacks that need to be executed/run at specific stages during training.

Examples
--------
Initialize and train a `Word2Vec` model

>>> from gensim.models import Word2Vec
>>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
>>>
>>> model = Word2Vec(sentences, min_count=1)
>>> say_vector = model['say'] # get vector for word

"""



if iter is not None:
warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
epochs = iter

if size is not None:
warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
vector_size = size

self.vector_size = vector_size
self.epochs = epochs
self.callbacks = callbacks
self.load = call_on_class_only

self.wv = Word2VecKeyedVectors(size)
self.wv = Word2VecKeyedVectors(vector_size)
self.vocabulary = Word2VecVocab(
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
sorted_vocab=bool(sorted_vocab), null_word=null_word)
self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn)
self.trainables = Word2VecTrainables(seed=seed, vector_size=vector_size, hashfxn=hashfxn)

super(Word2Vec, self).__init__(
sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks,
sentences=sentences, workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks,
batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed,
hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss,
fast_version=FAST_VERSION)
Expand Down
28 changes: 19 additions & 9 deletions gensim/sklearn_api/d2vmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
Scikit learn interface for gensim for easy use of gensim with scikit-learn
Follows scikit-learn API conventions
"""

import warnings
import numpy as np
from six import string_types
from sklearn.base import TransformerMixin, BaseEstimator
Expand All @@ -24,12 +24,21 @@ class D2VTransformer(TransformerMixin, BaseEstimator):
"""

def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None,
docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5,
docvecs_mapfile=None, comment=None, trim_rule=None, size=None, vector_size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1,
hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000):
hashfxn=hash, iter=None, epochs=5, sorted_vocab=1, batch_words=10000):
"""
Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details.
"""

if iter is not None:
warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
epochs = iter

if size is not None:
warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
vector_size = size

self.gensim_model = None
self.dm_mean = dm_mean
self.dm = dm
Expand All @@ -42,7 +51,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
self.trim_rule = trim_rule

# attributes associated with gensim.models.Word2Vec
self.size = size
self.vector_size = vector_size
self.alpha = alpha
self.window = window
self.min_count = min_count
Expand All @@ -55,11 +64,11 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
self.negative = negative
self.cbow_mean = int(cbow_mean)
self.hashfxn = hashfxn
self.iter = iter
self.epochs = epochs
self.sorted_vocab = sorted_vocab
self.batch_words = batch_words

def fit(self, X, y=None):
def fit(self, X, y=None):
"""
Fit the model according to the given training data.
Calls gensim.models.Doc2Vec
Expand All @@ -72,14 +81,15 @@ def fit(self, X, y=None):
documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm,
dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count,
docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment,
trim_rule=self.trim_rule, size=self.size, alpha=self.alpha, window=self.window,
trim_rule=self.trim_rule, vector_size=self.vector_size, alpha=self.alpha, window=self.window,
min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample,
seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs,
negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn,
iter=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
)
return self


def transform(self, docs):
"""
Return the vector representations for the input documents.
Expand Down
24 changes: 16 additions & 8 deletions gensim/sklearn_api/w2vmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
Follows scikit-learn API conventions
"""

import warnings
import numpy as np
import six
from sklearn.base import TransformerMixin, BaseEstimator
Expand All @@ -23,14 +24,22 @@ class W2VTransformer(TransformerMixin, BaseEstimator):
Base Word2Vec module
"""

def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
def __init__(self, size=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=None, epochs=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=10000):
"""
Sklearn wrapper for Word2Vec model. See gensim.models.Word2Vec for parameter details.
"""
if iter is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense here to rename the arguments here? You can simply pass correct variants to "original" class and this is enough to avoid deprecation problems.

CC: @manneshiva

warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
epochs = iter

if size is not None:
warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
vector_size = size

self.gensim_model = None
self.size = size
self.vector_size = vector_size
self.alpha = alpha
self.window = window
self.min_count = min_count
Expand All @@ -44,23 +53,22 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=
self.negative = negative
self.cbow_mean = int(cbow_mean)
self.hashfxn = hashfxn
self.iter = iter
self.epochs = epochs
self.null_word = null_word
self.trim_rule = trim_rule
self.sorted_vocab = sorted_vocab
self.batch_words = batch_words

def fit(self, X, y=None):
def fit(self, X, y=None):
"""
Fit the model according to the given training data.
Calls gensim.models.Word2Vec
"""
self.gensim_model = models.Word2Vec(
sentences=X, size=self.size, alpha=self.alpha,
sentences=X, vector_size=self.vector_size, alpha=self.alpha,
window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size,
sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha,
sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean,
hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, trim_rule=self.trim_rule,
hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule,
sorted_vocab=self.sorted_vocab, batch_words=self.batch_words
)
return self
Expand Down
8 changes: 4 additions & 4 deletions gensim/test/test_sklearn_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,7 @@ def testModelNotFitted(self):
class TestWord2VecWrapper(unittest.TestCase):
def setUp(self):
numpy.random.seed(0)
self.model = W2VTransformer(size=10, min_count=0, seed=42)
self.model = W2VTransformer(min_count=0, seed=42, vector_size=10)
self.model.fit(texts)

def testTransform(self):
Expand All @@ -665,7 +665,7 @@ def testTransform(self):

def testConsistencyWithGensimModel(self):
# training a W2VTransformer
self.model = W2VTransformer(size=10, min_count=0, seed=42)
self.model = W2VTransformer(min_count=0, seed=42, vector_size=10)
self.model.fit(texts)

# training a Gensim Word2Vec model with the same params
Expand All @@ -679,7 +679,7 @@ def testConsistencyWithGensimModel(self):

def testPipeline(self):
numpy.random.seed(0) # set fixed seed to get similar values everytime
model = W2VTransformer(size=10, min_count=1)
model = W2VTransformer(min_count=1,vector_size=10)
model.fit(w2v_texts)

class_dict = {'mathematics': 1, 'physics': 0}
Expand Down Expand Up @@ -724,7 +724,7 @@ def testPersistence(self):
self.assertTrue(passed)

def testModelNotFitted(self):
w2vmodel_wrapper = W2VTransformer(size=10, min_count=0, seed=42)
w2vmodel_wrapper = W2VTransformer(min_count=0, seed=42, vector_size=10)
word = texts[0][0]
self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word)

Expand Down