Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get rid most of warnings in testing #2191

Merged
merged 16 commits into from
Sep 23, 2018
8 changes: 4 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ Changes
New training mode for `*2Vec` models (word2vec, doc2vec, fasttext) that allows model training to scale linearly with the number of cores (full GIL elimination). The result of our Google Summer of Code 2018 project by Dmitry Persiyanov.

**Benchmark**
- Dataset: full English Wikipedia
- Cloud: GCE
- CPU: Intel(R) Xeon(R) CPU @ 2.30GHz 32 cores
- BLAS: libblas3 (3.7.1-3ubuntu2)
- Dataset: `full English Wikipedia`
- Cloud: `GCE`
- CPU: `Intel(R) Xeon(R) CPU @ 2.30GHz 32 cores`
- BLAS: `MKL`


| Model | Queue-based version [sec] | File-based version [sec] | speed up | Accuracy (queue-based) | Accuracy (file-based) |
Expand Down
4 changes: 2 additions & 2 deletions gensim/models/deprecated/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def load_old_doc2vec(*args, **kwargs):
'dm_tag_count': old_model.dm_tag_count,
'docvecs_mapfile': old_model.__dict__.get('docvecs_mapfile', None),
'comment': old_model.__dict__.get('comment', None),
'size': old_model.vector_size,
'vector_size': old_model.vector_size,
'alpha': old_model.alpha,
'window': old_model.window,
'min_count': old_model.min_count,
Expand All @@ -104,7 +104,7 @@ def load_old_doc2vec(*args, **kwargs):
'negative': old_model.negative,
'cbow_mean': old_model.cbow_mean,
'hashfxn': old_model.hashfxn,
'iter': old_model.iter,
'epochs': old_model.iter,
'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1),
'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH),
'compute_loss': old_model.__dict__.get('compute_loss', None)
Expand Down
16 changes: 8 additions & 8 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,8 +363,8 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha
>>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
>>>
>>> model = FastText(sentences, min_count=1)
>>> say_vector = model['say'] # get vector for word
>>> of_vector = model['of'] # get vector for out-of-vocab word
>>> say_vector = model.wv['say'] # get vector for word
>>> of_vector = model.wv['of'] # get vector for out-of-vocab word

"""
self.load = call_on_class_only
Expand All @@ -380,7 +380,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha
sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent)
self.trainables = FastTextTrainables(
vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn)
self.wv.bucket = self.bucket
self.wv.bucket = self.trainables.bucket

super(FastText, self).__init__(
sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=size, epochs=iter,
Expand Down Expand Up @@ -487,10 +487,10 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p
>>>
>>> model = FastText(min_count=1)
>>> model.build_vocab(sentences_1)
>>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.iter)
>>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.epochs)
>>>
>>> model.build_vocab(sentences_2, update=True)
>>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.iter)
>>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.epochs)

"""
if update:
Expand Down Expand Up @@ -519,11 +519,11 @@ def _clear_post_train(self):
def estimate_memory(self, vocab_size=None, report=None):
vocab_size = vocab_size or len(self.wv.vocab)
vec_size = self.vector_size * np.dtype(np.float32).itemsize
l1_size = self.layer1_size * np.dtype(np.float32).itemsize
l1_size = self.trainables.layer1_size * np.dtype(np.float32).itemsize
report = report or {}
report['vocab'] = len(self.wv.vocab) * (700 if self.hs else 500)
report['syn0_vocab'] = len(self.wv.vocab) * vec_size
num_buckets = self.bucket
num_buckets = self.trainables.bucket
if self.hs:
report['syn1'] = len(self.wv.vocab) * l1_size
if self.negative:
Expand Down Expand Up @@ -657,7 +657,7 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor
>>>
>>> model = FastText(min_count=1)
>>> model.build_vocab(sentences)
>>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
>>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

"""
super(FastText, self).train(
Expand Down
4 changes: 2 additions & 2 deletions gensim/models/translation_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,12 @@ def build(cls, lang_vec, lexicon=None):
# if the lexicon is not provided, using the all the Keyedvectors's words as default
for item in lexicon:
words.append(item)
mat.append(lang_vec.syn0[lang_vec.vocab[item].index])
mat.append(lang_vec.vectors[lang_vec.vocab[item].index])

else:
for item in lang_vec.vocab.keys():
words.append(item)
mat.append(lang_vec.syn0[lang_vec.vocab[item].index])
mat.append(lang_vec.vectors[lang_vec.vocab[item].index])

return Space(mat, words)

Expand Down
6 changes: 3 additions & 3 deletions gensim/models/wrappers/dtmmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,9 +464,9 @@ def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted
for time in chosen_times:
for i in chosen_topics:
if formatted:
topic = self.print_topic(i, time, num_words=num_words)
topic = self.print_topic(i, time, topn=num_words)
else:
topic = self.show_topic(i, time, num_words=num_words)
topic = self.show_topic(i, time, topn=num_words)
shown.append(topic)
return shown

Expand Down Expand Up @@ -529,7 +529,7 @@ def print_topic(self, topicid, time, topn=10, num_words=None):
warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.")
topn = num_words

return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, topn)])
return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, topn=topn)])

def dtm_vis(self, corpus, time):
"""Get data specified by pyLDAvis format.
Expand Down
8 changes: 4 additions & 4 deletions gensim/models/wrappers/varembed.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,14 @@ def load_word_embeddings(self, word_embeddings, word_to_ix):
counts[word] = counts.get(word, 0) + 1
self.vocab_size = len(counts)
self.vector_size = word_embeddings.shape[1]
self.syn0 = np.zeros((self.vocab_size, self.vector_size))
self.vectors = np.zeros((self.vocab_size, self.vector_size))
self.index2word = [None] * self.vocab_size
logger.info("Corpus has %i words", len(self.vocab))
for word_id, word in enumerate(counts):
self.vocab[word] = Vocab(index=word_id, count=counts[word])
self.syn0[word_id] = word_embeddings[word_to_ix[word]]
self.vectors[word_id] = word_embeddings[word_to_ix[word]]
self.index2word[word_id] = word
assert((len(self.vocab), self.vector_size) == self.syn0.shape)
assert((len(self.vocab), self.vector_size) == self.vectors.shape)
logger.info("Loaded matrix of %d size and %d dimensions", self.vocab_size, self.vector_size)

def add_morphemes_to_embeddings(self, morfessor_model, morpho_embeddings, morpho_to_ix):
Expand All @@ -125,5 +125,5 @@ def add_morphemes_to_embeddings(self, morfessor_model, morpho_embeddings, morpho
for m in morfessor_model.viterbi_segment(word)[0]
]
).sum(axis=0)
self.syn0[self.vocab[word].index] += morpheme_embedding
self.vectors[self.vocab[word].index] += morpheme_embedding
logger.info("Added morphemes to word vectors")
2 changes: 1 addition & 1 deletion gensim/similarities/docsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -1051,7 +1051,7 @@ def get_similarities(self, query):
result = []
for qidx in range(n_queries):
# Compute similarity for each query.
qresult = [self.w2v_model.wmdistance(document, query[qidx]) for document in self.corpus]
qresult = [self.w2v_model.wv.wmdistance(document, query[qidx]) for document in self.corpus]
qresult = numpy.array(qresult)
qresult = 1. / (1. + qresult) # Similarity is the negative of the distance.

Expand Down
2 changes: 1 addition & 1 deletion gensim/similarities/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def build_from_keyedvectors(self):
"""Build an Annoy index using word vectors from a KeyedVectors model."""

self.model.init_sims()
return self._build_from_model(self.model.syn0norm, self.model.index2word, self.model.vector_size)
return self._build_from_model(self.model.vectors_norm, self.model.index2word, self.model.vector_size)

def _build_from_model(self, vectors, labels, num_features):
index = AnnoyIndex(num_features)
Expand Down
2 changes: 1 addition & 1 deletion gensim/sklearn_api/ftmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,5 +220,5 @@ def transform(self, words):
# The input as array of array
if isinstance(words, six.string_types):
words = [words]
vectors = [self.gensim_model[word] for word in words]
vectors = [self.gensim_model.wv[word] for word in words]
return np.reshape(np.array(vectors), (len(words), self.size))
2 changes: 1 addition & 1 deletion gensim/sklearn_api/w2vmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def transform(self, words):
# The input as array of array
if isinstance(words, six.string_types):
words = [words]
vectors = [self.gensim_model[word] for word in words]
vectors = [self.gensim_model.wv[word] for word in words]
return np.reshape(np.array(vectors), (len(words), self.size))

def partial_fit(self, X):
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_load_model(self):
base_dir, "__testing_word2vec-matrix-synopsis", "__testing_word2vec-matrix-synopsis.gz"
)
model = api.load("__testing_word2vec-matrix-synopsis")
vector_dead_calc = model["dead"]
vector_dead_calc = model.wv["dead"]
self.assertTrue(np.allclose(vector_dead, vector_dead_calc))
shutil.rmtree(base_dir)
self.assertEqual(api.load("__testing_word2vec-matrix-synopsis", return_path=True), dataset_path)
Expand Down
Loading