Skip to content

Commit

Permalink
Merge branch 'develop' into fix-xml
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Apr 24, 2020
2 parents a6247af + d70b129 commit 585b0c0
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 55 deletions.
20 changes: 12 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
gensim – Topic Modelling in Python
==================================

[![Build Status](https://travis-ci.org/RaRe-Technologies/gensim.svg?branch=develop)](https://travis-ci.org/RaRe-Technologies/gensim)
[![GitHub release](https://img.shields.io/github/release/rare-technologies/gensim.svg?maxAge=3600)](https://github.com/RaRe-Technologies/gensim/releases)
[![Conda-forge Build](https://anaconda.org/conda-forge/gensim/badges/version.svg)](https://anaconda.org/conda-forge/gensim)
[![Wheel](https://img.shields.io/pypi/wheel/gensim.svg)](https://pypi.python.org/pypi/gensim)
[![DOI](https://zenodo.org/badge/DOI/10.13140/2.1.2393.1847.svg)](https://doi.org/10.13140/2.1.2393.1847)
[![Mailing List](https://img.shields.io/badge/-Mailing%20List-brightgreen.svg)](https://groups.google.com/forum/#!forum/gensim)
[![Gitter](https://img.shields.io/badge/gitter-join%20chat%20%E2%86%92-09a3d5.svg)](https://gitter.im/RaRe-Technologies/gensim)
[![Follow](https://img.shields.io/twitter/follow/gensim_py.svg?style=social&label=Follow)](https://twitter.com/gensim_py)
<!--
The following image URLs are obfuscated = proxied and cached through
Google because of Github's proxying issues. See:
https://github.com/RaRe-Technologies/gensim/issues/2805
-->

[![Build Status](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&refresh=3600&url=https%3A%2F%2Ftravis-ci.org%2FRaRe-Technologies%2Fgensim.svg%3Fbranch%3Ddevelop)](https://travis-ci.org/RaRe-Technologies/gensim)
[![GitHub release](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&refresh=3600&url=https%3A%2F%2Fimg.shields.io%2Fgithub%2Frelease%2Frare-technologies%2Fgensim.svg%3FmaxAge%3D3600)](https://github.com/RaRe-Technologies/gensim/releases)
[![Downloads](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&refresh=86400&url=https%3A%2F%2Fimg.shields.io%2Fpypi%2Fdm%2Fgensim%3Fcolor%3Dblue)](https://pepy.tech/project/gensim/month)
[![DOI](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&refresh=604800&url=https%3A%2F%2Fzenodo.org%2Fbadge%2FDOI%2F10.13140%2F2.1.2393.1847.svg)](https://doi.org/10.13140/2.1.2393.1847)
[![Mailing List](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&refresh=604800&url=https%3A%2F%2Fimg.shields.io%2Fbadge%2F-Mailing%2520List-blue.svg)](https://groups.google.com/forum/#!forum/gensim)
[![Follow](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&refresh=86400&url=https%3A%2F%2Fimg.shields.io%2Ftwitter%2Ffollow%2Fgensim_py.svg%3Fstyle%3Dsocial%26style%3Dflat%26logo%3Dtwitter%26label%3DFollow%26color%3Dblue)](https://twitter.com/gensim_py)

Gensim is a Python library for *topic modelling*, *document indexing*
and *similarity retrieval* with large corpora. Target audience is the
Expand Down
74 changes: 39 additions & 35 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,12 @@
logger = logging.getLogger(__name__)

IS_WIN32 = (os.name == "nt") and (struct.calcsize('P') * 8 == 32)

MAX_WORDVEC_COMPONENT_DIFFERENCE = 1.0e-10

# Limit the size of FastText ngram buckets, for RAM reasons.
# See https://github.com/RaRe-Technologies/gensim/issues/2790
BUCKET = 5000

FT_HOME = os.environ.get("FT_HOME")
FT_CMD = os.path.join(FT_HOME, "fasttext") if FT_HOME else None

Expand Down Expand Up @@ -67,7 +70,7 @@ def setUp(self):
self.test_new_model_file = datapath('lee_fasttext_new.bin')

def test_training(self):
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
model.build_vocab(sentences)
self.model_sanity(model)

Expand All @@ -87,7 +90,7 @@ def test_training(self):
self.assertEqual(sims, sims2)

# build vocab and train in one step; must be the same as above
model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
self.models_equal(model, model2)

# verify oov-word vector retrieval
Expand All @@ -99,7 +102,7 @@ def test_training(self):

def testFastTextTrainParameters(self):

model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
model.build_vocab(sentences=sentences)

self.assertRaises(TypeError, model.train, corpus_file=11111)
Expand All @@ -112,7 +115,7 @@ def test_training_fromfile(self):
with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
utils.save_as_line_sentence(sentences, corpus_file)

model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
model.build_vocab(corpus_file=corpus_file)
self.model_sanity(model)

Expand Down Expand Up @@ -151,10 +154,9 @@ def models_equal(self, model, model2):
most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0]
self.assertTrue(np.allclose(model.wv[most_common_word], model2.wv[most_common_word]))

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_persistence(self):
tmpf = get_tmpfile('gensim_fasttext.tst')
model = FT_gensim(sentences, min_count=1)
model = FT_gensim(sentences, min_count=1, bucket=BUCKET)
model.save(tmpf)
self.models_equal(model, FT_gensim.load(tmpf))
# test persistence of the KeyedVectors of a model
Expand All @@ -169,7 +171,7 @@ def test_persistence_fromfile(self):
utils.save_as_line_sentence(sentences, corpus_file)

tmpf = get_tmpfile('gensim_fasttext.tst')
model = FT_gensim(corpus_file=corpus_file, min_count=1)
model = FT_gensim(corpus_file=corpus_file, min_count=1, bucket=BUCKET)
model.save(tmpf)
self.models_equal(model, FT_gensim.load(tmpf))
# test persistence of the KeyedVectors of a model
Expand All @@ -179,10 +181,9 @@ def test_persistence_fromfile(self):
self.assertTrue(np.allclose(wv.vectors_ngrams, loaded_wv.vectors_ngrams))
self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_norm_vectors_not_saved(self):
tmpf = get_tmpfile('gensim_fasttext.tst')
model = FT_gensim(sentences, min_count=1)
model = FT_gensim(sentences, min_count=1, bucket=BUCKET)
model.init_sims()
model.save(tmpf)
loaded_model = FT_gensim.load(tmpf)
Expand Down Expand Up @@ -406,7 +407,7 @@ def test_cbow_hs_training(self):
model_gensim = FT_gensim(
size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -435,7 +436,7 @@ def test_cbow_hs_training_fromfile(self):
model_gensim = FT_gensim(
size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
Expand Down Expand Up @@ -468,7 +469,7 @@ def test_sg_hs_training(self):
model_gensim = FT_gensim(
size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -497,7 +498,7 @@ def test_sg_hs_training_fromfile(self):
model_gensim = FT_gensim(
size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
Expand Down Expand Up @@ -530,7 +531,7 @@ def test_cbow_neg_training(self):
model_gensim = FT_gensim(
size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -559,7 +560,7 @@ def test_cbow_neg_training_fromfile(self):
model_gensim = FT_gensim(
size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
Expand Down Expand Up @@ -592,7 +593,7 @@ def test_sg_neg_training(self):
model_gensim = FT_gensim(
size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -621,7 +622,7 @@ def test_sg_neg_training_fromfile(self):
model_gensim = FT_gensim(
size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)

lee_data = LineSentence(datapath('lee_background.cor'))
utils.save_as_line_sentence(lee_data, corpus_file)
Expand Down Expand Up @@ -650,7 +651,7 @@ def test_sg_neg_training_fromfile(self):
self.assertGreaterEqual(overlap_count, 2)

def test_online_learning(self):
model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0)
model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0, bucket=BUCKET)
self.assertTrue(len(model_hs.wv.vocab), 12)
self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
model_hs.build_vocab(new_sentences, update=True) # update vocab
Expand All @@ -664,7 +665,8 @@ def test_online_learning_fromfile(self):
utils.save_as_line_sentence(sentences, corpus_file)
utils.save_as_line_sentence(new_sentences, new_corpus_file)

model_hs = FT_gensim(corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0)
model_hs = FT_gensim(
corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0, bucket=BUCKET)
self.assertTrue(len(model_hs.wv.vocab), 12)
self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
model_hs.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab
Expand All @@ -674,7 +676,7 @@ def test_online_learning_fromfile(self):

def test_online_learning_after_save(self):
tmpf = get_tmpfile('gensim_fasttext.tst')
model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5, bucket=BUCKET)
model_neg.save(tmpf)
model_neg = FT_gensim.load(tmpf)
self.assertTrue(len(model_neg.wv.vocab), 12)
Expand All @@ -689,7 +691,8 @@ def test_online_learning_after_save_fromfile(self):
utils.save_as_line_sentence(new_sentences, new_corpus_file)

tmpf = get_tmpfile('gensim_fasttext.tst')
model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5)
model_neg = FT_gensim(
corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5, bucket=BUCKET)
model_neg.save(tmpf)
model_neg = FT_gensim.load(tmpf)
self.assertTrue(len(model_neg.wv.vocab), 12)
Expand Down Expand Up @@ -720,33 +723,30 @@ def online_sanity(self, model):
sim = model.wv.n_similarity(['war'], ['terrorism'])
self.assertLess(0., sim)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_sg_hs_online(self):
model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1)
model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1, bucket=BUCKET)
self.online_sanity(model)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_sg_neg_online(self):
model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=1)
model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=1, bucket=BUCKET)
self.online_sanity(model)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_cbow_hs_online(self):
model = FT_gensim(
sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1
sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1,
bucket=BUCKET,
)
self.online_sanity(model)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def test_cbow_neg_online(self):
model = FT_gensim(
sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=5,
min_count=5, iter=1, seed=42, workers=1, sample=0
min_count=5, iter=1, seed=42, workers=1, sample=0, bucket=BUCKET
)
self.online_sanity(model)

def test_get_vocab_word_vecs(self):
model = FT_gensim(size=10, min_count=1, seed=42)
model = FT_gensim(size=10, min_count=1, seed=42, bucket=BUCKET)
model.build_vocab(sentences)
original_syn0_vocab = np.copy(model.wv.vectors_vocab)
model.wv.adjust_vectors()
Expand All @@ -755,7 +755,7 @@ def test_get_vocab_word_vecs(self):
def test_persistence_word2vec_format(self):
"""Test storing/loading the model in word2vec format."""
tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst')
model = FT_gensim(sentences, min_count=1, size=10)
model = FT_gensim(sentences, min_count=1, size=10, bucket=BUCKET)
model.wv.save_word2vec_format(tmpf, binary=True)
loaded_model_kv = Word2VecKeyedVectors.load_word2vec_format(tmpf, binary=True)
self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab))
Expand All @@ -769,7 +769,7 @@ def test_bucket_ngrams(self):
self.assertEqual(model.wv.vectors_ngrams.shape, (20, 10))

def test_estimate_memory(self):
model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3)
model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3, bucket=BUCKET)
model.build_vocab(sentences)
report = model.estimate_memory()
self.assertEqual(report['vocab'], 2800)
Expand All @@ -780,6 +780,7 @@ def test_estimate_memory(self):
self.assertEqual(report['buckets_word'], 640)
self.assertEqual(report['total'], 6160)

@unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
def testLoadOldModel(self):
"""Test loading fasttext models from previous version"""

Expand Down Expand Up @@ -835,7 +836,7 @@ def test_cbow_hs_against_wrapper(self):

model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand All @@ -856,7 +857,7 @@ def test_sg_hs_against_wrapper(self):

model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
sorted_vocab=1, workers=1, min_alpha=0.0)
sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)

lee_data = LineSentence(datapath('lee_background.cor'))
model_gensim.build_vocab(lee_data)
Expand Down Expand Up @@ -1334,6 +1335,7 @@ def _check_roundtrip(self, sg):
"hs": 1,
"negative": 5,
"seed": 42,
"bucket": BUCKET,
"workers": 1}

with temporary_file("roundtrip_model_to_model.bin") as fpath:
Expand Down Expand Up @@ -1387,6 +1389,7 @@ def _check_roundtrip_file_file(self, sg):
"min_count": 1,
"hs": 1,
"negative": 0,
"bucket": BUCKET,
"seed": 42,
"workers": 1}

Expand Down Expand Up @@ -1486,6 +1489,7 @@ def _check_load_fasttext_format(self, sg):
"min_count": 1,
"hs": 1,
"negative": 5,
"bucket": BUCKET,
"seed": 42,
"workers": 1}

Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_nmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def testTransform(self):
vec = matutils.sparse2full(transformed, 2)
expected = [0.35023746, 0.64976251]
# must contain the same values, up to re-ordering
self.assertTrue(np.allclose(sorted(vec), sorted(expected), rtol=1e-4))
self.assertTrue(np.allclose(sorted(vec), sorted(expected), rtol=1e-3))

def testTopTopics(self):
top_topics = self.model.top_topics(common_corpus)
Expand Down
4 changes: 2 additions & 2 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ def __iter__(self):
for line in infile:
yield line.lower().strip().split()

model = FastText(LeeReader(datapath('lee.cor')))
model = FastText(LeeReader(datapath('lee.cor')), bucket=5000)
model.init_sims()
index = self.indexer(model, 10)

Expand Down Expand Up @@ -733,7 +733,7 @@ def __iter__(self):
for line in infile:
yield line.lower().strip().split()

model = FastText(LeeReader(datapath('lee.cor')))
model = FastText(LeeReader(datapath('lee.cor')), bucket=5000)
model.init_sims()
index = self.indexer(model)

Expand Down
Loading

0 comments on commit 585b0c0

Please sign in to comment.