Merge branch 'develop' into fix-xml

piskvorky · Apr 24, 2020 · 585b0c0 · 585b0c0
2 parents a6247af + d70b129
commit 585b0c0
Show file tree

Hide file tree

Showing 7 changed files with 68 additions and 55 deletions.
diff --git a/README.md b/README.md
@@ -1,14 +1,18 @@
 gensim – Topic Modelling in Python
 ==================================
 
-[![Build Status](https://travis-ci.org/RaRe-Technologies/gensim.svg?branch=develop)](https://travis-ci.org/RaRe-Technologies/gensim)
-[![GitHub release](https://img.shields.io/github/release/rare-technologies/gensim.svg?maxAge=3600)](https://github.com/RaRe-Technologies/gensim/releases)
-[![Conda-forge Build](https://anaconda.org/conda-forge/gensim/badges/version.svg)](https://anaconda.org/conda-forge/gensim)
-[![Wheel](https://img.shields.io/pypi/wheel/gensim.svg)](https://pypi.python.org/pypi/gensim)
-[![DOI](https://zenodo.org/badge/DOI/10.13140/2.1.2393.1847.svg)](https://doi.org/10.13140/2.1.2393.1847)
-[![Mailing List](https://img.shields.io/badge/-Mailing%20List-brightgreen.svg)](https://groups.google.com/forum/#!forum/gensim)
-[![Gitter](https://img.shields.io/badge/gitter-join%20chat%20%E2%86%92-09a3d5.svg)](https://gitter.im/RaRe-Technologies/gensim)
-[![Follow](https://img.shields.io/twitter/follow/gensim_py.svg?style=social&label=Follow)](https://twitter.com/gensim_py)
+<!--
+The following image URLs are obfuscated = proxied and cached through
+Google because of Github's proxying issues. See:
+https://github.com/RaRe-Technologies/gensim/issues/2805
+-->
+
+[![Build Status](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&refresh=3600&url=https%3A%2F%2Ftravis-ci.org%2FRaRe-Technologies%2Fgensim.svg%3Fbranch%3Ddevelop)](https://travis-ci.org/RaRe-Technologies/gensim)
+[![GitHub release](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&refresh=3600&url=https%3A%2F%2Fimg.shields.io%2Fgithub%2Frelease%2Frare-technologies%2Fgensim.svg%3FmaxAge%3D3600)](https://github.com/RaRe-Technologies/gensim/releases)
+[![Downloads](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&refresh=86400&url=https%3A%2F%2Fimg.shields.io%2Fpypi%2Fdm%2Fgensim%3Fcolor%3Dblue)](https://pepy.tech/project/gensim/month)
+[![DOI](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&refresh=604800&url=https%3A%2F%2Fzenodo.org%2Fbadge%2FDOI%2F10.13140%2F2.1.2393.1847.svg)](https://doi.org/10.13140/2.1.2393.1847)
+[![Mailing List](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&refresh=604800&url=https%3A%2F%2Fimg.shields.io%2Fbadge%2F-Mailing%2520List-blue.svg)](https://groups.google.com/forum/#!forum/gensim)
+[![Follow](https://images1-focus-opensocial.googleusercontent.com/gadgets/proxy?container=focus&refresh=86400&url=https%3A%2F%2Fimg.shields.io%2Ftwitter%2Ffollow%2Fgensim_py.svg%3Fstyle%3Dsocial%26style%3Dflat%26logo%3Dtwitter%26label%3DFollow%26color%3Dblue)](https://twitter.com/gensim_py)
 
 Gensim is a Python library for *topic modelling*, *document indexing*
 and *similarity retrieval* with large corpora. Target audience is the

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
@@ -33,9 +33,12 @@
 logger = logging.getLogger(__name__)
 
 IS_WIN32 = (os.name == "nt") and (struct.calcsize('P') * 8 == 32)
-
 MAX_WORDVEC_COMPONENT_DIFFERENCE = 1.0e-10
 
+# Limit the size of FastText ngram buckets, for RAM reasons.
+# See https://github.com/RaRe-Technologies/gensim/issues/2790
+BUCKET = 5000
+
 FT_HOME = os.environ.get("FT_HOME")
 FT_CMD = os.path.join(FT_HOME, "fasttext") if FT_HOME else None
 
@@ -67,7 +70,7 @@ def setUp(self):
         self.test_new_model_file = datapath('lee_fasttext_new.bin')
 
     def test_training(self):
-        model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
+        model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
         model.build_vocab(sentences)
         self.model_sanity(model)
 
@@ -87,7 +90,7 @@ def test_training(self):
         self.assertEqual(sims, sims2)
 
         # build vocab and train in one step; must be the same as above
-        model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
+        model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
         self.models_equal(model, model2)
 
         # verify oov-word vector retrieval
@@ -99,7 +102,7 @@ def test_training(self):
 
     def testFastTextTrainParameters(self):
 
-        model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
+        model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
         model.build_vocab(sentences=sentences)
 
         self.assertRaises(TypeError, model.train, corpus_file=11111)
@@ -112,7 +115,7 @@ def test_training_fromfile(self):
         with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
             utils.save_as_line_sentence(sentences, corpus_file)
 
-            model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
+            model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1, bucket=BUCKET)
             model.build_vocab(corpus_file=corpus_file)
             self.model_sanity(model)
 
@@ -151,10 +154,9 @@ def models_equal(self, model, model2):
         most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0]
         self.assertTrue(np.allclose(model.wv[most_common_word], model2.wv[most_common_word]))
 
-    @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
     def test_persistence(self):
         tmpf = get_tmpfile('gensim_fasttext.tst')
-        model = FT_gensim(sentences, min_count=1)
+        model = FT_gensim(sentences, min_count=1, bucket=BUCKET)
         model.save(tmpf)
         self.models_equal(model, FT_gensim.load(tmpf))
         #  test persistence of the KeyedVectors of a model
@@ -169,7 +171,7 @@ def test_persistence_fromfile(self):
             utils.save_as_line_sentence(sentences, corpus_file)
 
             tmpf = get_tmpfile('gensim_fasttext.tst')
-            model = FT_gensim(corpus_file=corpus_file, min_count=1)
+            model = FT_gensim(corpus_file=corpus_file, min_count=1, bucket=BUCKET)
             model.save(tmpf)
             self.models_equal(model, FT_gensim.load(tmpf))
             #  test persistence of the KeyedVectors of a model
@@ -179,10 +181,9 @@ def test_persistence_fromfile(self):
             self.assertTrue(np.allclose(wv.vectors_ngrams, loaded_wv.vectors_ngrams))
             self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
 
-    @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
     def test_norm_vectors_not_saved(self):
         tmpf = get_tmpfile('gensim_fasttext.tst')
-        model = FT_gensim(sentences, min_count=1)
+        model = FT_gensim(sentences, min_count=1, bucket=BUCKET)
         model.init_sims()
         model.save(tmpf)
         loaded_model = FT_gensim.load(tmpf)
@@ -406,7 +407,7 @@ def test_cbow_hs_training(self):
         model_gensim = FT_gensim(
             size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
             min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-            sorted_vocab=1, workers=1, min_alpha=0.0)
+            sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
 
         lee_data = LineSentence(datapath('lee_background.cor'))
         model_gensim.build_vocab(lee_data)
@@ -435,7 +436,7 @@ def test_cbow_hs_training_fromfile(self):
             model_gensim = FT_gensim(
                 size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
                 min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-                sorted_vocab=1, workers=1, min_alpha=0.0)
+                sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)
 
             lee_data = LineSentence(datapath('lee_background.cor'))
             utils.save_as_line_sentence(lee_data, corpus_file)
@@ -468,7 +469,7 @@ def test_sg_hs_training(self):
         model_gensim = FT_gensim(
             size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
             min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-            sorted_vocab=1, workers=1, min_alpha=0.0)
+            sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
 
         lee_data = LineSentence(datapath('lee_background.cor'))
         model_gensim.build_vocab(lee_data)
@@ -497,7 +498,7 @@ def test_sg_hs_training_fromfile(self):
             model_gensim = FT_gensim(
                 size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
                 min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-                sorted_vocab=1, workers=1, min_alpha=0.0)
+                sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
 
             lee_data = LineSentence(datapath('lee_background.cor'))
             utils.save_as_line_sentence(lee_data, corpus_file)
@@ -530,7 +531,7 @@ def test_cbow_neg_training(self):
         model_gensim = FT_gensim(
             size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5,
             min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-            sorted_vocab=1, workers=1, min_alpha=0.0)
+            sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
 
         lee_data = LineSentence(datapath('lee_background.cor'))
         model_gensim.build_vocab(lee_data)
@@ -559,7 +560,7 @@ def test_cbow_neg_training_fromfile(self):
             model_gensim = FT_gensim(
                 size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5,
                 min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-                sorted_vocab=1, workers=1, min_alpha=0.0)
+                sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
 
             lee_data = LineSentence(datapath('lee_background.cor'))
             utils.save_as_line_sentence(lee_data, corpus_file)
@@ -592,7 +593,7 @@ def test_sg_neg_training(self):
         model_gensim = FT_gensim(
             size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
             min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-            sorted_vocab=1, workers=1, min_alpha=0.0)
+            sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)
 
         lee_data = LineSentence(datapath('lee_background.cor'))
         model_gensim.build_vocab(lee_data)
@@ -621,7 +622,7 @@ def test_sg_neg_training_fromfile(self):
             model_gensim = FT_gensim(
                 size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
                 min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-                sorted_vocab=1, workers=1, min_alpha=0.0)
+                sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET * 4)
 
             lee_data = LineSentence(datapath('lee_background.cor'))
             utils.save_as_line_sentence(lee_data, corpus_file)
@@ -650,7 +651,7 @@ def test_sg_neg_training_fromfile(self):
             self.assertGreaterEqual(overlap_count, 2)
 
     def test_online_learning(self):
-        model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0)
+        model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0, bucket=BUCKET)
         self.assertTrue(len(model_hs.wv.vocab), 12)
         self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
         model_hs.build_vocab(new_sentences, update=True)  # update vocab
@@ -664,7 +665,8 @@ def test_online_learning_fromfile(self):
             utils.save_as_line_sentence(sentences, corpus_file)
             utils.save_as_line_sentence(new_sentences, new_corpus_file)
 
-            model_hs = FT_gensim(corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0)
+            model_hs = FT_gensim(
+                corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0, bucket=BUCKET)
             self.assertTrue(len(model_hs.wv.vocab), 12)
             self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
             model_hs.build_vocab(corpus_file=new_corpus_file, update=True)  # update vocab
@@ -674,7 +676,7 @@ def test_online_learning_fromfile(self):
 
     def test_online_learning_after_save(self):
         tmpf = get_tmpfile('gensim_fasttext.tst')
-        model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
+        model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5, bucket=BUCKET)
         model_neg.save(tmpf)
         model_neg = FT_gensim.load(tmpf)
         self.assertTrue(len(model_neg.wv.vocab), 12)
@@ -689,7 +691,8 @@ def test_online_learning_after_save_fromfile(self):
             utils.save_as_line_sentence(new_sentences, new_corpus_file)
 
             tmpf = get_tmpfile('gensim_fasttext.tst')
-            model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5)
+            model_neg = FT_gensim(
+                corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5, bucket=BUCKET)
             model_neg.save(tmpf)
             model_neg = FT_gensim.load(tmpf)
             self.assertTrue(len(model_neg.wv.vocab), 12)
@@ -720,33 +723,30 @@ def online_sanity(self, model):
         sim = model.wv.n_similarity(['war'], ['terrorism'])
         self.assertLess(0., sim)
 
-    @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
     def test_sg_hs_online(self):
-        model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1)
+        model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1, bucket=BUCKET)
         self.online_sanity(model)
 
-    @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
     def test_sg_neg_online(self):
-        model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=1)
+        model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=1, bucket=BUCKET)
         self.online_sanity(model)
 
-    @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
     def test_cbow_hs_online(self):
         model = FT_gensim(
-            sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1
+            sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1,
+            bucket=BUCKET,
         )
         self.online_sanity(model)
 
-    @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
     def test_cbow_neg_online(self):
         model = FT_gensim(
             sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=5,
-            min_count=5, iter=1, seed=42, workers=1, sample=0
+            min_count=5, iter=1, seed=42, workers=1, sample=0, bucket=BUCKET
         )
         self.online_sanity(model)
 
     def test_get_vocab_word_vecs(self):
-        model = FT_gensim(size=10, min_count=1, seed=42)
+        model = FT_gensim(size=10, min_count=1, seed=42, bucket=BUCKET)
         model.build_vocab(sentences)
         original_syn0_vocab = np.copy(model.wv.vectors_vocab)
         model.wv.adjust_vectors()
@@ -755,7 +755,7 @@ def test_get_vocab_word_vecs(self):
     def test_persistence_word2vec_format(self):
         """Test storing/loading the model in word2vec format."""
         tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst')
-        model = FT_gensim(sentences, min_count=1, size=10)
+        model = FT_gensim(sentences, min_count=1, size=10, bucket=BUCKET)
         model.wv.save_word2vec_format(tmpf, binary=True)
         loaded_model_kv = Word2VecKeyedVectors.load_word2vec_format(tmpf, binary=True)
         self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab))
@@ -769,7 +769,7 @@ def test_bucket_ngrams(self):
         self.assertEqual(model.wv.vectors_ngrams.shape, (20, 10))
 
     def test_estimate_memory(self):
-        model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3)
+        model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3, bucket=BUCKET)
         model.build_vocab(sentences)
         report = model.estimate_memory()
         self.assertEqual(report['vocab'], 2800)
@@ -780,6 +780,7 @@ def test_estimate_memory(self):
         self.assertEqual(report['buckets_word'], 640)
         self.assertEqual(report['total'], 6160)
 
+    @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32")
     def testLoadOldModel(self):
         """Test loading fasttext models from previous version"""
 
@@ -835,7 +836,7 @@ def test_cbow_hs_against_wrapper(self):
 
         model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0,
                                  min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-                                 sorted_vocab=1, workers=1, min_alpha=0.0)
+                                 sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
 
         lee_data = LineSentence(datapath('lee_background.cor'))
         model_gensim.build_vocab(lee_data)
@@ -856,7 +857,7 @@ def test_sg_hs_against_wrapper(self):
 
         model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
                                  min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
-                                 sorted_vocab=1, workers=1, min_alpha=0.0)
+                                 sorted_vocab=1, workers=1, min_alpha=0.0, bucket=BUCKET)
 
         lee_data = LineSentence(datapath('lee_background.cor'))
         model_gensim.build_vocab(lee_data)
@@ -1334,6 +1335,7 @@ def _check_roundtrip(self, sg):
             "hs": 1,
             "negative": 5,
             "seed": 42,
+            "bucket": BUCKET,
             "workers": 1}
 
         with temporary_file("roundtrip_model_to_model.bin") as fpath:
@@ -1387,6 +1389,7 @@ def _check_roundtrip_file_file(self, sg):
             "min_count": 1,
             "hs": 1,
             "negative": 0,
+            "bucket": BUCKET,
             "seed": 42,
             "workers": 1}
 
@@ -1486,6 +1489,7 @@ def _check_load_fasttext_format(self, sg):
             "min_count": 1,
             "hs": 1,
             "negative": 5,
+            "bucket": BUCKET,
             "seed": 42,
             "workers": 1}
 

diff --git a/gensim/test/test_nmf.py b/gensim/test/test_nmf.py
@@ -98,7 +98,7 @@ def testTransform(self):
         vec = matutils.sparse2full(transformed, 2)
         expected = [0.35023746, 0.64976251]
         # must contain the same values, up to re-ordering
-        self.assertTrue(np.allclose(sorted(vec), sorted(expected), rtol=1e-4))
+        self.assertTrue(np.allclose(sorted(vec), sorted(expected), rtol=1e-3))
 
     def testTopTopics(self):
         top_topics = self.model.top_topics(common_corpus)

diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py
@@ -570,7 +570,7 @@ def __iter__(self):
                     for line in infile:
                         yield line.lower().strip().split()
 
-        model = FastText(LeeReader(datapath('lee.cor')))
+        model = FastText(LeeReader(datapath('lee.cor')), bucket=5000)
         model.init_sims()
         index = self.indexer(model, 10)
 
@@ -733,7 +733,7 @@ def __iter__(self):
                     for line in infile:
                         yield line.lower().strip().split()
 
-        model = FastText(LeeReader(datapath('lee.cor')))
+        model = FastText(LeeReader(datapath('lee.cor')), bucket=5000)
         model.init_sims()
         index = self.indexer(model)