From 574134e1a49411375d9a599079231934506cd867 Mon Sep 17 00:00:00 2001
From: Manos Stergiadis <stergiadis777@gmail.com>
Date: Tue, 26 Dec 2017 03:10:29 +0200
Subject: [PATCH 1/8] minor style refactoring and comment fixes in accordance
 to PEP8

---
 gensim/corpora/wikicorpus.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 0c1c229bac..7148b90884 100755
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -3,6 +3,7 @@
 #
 # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
 # Copyright (C) 2012 Lars Buitinck <larsmans@gmail.com>
+# Copyright (C) 2018 Emmanouil Stergiadis <em.stergiadis@gmail.com>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
@@ -56,8 +57,8 @@
 RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE)  # table formatting
 RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE)  # table cell formatting
 RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)  # categories
-# Remove File and Image template
-RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
+RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)  # Remove File and Image template
+
 
 # MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that
 # ought to be ignored
@@ -332,11 +333,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         self.token_min_len = token_min_len
         self.token_max_len = token_max_len
         self.lower = lower
-
-        if dictionary is None:
-            self.dictionary = Dictionary(self.get_texts())
-        else:
-            self.dictionary = dictionary
+        self.dictionary = dictionary or Dictionary(self.get_texts())
 
     def get_texts(self):
         """
@@ -344,7 +341,7 @@ def get_texts(self):
         of tokens.
 
         Only articles of sufficient length are returned (short articles & redirects
-        etc are ignored). This is control by `article_min_tokens` on the class instance.
+        etc are ignored). This is controlled by `article_min_tokens` on the class instance.
 
         Note that this iterates over the **texts**; if you want vectors, just use
         the standard corpus interface instead of this function::
@@ -380,6 +377,7 @@ def get_texts(self):
                         yield (tokens, (pageid, title))
                     else:
                         yield tokens
+
         except KeyboardInterrupt:
             logger.warn(
                 "user terminated iteration over Wikipedia corpus after %i documents with %i positions "

From 952e8d5011a100bb30da98a8c631a7e3b63cc19e Mon Sep 17 00:00:00 2001
From: Manos Stergiadis <stergiadis777@gmail.com>
Date: Tue, 26 Dec 2017 03:10:30 +0200
Subject: [PATCH 2/8] Created test data in legitimate compressed XML format
 (.xml.bz2) for the WikiCorpus class.

* Used the same raw data found for other sources (9 articles).

* Added Various wiki markup to test the parsing regural expressions
---
 gensim/test/test_data/testcorpus.xml.bz2 | Bin 0 -> 1404 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 gensim/test/test_data/testcorpus.xml.bz2

diff --git a/gensim/test/test_data/testcorpus.xml.bz2 b/gensim/test/test_data/testcorpus.xml.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..064b9ad1e9c9704d9e1204a4c4691f8561a400bd
GIT binary patch
literal 1404
zcmV-?1%vuRT4*^jL0KkKSqzed&j1DL-+)vyQHTHkKgM3#zyJUAU<ThdzyJUM0006h
z`l+Yto~NkL0BC45H2qKj00006qD+R!K+{G|27np>4Gj$#fso@SO*Fz}WDE$z(Sk68
zMwkE)<0efs!enF&2*lBXFoQ;z01)FQO*Fz}WDE$z(Sk68MwkE)DoB$EO#?=a2xT;7
zGb#F0LrjlQdYeh36Bz<(Lm3e#&-!k&=Tp-HOoS6EkR?Qf*$PENmp)I9F(u*he803X
z!x;bw2*R^}EGr^u$&i2u2Xv7g3{{T|CXBQdIuIQxg;0fZNk3qSL=atevLnwj${+}(
ztdyV(!vF#^THHEPFA%78@}kN}xJ*8a<A+yMuGV+(oqM5K!?Oe&kJ1=r?(Mnh;nVs~
z&AsyYWi<E4<<pkGIF8>bXoO>PrWTv_GF3?9e>r*2UOt=OE5h{({_w%&4;X!r^ux*;
zHf_zn3-@yD$`+VL9FVm7Axxp9Emwtoyq>QFKR9QO-8g3qA<2Q=454hHhTd3LR-BpZ
z)6t2-86zvh9BLG(c}UI}>xNc_ta%}vI73xK9(ZfZ98s#!!wXW-!wz_C*N$B<s9BZ}
z%Nt><R4;@#!ub7u!&Ejy;|*S)RYNaIVePy>gfT|#H(Q&htFN=Bp`1MVVgEZiJ<6v|
zYOcEX()I1Gx_b1sb&W06_VI_V9Z<YIP`;H9#vFL@>(jTlw~xXbo#<6UlA%-iLcLcc
zSsbvmEeeILWGWa#jN?G-eRTr<uFIp!6zYX|Q&cG*x*oei(83<5c*9&hP{%s<)RJcm
z5U5%i!x&?x17+ckxKTDjq>Z@47&M~LsFki2DjQ+_A-*4lKg=lerwCWhquEdO^!)f>
zpE!P2k)zHZjsBQa;Hmy$P0-sdGTHl~sT#@qT7CzYk52yBYK9&ftLj753rp1fVf|PH
zbHnC&KI{<iv^g}XKdnO*`4ULF8&&11XouYp|Fo=7v^70zOj<{8$I!YS5Ypbqe1j~r
zhwA$@h;!6P|Gw|DY&FANHZ2Q7_J=Go{2`ErY_iu48&I*TC%t*l-3^wIaOz)v9*GxV
zk34L=gH$oqYc9eVLkJi`!tk_=9kx@xl`R|%?0n&@x*z#y@_HToFm7&bLuwv87%^2q
zs;-rmL#CnEH?xM4K}the(1Pe~o3+T5P}lL0R=qM=%LA8WH#S4I9jvpBSCG}Kd5k#I
zwZ49QXGA*C>t9zq=jXq-KB+XR6<dbB7LDj_${i9VL!nTpOg;}plB%ke5B)8xijc#C
zI^M?@%X=fHsScAA9q3%VJJxC=<%~P}<y1{19su~mmvl;Uq44Z`GVX;fxeq<)IW>kq
zR5rG?L(0<{!WLR<cOhBfBvZe9@XZZUL8%Tq-;9%tI3b)h!!$V<7OQDih`sasHEu$!
zl!G)CS!E4MT8?HA)eRW6QWT^tg?yo;IKQ7VA6w5;Hxf08+_E2w%fODv<U_@j_&CGb
z?>J~zLK#Eo2<+^IK~VGPKVIq8qc^5~Q1?TZ(B{eHUJ;=mqoywUp_Kkxklznmd^(|<
zWM+moLg;_EcCNo#3ncsDcyuoD&4=*4m+<?cNV=i!<7Y%8+}QnUsSKgf4wz=!%!Tk)
zwlWQnq=qOd3;prc{CuHl`QI7z@*_#)g3#-p8tr&vmA1m6z7~ZcqimC0h0b-Noety~
zIwAbwsvF_19*Tx2Y{D5sV<lb}g~-0ND_mnTLy-p<`{7*QacTP;yzL%5(B_As_M!H}
zd^fYRnGb9~D0`27koZHe-NV-l<J}GzZ-%ol<tFK!zKh8YWf00cq3?xw`NG{F{9VZu
K;X*(%N)tQ)p{QX1

literal 0
HcmV?d00001


From 7ddce6ca5ff8eeb5ff0f99d2f222a2fccbf87443 Mon Sep 17 00:00:00 2001
From: Manos Stergiadis <stergiadis777@gmail.com>
Date: Tue, 26 Dec 2017 03:10:31 +0200
Subject: [PATCH 3/8] Added test class for the WikiCorpus source.

* Following the same inheritance schema as in the source TestWikiCorpus > TestTextCorpus > CorpusTestCase.

* Testing methods are overriden where necessary to reflect logic changes.

* All existing functionality is tested (account for markup handling, minimum article length etc)
---
 gensim/test/test_corpora.py | 71 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
index 4ddc16e0cf..33f28dfbd7 100644
--- a/gensim/test/test_corpora.py
+++ b/gensim/test/test_corpora.py
@@ -11,6 +11,7 @@
 from __future__ import unicode_literals
 
 import codecs
+import bz2
 import itertools
 import logging
 import os.path
@@ -18,9 +19,10 @@
 import unittest
 
 import numpy as np
+from xml.etree.cElementTree import ParseError
 
 from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus,
-                            ucicorpus, malletcorpus, textcorpus, indexedcorpus)
+                            ucicorpus, malletcorpus, textcorpus, indexedcorpus, wikicorpus)
 from gensim.interfaces import TransformedCorpus
 from gensim.utils import to_unicode
 from gensim.test.utils import datapath, get_tmpfile
@@ -400,6 +402,73 @@ def test_indexing(self):
         pass
 
 
+class TestWikiCorpus(TestTextCorpus):
+    def setUp(self):
+        self.corpus_class = wikicorpus.WikiCorpus
+        self.file_extension = '.xml.bz2'
+        self.fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
+
+    def test_default_preprocessing(self):
+        expected = ['computer', 'human', 'interface']
+        corpus = self.corpus_class(self.fname, article_min_tokens=0)
+        first_text = corpus.get_texts().next()
+        self.assertEqual(expected, first_text)
+
+    def test_len(self):
+
+        def test_with_limit(article_min_tokens, expected_articles):
+            corpus = self.corpus_class(self.fname, article_min_tokens=article_min_tokens)
+            all_articles = corpus.get_texts()
+            assert (len(list(all_articles)) == expected_articles)
+
+        test_with_limit(0, 9)
+        test_with_limit(100000, 0)
+
+    def test_load_with_metadata(self):
+        corpus = self.corpus_class(self.fname, article_min_tokens=0)
+        corpus.metadata = True
+        self.assertEqual(len(corpus), 9)
+
+        docs = list(corpus)
+        self.assertEqual(len(docs), 9)
+
+        for i, docmeta in enumerate(docs):
+            doc, metadata = docmeta
+            article_no = i + 1  # Counting IDs from 1
+            self.assertEqual(metadata[0], str(article_no))
+            self.assertEqual(metadata[1], 'Article%d' % article_no)
+
+    def test_load(self):
+        corpus = self.corpus_class(self.fname, article_min_tokens=0)
+
+        docs = list(corpus)
+        # the deerwester corpus always has nine documents
+        self.assertEqual(len(docs), 9)
+
+    def test_empty_input(self):
+        tmpf = get_tmpfile('emptycorpus.xml.bz2')
+        content = bz2.compress(b'')  # Explicit string to byte conversion needed in python 3
+        fh = open(tmpf, "wb")
+        fh.write(content)
+        fh.close()
+
+        with self.assertRaises(ParseError):
+            corpus = self.corpus_class(tmpf)
+            del corpus  # Needed to supress tox warning
+
+    def test_sample_text(self):
+        # Cannot instantiate WikiCorpus from lines
+        pass
+
+    def test_sample_text_length(self):
+        # Cannot instantiate WikiCorpus from lines
+        pass
+
+    def test_sample_text_seed(self):
+        # Cannot instantiate WikiCorpus from lines
+        pass
+
+
 class TestTextDirectoryCorpus(unittest.TestCase):
 
     def write_one_level(self, *args):

From 836c3c2431ff07bd1661e551f2f940a9d2b0fd69 Mon Sep 17 00:00:00 2001
From: Manos Stergiadis <stergiadis777@gmail.com>
Date: Thu, 28 Dec 2017 21:16:26 +0200
Subject: [PATCH 4/8] Fix python 3 compatibility for generator next method

---
 gensim/test/test_corpora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
index 33f28dfbd7..bc279de1ed 100644
--- a/gensim/test/test_corpora.py
+++ b/gensim/test/test_corpora.py
@@ -411,7 +411,7 @@ def setUp(self):
     def test_default_preprocessing(self):
         expected = ['computer', 'human', 'interface']
         corpus = self.corpus_class(self.fname, article_min_tokens=0)
-        first_text = corpus.get_texts().next()
+        first_text = next(corpus.get_texts())
         self.assertEqual(expected, first_text)
 
     def test_len(self):

From 43a48f5f9abde0ceba96636e9efd1347a0265789 Mon Sep 17 00:00:00 2001
From: Manos Stergiadis <stergiadis777@gmail.com>
Date: Sat, 30 Dec 2017 20:26:55 +0200
Subject: [PATCH 5/8] code review corrections

---
 gensim/test/test_corpora.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
index bc279de1ed..c3433f8195 100644
--- a/gensim/test/test_corpora.py
+++ b/gensim/test/test_corpora.py
@@ -415,14 +415,15 @@ def test_default_preprocessing(self):
         self.assertEqual(expected, first_text)
 
     def test_len(self):
+        # When there is no min_token limit all 9 articles must be registered.
+        corpus = self.corpus_class(self.fname, article_min_tokens=0)
+        all_articles = corpus.get_texts()
+        assert (len(list(all_articles)) == 9)
 
-        def test_with_limit(article_min_tokens, expected_articles):
-            corpus = self.corpus_class(self.fname, article_min_tokens=article_min_tokens)
-            all_articles = corpus.get_texts()
-            assert (len(list(all_articles)) == expected_articles)
-
-        test_with_limit(0, 9)
-        test_with_limit(100000, 0)
+        # With a huge min_token limit, all articles should be filtered out.
+        corpus = self.corpus_class(self.fname, article_min_tokens=100000)
+        all_articles = corpus.get_texts()
+        assert (len(list(all_articles)) == 0)
 
     def test_load_with_metadata(self):
         corpus = self.corpus_class(self.fname, article_min_tokens=0)
@@ -446,11 +447,14 @@ def test_load(self):
         self.assertEqual(len(docs), 9)
 
     def test_empty_input(self):
+        """
+        Empty compressed input raises ParseError
+        """
         tmpf = get_tmpfile('emptycorpus.xml.bz2')
-        content = bz2.compress(b'')  # Explicit string to byte conversion needed in python 3
-        fh = open(tmpf, "wb")
-        fh.write(content)
-        fh.close()
+        content = bz2.compress(''.encode())  # Explicit string to byte conversion needed in python 3
+
+        with open(tmpf, "wb") as fh:
+            fh.write(content)
 
         with self.assertRaises(ParseError):
             corpus = self.corpus_class(tmpf)

From 8b7a1d585001a17fcff32f41570ebafb400ab354 Mon Sep 17 00:00:00 2001
From: Manos Stergiadis <stergiadis777@gmail.com>
Date: Sat, 30 Dec 2017 20:29:39 +0200
Subject: [PATCH 6/8] Moved WikiCorpus tests from test/test_wikicorpus.py into
 its class within the test_corpora.py file.

* Adapted all old tests to the new class

* Current Test class schema ensures that WikiCorpus also passes tests defined in parents

* Deleted test_wikicorpus.py since it is now redundant
---
 gensim/test/test_corpora.py    |  72 +++++++++++++++++-
 gensim/test/test_wikicorpus.py | 135 ---------------------------------
 2 files changed, 71 insertions(+), 136 deletions(-)
 delete mode 100644 gensim/test/test_wikicorpus.py

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
index c3433f8195..20bf28385f 100644
--- a/gensim/test/test_corpora.py
+++ b/gensim/test/test_corpora.py
@@ -401,12 +401,12 @@ def test_serialize_compressed(self):
     def test_indexing(self):
         pass
 
-
 class TestWikiCorpus(TestTextCorpus):
     def setUp(self):
         self.corpus_class = wikicorpus.WikiCorpus
         self.file_extension = '.xml.bz2'
         self.fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
+        self.enwiki = datapath('enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2')
 
     def test_default_preprocessing(self):
         expected = ['computer', 'human', 'interface']
@@ -460,6 +460,76 @@ def test_empty_input(self):
             corpus = self.corpus_class(tmpf)
             del corpus  # Needed to supress tox warning
 
+    def test_unicode_element(self):
+        """
+        First unicode article in this sample is
+        1) папа
+        """
+        bgwiki = datapath('bgwiki-latest-pages-articles-shortened.xml.bz2')
+        corpus = self.corpus_class(bgwiki)
+        texts = corpus.get_texts()
+        self.assertTrue(u'папа' in next(texts))
+
+    def test_lower_case_set_true(self):
+        """
+        Set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1, lower=True, lemmatize=False)
+        row = corpus.get_texts()
+        list_tokens = next(row)
+        self.assertTrue(u'Anarchism' not in list_tokens)
+        self.assertTrue(u'anarchism' in list_tokens)
+
+    def test_lower_case_set_false(self):
+        """
+        Set the parameter lower to False and check that upper case Anarchism' token exists
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1, lower=False, lemmatize=False)
+        row = corpus.get_texts()
+        list_tokens = next(row)
+        self.assertTrue(u'Anarchism' in list_tokens)
+        self.assertTrue(u'anarchism' in list_tokens)
+
+    def test_min_token_len_not_set(self):
+        """
+        Don't set the parameter token_min_len and check that 'a' as a token doesn't exist
+        Default token_min_len=2
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1, lemmatize=False)
+        self.assertTrue(u'a' not in next(corpus.get_texts()))
+
+    def test_min_token_len_set(self):
+        """
+        Set the parameter token_min_len to 1 and check that 'a' as a token exists
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1, token_min_len=1, lemmatize=False)
+        self.assertTrue(u'a' in next(corpus.get_texts()))
+
+    def test_max_token_len_not_set(self):
+        """
+        Don't set the parameter token_max_len and check that 'collectivisation' as a token doesn't exist
+        Default token_max_len=15
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1, lemmatize=False)
+        self.assertTrue(u'collectivization' not in next(corpus.get_texts()))
+
+    def test_max_token_len_set(self):
+        """
+        Set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1, token_max_len=16, lemmatize=False)
+        self.assertTrue(u'collectivization' in next(corpus.get_texts()))
+
+    # TODO: sporadic failure to be investigated
+    # def test_get_texts_returns_generator_of_lists(self):
+    #
+    #     corpus = self.corpus_class(self.fname)
+    #     l = corpus.get_texts()
+    #     self.assertEqual(type(l), types.GeneratorType)
+    #     first = next(l)
+    #     self.assertEqual(type(first), list)
+    #     self.assertTrue(isinstance(first[0], bytes) or isinstance(first[0], str))
+
     def test_sample_text(self):
         # Cannot instantiate WikiCorpus from lines
         pass
diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py
deleted file mode 100644
index e7b7b14011..0000000000
--- a/gensim/test/test_wikicorpus.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
-# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
-
-"""
-Automated tests for checking the WikiCorpus
-"""
-
-
-import logging
-import unittest
-
-from gensim.corpora.wikicorpus import WikiCorpus
-from gensim import utils
-from gensim.test.utils import datapath
-
-FILENAME = 'enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2'
-FILENAME_U = 'bgwiki-latest-pages-articles-shortened.xml.bz2'
-
-logger = logging.getLogger(__name__)
-
-
-def custom_tokeiner(content, token_min_len=2, token_max_len=15, lower=True):
-    return [
-        utils.to_unicode(token.lower()) if lower else utils.to_unicode(token) for token in content.split()
-        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
-    ]
-
-
-class TestWikiCorpus(unittest.TestCase):
-
-    # #TODO: sporadic failure to be investigated
-    # def test_get_texts_returns_generator_of_lists(self):
-    #     logger.debug("Current Python Version is %s", str(sys.version_info))
-    #     if sys.version_info < (2, 7, 0):
-    #         return
-    #
-    #     wc = WikiCorpus(datapath(FILENAME))
-    #     l = wc.get_texts()
-    #     self.assertEqual(type(l), types.GeneratorType)
-    #     first = next(l)
-    #     self.assertEqual(type(first), list)
-    #     self.assertTrue(isinstance(first[0], bytes) or isinstance(first[0], str))
-
-    def test_first_element(self):
-        """
-        First two articles in this sample are
-        1) anarchism
-        2) autism
-        """
-        wc = WikiCorpus(datapath(FILENAME), processes=1)
-
-        texts = wc.get_texts()
-        self.assertTrue(u'anarchism' in next(texts))
-        self.assertTrue(u'autism' in next(texts))
-
-    def test_unicode_element(self):
-        """
-        First unicode article in this sample is
-        1) папа
-        """
-        wc = WikiCorpus(datapath(FILENAME_U), processes=1)
-
-        texts = wc.get_texts()
-        self.assertTrue(u'папа' in next(texts))
-
-    def test_lower_case_set_true(self):
-        """
-        set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist
-        """
-        wc = WikiCorpus(datapath(FILENAME), processes=1, lower=True, lemmatize=False)
-        row = wc.get_texts()
-        list_tokens = next(row)
-        self.assertTrue(u'Anarchism' not in list_tokens)
-        self.assertTrue(u'anarchism' in list_tokens)
-
-    def test_lower_case_set_false(self):
-        """
-        set the parameter lower to False and check that upper case Anarchism' token exist
-        """
-        wc = WikiCorpus(datapath(FILENAME), processes=1, lower=False, lemmatize=False)
-        row = wc.get_texts()
-        list_tokens = next(row)
-        self.assertTrue(u'Anarchism' in list_tokens)
-        self.assertTrue(u'anarchism' in list_tokens)
-
-    def test_min_token_len_not_set(self):
-        """
-        don't set the parameter token_min_len and check that 'a' as a token doesn't exists
-        default token_min_len=2
-        """
-        wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False)
-        self.assertTrue(u'a' not in next(wc.get_texts()))
-
-    def test_min_token_len_set(self):
-        """
-        set the parameter token_min_len to 1 and check that 'a' as a token exists
-        """
-        wc = WikiCorpus(datapath(FILENAME), processes=1, token_min_len=1, lemmatize=False)
-        self.assertTrue(u'a' in next(wc.get_texts()))
-
-    def test_max_token_len_not_set(self):
-        """
-        don't set the parameter token_max_len and check that 'collectivisation' as a token doesn't exists
-        default token_max_len=15
-        """
-        wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False)
-        self.assertTrue(u'collectivization' not in next(wc.get_texts()))
-
-    def test_max_token_len_set(self):
-        """
-        set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists
-        """
-        wc = WikiCorpus(datapath(FILENAME), processes=1, token_max_len=16, lemmatize=False)
-        self.assertTrue(u'collectivization' in next(wc.get_texts()))
-
-    def test_custom_tokenizer(self):
-        """
-        define a custom tokenizer function and use it
-        """
-        wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False, tokenizer_func=custom_tokeiner,
-                        token_max_len=16, token_min_len=1, lower=False)
-        row = wc.get_texts()
-        list_tokens = next(row)
-        self.assertTrue(u'Anarchism' in list_tokens)
-        self.assertTrue(u'collectivization' in list_tokens)
-        self.assertTrue(u'a' in list_tokens)
-        self.assertTrue(u'i.e.' in list_tokens)
-
-
-if __name__ == '__main__':
-    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
-    unittest.main()

From b5976c4f95ae3ea8d084946943ba381837c92bad Mon Sep 17 00:00:00 2001
From: Manos Stergiadis <stergiadis777@gmail.com>
Date: Wed, 10 Jan 2018 21:20:10 +0100
Subject: [PATCH 7/8] Discarded the empty input test for the WikiCorpus since
 an empty file is not legitimate XML

---
 gensim/test/test_corpora.py | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
index 20bf28385f..2d4b369831 100644
--- a/gensim/test/test_corpora.py
+++ b/gensim/test/test_corpora.py
@@ -11,7 +11,6 @@
 from __future__ import unicode_literals
 
 import codecs
-import bz2
 import itertools
 import logging
 import os.path
@@ -19,7 +18,6 @@
 import unittest
 
 import numpy as np
-from xml.etree.cElementTree import ParseError
 
 from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus,
                             ucicorpus, malletcorpus, textcorpus, indexedcorpus, wikicorpus)
@@ -401,6 +399,7 @@ def test_serialize_compressed(self):
     def test_indexing(self):
         pass
 
+
 class TestWikiCorpus(TestTextCorpus):
     def setUp(self):
         self.corpus_class = wikicorpus.WikiCorpus
@@ -446,20 +445,6 @@ def test_load(self):
         # the deerwester corpus always has nine documents
         self.assertEqual(len(docs), 9)
 
-    def test_empty_input(self):
-        """
-        Empty compressed input raises ParseError
-        """
-        tmpf = get_tmpfile('emptycorpus.xml.bz2')
-        content = bz2.compress(''.encode())  # Explicit string to byte conversion needed in python 3
-
-        with open(tmpf, "wb") as fh:
-            fh.write(content)
-
-        with self.assertRaises(ParseError):
-            corpus = self.corpus_class(tmpf)
-            del corpus  # Needed to supress tox warning
-
     def test_unicode_element(self):
         """
         First unicode article in this sample is
@@ -542,6 +527,9 @@ def test_sample_text_seed(self):
         # Cannot instantiate WikiCorpus from lines
         pass
 
+    def test_empty_input(self):
+        pass
+
 
 class TestTextDirectoryCorpus(unittest.TestCase):
 

From 78f28705008226dbf3b1204733e14b61897bc38a Mon Sep 17 00:00:00 2001
From: Manos Stergiadis <stergiadis777@gmail.com>
Date: Thu, 11 Jan 2018 11:10:32 +0100
Subject: [PATCH 8/8] Added 2 more tests

---
 gensim/test/test_corpora.py | 40 ++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
index 2d4b369831..f330dbd271 100644
--- a/gensim/test/test_corpora.py
+++ b/gensim/test/test_corpora.py
@@ -400,6 +400,15 @@ def test_indexing(self):
         pass
 
 
+# Needed for the test_custom_tokenizer is the TestWikiCorpus class.
+# Cannot be nested due to serializing.
+def custom_tokenizer(content, token_min_len=2, token_max_len=15, lower=True):
+    return [
+        to_unicode(token.lower()) if lower else to_unicode(token) for token in content.split()
+        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
+    ]
+
+
 class TestWikiCorpus(TestTextCorpus):
     def setUp(self):
         self.corpus_class = wikicorpus.WikiCorpus
@@ -445,6 +454,18 @@ def test_load(self):
         # the deerwester corpus always has nine documents
         self.assertEqual(len(docs), 9)
 
+    def test_first_element(self):
+        """
+        First two articles in this sample are
+        1) anarchism
+        2) autism
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1)
+
+        texts = corpus.get_texts()
+        self.assertTrue(u'anarchism' in next(texts))
+        self.assertTrue(u'autism' in next(texts))
+
     def test_unicode_element(self):
         """
         First unicode article in this sample is
@@ -455,6 +476,19 @@ def test_unicode_element(self):
         texts = corpus.get_texts()
         self.assertTrue(u'папа' in next(texts))
 
+    def test_custom_tokenizer(self):
+        """
+        define a custom tokenizer function and use it
+        """
+        wc = self.corpus_class(self.enwiki, processes=1, lemmatize=False, tokenizer_func=custom_tokenizer,
+                        token_max_len=16, token_min_len=1, lower=False)
+        row = wc.get_texts()
+        list_tokens = next(row)
+        self.assertTrue(u'Anarchism' in list_tokens)
+        self.assertTrue(u'collectivization' in list_tokens)
+        self.assertTrue(u'a' in list_tokens)
+        self.assertTrue(u'i.e.' in list_tokens)
+
     def test_lower_case_set_true(self):
         """
         Set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist
@@ -505,10 +539,9 @@ def test_max_token_len_set(self):
         corpus = self.corpus_class(self.enwiki, processes=1, token_max_len=16, lemmatize=False)
         self.assertTrue(u'collectivization' in next(corpus.get_texts()))
 
-    # TODO: sporadic failure to be investigated
+    # #TODO: sporadic failure to be investigated
     # def test_get_texts_returns_generator_of_lists(self):
-    #
-    #     corpus = self.corpus_class(self.fname)
+    #     corpus = self.corpus_class(self.enwiki)
     #     l = corpus.get_texts()
     #     self.assertEqual(type(l), types.GeneratorType)
     #     first = next(l)
@@ -528,6 +561,7 @@ def test_sample_text_seed(self):
         pass
 
     def test_empty_input(self):
+        # An empty file is not legit XML
         pass