piskvorky · menshikh-iv · Jan 11, 2018 · Dec 26, 2017 · Dec 26, 2017 · Dec 26, 2017
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -3,6 +3,7 @@
 #
 # Copyright (C) 2010 Radim Rehurek <[email protected]>
 # Copyright (C) 2012 Lars Buitinck <[email protected]>
+# Copyright (C) 2018 Emmanouil Stergiadis <[email protected]>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
@@ -56,8 +57,8 @@
 RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE)  # table formatting
 RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE)  # table cell formatting
 RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)  # categories
-# Remove File and Image template
-RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
+RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)  # Remove File and Image template
+
 
 # MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that
 # ought to be ignored
@@ -332,19 +333,15 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         self.token_min_len = token_min_len
         self.token_max_len = token_max_len
         self.lower = lower
-
-        if dictionary is None:
-            self.dictionary = Dictionary(self.get_texts())
-        else:
-            self.dictionary = dictionary
+        self.dictionary = dictionary or Dictionary(self.get_texts())
 
     def get_texts(self):
         """
         Iterate over the dump, returning text version of each article as a list
         of tokens.
 
         Only articles of sufficient length are returned (short articles & redirects
-        etc are ignored). This is control by `article_min_tokens` on the class instance.
+        etc are ignored). This is controlled by `article_min_tokens` on the class instance.
 
         Note that this iterates over the **texts**; if you want vectors, just use
         the standard corpus interface instead of this function::
@@ -380,6 +377,7 @@ def get_texts(self):
                         yield (tokens, (pageid, title))
                     else:
                         yield tokens
+
         except KeyboardInterrupt:
             logger.warn(
                 "user terminated iteration over Wikipedia corpus after %i documents with %i positions "

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
@@ -11,16 +11,18 @@
 from __future__ import unicode_literals
 
 import codecs
+import bz2
 import itertools
 import logging
 import os.path
 import tempfile
 import unittest
 
 import numpy as np
+from xml.etree.cElementTree import ParseError
 
 from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus,
-                            ucicorpus, malletcorpus, textcorpus, indexedcorpus)
+                            ucicorpus, malletcorpus, textcorpus, indexedcorpus, wikicorpus)
 from gensim.interfaces import TransformedCorpus
 from gensim.utils import to_unicode
 from gensim.test.utils import datapath, get_tmpfile
@@ -400,6 +402,73 @@ def test_indexing(self):
         pass
 
 
+class TestWikiCorpus(TestTextCorpus):
+    def setUp(self):
+        self.corpus_class = wikicorpus.WikiCorpus
+        self.file_extension = '.xml.bz2'
+        self.fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
+
+    def test_default_preprocessing(self):
+        expected = ['computer', 'human', 'interface']
+        corpus = self.corpus_class(self.fname, article_min_tokens=0)
+        first_text = next(corpus.get_texts())
+        self.assertEqual(expected, first_text)
+
+    def test_len(self):
+
+        def test_with_limit(article_min_tokens, expected_articles):
+            corpus = self.corpus_class(self.fname, article_min_tokens=article_min_tokens)
+            all_articles = corpus.get_texts()
+            assert (len(list(all_articles)) == expected_articles)
+
+        test_with_limit(0, 9)
+        test_with_limit(100000, 0)
+
+    def test_load_with_metadata(self):
+        corpus = self.corpus_class(self.fname, article_min_tokens=0)
+        corpus.metadata = True
+        self.assertEqual(len(corpus), 9)
+
+        docs = list(corpus)
+        self.assertEqual(len(docs), 9)
+
+        for i, docmeta in enumerate(docs):
+            doc, metadata = docmeta
+            article_no = i + 1  # Counting IDs from 1
+            self.assertEqual(metadata[0], str(article_no))
+            self.assertEqual(metadata[1], 'Article%d' % article_no)
+
+    def test_load(self):
+        corpus = self.corpus_class(self.fname, article_min_tokens=0)
+
+        docs = list(corpus)
+        # the deerwester corpus always has nine documents
+        self.assertEqual(len(docs), 9)
+
+    def test_empty_input(self):
+        tmpf = get_tmpfile('emptycorpus.xml.bz2')
+        content = bz2.compress(b'')  # Explicit string to byte conversion needed in python 3
+        fh = open(tmpf, "wb")
+        fh.write(content)
+        fh.close()
+
+        with self.assertRaises(ParseError):
+            corpus = self.corpus_class(tmpf)
+            del corpus  # Needed to supress tox warning
+
+    def test_sample_text(self):
+        # Cannot instantiate WikiCorpus from lines
+        pass
+
+    def test_sample_text_length(self):
+        # Cannot instantiate WikiCorpus from lines
+        pass
+
+    def test_sample_text_seed(self):
+        # Cannot instantiate WikiCorpus from lines
+        pass
+
+
 class TestTextDirectoryCorpus(unittest.TestCase):
 
     def write_one_level(self, *args):

diff --git a/gensim/test/test_data/testcorpus.xml.bz2 b/gensim/test/test_data/testcorpus.xml.bz2