Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test and refactor WikiCorpus #1821

Merged
merged 9 commits into from
Jan 11, 2018
14 changes: 6 additions & 8 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#
# Copyright (C) 2010 Radim Rehurek <[email protected]>
# Copyright (C) 2012 Lars Buitinck <[email protected]>
# Copyright (C) 2018 Emmanouil Stergiadis <[email protected]>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


Expand Down Expand Up @@ -56,8 +57,8 @@
RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting
RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting
RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) # categories
# Remove File and Image template
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) # Remove File and Image template


# MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that
# ought to be ignored
Expand Down Expand Up @@ -332,19 +333,15 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
self.token_min_len = token_min_len
self.token_max_len = token_max_len
self.lower = lower

if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
else:
self.dictionary = dictionary
self.dictionary = dictionary or Dictionary(self.get_texts())

def get_texts(self):
"""
Iterate over the dump, returning text version of each article as a list
of tokens.

Only articles of sufficient length are returned (short articles & redirects
etc are ignored). This is control by `article_min_tokens` on the class instance.
etc are ignored). This is controlled by `article_min_tokens` on the class instance.

Note that this iterates over the **texts**; if you want vectors, just use
the standard corpus interface instead of this function::
Expand Down Expand Up @@ -380,6 +377,7 @@ def get_texts(self):
yield (tokens, (pageid, title))
else:
yield tokens

except KeyboardInterrupt:
logger.warn(
"user terminated iteration over Wikipedia corpus after %i documents with %i positions "
Expand Down
71 changes: 70 additions & 1 deletion gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,18 @@
from __future__ import unicode_literals

import codecs
import bz2
import itertools
import logging
import os.path
import tempfile
import unittest

import numpy as np
from xml.etree.cElementTree import ParseError

from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus,
ucicorpus, malletcorpus, textcorpus, indexedcorpus)
ucicorpus, malletcorpus, textcorpus, indexedcorpus, wikicorpus)
from gensim.interfaces import TransformedCorpus
from gensim.utils import to_unicode
from gensim.test.utils import datapath, get_tmpfile
Expand Down Expand Up @@ -400,6 +402,73 @@ def test_indexing(self):
pass


class TestWikiCorpus(TestTextCorpus):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One of your tests "stuck" in Travis, please check, what's a problem and fix https://travis-ci.org/RaRe-Technologies/gensim/jobs/322641221#L828

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, question: why you need TextTextCorpus here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am inheriting so that we make sure the WikiCorpus class passes not only its own tests but also the one's defined for the TextCorpus class (since WikiCorpus inherits from TextCorpus). Else we can inherit from CorpusTestCase or even from unittest.Testcase

def setUp(self):
self.corpus_class = wikicorpus.WikiCorpus
self.file_extension = '.xml.bz2'
self.fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))

def test_default_preprocessing(self):
expected = ['computer', 'human', 'interface']
corpus = self.corpus_class(self.fname, article_min_tokens=0)
first_text = next(corpus.get_texts())
self.assertEqual(expected, first_text)

def test_len(self):

def test_with_limit(article_min_tokens, expected_articles):
corpus = self.corpus_class(self.fname, article_min_tokens=article_min_tokens)
all_articles = corpus.get_texts()
assert (len(list(all_articles)) == expected_articles)

test_with_limit(0, 9)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better to make it in "cycle" (instead of defining test in test)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok I will change that

test_with_limit(100000, 0)

def test_load_with_metadata(self):
corpus = self.corpus_class(self.fname, article_min_tokens=0)
corpus.metadata = True
self.assertEqual(len(corpus), 9)

docs = list(corpus)
self.assertEqual(len(docs), 9)

for i, docmeta in enumerate(docs):
doc, metadata = docmeta
article_no = i + 1 # Counting IDs from 1
self.assertEqual(metadata[0], str(article_no))
self.assertEqual(metadata[1], 'Article%d' % article_no)

def test_load(self):
corpus = self.corpus_class(self.fname, article_min_tokens=0)

docs = list(corpus)
# the deerwester corpus always has nine documents
self.assertEqual(len(docs), 9)

def test_empty_input(self):
tmpf = get_tmpfile('emptycorpus.xml.bz2')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better use with temporary_file('emptycorpus.xml.bz2'): ... from gensim.test.utils here (because we have auto-cleanup in this function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to follow the same style as in the base test classes but you are right, context managers are safer.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a lot of "dirty" code in the codebase, cleanup slow but go forward.

content = bz2.compress(b'') # Explicit string to byte conversion needed in python 3
fh = open(tmpf, "wb")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of open / close please use with open(...)

fh.write(content)
fh.close()

with self.assertRaises(ParseError):
corpus = self.corpus_class(tmpf)
del corpus # Needed to supress tox warning

def test_sample_text(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably better to skip this test (not silently pass)., what do you think @steremma?

Copy link
Contributor Author

@steremma steremma Jan 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that this test overrides the one defined in TestTextCorpus. If we just skip it the parent definition will be called and it will fail because we plain text is not legit XML (nor is it compressed). In that sense passing it silently practically ignores it. The same idea is followed in the tests:

    def test_save(self):
        pass

    def test_serialize(self):
        pass

    def test_serialize_compressed(self):
        pass

    def test_indexing(self):
        pass

of TestTextCorpus for these 4 tests defined in the parent class CorpusTestCase. So I think its better to keep it as it is.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Best variant - change class hierarchy and interfaces, we should'nt give "useless" methods from parent class (change from TestWikiCorpus -> TestTextCorpus to
TestTextCorpus -> BaseTestTextCorpus <- TestWikiCorpus), but right now this isn't really needed, stay current variant with "pass".

# Cannot instantiate WikiCorpus from lines
pass

def test_sample_text_length(self):
# Cannot instantiate WikiCorpus from lines
pass

def test_sample_text_seed(self):
# Cannot instantiate WikiCorpus from lines
pass


class TestTextDirectoryCorpus(unittest.TestCase):

def write_one_level(self, *args):
Expand Down
Binary file added gensim/test/test_data/testcorpus.xml.bz2
Binary file not shown.