-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Test and refactor WikiCorpus #1821
Changes from 4 commits
574134e
952e8d5
7ddce6c
836c3c2
43a48f5
8b7a1d5
eeea748
b5976c4
78f2870
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
# | ||
# Copyright (C) 2010 Radim Rehurek <[email protected]> | ||
# Copyright (C) 2012 Lars Buitinck <[email protected]> | ||
# Copyright (C) 2018 Emmanouil Stergiadis <[email protected]> | ||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
|
||
|
@@ -56,8 +57,8 @@ | |
RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting | ||
RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting | ||
RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) # categories | ||
# Remove File and Image template | ||
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) | ||
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) # Remove File and Image template | ||
|
||
|
||
# MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that | ||
# ought to be ignored | ||
|
@@ -332,19 +333,15 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction | |
self.token_min_len = token_min_len | ||
self.token_max_len = token_max_len | ||
self.lower = lower | ||
|
||
if dictionary is None: | ||
self.dictionary = Dictionary(self.get_texts()) | ||
else: | ||
self.dictionary = dictionary | ||
self.dictionary = dictionary or Dictionary(self.get_texts()) | ||
|
||
def get_texts(self): | ||
""" | ||
Iterate over the dump, returning text version of each article as a list | ||
of tokens. | ||
|
||
Only articles of sufficient length are returned (short articles & redirects | ||
etc are ignored). This is control by `article_min_tokens` on the class instance. | ||
etc are ignored). This is controlled by `article_min_tokens` on the class instance. | ||
|
||
Note that this iterates over the **texts**; if you want vectors, just use | ||
the standard corpus interface instead of this function:: | ||
|
@@ -380,6 +377,7 @@ def get_texts(self): | |
yield (tokens, (pageid, title)) | ||
else: | ||
yield tokens | ||
|
||
except KeyboardInterrupt: | ||
logger.warn( | ||
"user terminated iteration over Wikipedia corpus after %i documents with %i positions " | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,16 +11,18 @@ | |
from __future__ import unicode_literals | ||
|
||
import codecs | ||
import bz2 | ||
import itertools | ||
import logging | ||
import os.path | ||
import tempfile | ||
import unittest | ||
|
||
import numpy as np | ||
from xml.etree.cElementTree import ParseError | ||
|
||
from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus, | ||
ucicorpus, malletcorpus, textcorpus, indexedcorpus) | ||
ucicorpus, malletcorpus, textcorpus, indexedcorpus, wikicorpus) | ||
from gensim.interfaces import TransformedCorpus | ||
from gensim.utils import to_unicode | ||
from gensim.test.utils import datapath, get_tmpfile | ||
|
@@ -400,6 +402,73 @@ def test_indexing(self): | |
pass | ||
|
||
|
||
class TestWikiCorpus(TestTextCorpus): | ||
def setUp(self): | ||
self.corpus_class = wikicorpus.WikiCorpus | ||
self.file_extension = '.xml.bz2' | ||
self.fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) | ||
|
||
def test_default_preprocessing(self): | ||
expected = ['computer', 'human', 'interface'] | ||
corpus = self.corpus_class(self.fname, article_min_tokens=0) | ||
first_text = next(corpus.get_texts()) | ||
self.assertEqual(expected, first_text) | ||
|
||
def test_len(self): | ||
|
||
def test_with_limit(article_min_tokens, expected_articles): | ||
corpus = self.corpus_class(self.fname, article_min_tokens=article_min_tokens) | ||
all_articles = corpus.get_texts() | ||
assert (len(list(all_articles)) == expected_articles) | ||
|
||
test_with_limit(0, 9) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. better to make it in "cycle" (instead of defining test in test) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok I will change that |
||
test_with_limit(100000, 0) | ||
|
||
def test_load_with_metadata(self): | ||
corpus = self.corpus_class(self.fname, article_min_tokens=0) | ||
corpus.metadata = True | ||
self.assertEqual(len(corpus), 9) | ||
|
||
docs = list(corpus) | ||
self.assertEqual(len(docs), 9) | ||
|
||
for i, docmeta in enumerate(docs): | ||
doc, metadata = docmeta | ||
article_no = i + 1 # Counting IDs from 1 | ||
self.assertEqual(metadata[0], str(article_no)) | ||
self.assertEqual(metadata[1], 'Article%d' % article_no) | ||
|
||
def test_load(self): | ||
corpus = self.corpus_class(self.fname, article_min_tokens=0) | ||
|
||
docs = list(corpus) | ||
# the deerwester corpus always has nine documents | ||
self.assertEqual(len(docs), 9) | ||
|
||
def test_empty_input(self): | ||
tmpf = get_tmpfile('emptycorpus.xml.bz2') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried to follow the same style as in the base test classes but you are right, context managers are safer. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have a lot of "dirty" code in the codebase, cleanup slow but go forward. |
||
content = bz2.compress(b'') # Explicit string to byte conversion needed in python 3 | ||
fh = open(tmpf, "wb") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of |
||
fh.write(content) | ||
fh.close() | ||
|
||
with self.assertRaises(ParseError): | ||
corpus = self.corpus_class(tmpf) | ||
del corpus # Needed to supress tox warning | ||
|
||
def test_sample_text(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably better to skip this test (not silently pass)., what do you think @steremma? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem is that this test overrides the one defined in
of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Best variant - change class hierarchy and interfaces, we should'nt give "useless" methods from parent class (change from |
||
# Cannot instantiate WikiCorpus from lines | ||
pass | ||
|
||
def test_sample_text_length(self): | ||
# Cannot instantiate WikiCorpus from lines | ||
pass | ||
|
||
def test_sample_text_seed(self): | ||
# Cannot instantiate WikiCorpus from lines | ||
pass | ||
|
||
|
||
class TestTextDirectoryCorpus(unittest.TestCase): | ||
|
||
def write_one_level(self, *args): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One of your tests "stuck" in Travis, please check, what's a problem and fix https://travis-ci.org/RaRe-Technologies/gensim/jobs/322641221#L828
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, question: why you need
TextTextCorpus
here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am inheriting so that we make sure the
WikiCorpus
class passes not only its own tests but also the one's defined for theTextCorpus
class (sinceWikiCorpus
inherits fromTextCorpus
). Else we can inherit fromCorpusTestCase
or even fromunittest.Testcase