-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Test and refactor WikiCorpus #1821
Changes from 8 commits
574134e
952e8d5
7ddce6c
836c3c2
43a48f5
8b7a1d5
eeea748
b5976c4
78f2870
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
# | ||
# Copyright (C) 2010 Radim Rehurek <[email protected]> | ||
# Copyright (C) 2012 Lars Buitinck <[email protected]> | ||
# Copyright (C) 2018 Emmanouil Stergiadis <[email protected]> | ||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
|
||
|
@@ -56,8 +57,8 @@ | |
RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting | ||
RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting | ||
RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) # categories | ||
# Remove File and Image template | ||
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) | ||
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) # Remove File and Image template | ||
|
||
|
||
# MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that | ||
# ought to be ignored | ||
|
@@ -332,19 +333,15 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction | |
self.token_min_len = token_min_len | ||
self.token_max_len = token_max_len | ||
self.lower = lower | ||
|
||
if dictionary is None: | ||
self.dictionary = Dictionary(self.get_texts()) | ||
else: | ||
self.dictionary = dictionary | ||
self.dictionary = dictionary or Dictionary(self.get_texts()) | ||
|
||
def get_texts(self): | ||
""" | ||
Iterate over the dump, returning text version of each article as a list | ||
of tokens. | ||
|
||
Only articles of sufficient length are returned (short articles & redirects | ||
etc are ignored). This is control by `article_min_tokens` on the class instance. | ||
etc are ignored). This is controlled by `article_min_tokens` on the class instance. | ||
|
||
Note that this iterates over the **texts**; if you want vectors, just use | ||
the standard corpus interface instead of this function:: | ||
|
@@ -380,6 +377,7 @@ def get_texts(self): | |
yield (tokens, (pageid, title)) | ||
else: | ||
yield tokens | ||
|
||
except KeyboardInterrupt: | ||
logger.warn( | ||
"user terminated iteration over Wikipedia corpus after %i documents with %i positions " | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,7 @@ | |
import numpy as np | ||
|
||
from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus, | ||
ucicorpus, malletcorpus, textcorpus, indexedcorpus) | ||
ucicorpus, malletcorpus, textcorpus, indexedcorpus, wikicorpus) | ||
from gensim.interfaces import TransformedCorpus | ||
from gensim.utils import to_unicode | ||
from gensim.test.utils import datapath, get_tmpfile | ||
|
@@ -400,6 +400,137 @@ def test_indexing(self): | |
pass | ||
|
||
|
||
class TestWikiCorpus(TestTextCorpus): | ||
def setUp(self): | ||
self.corpus_class = wikicorpus.WikiCorpus | ||
self.file_extension = '.xml.bz2' | ||
self.fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) | ||
self.enwiki = datapath('enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2') | ||
|
||
def test_default_preprocessing(self): | ||
expected = ['computer', 'human', 'interface'] | ||
corpus = self.corpus_class(self.fname, article_min_tokens=0) | ||
first_text = next(corpus.get_texts()) | ||
self.assertEqual(expected, first_text) | ||
|
||
def test_len(self): | ||
# When there is no min_token limit all 9 articles must be registered. | ||
corpus = self.corpus_class(self.fname, article_min_tokens=0) | ||
all_articles = corpus.get_texts() | ||
assert (len(list(all_articles)) == 9) | ||
|
||
# With a huge min_token limit, all articles should be filtered out. | ||
corpus = self.corpus_class(self.fname, article_min_tokens=100000) | ||
all_articles = corpus.get_texts() | ||
assert (len(list(all_articles)) == 0) | ||
|
||
def test_load_with_metadata(self): | ||
corpus = self.corpus_class(self.fname, article_min_tokens=0) | ||
corpus.metadata = True | ||
self.assertEqual(len(corpus), 9) | ||
|
||
docs = list(corpus) | ||
self.assertEqual(len(docs), 9) | ||
|
||
for i, docmeta in enumerate(docs): | ||
doc, metadata = docmeta | ||
article_no = i + 1 # Counting IDs from 1 | ||
self.assertEqual(metadata[0], str(article_no)) | ||
self.assertEqual(metadata[1], 'Article%d' % article_no) | ||
|
||
def test_load(self): | ||
corpus = self.corpus_class(self.fname, article_min_tokens=0) | ||
|
||
docs = list(corpus) | ||
# the deerwester corpus always has nine documents | ||
self.assertEqual(len(docs), 9) | ||
|
||
def test_unicode_element(self): | ||
""" | ||
First unicode article in this sample is | ||
1) папа | ||
""" | ||
bgwiki = datapath('bgwiki-latest-pages-articles-shortened.xml.bz2') | ||
corpus = self.corpus_class(bgwiki) | ||
texts = corpus.get_texts() | ||
self.assertTrue(u'папа' in next(texts)) | ||
|
||
def test_lower_case_set_true(self): | ||
""" | ||
Set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist | ||
""" | ||
corpus = self.corpus_class(self.enwiki, processes=1, lower=True, lemmatize=False) | ||
row = corpus.get_texts() | ||
list_tokens = next(row) | ||
self.assertTrue(u'Anarchism' not in list_tokens) | ||
self.assertTrue(u'anarchism' in list_tokens) | ||
|
||
def test_lower_case_set_false(self): | ||
""" | ||
Set the parameter lower to False and check that upper case Anarchism' token exists | ||
""" | ||
corpus = self.corpus_class(self.enwiki, processes=1, lower=False, lemmatize=False) | ||
row = corpus.get_texts() | ||
list_tokens = next(row) | ||
self.assertTrue(u'Anarchism' in list_tokens) | ||
self.assertTrue(u'anarchism' in list_tokens) | ||
|
||
def test_min_token_len_not_set(self): | ||
""" | ||
Don't set the parameter token_min_len and check that 'a' as a token doesn't exist | ||
Default token_min_len=2 | ||
""" | ||
corpus = self.corpus_class(self.enwiki, processes=1, lemmatize=False) | ||
self.assertTrue(u'a' not in next(corpus.get_texts())) | ||
|
||
def test_min_token_len_set(self): | ||
""" | ||
Set the parameter token_min_len to 1 and check that 'a' as a token exists | ||
""" | ||
corpus = self.corpus_class(self.enwiki, processes=1, token_min_len=1, lemmatize=False) | ||
self.assertTrue(u'a' in next(corpus.get_texts())) | ||
|
||
def test_max_token_len_not_set(self): | ||
""" | ||
Don't set the parameter token_max_len and check that 'collectivisation' as a token doesn't exist | ||
Default token_max_len=15 | ||
""" | ||
corpus = self.corpus_class(self.enwiki, processes=1, lemmatize=False) | ||
self.assertTrue(u'collectivization' not in next(corpus.get_texts())) | ||
|
||
def test_max_token_len_set(self): | ||
""" | ||
Set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists | ||
""" | ||
corpus = self.corpus_class(self.enwiki, processes=1, token_max_len=16, lemmatize=False) | ||
self.assertTrue(u'collectivization' in next(corpus.get_texts())) | ||
|
||
# TODO: sporadic failure to be investigated | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's a problem here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one exists commented on test_wikicorpus, I don't know why. So I decided to keep it in case we want to look into it in the future. The comment says that the test some times fails unpredictably and indeed it fails on my machine. Do we keep it as comment or completely delete it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At least - leave in place. |
||
# def test_get_texts_returns_generator_of_lists(self): | ||
# | ||
# corpus = self.corpus_class(self.fname) | ||
# l = corpus.get_texts() | ||
# self.assertEqual(type(l), types.GeneratorType) | ||
# first = next(l) | ||
# self.assertEqual(type(first), list) | ||
# self.assertTrue(isinstance(first[0], bytes) or isinstance(first[0], str)) | ||
|
||
def test_sample_text(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably better to skip this test (not silently pass)., what do you think @steremma? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem is that this test overrides the one defined in
of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Best variant - change class hierarchy and interfaces, we should'nt give "useless" methods from parent class (change from |
||
# Cannot instantiate WikiCorpus from lines | ||
pass | ||
|
||
def test_sample_text_length(self): | ||
# Cannot instantiate WikiCorpus from lines | ||
pass | ||
|
||
def test_sample_text_seed(self): | ||
# Cannot instantiate WikiCorpus from lines | ||
pass | ||
|
||
def test_empty_input(self): | ||
pass | ||
|
||
|
||
class TestTextDirectoryCorpus(unittest.TestCase): | ||
|
||
def write_one_level(self, *args): | ||
|
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One of your tests "stuck" in Travis, please check, what's a problem and fix https://travis-ci.org/RaRe-Technologies/gensim/jobs/322641221#L828
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, question: why you need
TextTextCorpus
here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am inheriting so that we make sure the
WikiCorpus
class passes not only its own tests but also the one's defined for theTextCorpus
class (sinceWikiCorpus
inherits fromTextCorpus
). Else we can inherit fromCorpusTestCase
or even fromunittest.Testcase