From 574134e1a49411375d9a599079231934506cd867 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Tue, 26 Dec 2017 03:10:29 +0200 Subject: [PATCH 1/8] minor style refactoring and comment fixes in accordance to PEP8 --- gensim/corpora/wikicorpus.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 0c1c229bac..7148b90884 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -3,6 +3,7 @@ # # Copyright (C) 2010 Radim Rehurek # Copyright (C) 2012 Lars Buitinck +# Copyright (C) 2018 Emmanouil Stergiadis # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html @@ -56,8 +57,8 @@ RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) # categories -# Remove File and Image template -RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) +RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) # Remove File and Image template + # MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that # ought to be ignored @@ -332,11 +333,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction self.token_min_len = token_min_len self.token_max_len = token_max_len self.lower = lower - - if dictionary is None: - self.dictionary = Dictionary(self.get_texts()) - else: - self.dictionary = dictionary + self.dictionary = dictionary or Dictionary(self.get_texts()) def get_texts(self): """ @@ -344,7 +341,7 @@ def get_texts(self): of tokens. Only articles of sufficient length are returned (short articles & redirects - etc are ignored). This is control by `article_min_tokens` on the class instance. + etc are ignored). This is controlled by `article_min_tokens` on the class instance. Note that this iterates over the **texts**; if you want vectors, just use the standard corpus interface instead of this function:: @@ -380,6 +377,7 @@ def get_texts(self): yield (tokens, (pageid, title)) else: yield tokens + except KeyboardInterrupt: logger.warn( "user terminated iteration over Wikipedia corpus after %i documents with %i positions " From 952e8d5011a100bb30da98a8c631a7e3b63cc19e Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Tue, 26 Dec 2017 03:10:30 +0200 Subject: [PATCH 2/8] Created test data in legitimate compressed XML format (.xml.bz2) for the WikiCorpus class. * Used the same raw data found for other sources (9 articles). * Added Various wiki markup to test the parsing regural expressions --- gensim/test/test_data/testcorpus.xml.bz2 | Bin 0 -> 1404 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 gensim/test/test_data/testcorpus.xml.bz2 diff --git a/gensim/test/test_data/testcorpus.xml.bz2 b/gensim/test/test_data/testcorpus.xml.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..064b9ad1e9c9704d9e1204a4c4691f8561a400bd GIT binary patch literal 1404 zcmV-?1%vuRT4*^jL0KkKSqzed&j1DL-+)vyQHTHkKgM3#zyJUAU4Gj$#fso@SO*Fz}WDE$z(Sk68 zMwkE)<0efs!enF&2*lBXFoQ;z01)FQO*Fz}WDE$z(Sk68MwkE)DoB$EO#?=a2xT;7 zGb#F0LrjlQdYeh36Bz<(Lm3e#&-!k&=Tp-HOoS6EkR?Qf*$PENmp)I9F(u*he803X z!x;bw2*R^}EGr^u$&i2u2Xv7g3{{T|CXBQdIuIQxg;0fZNk3qSL=atevLnwj${+}( ztdyV(!vF#^THHEPFA%78@}kN}xJ*8abXoO>PrWTv_GF3?9e>r*2UOt=OE5h{({_w%&4;X!r^ux*; zHf_zn3-@yD$`+VL9FVm7Axxp9Emwtoyq>QFKR9QO-8g3qA<2Q=454hHhTd3LR-BpZ z)6t2-86zvh9BLG(c}UI}>xNc_ta%}vI73xK9(ZfZ98s#!!wXW-!wz_C*N$BgfT|#H(Q&htFN=Bp`1MVVgEZiJ<6v| zYOcEX()I1Gx_b1sb&W06_VI_V9Z(jTlw~xXbo#<6UlA%-iLcLcc zSsbvmEeeILWGWa#jN?G-eRTrZ@47&M~LsFki2DjQ+_A-*4lKg=lerwCWhquEdO^!)f> zpE!P2k)zHZjsBQa;Hmy$P0-sdGTHl~sT#@qT7CzYk52yBYK9&ftLj753rp1fVf|PH zbHnC&KI{t9zq=jXq-KB+XR6kq zR5rG?L(0<{!WLRJ~zLK#Eo2<+^IK~VGPKVIq8qc^5~Q1?TZ(B{eHUJ;=mqoywUp_Kkxklznmd^(|< zWM+moLg;_EcCNo#3ncsDcyuoD&4=*4m+ Date: Tue, 26 Dec 2017 03:10:31 +0200 Subject: [PATCH 3/8] Added test class for the WikiCorpus source. * Following the same inheritance schema as in the source TestWikiCorpus > TestTextCorpus > CorpusTestCase. * Testing methods are overriden where necessary to reflect logic changes. * All existing functionality is tested (account for markup handling, minimum article length etc) --- gensim/test/test_corpora.py | 71 ++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 4ddc16e0cf..33f28dfbd7 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -11,6 +11,7 @@ from __future__ import unicode_literals import codecs +import bz2 import itertools import logging import os.path @@ -18,9 +19,10 @@ import unittest import numpy as np +from xml.etree.cElementTree import ParseError from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus, - ucicorpus, malletcorpus, textcorpus, indexedcorpus) + ucicorpus, malletcorpus, textcorpus, indexedcorpus, wikicorpus) from gensim.interfaces import TransformedCorpus from gensim.utils import to_unicode from gensim.test.utils import datapath, get_tmpfile @@ -400,6 +402,73 @@ def test_indexing(self): pass +class TestWikiCorpus(TestTextCorpus): + def setUp(self): + self.corpus_class = wikicorpus.WikiCorpus + self.file_extension = '.xml.bz2' + self.fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) + + def test_default_preprocessing(self): + expected = ['computer', 'human', 'interface'] + corpus = self.corpus_class(self.fname, article_min_tokens=0) + first_text = corpus.get_texts().next() + self.assertEqual(expected, first_text) + + def test_len(self): + + def test_with_limit(article_min_tokens, expected_articles): + corpus = self.corpus_class(self.fname, article_min_tokens=article_min_tokens) + all_articles = corpus.get_texts() + assert (len(list(all_articles)) == expected_articles) + + test_with_limit(0, 9) + test_with_limit(100000, 0) + + def test_load_with_metadata(self): + corpus = self.corpus_class(self.fname, article_min_tokens=0) + corpus.metadata = True + self.assertEqual(len(corpus), 9) + + docs = list(corpus) + self.assertEqual(len(docs), 9) + + for i, docmeta in enumerate(docs): + doc, metadata = docmeta + article_no = i + 1 # Counting IDs from 1 + self.assertEqual(metadata[0], str(article_no)) + self.assertEqual(metadata[1], 'Article%d' % article_no) + + def test_load(self): + corpus = self.corpus_class(self.fname, article_min_tokens=0) + + docs = list(corpus) + # the deerwester corpus always has nine documents + self.assertEqual(len(docs), 9) + + def test_empty_input(self): + tmpf = get_tmpfile('emptycorpus.xml.bz2') + content = bz2.compress(b'') # Explicit string to byte conversion needed in python 3 + fh = open(tmpf, "wb") + fh.write(content) + fh.close() + + with self.assertRaises(ParseError): + corpus = self.corpus_class(tmpf) + del corpus # Needed to supress tox warning + + def test_sample_text(self): + # Cannot instantiate WikiCorpus from lines + pass + + def test_sample_text_length(self): + # Cannot instantiate WikiCorpus from lines + pass + + def test_sample_text_seed(self): + # Cannot instantiate WikiCorpus from lines + pass + + class TestTextDirectoryCorpus(unittest.TestCase): def write_one_level(self, *args): From 836c3c2431ff07bd1661e551f2f940a9d2b0fd69 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Thu, 28 Dec 2017 21:16:26 +0200 Subject: [PATCH 4/8] Fix python 3 compatibility for generator next method --- gensim/test/test_corpora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 33f28dfbd7..bc279de1ed 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -411,7 +411,7 @@ def setUp(self): def test_default_preprocessing(self): expected = ['computer', 'human', 'interface'] corpus = self.corpus_class(self.fname, article_min_tokens=0) - first_text = corpus.get_texts().next() + first_text = next(corpus.get_texts()) self.assertEqual(expected, first_text) def test_len(self): From 43a48f5f9abde0ceba96636e9efd1347a0265789 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sat, 30 Dec 2017 20:26:55 +0200 Subject: [PATCH 5/8] code review corrections --- gensim/test/test_corpora.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index bc279de1ed..c3433f8195 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -415,14 +415,15 @@ def test_default_preprocessing(self): self.assertEqual(expected, first_text) def test_len(self): + # When there is no min_token limit all 9 articles must be registered. + corpus = self.corpus_class(self.fname, article_min_tokens=0) + all_articles = corpus.get_texts() + assert (len(list(all_articles)) == 9) - def test_with_limit(article_min_tokens, expected_articles): - corpus = self.corpus_class(self.fname, article_min_tokens=article_min_tokens) - all_articles = corpus.get_texts() - assert (len(list(all_articles)) == expected_articles) - - test_with_limit(0, 9) - test_with_limit(100000, 0) + # With a huge min_token limit, all articles should be filtered out. + corpus = self.corpus_class(self.fname, article_min_tokens=100000) + all_articles = corpus.get_texts() + assert (len(list(all_articles)) == 0) def test_load_with_metadata(self): corpus = self.corpus_class(self.fname, article_min_tokens=0) @@ -446,11 +447,14 @@ def test_load(self): self.assertEqual(len(docs), 9) def test_empty_input(self): + """ + Empty compressed input raises ParseError + """ tmpf = get_tmpfile('emptycorpus.xml.bz2') - content = bz2.compress(b'') # Explicit string to byte conversion needed in python 3 - fh = open(tmpf, "wb") - fh.write(content) - fh.close() + content = bz2.compress(''.encode()) # Explicit string to byte conversion needed in python 3 + + with open(tmpf, "wb") as fh: + fh.write(content) with self.assertRaises(ParseError): corpus = self.corpus_class(tmpf) From 8b7a1d585001a17fcff32f41570ebafb400ab354 Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Sat, 30 Dec 2017 20:29:39 +0200 Subject: [PATCH 6/8] Moved WikiCorpus tests from test/test_wikicorpus.py into its class within the test_corpora.py file. * Adapted all old tests to the new class * Current Test class schema ensures that WikiCorpus also passes tests defined in parents * Deleted test_wikicorpus.py since it is now redundant --- gensim/test/test_corpora.py | 72 +++++++++++++++++- gensim/test/test_wikicorpus.py | 135 --------------------------------- 2 files changed, 71 insertions(+), 136 deletions(-) delete mode 100644 gensim/test/test_wikicorpus.py diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index c3433f8195..20bf28385f 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -401,12 +401,12 @@ def test_serialize_compressed(self): def test_indexing(self): pass - class TestWikiCorpus(TestTextCorpus): def setUp(self): self.corpus_class = wikicorpus.WikiCorpus self.file_extension = '.xml.bz2' self.fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) + self.enwiki = datapath('enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2') def test_default_preprocessing(self): expected = ['computer', 'human', 'interface'] @@ -460,6 +460,76 @@ def test_empty_input(self): corpus = self.corpus_class(tmpf) del corpus # Needed to supress tox warning + def test_unicode_element(self): + """ + First unicode article in this sample is + 1) папа + """ + bgwiki = datapath('bgwiki-latest-pages-articles-shortened.xml.bz2') + corpus = self.corpus_class(bgwiki) + texts = corpus.get_texts() + self.assertTrue(u'папа' in next(texts)) + + def test_lower_case_set_true(self): + """ + Set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist + """ + corpus = self.corpus_class(self.enwiki, processes=1, lower=True, lemmatize=False) + row = corpus.get_texts() + list_tokens = next(row) + self.assertTrue(u'Anarchism' not in list_tokens) + self.assertTrue(u'anarchism' in list_tokens) + + def test_lower_case_set_false(self): + """ + Set the parameter lower to False and check that upper case Anarchism' token exists + """ + corpus = self.corpus_class(self.enwiki, processes=1, lower=False, lemmatize=False) + row = corpus.get_texts() + list_tokens = next(row) + self.assertTrue(u'Anarchism' in list_tokens) + self.assertTrue(u'anarchism' in list_tokens) + + def test_min_token_len_not_set(self): + """ + Don't set the parameter token_min_len and check that 'a' as a token doesn't exist + Default token_min_len=2 + """ + corpus = self.corpus_class(self.enwiki, processes=1, lemmatize=False) + self.assertTrue(u'a' not in next(corpus.get_texts())) + + def test_min_token_len_set(self): + """ + Set the parameter token_min_len to 1 and check that 'a' as a token exists + """ + corpus = self.corpus_class(self.enwiki, processes=1, token_min_len=1, lemmatize=False) + self.assertTrue(u'a' in next(corpus.get_texts())) + + def test_max_token_len_not_set(self): + """ + Don't set the parameter token_max_len and check that 'collectivisation' as a token doesn't exist + Default token_max_len=15 + """ + corpus = self.corpus_class(self.enwiki, processes=1, lemmatize=False) + self.assertTrue(u'collectivization' not in next(corpus.get_texts())) + + def test_max_token_len_set(self): + """ + Set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists + """ + corpus = self.corpus_class(self.enwiki, processes=1, token_max_len=16, lemmatize=False) + self.assertTrue(u'collectivization' in next(corpus.get_texts())) + + # TODO: sporadic failure to be investigated + # def test_get_texts_returns_generator_of_lists(self): + # + # corpus = self.corpus_class(self.fname) + # l = corpus.get_texts() + # self.assertEqual(type(l), types.GeneratorType) + # first = next(l) + # self.assertEqual(type(first), list) + # self.assertTrue(isinstance(first[0], bytes) or isinstance(first[0], str)) + def test_sample_text(self): # Cannot instantiate WikiCorpus from lines pass diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py deleted file mode 100644 index e7b7b14011..0000000000 --- a/gensim/test/test_wikicorpus.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Automated tests for checking the WikiCorpus -""" - - -import logging -import unittest - -from gensim.corpora.wikicorpus import WikiCorpus -from gensim import utils -from gensim.test.utils import datapath - -FILENAME = 'enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2' -FILENAME_U = 'bgwiki-latest-pages-articles-shortened.xml.bz2' - -logger = logging.getLogger(__name__) - - -def custom_tokeiner(content, token_min_len=2, token_max_len=15, lower=True): - return [ - utils.to_unicode(token.lower()) if lower else utils.to_unicode(token) for token in content.split() - if token_min_len <= len(token) <= token_max_len and not token.startswith('_') - ] - - -class TestWikiCorpus(unittest.TestCase): - - # #TODO: sporadic failure to be investigated - # def test_get_texts_returns_generator_of_lists(self): - # logger.debug("Current Python Version is %s", str(sys.version_info)) - # if sys.version_info < (2, 7, 0): - # return - # - # wc = WikiCorpus(datapath(FILENAME)) - # l = wc.get_texts() - # self.assertEqual(type(l), types.GeneratorType) - # first = next(l) - # self.assertEqual(type(first), list) - # self.assertTrue(isinstance(first[0], bytes) or isinstance(first[0], str)) - - def test_first_element(self): - """ - First two articles in this sample are - 1) anarchism - 2) autism - """ - wc = WikiCorpus(datapath(FILENAME), processes=1) - - texts = wc.get_texts() - self.assertTrue(u'anarchism' in next(texts)) - self.assertTrue(u'autism' in next(texts)) - - def test_unicode_element(self): - """ - First unicode article in this sample is - 1) папа - """ - wc = WikiCorpus(datapath(FILENAME_U), processes=1) - - texts = wc.get_texts() - self.assertTrue(u'папа' in next(texts)) - - def test_lower_case_set_true(self): - """ - set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist - """ - wc = WikiCorpus(datapath(FILENAME), processes=1, lower=True, lemmatize=False) - row = wc.get_texts() - list_tokens = next(row) - self.assertTrue(u'Anarchism' not in list_tokens) - self.assertTrue(u'anarchism' in list_tokens) - - def test_lower_case_set_false(self): - """ - set the parameter lower to False and check that upper case Anarchism' token exist - """ - wc = WikiCorpus(datapath(FILENAME), processes=1, lower=False, lemmatize=False) - row = wc.get_texts() - list_tokens = next(row) - self.assertTrue(u'Anarchism' in list_tokens) - self.assertTrue(u'anarchism' in list_tokens) - - def test_min_token_len_not_set(self): - """ - don't set the parameter token_min_len and check that 'a' as a token doesn't exists - default token_min_len=2 - """ - wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False) - self.assertTrue(u'a' not in next(wc.get_texts())) - - def test_min_token_len_set(self): - """ - set the parameter token_min_len to 1 and check that 'a' as a token exists - """ - wc = WikiCorpus(datapath(FILENAME), processes=1, token_min_len=1, lemmatize=False) - self.assertTrue(u'a' in next(wc.get_texts())) - - def test_max_token_len_not_set(self): - """ - don't set the parameter token_max_len and check that 'collectivisation' as a token doesn't exists - default token_max_len=15 - """ - wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False) - self.assertTrue(u'collectivization' not in next(wc.get_texts())) - - def test_max_token_len_set(self): - """ - set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists - """ - wc = WikiCorpus(datapath(FILENAME), processes=1, token_max_len=16, lemmatize=False) - self.assertTrue(u'collectivization' in next(wc.get_texts())) - - def test_custom_tokenizer(self): - """ - define a custom tokenizer function and use it - """ - wc = WikiCorpus(datapath(FILENAME), processes=1, lemmatize=False, tokenizer_func=custom_tokeiner, - token_max_len=16, token_min_len=1, lower=False) - row = wc.get_texts() - list_tokens = next(row) - self.assertTrue(u'Anarchism' in list_tokens) - self.assertTrue(u'collectivization' in list_tokens) - self.assertTrue(u'a' in list_tokens) - self.assertTrue(u'i.e.' in list_tokens) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() From b5976c4f95ae3ea8d084946943ba381837c92bad Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Wed, 10 Jan 2018 21:20:10 +0100 Subject: [PATCH 7/8] Discarded the empty input test for the WikiCorpus since an empty file is not legitimate XML --- gensim/test/test_corpora.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 20bf28385f..2d4b369831 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -11,7 +11,6 @@ from __future__ import unicode_literals import codecs -import bz2 import itertools import logging import os.path @@ -19,7 +18,6 @@ import unittest import numpy as np -from xml.etree.cElementTree import ParseError from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus, ucicorpus, malletcorpus, textcorpus, indexedcorpus, wikicorpus) @@ -401,6 +399,7 @@ def test_serialize_compressed(self): def test_indexing(self): pass + class TestWikiCorpus(TestTextCorpus): def setUp(self): self.corpus_class = wikicorpus.WikiCorpus @@ -446,20 +445,6 @@ def test_load(self): # the deerwester corpus always has nine documents self.assertEqual(len(docs), 9) - def test_empty_input(self): - """ - Empty compressed input raises ParseError - """ - tmpf = get_tmpfile('emptycorpus.xml.bz2') - content = bz2.compress(''.encode()) # Explicit string to byte conversion needed in python 3 - - with open(tmpf, "wb") as fh: - fh.write(content) - - with self.assertRaises(ParseError): - corpus = self.corpus_class(tmpf) - del corpus # Needed to supress tox warning - def test_unicode_element(self): """ First unicode article in this sample is @@ -542,6 +527,9 @@ def test_sample_text_seed(self): # Cannot instantiate WikiCorpus from lines pass + def test_empty_input(self): + pass + class TestTextDirectoryCorpus(unittest.TestCase): From 78f28705008226dbf3b1204733e14b61897bc38a Mon Sep 17 00:00:00 2001 From: Manos Stergiadis Date: Thu, 11 Jan 2018 11:10:32 +0100 Subject: [PATCH 8/8] Added 2 more tests --- gensim/test/test_corpora.py | 40 ++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 2d4b369831..f330dbd271 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -400,6 +400,15 @@ def test_indexing(self): pass +# Needed for the test_custom_tokenizer is the TestWikiCorpus class. +# Cannot be nested due to serializing. +def custom_tokenizer(content, token_min_len=2, token_max_len=15, lower=True): + return [ + to_unicode(token.lower()) if lower else to_unicode(token) for token in content.split() + if token_min_len <= len(token) <= token_max_len and not token.startswith('_') + ] + + class TestWikiCorpus(TestTextCorpus): def setUp(self): self.corpus_class = wikicorpus.WikiCorpus @@ -445,6 +454,18 @@ def test_load(self): # the deerwester corpus always has nine documents self.assertEqual(len(docs), 9) + def test_first_element(self): + """ + First two articles in this sample are + 1) anarchism + 2) autism + """ + corpus = self.corpus_class(self.enwiki, processes=1) + + texts = corpus.get_texts() + self.assertTrue(u'anarchism' in next(texts)) + self.assertTrue(u'autism' in next(texts)) + def test_unicode_element(self): """ First unicode article in this sample is @@ -455,6 +476,19 @@ def test_unicode_element(self): texts = corpus.get_texts() self.assertTrue(u'папа' in next(texts)) + def test_custom_tokenizer(self): + """ + define a custom tokenizer function and use it + """ + wc = self.corpus_class(self.enwiki, processes=1, lemmatize=False, tokenizer_func=custom_tokenizer, + token_max_len=16, token_min_len=1, lower=False) + row = wc.get_texts() + list_tokens = next(row) + self.assertTrue(u'Anarchism' in list_tokens) + self.assertTrue(u'collectivization' in list_tokens) + self.assertTrue(u'a' in list_tokens) + self.assertTrue(u'i.e.' in list_tokens) + def test_lower_case_set_true(self): """ Set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist @@ -505,10 +539,9 @@ def test_max_token_len_set(self): corpus = self.corpus_class(self.enwiki, processes=1, token_max_len=16, lemmatize=False) self.assertTrue(u'collectivization' in next(corpus.get_texts())) - # TODO: sporadic failure to be investigated + # #TODO: sporadic failure to be investigated # def test_get_texts_returns_generator_of_lists(self): - # - # corpus = self.corpus_class(self.fname) + # corpus = self.corpus_class(self.enwiki) # l = corpus.get_texts() # self.assertEqual(type(l), types.GeneratorType) # first = next(l) @@ -528,6 +561,7 @@ def test_sample_text_seed(self): pass def test_empty_input(self): + # An empty file is not legit XML pass