Skip to content

Commit

Permalink
Merge pull request #1333 from alekol/issue_1326
Browse files Browse the repository at this point in the history
Fixing PR 1326 and providing some tests for unicode wiki corpora
  • Loading branch information
menshikh-iv authored May 27, 2017
2 parents 370646b + bb9dd96 commit ade3e02
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 6 deletions.
2 changes: 1 addition & 1 deletion gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def tokenize(content):
"""
# TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
return [
token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
utils.to_unicode(token) for token in utils.tokenize(content, lower=True, errors='ignore')
if 2 <= len(token) <= 15 and not token.startswith('_')
]

Expand Down
Binary file not shown.
18 changes: 13 additions & 5 deletions gensim/test/test_wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
FILENAME = 'enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2'
FILENAME_U = 'bgwiki-latest-pages-articles-shortened.xml.bz2'

logger = logging.getLogger(__name__)

Expand All @@ -45,14 +46,21 @@ def test_first_element(self):
1) anarchism
2) autism
"""
if sys.version_info < (2, 7, 0):
return
wc = WikiCorpus(datapath(FILENAME))
wc = WikiCorpus(datapath(FILENAME), processes=1)

l = wc.get_texts()
self.assertTrue(b"anarchism" in next(l))
self.assertTrue(b"autism" in next(l))
self.assertTrue(u'anarchism' in next(l))
self.assertTrue(u'autism' in next(l))

def test_unicode_element(self):
"""
First unicode article in this sample is
1) папа
"""
wc = WikiCorpus(datapath(FILENAME_U), processes=1)

l = wc.get_texts()
self.assertTrue(u'папа' in next(l))

if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
Expand Down

0 comments on commit ade3e02

Please sign in to comment.