Skip to content

Commit

Permalink
Added encoding='utf-8' keyword argument to TextDirectoryCorpus. Used …
Browse files Browse the repository at this point in the history
…smart_open to replace builtin open. (#3317)
  • Loading branch information
Ziang Ren authored Apr 15, 2022
1 parent cea25a3 commit edaeee9
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions gensim/corpora/textcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
"""


from __future__ import with_statement

import logging
Expand All @@ -50,6 +49,8 @@
)
from gensim.utils import deaccent, simple_tokenize

from smart_open import open

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -399,7 +400,7 @@ class TextDirectoryCorpus(TextCorpus):
"""

def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_depth=None,
pattern=None, exclude_pattern=None, lines_are_documents=False, **kwargs):
pattern=None, exclude_pattern=None, lines_are_documents=False, encoding='utf-8', **kwargs):
"""
Parameters
Expand All @@ -423,6 +424,8 @@ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_dept
Regex to use for file name exclusion, all files matching this pattern will be ignored.
lines_are_documents : bool, optional
If True - each line is considered a document, otherwise - each file is one document.
encoding : str, optional
Encoding used to read the specified file or files in the specified directory.
kwargs: keyword arguments passed through to the `TextCorpus` constructor.
See :meth:`gemsim.corpora.textcorpus.TextCorpus.__init__` docstring for more details on these.
Expand All @@ -432,6 +435,7 @@ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_dept
self.pattern = pattern
self.exclude_pattern = exclude_pattern
self.lines_are_documents = lines_are_documents
self.encoding = encoding
super(TextDirectoryCorpus, self).__init__(input, dictionary, metadata, **kwargs)

@property
Expand Down Expand Up @@ -510,7 +514,7 @@ def getstream(self):
"""
num_texts = 0
for path in self.iter_filepaths():
with open(path, 'rt') as f:
with open(path, 'rt', encoding=self.encoding) as f:
if self.lines_are_documents:
for line in f:
yield line.strip()
Expand Down

0 comments on commit edaeee9

Please sign in to comment.