From 9654995e6082c6fe6532857448841e30ce05ea35 Mon Sep 17 00:00:00 2001 From: will Date: Mon, 11 May 2015 11:03:29 -0400 Subject: [PATCH] fallback to the english stopwords in case the detected encoding is not available --- goose/text.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/goose/text.py b/goose/text.py index 4008d62b..ee4e6883 100644 --- a/goose/text.py +++ b/goose/text.py @@ -94,8 +94,12 @@ def __init__(self, language='en'): # TODO replace 'x' with class # to generate dynamic path for file to load if not language in self._cached_stop_words: - path = os.path.join('text', 'stopwords-%s.txt' % language) - self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines()) + try: + path = os.path.join('text', 'stopwords-%s.txt' % language) + self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines()) + except Exception: + path = os.path.join('text', 'stopwords-en.txt') + self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines()) self.STOP_WORDS = self._cached_stop_words[language] def remove_punctuation(self, content):