diff --git a/src/corporacreator/corpora.py b/src/corporacreator/corpora.py index 6b89946..364594c 100644 --- a/src/corporacreator/corpora.py +++ b/src/corporacreator/corpora.py @@ -14,7 +14,7 @@ def common_wrapper(sentence, up_votes, down_votes): is_valid, sentence = common(sentence) - if False == is_valid: + if is_valid is False: up_votes = 0 down_votes = 2 return pd.Series([sentence, up_votes, down_votes]) @@ -52,7 +52,7 @@ def create(self): raise argparse.ArgumentTypeError("ERROR: You have requested languages which do not exist in clips.tsv") else: locales = corpora_data.locale.unique() - + for locale in locales: _logger.info("Selecting %s corpus data..." % locale) corpus_data = corpora_data.loc[ diff --git a/src/corporacreator/corpus.py b/src/corporacreator/corpus.py index 68efa33..01beb02 100644 --- a/src/corporacreator/corpus.py +++ b/src/corporacreator/corpus.py @@ -21,7 +21,7 @@ class Corpus: Attributes: args ([str]): Command line parameters as list of strings - locale (str): Locale of this :class:`corporacreator.Corpus` + locale (str): Locale of this :class:`corporacreator.Corpus` corpus_data (:class:`pandas.DataFrame`): `pandas.DataFrame` Containing the corpus data """ @@ -52,7 +52,7 @@ def _preprocessor_wrapper(self, client_id, sentence, up_votes, down_votes): preprocessors, self.locale.replace("-", ""), self._preprocessor_default ) # Get locale specific preprocessor sentence = preprocessor(client_id, sentence) - if None == sentence or not sentence.strip(): + if sentence is None or not sentence.strip(): up_votes = 0 down_votes = 2 return pd.Series([sentence, up_votes, down_votes]) diff --git a/src/corporacreator/preprocessors/cy.py b/src/corporacreator/preprocessors/cy.py index cab7e40..0128731 100644 --- a/src/corporacreator/preprocessors/cy.py +++ b/src/corporacreator/preprocessors/cy.py @@ -11,9 +11,9 @@ def cy(client_id, sentence): # TODO: geiriau Saesneg / English inspired/pronunced words: # wallace, celsius, ddiesel, wicipedia, william, chiswell, f., h. - sentence = sentence.replace("’", "'") # fix apostrophes + sentence = sentence.replace("’", "'") # fix apostrophes sentence = sentence.replace("wwna", "wna") sentence = sentence.replace(" siwr ", " siŵr ") - sentence = sentence.replace("\\tungellog"," ungellog") - + sentence = sentence.replace("\\tungellog", " ungellog") + return sentence diff --git a/src/corporacreator/preprocessors/de.py b/src/corporacreator/preprocessors/de.py index a599ab3..b362be2 100644 --- a/src/corporacreator/preprocessors/de.py +++ b/src/corporacreator/preprocessors/de.py @@ -2,31 +2,32 @@ QUOTE_PATTERN = re.compile(r'^\"{3}(.*)\"{2}(.*)\"{1}$') QUOTE_PATTERN_2 = re.compile(r'^\"{1}(.*)\"{2}(.*)\"{2}(.*)\"{1}$') QUOTE_PATTERN_3 = re.compile(r'^\"{1}(.*)\"{1}$') - + + def _change_multi_quotes(sentence): - """Changes all quotes from patterns like + """Changes all quotes from patterns like [\"""content""content"] to ["content"content] or ["content""content""content"] to [content"content"content] or ["content" to content] - + Args: sentence (str): Sentence to be cleaned up. - + Returns: (str): Cleaned up sentence. Returns the sentence 'as-is', if matching did not work as expected """ - matches = QUOTE_PATTERN.match(sentence) # pattern: \"\"\"content\"\"content\" - matches2 = QUOTE_PATTERN_2.match(sentence) # pattern: \"content\"\"content\"\"content\" - matches3 = QUOTE_PATTERN_3.match(sentence) # patter: \"content\" - - if matches != None and matches.lastindex == 2: + matches = QUOTE_PATTERN.match(sentence) # pattern: \"\"\"content\"\"content\" + matches2 = QUOTE_PATTERN_2.match(sentence) # pattern: \"content\"\"content\"\"content\" + matches3 = QUOTE_PATTERN_3.match(sentence) # patter: \"content\" + + if matches is not None and matches.lastindex == 2: return "\"{}\"{}".format(matches.group(1), matches.group(2)) - elif matches2 != None and matches2.lastindex == 3: + elif matches2 is not None and matches2.lastindex == 3: return "{}\"{}\"{}".format(matches2.group(1), matches2.group(2), matches2.group(3)) - elif matches3 != None and matches3.lastindex == 1: + elif matches3 is not None and matches3.lastindex == 1: return "{}".format(matches3.group(1)) - + return sentence diff --git a/src/corporacreator/tool.py b/src/corporacreator/tool.py index 7057115..ec34819 100644 --- a/src/corporacreator/tool.py +++ b/src/corporacreator/tool.py @@ -8,6 +8,7 @@ _logger = logging.getLogger(__name__) + def main(args): """Main entry point allowing external calls