From b29233a80707a20517f4b7c147b8f1161c181fba Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Tue, 8 Aug 2023 15:24:37 +0200 Subject: [PATCH] Generate alternative typos with a translation table (#2985) --- codespell_lib/_codespell.py | 44 ++++++++++++++++++++++--------- codespell_lib/tests/test_basic.py | 6 +++++ 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 2e23b9acd6..1fe8c6306c 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -36,6 +36,9 @@ "(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|" "\\b[\\w.%+-]+@[\\w.-]+\\b)" ) +# Pass all misspellings through this translation table to generate +# alternative misspellings and fixes. +alt_chars = (("'", "’"),) encodings = ("utf-8", "iso-8859-1") USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] @@ -622,31 +625,46 @@ def build_ignore_words(filename: str, ignore_words: Set[str]) -> None: ignore_words.add(line.strip()) +def add_misspelling( + key: str, + data: str, + misspellings: Dict[str, Misspelling], +) -> None: + data = data.strip() + + if "," in data: + fix = False + data, reason = data.rsplit(",", 1) + reason = reason.lstrip() + else: + fix = True + reason = "" + + misspellings[key] = Misspelling(data, fix, reason) + + def build_dict( filename: str, misspellings: Dict[str, Misspelling], ignore_words: Set[str], ) -> None: with open(filename, encoding="utf-8") as f: + translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars] for line in f: [key, data] = line.split("->") # TODO for now, convert both to lower. Someday we can maybe add # support for fixing caps. key = key.lower() data = data.lower() - if key in ignore_words: - continue - data = data.strip() - - if "," in data: - fix = False - data, reason = data.rsplit(",", 1) - reason = reason.lstrip() - else: - fix = True - reason = "" - - misspellings[key] = Misspelling(data, fix, reason) + if key not in ignore_words: + add_misspelling(key, data, misspellings) + # generate alternative misspellings/fixes + for x, table in translate_tables: + if x in key: + alt_key = key.translate(table) + alt_data = data.translate(table) + if alt_key not in ignore_words: + add_misspelling(alt_key, alt_data, misspellings) def is_hidden(filename: str, check_hidden: bool) -> bool: diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 473b72fef4..c2a1e80c5b 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -168,6 +168,12 @@ def test_default_word_parsing( f.write("`abandonned`\n") assert cs.main(fname) == 1, "bad" + fname = tmp_path / "apostrophe" + fname.write_text("woudn't\n", encoding="utf-8") # U+0027 (') + assert cs.main(fname) == 1, "misspelling containing typewriter apostrophe U+0027" + fname.write_text("woudn’t\n", encoding="utf-8") # U+2019 (’) + assert cs.main(fname) == 1, "misspelling containing typographic apostrophe U+2019" + def test_bad_glob( tmp_path: Path,