Skip to content

Commit

Permalink
Generate alternative typos with a translation table (#2985)
Browse files Browse the repository at this point in the history
  • Loading branch information
DimitriPapadopoulos authored Aug 8, 2023
1 parent 0b0c8f8 commit b29233a
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 13 deletions.
44 changes: 31 additions & 13 deletions codespell_lib/_codespell.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
"(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|"
"\\b[\\w.%+-]+@[\\w.-]+\\b)"
)
# Pass all misspellings through this translation table to generate
# alternative misspellings and fixes.
alt_chars = (("'", "’"),)
encodings = ("utf-8", "iso-8859-1")
USAGE = """
\t%prog [OPTIONS] [file1 file2 ... fileN]
Expand Down Expand Up @@ -622,31 +625,46 @@ def build_ignore_words(filename: str, ignore_words: Set[str]) -> None:
ignore_words.add(line.strip())


def add_misspelling(
key: str,
data: str,
misspellings: Dict[str, Misspelling],
) -> None:
data = data.strip()

if "," in data:
fix = False
data, reason = data.rsplit(",", 1)
reason = reason.lstrip()
else:
fix = True
reason = ""

misspellings[key] = Misspelling(data, fix, reason)


def build_dict(
filename: str,
misspellings: Dict[str, Misspelling],
ignore_words: Set[str],
) -> None:
with open(filename, encoding="utf-8") as f:
translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
for line in f:
[key, data] = line.split("->")
# TODO for now, convert both to lower. Someday we can maybe add
# support for fixing caps.
key = key.lower()
data = data.lower()
if key in ignore_words:
continue
data = data.strip()

if "," in data:
fix = False
data, reason = data.rsplit(",", 1)
reason = reason.lstrip()
else:
fix = True
reason = ""

misspellings[key] = Misspelling(data, fix, reason)
if key not in ignore_words:
add_misspelling(key, data, misspellings)
# generate alternative misspellings/fixes
for x, table in translate_tables:
if x in key:
alt_key = key.translate(table)
alt_data = data.translate(table)
if alt_key not in ignore_words:
add_misspelling(alt_key, alt_data, misspellings)


def is_hidden(filename: str, check_hidden: bool) -> bool:
Expand Down
6 changes: 6 additions & 0 deletions codespell_lib/tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,12 @@ def test_default_word_parsing(
f.write("`abandonned`\n")
assert cs.main(fname) == 1, "bad"

fname = tmp_path / "apostrophe"
fname.write_text("woudn't\n", encoding="utf-8") # U+0027 (')
assert cs.main(fname) == 1, "misspelling containing typewriter apostrophe U+0027"
fname.write_text("woudn’t\n", encoding="utf-8") # U+2019 (’)
assert cs.main(fname) == 1, "misspelling containing typographic apostrophe U+2019"


def test_bad_glob(
tmp_path: Path,
Expand Down

0 comments on commit b29233a

Please sign in to comment.