Skip to content

Commit

Permalink
Simplify autocorrection Python code.
Browse files Browse the repository at this point in the history
Some style changes to hopefully address lint errors that came up in
qmk/qmk_firmware#15699.

* Break up `parse_file()` into several functions.
* Avoid using `lambda`.
* Better error message if node link exceeds 64KB limit.
  • Loading branch information
getreuer committed Jan 9, 2022
1 parent af565ed commit c28e526
Showing 1 changed file with 82 additions and 59 deletions.
141 changes: 82 additions & 59 deletions features/make_autocorrection_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@

import sys
import textwrap
from typing import Any, Dict, List, Tuple
from typing import Any, Dict, Iterator, List, Tuple

try:
from english_words import english_words_lower_alpha_set as CORRECT_WORDS
Expand All @@ -58,6 +58,7 @@
KC_A = 4
KC_SPC = 0x2c


def parse_file(file_name: str) -> List[Tuple[str, str]]:
"""Parses autocorrections dictionary file.
Expand All @@ -74,62 +75,30 @@ def parse_file(file_name: str) -> List[Tuple[str, str]]:

autocorrections = []
typos = set()
line_number = 0
for line in open(file_name, 'rt'):
line_number += 1
line = line.strip()
if line and line[0] != '#':
# Parse syntax "typo -> correction", using strip to ignore indenting.
tokens = [token.strip() for token in line.split('->', 1)]
if len(tokens) != 2 or not tokens[0]:
print(f'Error:{line_number}: Invalid syntax: "{line}"')
for line_number, typo, correction in parse_file_lines(file_name):
if typo in typos:
print(f'Warning:{line_number}: Ignoring duplicate typo: "{typo}"')
continue

# Check that `typo` is valid.
if not(all([ord('a') <= ord(c) <= ord('z') or c == ':' for c in typo])):
print(f'Error:{line_number}: Typo "{typo}" has '
'characters other than a-z and :.')
sys.exit(1)
for other_typo in typos:
if typo in other_typo or other_typo in typo:
print(f'Error:{line_number}: Typos may not be substrings of one '
f'another, otherwise the longer typo would never trigger: '
f'"{typo}" vs. "{other_typo}".')
sys.exit(1)
if len(typo) < 5:
print(f'Warning:{line_number}: It is suggested that typos are at '
f'least 5 characters long to avoid false triggers: "{typo}"')

typo, correction = tokens
typo = typo.lower() # Force typos to lowercase.
typo = typo.replace(' ', ':')
check_typo_against_dictionary(typo)

if typo in typos:
print(f'Warning:{line_number}: Ignoring duplicate typo: "{typo}"')
continue

# Check that `typo` is valid.
if not(all([ord('a') <= ord(c) <= ord('z') or c == ':' for c in typo])):
print(f'Error:{line_number}: Typo "{typo}" has '
'characters other than a-z and :.')
sys.exit(1)
for other_typo in typos:
if typo in other_typo or other_typo in typo:
print(f'Error:{line_number}: Typos may not be substrings of one '
f'another, otherwise the longer typo would never trigger: '
f'"{typo}" vs. "{other_typo}".')
sys.exit(1)
if len(typo) < 5:
print(f'Warning:{line_number}: It is suggested that typos are at '
f'least 5 characters long to avoid false triggers: "{typo}"')

if typo.startswith(':') and typo.endswith(':'):
if typo[1:-1] in CORRECT_WORDS:
print(f'Warning:{line_number}: Typo "{typo}" is a correctly spelled '
'dictionary word.')
elif typo.startswith(':') and not typo.endswith(':'):
for word in CORRECT_WORDS:
if word.startswith(typo[1:]):
print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger '
f'on correctly spelled word "{word}".')
elif not typo.startswith(':') and typo.endswith(':'):
for word in CORRECT_WORDS:
if word.endswith(typo[:-1]):
print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger '
f'on correctly spelled word "{word}".')
elif not typo.startswith(':') and not typo.endswith(':'):
for word in CORRECT_WORDS:
if typo in word:
print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger '
f'on correctly spelled word "{word}".')

autocorrections.append((typo, correction))
typos.add(typo)
autocorrections.append((typo, correction))
typos.add(typo)

return autocorrections

Expand All @@ -152,6 +121,51 @@ def make_trie(autocorrections: List[Tuple[str, str]]) -> Dict[str, Any]:
return trie


def parse_file_lines(file_name: str) -> Iterator[Tuple[int, str, str]]:
"""Parses lines read from `file_name` into typo-correction pairs."""

line_number = 0
for line in open(file_name, 'rt'):
line_number += 1
line = line.strip()
if line and line[0] != '#':
# Parse syntax "typo -> correction", using strip to ignore indenting.
tokens = [token.strip() for token in line.split('->', 1)]
if len(tokens) != 2 or not tokens[0]:
print(f'Error:{line_number}: Invalid syntax: "{line}"')
sys.exit(1)

typo, correction = tokens
typo = typo.lower() # Force typos to lowercase.
typo = typo.replace(' ', ':')

yield line_number, typo, correction


def check_typo_against_dictionary(typo: str) -> None:
"""Checks `typo` against English dictionary words."""

if typo.startswith(':') and typo.endswith(':'):
if typo[1:-1] in CORRECT_WORDS:
print(f'Warning:{line_number}: Typo "{typo}" is a correctly spelled '
'dictionary word.')
elif typo.startswith(':') and not typo.endswith(':'):
for word in CORRECT_WORDS:
if word.startswith(typo[1:]):
print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger '
f'on correctly spelled word "{word}".')
elif not typo.startswith(':') and typo.endswith(':'):
for word in CORRECT_WORDS:
if word.endswith(typo[:-1]):
print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger '
f'on correctly spelled word "{word}".')
elif not typo.startswith(':') and not typo.endswith(':'):
for word in CORRECT_WORDS:
if typo in word:
print(f'Warning:{line_number}: Typo "{typo}" would falsely trigger '
f'on correctly spelled word "{word}".')


def serialize_trie(autocorrections: List[Tuple[str, str]],
trie: Dict[str, Any]) -> List[int]:
"""Serializes trie and correction data in a form readable by the C code.
Expand Down Expand Up @@ -209,9 +223,6 @@ def kc_code(c):
else:
raise ValueError(f'Invalid character: {c}')

encode_link = lambda link: [link['byte_offset'] & 255,
link['byte_offset'] >> 8]

if not e['links']: # Handle a leaf table entry.
return e['data']
elif len(e['links']) == 1: # Handle a chain table entry.
Expand All @@ -226,11 +237,20 @@ def kc_code(c):
for e in table: # To encode links, first compute byte offset of each entry.
e['byte_offset'] = byte_offset
byte_offset += len(serialize(e))
assert 0 <= byte_offset <= 0xffff

return [b for e in table for b in serialize(e)] # Serialize final table.


def encode_link(link: Dict[str, Any]) -> List[int]:
"""Encodes a node link as two bytes."""
byte_offset = link['byte_offset']
if not (0 <= byte_offset <= 0xffff):
print('Error: The autocorrection table is too large, a node link exceeds '
'64KB limit. Try reducing the autocorrection dict to fewer entries.')
sys.exit(1)
return [byte_offset & 255, byte_offset >> 8]


def write_generated_code(autocorrections: List[Tuple[str, str]],
data: List[int],
file_name: str) -> None:
Expand All @@ -242,7 +262,10 @@ def write_generated_code(autocorrections: List[Tuple[str, str]],
file_name: String, path of the output C file.
"""
assert all(0 <= b <= 255 for b in data)
typo_len = lambda e: len(e[0])

def typo_len(e: Tuple[str, str]) -> int:
return len(e[0])

min_typo = min(autocorrections, key=typo_len)[0]
max_typo = max(autocorrections, key=typo_len)[0]
generated_code = ''.join([
Expand Down

0 comments on commit c28e526

Please sign in to comment.