Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix case handling for various capitalization issues #2478

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 45 additions & 7 deletions codespell_lib/_codespell.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,10 +454,10 @@ def build_dict(filename, misspellings, ignore_words):
with codecs.open(filename, mode='r', encoding='utf-8') as f:
for line in f:
[key, data] = line.split('->')
# TODO for now, convert both to lower. Someday we can maybe add
# support for fixing caps.
# Convert key to lower case.
# Do not modify data to lower case. Leave it as per dictionary.
key = key.lower()
data = data.lower()
# data = data.lower()
if key in ignore_words:
continue
data = data.strip()
Expand Down Expand Up @@ -493,13 +493,51 @@ def is_text_file(filename):
return True


def is_camel_case_word(input_word):
return (input_word != input_word.lower()) and \
(input_word != input_word.upper()) and \
("_" not in input_word) and \
("-" not in input_word) and \
(" " not in input_word)


def is_camel_case_string(input_string):
for word in input_string.split(','):
if is_camel_case_word(word) is True:
return True
return False


def fix_case(word, fixword):
if word == word.capitalize():
return ', '.join(w.strip().capitalize() for w in fixword.split(','))
if fixword == fixword.upper():
# abbreviation, acronym: fixword is in all upper case.
# Use fixword as per dictionary.
# Eg. asscii->ASCII
return fixword
elif word == word.capitalize() and fixword == fixword.lower():
# word is capitalized and fixword(s) in lower.
# Capitalize/Title fixword(s).
# Eg. Weather, Whether,
return fixword.title()
# return ', '.join(w.strip().capitalize() for w in fixword.split(','))
elif word == word.capitalize() and not is_camel_case_string(fixword):
# word is capitalized and fixword(s) contain mixed with no camelCase.
# Capitalize/Title fixword(s).
# Eg. skipt->skip, Skype, skipped,
return fixword.title()
# return ', '.join(w.strip().capitalize() for w in fixword.split(','))
elif word == word.upper():
# word is in all upper case, change fixword to upper.
# Eg. MONDAY
return fixword.upper()
# they are both lower case
# or we don't have any idea
elif word.lower() == fixword.lower():
# Special feature only meant for private custom dictionary.
# word is valid but fixword required in CamelCase.
# Use fixword as per dictionary.
# Eg. mysql->MySQL
return fixword
# word is in lower, capitalize, CamelCase or whatever.
# Use fixword as per dictionary.
return fixword


Expand Down
299 changes: 259 additions & 40 deletions codespell_lib/tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,61 +393,280 @@ def test_case_handling(tmpdir, capsys):
assert f.read().decode('utf-8') == 'this has an ASCII error'


def _helper_test_case_handling_in_fixes(tmpdir, capsys, reason):
def _helper_test_case_handling(tmpdir, capsys, dict_entry, bad_input,
expected_output, reason):
d = str(tmpdir)

with open(op.join(d, 'dictionary.txt'), 'w') as f:
if reason:
f.write('adoptor->adopter, adaptor, reason\n')
f.write(dict_entry + ' reason\n')
else:
f.write('adoptor->adopter, adaptor,\n')
f.write(dict_entry + '\n')
dictionary_name = f.name

# the mispelled word is entirely lowercase
# the misspelled word is entirely lowercase
with open(op.join(d, 'bad.txt'), 'w') as f:
f.write('early adoptor\n')
f.write(bad_input + '\n')
code, stdout, _ = cs.main('-D', dictionary_name, f.name, std=True)
# all suggested fixes must be lowercase too
assert 'adopter, adaptor' in stdout
# the reason, if any, must not be modified
if reason:
assert 'reason' in stdout

# the mispelled word is capitalized
with open(op.join(d, 'bad.txt'), 'w') as f:
f.write('Early Adoptor\n')
code, stdout, _ = cs.main('-D', dictionary_name, f.name, std=True)
# all suggested fixes must be capitalized too
assert 'Adopter, Adaptor' in stdout
# the reason, if any, must not be modified
if reason:
assert 'reason' in stdout

# the mispelled word is entirely uppercase
with open(op.join(d, 'bad.txt'), 'w') as f:
f.write('EARLY ADOPTOR\n')
code, stdout, _ = cs.main('-D', dictionary_name, f.name, std=True)
# all suggested fixes must be uppercase too
assert 'ADOPTER, ADAPTOR' in stdout
# the reason, if any, must not be modified
if reason:
assert 'reason' in stdout

# the mispelled word mixes lowercase and uppercase
with open(op.join(d, 'bad.txt'), 'w') as f:
f.write('EaRlY AdOpToR\n')
code, stdout, _ = cs.main('-D', dictionary_name, f.name, std=True)
# all suggested fixes should be lowercase
assert 'adopter, adaptor' in stdout
assert expected_output in stdout
# the reason, if any, must not be modified
if reason:
assert 'reason' in stdout


def test_case_handling_in_fixes(tmpdir, capsys):
"""Test that the case of fixes is similar to the mispelled word."""
_helper_test_case_handling_in_fixes(tmpdir, capsys, reason=False)
_helper_test_case_handling_in_fixes(tmpdir, capsys, reason=True)
def test_case_handling_in_fix_case(tmpdir, capsys):
"""Test various case handling in fix_case() function."""
# Test typical: Both misspelled and multiple suggested words are coded
# as lower case in dictionary.
# Verifying: Capitalize is consistent for all suggested words
_helper_test_case_handling(tmpdir, capsys,
'adoptor->adopter, adaptor,',
'early adoptor',
'adopter, adaptor', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'adoptor->adopter, adaptor,',
'Early Adoptor',
'Adopter, Adaptor', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'adoptor->adopter, adaptor,',
'EARLY ADOPTOR',
'ADOPTER, ADAPTOR', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'adoptor->adopter, adaptor,',
'EaRlY AdOpToR',
'adopter, adaptor', reason=False)
# Verifying: Capitalize is consistent for all suggested words
_helper_test_case_handling(tmpdir, capsys,
'adoptor->adopter, adaptor,',
'early adoptor',
'adopter, adaptor', reason=True)
_helper_test_case_handling(tmpdir, capsys,
'adoptor->adopter, adaptor,',
'Early Adoptor',
'Adopter, Adaptor', reason=True)
_helper_test_case_handling(tmpdir, capsys,
'adoptor->adopter, adaptor,',
'EARLY ADOPTOR',
'ADOPTER, ADAPTOR', reason=True)
_helper_test_case_handling(tmpdir, capsys,
'adoptor->adopter, adaptor,',
'EaRlY AdOpToR',
'adopter, adaptor', reason=True)
# Test abbreviation, acronym, initialism: Suggested word coded as
# upper case in dictionary.
_helper_test_case_handling(tmpdir, capsys,
'asscii->ASCII',
'asscii',
'ASCII', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'asscii->ASCII',
'Asscii',
'ASCII', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'asscii->ASCII',
'AssCii',
'ASCII', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'asscii->ASCII',
'ASSCII',
'ASCII', reason=False)
# Test proper nouns: Misspelled coded as lower case in dictionary.
_helper_test_case_handling(tmpdir, capsys,
'austrailia->Australia',
'austrailia',
'Australia', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'austrailia->Australia',
'Austrailia',
'Australia', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'austrailia->Australia',
'AustRailia',
'Australia', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'austrailia->Australia',
'AUSTRAILIA',
'AUSTRALIA', reason=False)
# Test proper nouns, brand names: Misspelled coded as capitalize
# in dictionary.
_helper_test_case_handling(tmpdir, capsys,
'Micosoft->Microsoft',
'micosoft',
'Microsoft', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'Micosoft->Microsoft',
'Micosoft',
'Microsoft', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'Micosoft->Microsoft',
'MicoSoft',
'Microsoft', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'Micosoft->Microsoft',
'MICOSOFT',
'MICROSOFT', reason=False)
# Test typical single: Both misspelled and suggested word both coded
# as lower case in dictionary.
_helper_test_case_handling(tmpdir, capsys,
'pinapple->pineapple',
'pinapple',
'pineapple', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'pinapple->pineapple',
'Pinapple',
'Pineapple', reason=False)
# Test typical multiple: Both misspelled and multiple suggested words
# both coded as lower case in dictionary.
_helper_test_case_handling(tmpdir, capsys,
'uspported->supported, unsupported,',
'uspported',
'supported, unsupported', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'uspported->supported, unsupported,',
'Uspported',
'Supported, Unsupported', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'uspported->supported, unsupported,',
'USPPORTED',
'SUPPORTED, UNSUPPORTED', reason=False)
# Test typical multiple & mix: Misspelled coded in lower. Multiple
# suggested words coded as lower & capitalize case in dictionary.
_helper_test_case_handling(tmpdir, capsys,
'skipt->skip, Skype, skipped,',
'skipt',
'skip, Skype, skipped', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'skipt->skip, Skype, skipped,',
'Skipt',
'Skip, Skype, Skipped', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'skipt->skip, Skype, skipped,',
'SKIPT',
'SKIP, SKYPE, SKIPPED', reason=False)
# Test CamelCase basic: Suggested word coded as CamelCase in dictionary.
_helper_test_case_handling(tmpdir, capsys,
'lesstiff->LessTif',
'lesstiff',
'LessTif', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'lesstiff->LessTif',
'lessTiff',
'LessTif', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'lesstiff->LessTif',
'Lesstiff',
'LessTif', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'lesstiff->LessTif',
'LessTiff',
'LessTif', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'lesstiff->LessTif',
'LESSTIFF',
'LESSTIF', reason=False)
# Test CamelCase brand names: Suggested word coded as CamelCase
# in dictionary.
_helper_test_case_handling(tmpdir, capsys,
'mangodb->MongoDB',
'mangodb',
'MongoDB', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mangodb->MongoDB',
'mangoDb',
'MongoDB', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mangodb->MongoDB',
'mangoDB',
'MongoDB', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mangodb->MongoDB',
'Mangodb',
'MongoDB', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mangodb->MongoDB',
'MangoDb',
'MongoDB', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mangodb->MongoDB',
'MangoDB',
'MongoDB', reason=False)
# Test CamelCase brand names: Suggested word coded as CamelCase
# in dictionary.
_helper_test_case_handling(tmpdir, capsys,
'ebya->eBay',
'ebya',
'eBay', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'ebya->eBay',
'eBya',
'eBay', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'ebya->eBay',
'Ebya',
'eBay', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'ebya->eBay',
'EBya',
'eBay', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'ebya->eBay',
'EBYA',
'EBAY', reason=False)
# Special Test CamelCase, brand names: Misspelled is correct spelling
# but incorrect case. Suggested word is coded as CamelCase in
# dictionary. For custom dictionary only.
_helper_test_case_handling(tmpdir, capsys,
'mariadb->MariaDB',
'mariadb',
'MariaDB', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mariadb->MariaDB',
'mariaDb',
'MariaDB', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mariadb->MariaDB',
'mariaDB',
'MariaDB', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mariadb->MariaDB',
'Mariadb',
'MariaDB', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mariadb->MariaDB',
'MariaDb',
'MariaDB', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mariadb->MariaDB',
'MariaDB',
'MariaDB', reason=False)
# Special Test CamelCase, brand names: Misspelled is correct spelling
# but incorrect case. Multiple suggested words are coded as CamelCase
# and lower case in dictionary. For custom dictionary only.
_helper_test_case_handling(tmpdir, capsys,
'mysql->MySQL, mysql,',
'mysql',
'MySQL, mysql', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mysql->MySQL, mysql,',
'mySql',
'MySQL, mysql', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mysql->MySQL, mysql,',
'mySQL',
'MySQL, mysql', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mysql->MySQL, mysql,',
'Mysql',
'MySQL, mysql', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mysql->MySQL, mysql,',
'MySql',
'MySQL, mysql', reason=False)
_helper_test_case_handling(tmpdir, capsys,
'mysql->MySQL, mysql,',
'MySQL',
'MySQL, mysql', reason=False)


def test_context(tmpdir, capsys):
Expand Down