From 018f0c64657afcebe1c98726243799e6fa10d15e Mon Sep 17 00:00:00 2001 From: jonmeow <46229924+jonmeow@users.noreply.github.com> Date: Wed, 8 Jul 2020 09:19:27 -0700 Subject: [PATCH 1/7] Add --ignore-regex for URI/email handling. This is for issue #676, where typos are found in actually-okay URIs/emails. Because these are closer to names in context, this ignores them. Mechanically, this erases the URI/email text before the word regex is applied. --- codespell_lib/_codespell.py | 31 ++++++++++++--- codespell_lib/tests/test_basic.py | 64 +++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 5 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index f52b840c07..4c66a87632 100755 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -28,6 +28,8 @@ import textwrap word_regex_def = u"[\\w\\-'’`]+" +# Matches common URIs and email addresses, in that order. +ignore_word_regex_def = r"(?:(?:https?|ftp|smtp):\/\/([\w-]+\.)+\w{2,}(?:/(?:[\w:/?#\[\]@!$&'()*+,;=.~-]*/?)*)?|[\w.%+-]+@[\w.-]+\.[a-z]{2,})" # noqa: E501 encodings = ('utf-8', 'iso-8859-1') USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] @@ -273,6 +275,11 @@ def parse_options(args): 'to include (when "-D -" or no "-D" is passed). ' 'Current options are:' + builtin_opts + '\n' 'The default is %(default)r.') + parser.add_argument('--ignore-regex', + action='store', type=str, + help='regular expression which is used to find words ' + 'to ignore. Matches URIs and emails by default. ' + 'Can be disabled by setting to "^$".') parser.add_argument('-I', '--ignore-words', action='append', metavar='FILE', help='file that contains words which will be ignored ' @@ -489,8 +496,13 @@ def print_context(lines, index, context): print('%s %s' % ('>' if i == index else ':', lines[i].rstrip())) +def extract_words(text, word_regex, ignore_word_regex): + interesting_text = ignore_word_regex.sub(' ', text) + return word_regex.findall(interesting_text) + + def parse_file(filename, colors, summary, misspellings, exclude_lines, - file_opener, word_regex, context, options): + file_opener, word_regex, ignore_word_regex, context, options): bad_count = 0 lines = None changed = False @@ -501,7 +513,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines, lines = f.readlines() else: if options.check_filenames: - for word in word_regex.findall(filename): + for word in extract_words(filename, word_regex, ignore_word_regex): lword = word.lower() if lword not in misspellings: continue @@ -555,7 +567,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines, fixed_words = set() asked_for = set() - for word in word_regex.findall(line): + for word in extract_words(line, word_regex, ignore_word_regex): lword = word.lower() if lword in misspellings: context_shown = False @@ -662,6 +674,14 @@ def main(*args): (word_regex, err), file=sys.stderr) parser.print_help() return EX_USAGE + ignore_word_regex = options.ignore_regex or ignore_word_regex_def + try: + ignore_word_regex = re.compile(ignore_word_regex) + except re.error as err: + print("ERROR: invalid regular expression \"%s\" (%s)" % + (ignore_word_regex, err), file=sys.stderr) + parser.print_help() + return EX_USAGE ignore_words_files = options.ignore_words or [] ignore_words = set() @@ -770,7 +790,8 @@ def main(*args): continue bad_count += parse_file( fname, colors, summary, misspellings, exclude_lines, - file_opener, word_regex, context, options) + file_opener, word_regex, ignore_word_regex, context, + options) # skip (relative) directories dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)] @@ -778,7 +799,7 @@ def main(*args): else: bad_count += parse_file( filename, colors, summary, misspellings, exclude_lines, - file_opener, word_regex, context, options) + file_opener, word_regex, ignore_word_regex, context, options) if summary: print("\n-------8<-------\nSUMMARY:") diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 4107b6ce1c..8a4a1eb0f0 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -455,6 +455,70 @@ def test_context(tmpdir, capsys): assert 'ERROR' in lines[0] +def test_uri(tmpdir, capsys): + """Test ignore regex functionality for URIs.""" + d = str(tmpdir) + + # Ignoring text in path. + with open(op.join(d, 'uri.txt'), 'w') as f: + f.write('# Please see http://example.com/abandonned for info\n') + assert cs.main(f.name) == 0 + # Same is a typo with ignores disabled. + assert cs.main(f.name, '--ignore-regex=^$') == 1 + + # Test a different protocol. + with open(op.join(d, 'uri.txt'), 'w') as f: + f.write('# Please see https://example.com/abandonned for info\n') + assert cs.main(f.name) == 0 + + # Ignoring text in path ending with /. + with open(op.join(d, 'uri.txt'), 'w') as f: + f.write('# Please see http://example.com/abandonned/ for info\n') + assert cs.main(f.name) == 0 + + # Ignoring text in domain. + with open(op.join(d, 'uri.txt'), 'w') as f: + f.write('# Please see http://abandonned.com/example for info\n') + assert cs.main(f.name) == 0 + + # Ignoring text in anchor. + with open(op.join(d, 'uri.txt'), 'w') as f: + f.write('# Please see http://example.com/ex#abandonned for info\n') + assert cs.main(f.name) == 0 + + # Typo because there's no protocol. + with open(op.join(d, 'uri.txt'), 'w') as f: + f.write('# Please see example.com/abandonned for info\n') + assert cs.main(f.name) == 1 + + # Typo because there aren't enough domain parts. + with open(op.join(d, 'uri.txt'), 'w') as f: + f.write('# Please see http://abandonned for info\n') + assert cs.main(f.name) == 1 + + +def test_email(tmpdir, capsys): + """Test ignore regex functionality for emails.""" + d = str(tmpdir) + + # Ignoring text in username. + with open(op.join(d, 'email.txt'), 'w') as f: + f.write('# Please contact abandonned@example.com for info\n') + assert cs.main(f.name) == 0 + # Same is a typo with ignores disabled. + assert cs.main(f.name, '--ignore-regex=^$') == 1 + + # Ignoring text in domain. + with open(op.join(d, 'email.txt'), 'w') as f: + f.write('# Please contact example@abandonned.com for info\n') + assert cs.main(f.name) == 0 + + # Typo because there's no TLD for an email. + with open(op.join(d, 'email.txt'), 'w') as f: + f.write('# Please contact abandonned@example for info\n') + assert cs.main(f.name) == 1 + + @contextlib.contextmanager def FakeStdin(text): if sys.version[0] == '2': From 3508ed6b5f1903f68cc5c4ae5a137915e1b0ed2f Mon Sep 17 00:00:00 2001 From: jonmeow <46229924+jonmeow@users.noreply.github.com> Date: Fri, 10 Jul 2020 15:09:27 -0700 Subject: [PATCH 2/7] More --ignore-regex tests --- codespell_lib/tests/test_basic.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 8a4a1eb0f0..ad476104de 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -455,6 +455,26 @@ def test_context(tmpdir, capsys): assert 'ERROR' in lines[0] +def test_ignore_regex_flag(tmpdir, capsys): + """Test ignore regex flag functionality.""" + d = str(tmpdir) + + # Invalid regex. + code, stdout, _ = cs.main('--ignore-regex=(', std=True) + assert code == EX_USAGE + assert 'usage:' in stdout + + # Empty regex matches everything. + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write('# Please see http://example.com/abandonned for info\n') + assert cs.main(f.name, '--ignore-regex=^$') == 1 + + # Custom ignore. + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write('abandonned\n') + assert cs.main(f.name, '--ignore-regex=abandonned') == 0 + + def test_uri(tmpdir, capsys): """Test ignore regex functionality for URIs.""" d = str(tmpdir) @@ -463,8 +483,6 @@ def test_uri(tmpdir, capsys): with open(op.join(d, 'uri.txt'), 'w') as f: f.write('# Please see http://example.com/abandonned for info\n') assert cs.main(f.name) == 0 - # Same is a typo with ignores disabled. - assert cs.main(f.name, '--ignore-regex=^$') == 1 # Test a different protocol. with open(op.join(d, 'uri.txt'), 'w') as f: @@ -505,8 +523,6 @@ def test_email(tmpdir, capsys): with open(op.join(d, 'email.txt'), 'w') as f: f.write('# Please contact abandonned@example.com for info\n') assert cs.main(f.name) == 0 - # Same is a typo with ignores disabled. - assert cs.main(f.name, '--ignore-regex=^$') == 1 # Ignoring text in domain. with open(op.join(d, 'email.txt'), 'w') as f: From 9d6f0051b2cd97d04f192657ef55680de601b236 Mon Sep 17 00:00:00 2001 From: jonmeow <46229924+jonmeow@users.noreply.github.com> Date: Fri, 24 Jul 2020 12:41:17 -0700 Subject: [PATCH 3/7] Switch to ignore-regex as a simple flag --- codespell_lib/_codespell.py | 34 ++++++++------- codespell_lib/tests/test_basic.py | 69 ++----------------------------- 2 files changed, 23 insertions(+), 80 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 4c66a87632..bb733fada6 100755 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -28,8 +28,6 @@ import textwrap word_regex_def = u"[\\w\\-'’`]+" -# Matches common URIs and email addresses, in that order. -ignore_word_regex_def = r"(?:(?:https?|ftp|smtp):\/\/([\w-]+\.)+\w{2,}(?:/(?:[\w:/?#\[\]@!$&'()*+,;=.~-]*/?)*)?|[\w.%+-]+@[\w.-]+\.[a-z]{2,})" # noqa: E501 encodings = ('utf-8', 'iso-8859-1') USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] @@ -277,9 +275,11 @@ def parse_options(args): 'The default is %(default)r.') parser.add_argument('--ignore-regex', action='store', type=str, - help='regular expression which is used to find words ' - 'to ignore. Matches URIs and emails by default. ' - 'Can be disabled by setting to "^$".') + help='regular expression which is used to find ' + 'patterns to ignore by treating as whitespace. ' + 'When writing regexes, consider ensuring there ' + 'is boundary non-word chars, e.g., ' + '"\\Wmatch\\W". Defaults to empty/disabled.') parser.add_argument('-I', '--ignore-words', action='append', metavar='FILE', help='file that contains words which will be ignored ' @@ -497,8 +497,9 @@ def print_context(lines, index, context): def extract_words(text, word_regex, ignore_word_regex): - interesting_text = ignore_word_regex.sub(' ', text) - return word_regex.findall(interesting_text) + if ignore_word_regex: + text = ignore_word_regex.sub(' ', text) + return word_regex.findall(text) def parse_file(filename, colors, summary, misspellings, exclude_lines, @@ -674,14 +675,17 @@ def main(*args): (word_regex, err), file=sys.stderr) parser.print_help() return EX_USAGE - ignore_word_regex = options.ignore_regex or ignore_word_regex_def - try: - ignore_word_regex = re.compile(ignore_word_regex) - except re.error as err: - print("ERROR: invalid regular expression \"%s\" (%s)" % - (ignore_word_regex, err), file=sys.stderr) - parser.print_help() - return EX_USAGE + + if options.ignore_regex: + try: + ignore_word_regex = re.compile(options.ignore_regex) + except re.error as err: + print("ERROR: invalid regular expression \"%s\" (%s)" % + (options.ignore_regex, err), file=sys.stderr) + parser.print_help() + return EX_USAGE + else: + ignore_word_regex = None ignore_words_files = options.ignore_words or [] ignore_words = set() diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index ad476104de..e9e76bee80 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -464,75 +464,14 @@ def test_ignore_regex_flag(tmpdir, capsys): assert code == EX_USAGE assert 'usage:' in stdout - # Empty regex matches everything. with open(op.join(d, 'flag.txt'), 'w') as f: f.write('# Please see http://example.com/abandonned for info\n') + # Non-matching regex results in nothing being ignored. assert cs.main(f.name, '--ignore-regex=^$') == 1 - - # Custom ignore. - with open(op.join(d, 'flag.txt'), 'w') as f: - f.write('abandonned\n') + # A word can be ignored. assert cs.main(f.name, '--ignore-regex=abandonned') == 0 - - -def test_uri(tmpdir, capsys): - """Test ignore regex functionality for URIs.""" - d = str(tmpdir) - - # Ignoring text in path. - with open(op.join(d, 'uri.txt'), 'w') as f: - f.write('# Please see http://example.com/abandonned for info\n') - assert cs.main(f.name) == 0 - - # Test a different protocol. - with open(op.join(d, 'uri.txt'), 'w') as f: - f.write('# Please see https://example.com/abandonned for info\n') - assert cs.main(f.name) == 0 - - # Ignoring text in path ending with /. - with open(op.join(d, 'uri.txt'), 'w') as f: - f.write('# Please see http://example.com/abandonned/ for info\n') - assert cs.main(f.name) == 0 - - # Ignoring text in domain. - with open(op.join(d, 'uri.txt'), 'w') as f: - f.write('# Please see http://abandonned.com/example for info\n') - assert cs.main(f.name) == 0 - - # Ignoring text in anchor. - with open(op.join(d, 'uri.txt'), 'w') as f: - f.write('# Please see http://example.com/ex#abandonned for info\n') - assert cs.main(f.name) == 0 - - # Typo because there's no protocol. - with open(op.join(d, 'uri.txt'), 'w') as f: - f.write('# Please see example.com/abandonned for info\n') - assert cs.main(f.name) == 1 - - # Typo because there aren't enough domain parts. - with open(op.join(d, 'uri.txt'), 'w') as f: - f.write('# Please see http://abandonned for info\n') - assert cs.main(f.name) == 1 - - -def test_email(tmpdir, capsys): - """Test ignore regex functionality for emails.""" - d = str(tmpdir) - - # Ignoring text in username. - with open(op.join(d, 'email.txt'), 'w') as f: - f.write('# Please contact abandonned@example.com for info\n') - assert cs.main(f.name) == 0 - - # Ignoring text in domain. - with open(op.join(d, 'email.txt'), 'w') as f: - f.write('# Please contact example@abandonned.com for info\n') - assert cs.main(f.name) == 0 - - # Typo because there's no TLD for an email. - with open(op.join(d, 'email.txt'), 'w') as f: - f.write('# Please contact abandonned@example for info\n') - assert cs.main(f.name) == 1 + # Ignoring part of the word can result in odd behavior. + assert cs.main(f.name, '--ignore-regex=nn') == 0 @contextlib.contextmanager From da4ae3e54ca45db817b2043d60ffcd15adcefb52 Mon Sep 17 00:00:00 2001 From: jonmeow <46229924+jonmeow@users.noreply.github.com> Date: Mon, 27 Jul 2020 11:26:15 -0700 Subject: [PATCH 4/7] Addressing comments --- codespell_lib/_codespell.py | 4 ++-- codespell_lib/tests/test_basic.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index bb733fada6..4fe8acc6d0 100755 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -671,7 +671,7 @@ def main(*args): try: word_regex = re.compile(word_regex) except re.error as err: - print("ERROR: invalid regular expression \"%s\" (%s)" % + print("ERROR: invalid --regex \"%s\" (%s)" % (word_regex, err), file=sys.stderr) parser.print_help() return EX_USAGE @@ -680,7 +680,7 @@ def main(*args): try: ignore_word_regex = re.compile(options.ignore_regex) except re.error as err: - print("ERROR: invalid regular expression \"%s\" (%s)" % + print("ERROR: invalid --ignore-regex \"%s\" (%s)" % (options.ignore_regex, err), file=sys.stderr) parser.print_help() return EX_USAGE diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index e9e76bee80..c9ee90414a 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -2,6 +2,7 @@ from __future__ import print_function +import argparse import contextlib import inspect import os @@ -466,6 +467,8 @@ def test_ignore_regex_flag(tmpdir, capsys): with open(op.join(d, 'flag.txt'), 'w') as f: f.write('# Please see http://example.com/abandonned for info\n') + # Test file has 1 invalid entry, and it's not ignored by default. + assert cs.main(f.name) == 1 # Non-matching regex results in nothing being ignored. assert cs.main(f.name, '--ignore-regex=^$') == 1 # A word can be ignored. From 69ece679a511a20722e0a6eab9846f4030b4cb36 Mon Sep 17 00:00:00 2001 From: jonmeow <46229924+jonmeow@users.noreply.github.com> Date: Mon, 27 Jul 2020 11:29:26 -0700 Subject: [PATCH 5/7] Remove accidental argparse import --- codespell_lib/tests/test_basic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index c9ee90414a..e151ce1de6 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -2,7 +2,6 @@ from __future__ import print_function -import argparse import contextlib import inspect import os From 5c608e018d13a01bced993d0e93fd50a5f85a3ae Mon Sep 17 00:00:00 2001 From: jonmeow <46229924+jonmeow@users.noreply.github.com> Date: Thu, 6 Aug 2020 11:36:22 -0700 Subject: [PATCH 6/7] Add tests for empty ignore-regex --- codespell_lib/tests/test_basic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index e151ce1de6..e0b8f8aecd 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -468,6 +468,9 @@ def test_ignore_regex_flag(tmpdir, capsys): f.write('# Please see http://example.com/abandonned for info\n') # Test file has 1 invalid entry, and it's not ignored by default. assert cs.main(f.name) == 1 + # An empty regex is the default value, and nothing is ignored. + assert cs.main(f.name, '--ignore-regex=') == 1 + assert cs.main(f.name, '--ignore-regex=""') == 1 # Non-matching regex results in nothing being ignored. assert cs.main(f.name, '--ignore-regex=^$') == 1 # A word can be ignored. From 5ae1b91fae8f439fcaab2d0ed513612125a541e2 Mon Sep 17 00:00:00 2001 From: jonmeow <46229924+jonmeow@users.noreply.github.com> Date: Mon, 10 Aug 2020 15:21:56 -0700 Subject: [PATCH 7/7] More tests --- codespell_lib/_codespell.py | 2 +- codespell_lib/tests/test_basic.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 4fe8acc6d0..83ebaa4e01 100755 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -278,7 +278,7 @@ def parse_options(args): help='regular expression which is used to find ' 'patterns to ignore by treating as whitespace. ' 'When writing regexes, consider ensuring there ' - 'is boundary non-word chars, e.g., ' + 'are boundary non-word chars, e.g., ' '"\\Wmatch\\W". Defaults to empty/disabled.') parser.add_argument('-I', '--ignore-words', action='append', metavar='FILE', diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index e0b8f8aecd..876e24ce93 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -478,6 +478,15 @@ def test_ignore_regex_flag(tmpdir, capsys): # Ignoring part of the word can result in odd behavior. assert cs.main(f.name, '--ignore-regex=nn') == 0 + with open(op.join(d, 'flag.txt'), 'w') as f: + f.write('abandonned donn\n') + # Test file has 2 invalid entries. + assert cs.main(f.name) == 2 + # Ignoring donn breaks them both. + assert cs.main(f.name, '--ignore-regex=donn') == 0 + # Adding word breaks causes only one to be ignored. + assert cs.main(f.name, r'--ignore-regex=\Wdonn\W') == 1 + @contextlib.contextmanager def FakeStdin(text):