From 018f0c64657afcebe1c98726243799e6fa10d15e Mon Sep 17 00:00:00 2001
From: jonmeow <46229924+jonmeow@users.noreply.github.com>
Date: Wed, 8 Jul 2020 09:19:27 -0700
Subject: [PATCH 1/7] Add --ignore-regex for URI/email handling.

This is for issue #676, where typos are found in actually-okay URIs/emails. Because these are closer to names in context, this ignores them.

Mechanically, this erases the URI/email text before the word regex is applied.
---
 codespell_lib/_codespell.py       | 31 ++++++++++++---
 codespell_lib/tests/test_basic.py | 64 +++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 5 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index f52b840c07..4c66a87632 100755
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -28,6 +28,8 @@
 import textwrap
 
 word_regex_def = u"[\\w\\-'’`]+"
+# Matches common URIs and email addresses, in that order.
+ignore_word_regex_def = r"(?:(?:https?|ftp|smtp):\/\/([\w-]+\.)+\w{2,}(?:/(?:[\w:/?#\[\]@!$&'()*+,;=.~-]*/?)*)?|[\w.%+-]+@[\w.-]+\.[a-z]{2,})"  # noqa: E501
 encodings = ('utf-8', 'iso-8859-1')
 USAGE = """
 \t%prog [OPTIONS] [file1 file2 ... fileN]
@@ -273,6 +275,11 @@ def parse_options(args):
                         'to include (when "-D -" or no "-D" is passed). '
                         'Current options are:' + builtin_opts + '\n'
                         'The default is %(default)r.')
+    parser.add_argument('--ignore-regex',
+                        action='store', type=str,
+                        help='regular expression which is used to find words '
+                             'to ignore. Matches URIs and emails by default. '
+                             'Can be disabled by setting to "^$".')
     parser.add_argument('-I', '--ignore-words',
                         action='append', metavar='FILE',
                         help='file that contains words which will be ignored '
@@ -489,8 +496,13 @@ def print_context(lines, index, context):
             print('%s %s' % ('>' if i == index else ':', lines[i].rstrip()))
 
 
+def extract_words(text, word_regex, ignore_word_regex):
+    interesting_text = ignore_word_regex.sub(' ', text)
+    return word_regex.findall(interesting_text)
+
+
 def parse_file(filename, colors, summary, misspellings, exclude_lines,
-               file_opener, word_regex, context, options):
+               file_opener, word_regex, ignore_word_regex, context, options):
     bad_count = 0
     lines = None
     changed = False
@@ -501,7 +513,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
         lines = f.readlines()
     else:
         if options.check_filenames:
-            for word in word_regex.findall(filename):
+            for word in extract_words(filename, word_regex, ignore_word_regex):
                 lword = word.lower()
                 if lword not in misspellings:
                     continue
@@ -555,7 +567,7 @@ def parse_file(filename, colors, summary, misspellings, exclude_lines,
         fixed_words = set()
         asked_for = set()
 
-        for word in word_regex.findall(line):
+        for word in extract_words(line, word_regex, ignore_word_regex):
             lword = word.lower()
             if lword in misspellings:
                 context_shown = False
@@ -662,6 +674,14 @@ def main(*args):
               (word_regex, err), file=sys.stderr)
         parser.print_help()
         return EX_USAGE
+    ignore_word_regex = options.ignore_regex or ignore_word_regex_def
+    try:
+        ignore_word_regex = re.compile(ignore_word_regex)
+    except re.error as err:
+        print("ERROR: invalid regular expression \"%s\" (%s)" %
+              (ignore_word_regex, err), file=sys.stderr)
+        parser.print_help()
+        return EX_USAGE
 
     ignore_words_files = options.ignore_words or []
     ignore_words = set()
@@ -770,7 +790,8 @@ def main(*args):
                         continue
                     bad_count += parse_file(
                         fname, colors, summary, misspellings, exclude_lines,
-                        file_opener, word_regex, context, options)
+                        file_opener, word_regex, ignore_word_regex, context,
+                        options)
 
                 # skip (relative) directories
                 dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)]
@@ -778,7 +799,7 @@ def main(*args):
         else:
             bad_count += parse_file(
                 filename, colors, summary, misspellings, exclude_lines,
-                file_opener, word_regex, context, options)
+                file_opener, word_regex, ignore_word_regex, context, options)
 
     if summary:
         print("\n-------8<-------\nSUMMARY:")
diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py
index 4107b6ce1c..8a4a1eb0f0 100644
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@@ -455,6 +455,70 @@ def test_context(tmpdir, capsys):
     assert 'ERROR' in lines[0]
 
 
+def test_uri(tmpdir, capsys):
+    """Test ignore regex functionality for URIs."""
+    d = str(tmpdir)
+
+    # Ignoring text in path.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see http://example.com/abandonned for info\n')
+    assert cs.main(f.name) == 0
+    # Same is a typo with ignores disabled.
+    assert cs.main(f.name, '--ignore-regex=^$') == 1
+
+    # Test a different protocol.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see https://example.com/abandonned for info\n')
+    assert cs.main(f.name) == 0
+
+    # Ignoring text in path ending with /.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see http://example.com/abandonned/ for info\n')
+    assert cs.main(f.name) == 0
+
+    # Ignoring text in domain.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see http://abandonned.com/example for info\n')
+    assert cs.main(f.name) == 0
+
+    # Ignoring text in anchor.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see http://example.com/ex#abandonned for info\n')
+    assert cs.main(f.name) == 0
+
+    # Typo because there's no protocol.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see example.com/abandonned for info\n')
+    assert cs.main(f.name) == 1
+
+    # Typo because there aren't enough domain parts.
+    with open(op.join(d, 'uri.txt'), 'w') as f:
+        f.write('# Please see http://abandonned for info\n')
+    assert cs.main(f.name) == 1
+
+
+def test_email(tmpdir, capsys):
+    """Test ignore regex functionality for emails."""
+    d = str(tmpdir)
+
+    # Ignoring text in username.
+    with open(op.join(d, 'email.txt'), 'w') as f:
+        f.write('# Please contact abandonned@example.com for info\n')
+    assert cs.main(f.name) == 0
+    # Same is a typo with ignores disabled.
+    assert cs.main(f.name, '--ignore-regex=^$') == 1
+
+    # Ignoring text in domain.
+    with open(op.join(d, 'email.txt'), 'w') as f:
+        f.write('# Please contact example@abandonned.com for info\n')
+    assert cs.main(f.name) == 0
+
+    # Typo because there's no TLD for an email.
+    with open(op.join(d, 'email.txt'), 'w') as f:
+        f.write('# Please contact abandonned@example for info\n')
+    assert cs.main(f.name) == 1
+
+
 @contextlib.contextmanager
 def FakeStdin(text):
     if sys.version[0] == '2':

From 3508ed6b5f1903f68cc5c4ae5a137915e1b0ed2f Mon Sep 17 00:00:00 2001
From: jonmeow <46229924+jonmeow@users.noreply.github.com>
Date: Fri, 10 Jul 2020 15:09:27 -0700
Subject: [PATCH 2/7] More --ignore-regex tests

---
 codespell_lib/tests/test_basic.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py
index 8a4a1eb0f0..ad476104de 100644
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@@ -455,6 +455,26 @@ def test_context(tmpdir, capsys):
     assert 'ERROR' in lines[0]
 
 
+def test_ignore_regex_flag(tmpdir, capsys):
+    """Test ignore regex flag functionality."""
+    d = str(tmpdir)
+
+    # Invalid regex.
+    code, stdout, _ = cs.main('--ignore-regex=(', std=True)
+    assert code == EX_USAGE
+    assert 'usage:' in stdout
+
+    # Empty regex matches everything.
+    with open(op.join(d, 'flag.txt'), 'w') as f:
+        f.write('# Please see http://example.com/abandonned for info\n')
+    assert cs.main(f.name, '--ignore-regex=^$') == 1
+
+    # Custom ignore.
+    with open(op.join(d, 'flag.txt'), 'w') as f:
+        f.write('abandonned\n')
+    assert cs.main(f.name, '--ignore-regex=abandonned') == 0
+
+
 def test_uri(tmpdir, capsys):
     """Test ignore regex functionality for URIs."""
     d = str(tmpdir)
@@ -463,8 +483,6 @@ def test_uri(tmpdir, capsys):
     with open(op.join(d, 'uri.txt'), 'w') as f:
         f.write('# Please see http://example.com/abandonned for info\n')
     assert cs.main(f.name) == 0
-    # Same is a typo with ignores disabled.
-    assert cs.main(f.name, '--ignore-regex=^$') == 1
 
     # Test a different protocol.
     with open(op.join(d, 'uri.txt'), 'w') as f:
@@ -505,8 +523,6 @@ def test_email(tmpdir, capsys):
     with open(op.join(d, 'email.txt'), 'w') as f:
         f.write('# Please contact abandonned@example.com for info\n')
     assert cs.main(f.name) == 0
-    # Same is a typo with ignores disabled.
-    assert cs.main(f.name, '--ignore-regex=^$') == 1
 
     # Ignoring text in domain.
     with open(op.join(d, 'email.txt'), 'w') as f:

From 9d6f0051b2cd97d04f192657ef55680de601b236 Mon Sep 17 00:00:00 2001
From: jonmeow <46229924+jonmeow@users.noreply.github.com>
Date: Fri, 24 Jul 2020 12:41:17 -0700
Subject: [PATCH 3/7] Switch to ignore-regex as a simple flag

---
 codespell_lib/_codespell.py       | 34 ++++++++-------
 codespell_lib/tests/test_basic.py | 69 ++-----------------------------
 2 files changed, 23 insertions(+), 80 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 4c66a87632..bb733fada6 100755
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -28,8 +28,6 @@
 import textwrap
 
 word_regex_def = u"[\\w\\-'’`]+"
-# Matches common URIs and email addresses, in that order.
-ignore_word_regex_def = r"(?:(?:https?|ftp|smtp):\/\/([\w-]+\.)+\w{2,}(?:/(?:[\w:/?#\[\]@!$&'()*+,;=.~-]*/?)*)?|[\w.%+-]+@[\w.-]+\.[a-z]{2,})"  # noqa: E501
 encodings = ('utf-8', 'iso-8859-1')
 USAGE = """
 \t%prog [OPTIONS] [file1 file2 ... fileN]
@@ -277,9 +275,11 @@ def parse_options(args):
                         'The default is %(default)r.')
     parser.add_argument('--ignore-regex',
                         action='store', type=str,
-                        help='regular expression which is used to find words '
-                             'to ignore. Matches URIs and emails by default. '
-                             'Can be disabled by setting to "^$".')
+                        help='regular expression which is used to find '
+                             'patterns to ignore by treating as whitespace. '
+                             'When writing regexes, consider ensuring there '
+                             'is boundary non-word chars, e.g., '
+                             '"\\Wmatch\\W". Defaults to empty/disabled.')
     parser.add_argument('-I', '--ignore-words',
                         action='append', metavar='FILE',
                         help='file that contains words which will be ignored '
@@ -497,8 +497,9 @@ def print_context(lines, index, context):
 
 
 def extract_words(text, word_regex, ignore_word_regex):
-    interesting_text = ignore_word_regex.sub(' ', text)
-    return word_regex.findall(interesting_text)
+    if ignore_word_regex:
+        text = ignore_word_regex.sub(' ', text)
+    return word_regex.findall(text)
 
 
 def parse_file(filename, colors, summary, misspellings, exclude_lines,
@@ -674,14 +675,17 @@ def main(*args):
               (word_regex, err), file=sys.stderr)
         parser.print_help()
         return EX_USAGE
-    ignore_word_regex = options.ignore_regex or ignore_word_regex_def
-    try:
-        ignore_word_regex = re.compile(ignore_word_regex)
-    except re.error as err:
-        print("ERROR: invalid regular expression \"%s\" (%s)" %
-              (ignore_word_regex, err), file=sys.stderr)
-        parser.print_help()
-        return EX_USAGE
+
+    if options.ignore_regex:
+        try:
+            ignore_word_regex = re.compile(options.ignore_regex)
+        except re.error as err:
+            print("ERROR: invalid regular expression \"%s\" (%s)" %
+                  (options.ignore_regex, err), file=sys.stderr)
+            parser.print_help()
+            return EX_USAGE
+    else:
+        ignore_word_regex = None
 
     ignore_words_files = options.ignore_words or []
     ignore_words = set()
diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py
index ad476104de..e9e76bee80 100644
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@@ -464,75 +464,14 @@ def test_ignore_regex_flag(tmpdir, capsys):
     assert code == EX_USAGE
     assert 'usage:' in stdout
 
-    # Empty regex matches everything.
     with open(op.join(d, 'flag.txt'), 'w') as f:
         f.write('# Please see http://example.com/abandonned for info\n')
+    # Non-matching regex results in nothing being ignored.
     assert cs.main(f.name, '--ignore-regex=^$') == 1
-
-    # Custom ignore.
-    with open(op.join(d, 'flag.txt'), 'w') as f:
-        f.write('abandonned\n')
+    # A word can be ignored.
     assert cs.main(f.name, '--ignore-regex=abandonned') == 0
-
-
-def test_uri(tmpdir, capsys):
-    """Test ignore regex functionality for URIs."""
-    d = str(tmpdir)
-
-    # Ignoring text in path.
-    with open(op.join(d, 'uri.txt'), 'w') as f:
-        f.write('# Please see http://example.com/abandonned for info\n')
-    assert cs.main(f.name) == 0
-
-    # Test a different protocol.
-    with open(op.join(d, 'uri.txt'), 'w') as f:
-        f.write('# Please see https://example.com/abandonned for info\n')
-    assert cs.main(f.name) == 0
-
-    # Ignoring text in path ending with /.
-    with open(op.join(d, 'uri.txt'), 'w') as f:
-        f.write('# Please see http://example.com/abandonned/ for info\n')
-    assert cs.main(f.name) == 0
-
-    # Ignoring text in domain.
-    with open(op.join(d, 'uri.txt'), 'w') as f:
-        f.write('# Please see http://abandonned.com/example for info\n')
-    assert cs.main(f.name) == 0
-
-    # Ignoring text in anchor.
-    with open(op.join(d, 'uri.txt'), 'w') as f:
-        f.write('# Please see http://example.com/ex#abandonned for info\n')
-    assert cs.main(f.name) == 0
-
-    # Typo because there's no protocol.
-    with open(op.join(d, 'uri.txt'), 'w') as f:
-        f.write('# Please see example.com/abandonned for info\n')
-    assert cs.main(f.name) == 1
-
-    # Typo because there aren't enough domain parts.
-    with open(op.join(d, 'uri.txt'), 'w') as f:
-        f.write('# Please see http://abandonned for info\n')
-    assert cs.main(f.name) == 1
-
-
-def test_email(tmpdir, capsys):
-    """Test ignore regex functionality for emails."""
-    d = str(tmpdir)
-
-    # Ignoring text in username.
-    with open(op.join(d, 'email.txt'), 'w') as f:
-        f.write('# Please contact abandonned@example.com for info\n')
-    assert cs.main(f.name) == 0
-
-    # Ignoring text in domain.
-    with open(op.join(d, 'email.txt'), 'w') as f:
-        f.write('# Please contact example@abandonned.com for info\n')
-    assert cs.main(f.name) == 0
-
-    # Typo because there's no TLD for an email.
-    with open(op.join(d, 'email.txt'), 'w') as f:
-        f.write('# Please contact abandonned@example for info\n')
-    assert cs.main(f.name) == 1
+    # Ignoring part of the word can result in odd behavior.
+    assert cs.main(f.name, '--ignore-regex=nn') == 0
 
 
 @contextlib.contextmanager

From da4ae3e54ca45db817b2043d60ffcd15adcefb52 Mon Sep 17 00:00:00 2001
From: jonmeow <46229924+jonmeow@users.noreply.github.com>
Date: Mon, 27 Jul 2020 11:26:15 -0700
Subject: [PATCH 4/7] Addressing comments

---
 codespell_lib/_codespell.py       | 4 ++--
 codespell_lib/tests/test_basic.py | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index bb733fada6..4fe8acc6d0 100755
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -671,7 +671,7 @@ def main(*args):
     try:
         word_regex = re.compile(word_regex)
     except re.error as err:
-        print("ERROR: invalid regular expression \"%s\" (%s)" %
+        print("ERROR: invalid --regex \"%s\" (%s)" %
               (word_regex, err), file=sys.stderr)
         parser.print_help()
         return EX_USAGE
@@ -680,7 +680,7 @@ def main(*args):
         try:
             ignore_word_regex = re.compile(options.ignore_regex)
         except re.error as err:
-            print("ERROR: invalid regular expression \"%s\" (%s)" %
+            print("ERROR: invalid --ignore-regex \"%s\" (%s)" %
                   (options.ignore_regex, err), file=sys.stderr)
             parser.print_help()
             return EX_USAGE
diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py
index e9e76bee80..c9ee90414a 100644
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@@ -2,6 +2,7 @@
 
 from __future__ import print_function
 
+import argparse
 import contextlib
 import inspect
 import os
@@ -466,6 +467,8 @@ def test_ignore_regex_flag(tmpdir, capsys):
 
     with open(op.join(d, 'flag.txt'), 'w') as f:
         f.write('# Please see http://example.com/abandonned for info\n')
+    # Test file has 1 invalid entry, and it's not ignored by default.
+    assert cs.main(f.name) == 1
     # Non-matching regex results in nothing being ignored.
     assert cs.main(f.name, '--ignore-regex=^$') == 1
     # A word can be ignored.

From 69ece679a511a20722e0a6eab9846f4030b4cb36 Mon Sep 17 00:00:00 2001
From: jonmeow <46229924+jonmeow@users.noreply.github.com>
Date: Mon, 27 Jul 2020 11:29:26 -0700
Subject: [PATCH 5/7] Remove accidental argparse import

---
 codespell_lib/tests/test_basic.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py
index c9ee90414a..e151ce1de6 100644
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@@ -2,7 +2,6 @@
 
 from __future__ import print_function
 
-import argparse
 import contextlib
 import inspect
 import os

From 5c608e018d13a01bced993d0e93fd50a5f85a3ae Mon Sep 17 00:00:00 2001
From: jonmeow <46229924+jonmeow@users.noreply.github.com>
Date: Thu, 6 Aug 2020 11:36:22 -0700
Subject: [PATCH 6/7] Add tests for empty ignore-regex

---
 codespell_lib/tests/test_basic.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py
index e151ce1de6..e0b8f8aecd 100644
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@@ -468,6 +468,9 @@ def test_ignore_regex_flag(tmpdir, capsys):
         f.write('# Please see http://example.com/abandonned for info\n')
     # Test file has 1 invalid entry, and it's not ignored by default.
     assert cs.main(f.name) == 1
+    # An empty regex is the default value, and nothing is ignored.
+    assert cs.main(f.name, '--ignore-regex=') == 1
+    assert cs.main(f.name, '--ignore-regex=""') == 1
     # Non-matching regex results in nothing being ignored.
     assert cs.main(f.name, '--ignore-regex=^$') == 1
     # A word can be ignored.

From 5ae1b91fae8f439fcaab2d0ed513612125a541e2 Mon Sep 17 00:00:00 2001
From: jonmeow <46229924+jonmeow@users.noreply.github.com>
Date: Mon, 10 Aug 2020 15:21:56 -0700
Subject: [PATCH 7/7] More tests

---
 codespell_lib/_codespell.py       | 2 +-
 codespell_lib/tests/test_basic.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 4fe8acc6d0..83ebaa4e01 100755
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -278,7 +278,7 @@ def parse_options(args):
                         help='regular expression which is used to find '
                              'patterns to ignore by treating as whitespace. '
                              'When writing regexes, consider ensuring there '
-                             'is boundary non-word chars, e.g., '
+                             'are boundary non-word chars, e.g., '
                              '"\\Wmatch\\W". Defaults to empty/disabled.')
     parser.add_argument('-I', '--ignore-words',
                         action='append', metavar='FILE',
diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py
index e0b8f8aecd..876e24ce93 100644
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@@ -478,6 +478,15 @@ def test_ignore_regex_flag(tmpdir, capsys):
     # Ignoring part of the word can result in odd behavior.
     assert cs.main(f.name, '--ignore-regex=nn') == 0
 
+    with open(op.join(d, 'flag.txt'), 'w') as f:
+        f.write('abandonned donn\n')
+    # Test file has 2 invalid entries.
+    assert cs.main(f.name) == 2
+    # Ignoring donn breaks them both.
+    assert cs.main(f.name, '--ignore-regex=donn') == 0
+    # Adding word breaks causes only one to be ignored.
+    assert cs.main(f.name, r'--ignore-regex=\Wdonn\W') == 1
+
 
 @contextlib.contextmanager
 def FakeStdin(text):