codespell_lib/_codespell.py

#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see
# https://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
"""
Copyright (C) 2010-2011  Lucas De Marchi <lucas.de.marchi@gmail.com>
Copyright (C) 2011  ProFUSION embedded systems
"""

import argparse
import configparser
import ctypes
import fnmatch
import itertools
import os
import re
import sys
import textwrap
from typing import (
    Any,
    Dict,
    Iterable,
    List,
    Match,
    Optional,
    Pattern,
    Sequence,
    Set,
    TextIO,
    Tuple,
)

if sys.platform == "win32":
    from ctypes import wintypes

    ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
    STD_OUTPUT_HANDLE = wintypes.HANDLE(-11)

from ._spellchecker import Misspelling, build_dict
from ._text_util import fix_case

# autogenerated by setuptools_scm
from ._version import (  # type: ignore[import-not-found]
    __version__ as VERSION,  # noqa: N812
)

word_regex_def = r"[\w\-'’]+"  # noqa: RUF001
# While we want to treat characters like ( or " as okay for a starting break,
# these may occur unescaped in URIs, and so we are more restrictive on the
# endpoint.  Emails are more restrictive, so the endpoint remains flexible.
uri_regex_def = (
    "(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|"
    "\\b[\\w.%+-]+@[\\w.-]+\\b)"
)
inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P<words>[\w,]*))?")
USAGE = """
\t%prog [OPTIONS] [file1 file2 ... fileN]
"""

supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU")
supported_languages = supported_languages_en

# Users might want to link this file into /usr/local/bin, so we resolve the
# symbolic link path to the real path if necessary.
_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
_builtin_dictionaries = (
    # name, desc, name, err in aspell, correction in aspell, \
    # err dictionary array, rep dictionary array
    # The arrays must contain the names of aspell dictionaries
    # The aspell tests here aren't the ideal state, but the None's are
    # realistic for obscure words
    ("clear", "for unambiguous errors", "", False, None, supported_languages_en, None),
    (
        "rare",
        "for rare (but valid) words that are likely to be errors",
        "_rare",
        None,
        None,
        None,
        None,
    ),
    (
        "informal",
        "for making informal words more formal",
        "_informal",
        True,
        True,
        supported_languages_en,
        supported_languages_en,
    ),
    (
        "usage",
        "for replacing phrasing with recommended terms",
        "_usage",
        None,
        None,
        None,
        None,
    ),
    (
        "code",
        "for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)",  # noqa: E501
        "_code",
        None,
        None,
        None,
        None,
    ),
    (
        "names",
        "for valid proper names that might be typos",
        "_names",
        None,
        None,
        None,
        None,
    ),
    (
        "en-GB_to_en-US",
        "for corrections from en-GB to en-US",
        "_en-GB_to_en-US",
        True,
        True,
        ("en_GB",),
        ("en_US",),
    ),
)
_builtin_default = "clear,rare"

# docs say os.EX_USAGE et al. are only available on Unix systems, so to be safe
# we protect and just use the values they are on macOS and Linux
EX_OK = 0
EX_USAGE = 64
EX_DATAERR = 65
EX_CONFIG = 78

# OPTIONS:
#
# ARGUMENTS:
#    dict_filename       The file containing the dictionary of misspellings.
#                        If set to '-', it will be read from stdin
#    file1 .. fileN      Files to check spelling


class QuietLevels:
    NONE = 0
    ENCODING = 1
    BINARY_FILE = 2
    DISABLED_FIXES = 4
    NON_AUTOMATIC_FIXES = 8
    FIXES = 16
    CONFIG_FILES = 32


class GlobMatch:
    def __init__(self, pattern: List[str]) -> None:
        self.pattern_list: List[str] = pattern

    def match(self, filename: str) -> bool:
        return any(fnmatch.fnmatch(filename, p) for p in self.pattern_list)


class TermColors:
    def __init__(self) -> None:
        self.FILE = "\033[33m"
        self.WWORD = "\033[31m"
        self.FWORD = "\033[32m"
        self.DISABLE = "\033[0m"

    def disable(self) -> None:
        self.FILE = ""
        self.WWORD = ""
        self.FWORD = ""
        self.DISABLE = ""


class Summary:
    def __init__(self) -> None:
        self.summary: Dict[str, int] = {}

    def update(self, wrongword: str) -> None:
        if wrongword in self.summary:
            self.summary[wrongword] += 1
        else:
            self.summary[wrongword] = 1

    def __str__(self) -> str:
        keys = list(self.summary.keys())
        keys.sort()

        return "\n".join(
            [f"{key}{self.summary.get(key):{15 - len(key)}}" for key in keys]
        )


class FileOpener:
    def __init__(
        self,
        use_chardet: bool,
        quiet_level: int,
        ignore_multiline_regex: Optional[Pattern[str]],
    ) -> None:
        self.use_chardet = use_chardet
        if use_chardet:
            self.init_chardet()
        self.quiet_level = quiet_level
        self.ignore_multiline_regex = ignore_multiline_regex

    def init_chardet(self) -> None:
        try:
            from chardet.universaldetector import UniversalDetector
        except ImportError as e:
            msg = (
                "There's no chardet installed to import from. "
                "Please, install it and check your PYTHONPATH "
                "environment variable"
            )
            raise ImportError(msg) from e

        self.encdetector = UniversalDetector()

    def open(self, filename: str) -> Tuple[List[str], str]:
        if self.use_chardet:
            return self.open_with_chardet(filename)
        return self.open_with_internal(filename)

    def open_with_chardet(self, filename: str) -> Tuple[List[str], str]:
        self.encdetector.reset()
        with open(filename, "rb") as fb:
            for line in fb:
                self.encdetector.feed(line)
                if self.encdetector.done:
                    break
        self.encdetector.close()
        encoding = self.encdetector.result["encoding"]

        try:
            f = open(filename, encoding=encoding, newline="")
        except UnicodeDecodeError:
            print(f"ERROR: Could not detect encoding: {filename}", file=sys.stderr)
            raise
        except LookupError:
            print(
                f"ERROR: Don't know how to handle encoding {encoding}: {filename}",
                file=sys.stderr,
            )
            raise
        else:
            lines = self.get_lines(f)
            f.close()

        return lines, f.encoding

    def open_with_internal(self, filename: str) -> Tuple[List[str], str]:
        encoding = None
        first_try = True
        for encoding in ("utf-8", "iso-8859-1"):
            if first_try:
                first_try = False
            elif not self.quiet_level & QuietLevels.ENCODING:
                print(f'WARNING: Trying next encoding "{encoding}"', file=sys.stderr)
            with open(filename, encoding=encoding, newline="") as f:
                try:
                    lines = self.get_lines(f)
                except UnicodeDecodeError:
                    if not self.quiet_level & QuietLevels.ENCODING:
                        print(
                            f'WARNING: Cannot decode file using encoding "{encoding}": '
                            f"{filename}",
                            file=sys.stderr,
                        )
                else:
                    break
        else:
            # reading with encoding "iso-8859-1" cannot fail with UnicodeDecodeError
            msg = "Unknown encoding"
            raise RuntimeError(msg)  # pragma: no cover

        return lines, encoding

    def get_lines(self, f: TextIO) -> List[str]:
        if self.ignore_multiline_regex:
            text = f.read()
            pos = 0
            text2 = ""
            for m in re.finditer(self.ignore_multiline_regex, text):
                text2 += text[pos : m.start()]
                # Replace with blank lines so line numbers are unchanged.
                text2 += "\n" * m.group().count("\n")
                pos = m.end()
            text2 += text[pos:]
            lines = text2.split("\n")
        else:
            lines = f.readlines()
        return lines


# -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-


# If someday this breaks, we can just switch to using RawTextHelpFormatter,
# but it has the disadvantage of not wrapping our long lines.


class NewlineHelpFormatter(argparse.HelpFormatter):
    """Help formatter that preserves newlines and deals with lists."""

    def _split_lines(self, text: str, width: int) -> List[str]:
        parts = text.split("\n")
        out = []
        for part in parts:
            # Eventually we could allow others...
            indent_start = "- "
            offset = len(indent_start) if part.startswith(indent_start) else 0
            part = part[offset:]
            part = self._whitespace_matcher.sub(" ", part).strip()
            parts = textwrap.wrap(part, width - offset)
            parts = [" " * offset + p for p in parts]
            if offset:
                parts[0] = indent_start + parts[0][offset:]
            out.extend(parts)
        return out


def _toml_to_parseconfig(toml_dict: Dict[str, Any]) -> Dict[str, Any]:
    """Convert a dict read from a TOML file to the parseconfig.read_dict() format."""
    return {
        k: "" if v is True else ",".join(v) if isinstance(v, list) else v
        for k, v in toml_dict.items()
        if v is not False
    }


def _supports_ansi_colors() -> bool:
    if sys.platform == "win32":
        # Windows Terminal enables ANSI escape codes by default. In other cases
        # it is disabled.
        # See https://ss64.com/nt/syntax-ansi.html for more information.
        kernel32 = ctypes.WinDLL("kernel32")

        # fmt: off
        kernel32.GetConsoleMode.argtypes = (
            wintypes.HANDLE,   # _In_  hConsoleHandle
            wintypes.LPDWORD,  # _Out_ lpMode
        )
        # fmt: on
        kernel32.GetConsoleMode.restype = wintypes.BOOL

        mode = wintypes.DWORD()
        handle = kernel32.GetStdHandle(STD_OUTPUT_HANDLE)
        if not kernel32.GetConsoleMode(handle, ctypes.byref(mode)):
            # TODO: print a warning with the error message on stderr?
            return False

        return (mode.value & ENABLE_VIRTUAL_TERMINAL_PROCESSING) != 0
    elif sys.platform == "wasi":
        # WASI disables ANSI escape codes for security reasons.
        # See https://github.com/WebAssembly/WASI/issues/162.
        return False
    elif sys.stdout.isatty():
        return True

    return False


def parse_options(
    args: Sequence[str],
) -> Tuple[argparse.Namespace, argparse.ArgumentParser, List[str]]:
    parser = argparse.ArgumentParser(formatter_class=NewlineHelpFormatter)

    parser.set_defaults(colors=_supports_ansi_colors())
    parser.add_argument("--version", action="version", version=VERSION)

    parser.add_argument(
        "-d",
        "--disable-colors",
        action="store_false",
        dest="colors",
        help="disable colors, even when printing to terminal",
    )
    parser.add_argument(
        "-c",
        "--enable-colors",
        action="store_true",
        dest="colors",
        help="enable colors, even when not printing to terminal",
    )

    parser.add_argument(
        "-w",
        "--write-changes",
        action="store_true",
        default=False,
        help="write changes in place if possible",
    )

    parser.add_argument(
        "-D",
        "--dictionary",
        action="append",
        help="comma-separated list of custom dictionary files that "
        "contain spelling corrections. If this flag is not specified "
        'or equals "-" then the default dictionary is used.',
    )
    builtin_opts = "\n- ".join(
        [""] + [f"{d[0]!r} {d[1]}" for d in _builtin_dictionaries]
    )
    parser.add_argument(
        "--builtin",
        dest="builtin",
        default=_builtin_default,
        metavar="BUILTIN-LIST",
        help="comma-separated list of builtin dictionaries "
        'to include (when "-D -" or no "-D" is passed). '
        "Current options are:" + builtin_opts + "\n"
        "The default is %(default)r.",
    )
    parser.add_argument(
        "--ignore-regex",
        action="store",
        type=str,
        help="regular expression that is used to find "
        "patterns to ignore by treating as whitespace. "
        "When writing regular expressions, consider "
        "ensuring there are boundary non-word chars, "
        'e.g., "\\bmatch\\b". Defaults to '
        "empty/disabled.",
    )
    parser.add_argument(
        "--ignore-multiline-regex",
        action="store",
        type=str,
        help="regular expression that is used to ignore "
        "text that may span multi-line regions. "
        "The regex is run with re.DOTALL. For example to "
        "allow skipping of regions of Python code using "
        "begin/end comments one could use: "
        "--ignore-multiline-regex "
        "'# codespell:ignore-begin *\\n.*# codespell:ignore-end *\\n'. "
        "Defaults to empty/disabled.",
    )
    parser.add_argument(
        "-I",
        "--ignore-words",
        action="append",
        metavar="FILES",
        help="comma-separated list of files that contain "
        "words to be ignored by codespell. Files must contain "
        "1 word per line. Words are case sensitive based on "
        "how they are written in the dictionary file.",
    )
    parser.add_argument(
        "-L",
        "--ignore-words-list",
        action="append",
        metavar="WORDS",
        help="comma-separated list of words to be ignored "
        "by codespell. Words are case sensitive based on "
        "how they are written in the dictionary file.",
    )
    parser.add_argument(
        "--uri-ignore-words-list",
        action="append",
        metavar="WORDS",
        help="comma-separated list of words to be ignored "
        "by codespell in URIs and emails only. Words are "
        "case sensitive based on how they are written in "
        'the dictionary file. If set to "*", all '
        "misspelling in URIs and emails will be ignored.",
    )
    parser.add_argument(
        "-r",
        "--regex",
        action="store",
        type=str,
        help="regular expression that is used to find words. "
        "By default any alphanumeric character, the "
        "underscore, the hyphen, and the apostrophe are "
        "used to build words. This option cannot be "
        "specified together with --write-changes.",
    )
    parser.add_argument(
        "--uri-regex",
        action="store",
        type=str,
        help="regular expression that is used to find URIs "
        "and emails. A default expression is provided.",
    )
    parser.add_argument(
        "-s",
        "--summary",
        action="store_true",
        default=False,
        help="print summary of fixes",
    )

    parser.add_argument(
        "--count",
        action="store_true",
        default=False,
        help="print the number of errors as the last line of stderr",
    )

    parser.add_argument(
        "-S",
        "--skip",
        action="append",
        help="comma-separated list of files to skip. It "
        "accepts globs as well. E.g.: if you want "
        "codespell to skip .eps and .txt files, "
        'you\'d give "*.eps,*.txt" to this option.',
    )

    parser.add_argument(
        "-x",
        "--exclude-file",
        action="append",
        type=str,
        metavar="FILES",
        help="ignore whole lines that match those in "
        "the comma-separated list of files EXCLUDE. "
        "The lines in these files should match the "
        "to-be-excluded lines exactly",
    )

    parser.add_argument(
        "-i",
        "--interactive",
        action="store",
        type=int,
        default=0,
        choices=range(0, 4),
        help="set interactive mode when writing changes:\n"
        "- 0: no interactivity.\n"
        "- 1: ask for confirmation.\n"
        "- 2: ask user to choose one fix when more than one is available.\n"
        "- 3: both 1 and 2",
        metavar="MODE",
    )

    parser.add_argument(
        "-q",
        "--quiet-level",
        action="store",
        type=int,
        default=34,
        choices=range(0, 64),
        help="bitmask that allows suppressing messages:\n"
        "- 0: print all messages.\n"
        "- 1: disable warnings about wrong encoding.\n"
        "- 2: disable warnings about binary files.\n"
        "- 4: omit warnings about automatic fixes that were disabled in the dictionary.\n"  # noqa: E501
        "- 8: don't print anything for non-automatic fixes.\n"
        "- 16: don't print the list of fixed files.\n"
        "- 32: don't print configuration files.\n"
        "As usual with bitmasks, these levels can be "
        "combined; e.g. use 3 for levels 1+2, 7 for "
        "1+2+4, 23 for 1+2+4+16, etc. "
        "The default mask is %(default)s.",
        metavar="LEVEL",
    )

    parser.add_argument(
        "-e",
        "--hard-encoding-detection",
        action="store_true",
        default=False,
        help="use chardet to detect the encoding of each "
        "file. This can slow down codespell, but is more "
        "reliable in detecting encodings other than "
        "utf-8, iso8859-1, and ascii.",
    )

    parser.add_argument(
        "-f",
        "--check-filenames",
        action="store_true",
        default=False,
        help="check file names as well",
    )

    parser.add_argument(
        "-H",
        "--check-hidden",
        action="store_true",
        default=False,
        help='check hidden files and directories (those starting with ".") as well.',
    )
    parser.add_argument(
        "-A",
        "--after-context",
        type=int,
        metavar="LINES",
        help="print LINES of trailing context",
    )
    parser.add_argument(
        "-B",
        "--before-context",
        type=int,
        metavar="LINES",
        help="print LINES of leading context",
    )
    parser.add_argument(
        "-C",
        "--context",
        type=int,
        metavar="LINES",
        help="print LINES of surrounding context",
    )
    parser.add_argument(
        "--stdin-single-line",
        action="store_true",
        help="output just a single line for each misspelling in stdin mode",
    )
    parser.add_argument("--config", type=str, help="path to config file.")
    parser.add_argument("--toml", type=str, help="path to a pyproject.toml file.")
    parser.add_argument("files", nargs="*", help="files or directories to check")

    # Parse command line options.
    options = parser.parse_args(list(args))

    # Load config files and look for ``codespell`` options.
    cfg_files = ["setup.cfg", ".codespellrc"]
    if options.config:
        cfg_files.append(options.config)
    config = configparser.ConfigParser(interpolation=None)

    # Read toml before other config files.
    toml_files = []
    tomllib_raise_error = False
    if os.path.isfile("pyproject.toml"):
        toml_files.append("pyproject.toml")
    if options.toml:
        toml_files.append(options.toml)
        tomllib_raise_error = True
    if toml_files:
        if sys.version_info >= (3, 11):
            import tomllib
        else:
            try:
                import tomli as tomllib  # type: ignore[no-redef]
            except ImportError as e:
                if tomllib_raise_error:
                    msg = (
                        f"tomllib or tomli are required to read pyproject.toml "
                        f"but could not be imported, got: {e}"
                    )
                    raise ImportError(msg) from None
                tomllib = None  # type: ignore[assignment]
        if tomllib is not None:
            for toml_file in toml_files:
                with open(toml_file, "rb") as f:
                    data = tomllib.load(f).get("tool", {})
                if "codespell" in data:
                    data["codespell"] = _toml_to_parseconfig(data["codespell"])
                config.read_dict(data)

    # Collect which config files are going to be used
    used_cfg_files = []
    for cfg_file in cfg_files:
        _cfg = configparser.ConfigParser()
        _cfg.read(cfg_file)
        if _cfg.has_section("codespell"):
            used_cfg_files.append(cfg_file)

    # Use config files
    config.read(used_cfg_files)
    if config.has_section("codespell"):
        # Build a "fake" argv list using option name and value.
        cfg_args = []
        for key in config["codespell"]:
            # Add option as arg.
            cfg_args.append(f"--{key}")
            # If value is blank, skip.
            val = config["codespell"][key]
            if val:
                cfg_args.append(val)

        # Parse config file options.
        options = parser.parse_args(cfg_args)

        # Re-parse command line options to override config.
        options = parser.parse_args(list(args), namespace=options)

    if not options.files:
        options.files.append(".")

    return options, parser, used_cfg_files


def process_ignore_words(
    words: Iterable[str], ignore_words: Set[str], ignore_words_cased: Set[str]
) -> None:
    for word in words:
        word = word.strip()
        if word == word.lower():
            ignore_words.add(word)
        else:
            ignore_words_cased.add(word)


def parse_ignore_words_option(
    ignore_words_option: List[str],
) -> Tuple[Set[str], Set[str]]:
    ignore_words: Set[str] = set()
    ignore_words_cased: Set[str] = set()
    if ignore_words_option:
        for comma_separated_words in ignore_words_option:
            process_ignore_words(
                (word.strip() for word in comma_separated_words.split(",")),
                ignore_words,
                ignore_words_cased,
            )
    return (ignore_words, ignore_words_cased)


def build_exclude_hashes(filename: str, exclude_lines: Set[str]) -> None:
    with open(filename, encoding="utf-8") as f:
        exclude_lines.update(line.rstrip() for line in f)


def build_ignore_words(
    filename: str, ignore_words: Set[str], ignore_words_cased: Set[str]
) -> None:
    with open(filename, encoding="utf-8") as f:
        process_ignore_words(
            (line.strip() for line in f), ignore_words, ignore_words_cased
        )


def is_hidden(filename: str, check_hidden: bool) -> bool:
    bfilename = os.path.basename(filename)

    return bfilename not in ("", ".", "..") and (
        not check_hidden and bfilename[0] == "."
    )


def is_text_file(filename: str) -> bool:
    with open(filename, mode="rb") as f:
        s = f.read(1024)
    return b"\x00" not in s


def ask_for_word_fix(
    line: str,
    match: Match[str],
    misspelling: Misspelling,
    interactivity: int,
    colors: TermColors,
) -> Tuple[bool, str]:
    wrongword = match.group()
    if interactivity <= 0:
        return misspelling.fix, fix_case(wrongword, misspelling.data)

    line_ui = (
        f"{line[:match.start()]}"
        f"{colors.WWORD}{wrongword}{colors.DISABLE}"
        f"{line[match.end():]}"
    )

    if misspelling.fix and interactivity & 1:
        r = ""
        fixword = fix_case(wrongword, misspelling.data)
        while not r:
            print(f"{line_ui}\t{wrongword} ==> {fixword} (Y/n) ", end="", flush=True)
            r = sys.stdin.readline().strip().upper()
            if not r:
                r = "Y"
            if r not in ("Y", "N"):
                print("Say 'y' or 'n'")
                r = ""

        if r == "N":
            misspelling.fix = False

    elif (interactivity & 2) and not misspelling.reason:
        # if it is not disabled, i.e. it just has more than one possible fix,
        # we ask the user which word to use

        r = ""
        opt = [w.strip() for w in misspelling.data.split(",")]
        while not r:
            print(f"{line_ui} Choose an option (blank for none): ", end="")
            for i, o in enumerate(opt):
                fixword = fix_case(wrongword, o)
                print(f" {i}) {fixword}", end="")
            print(": ", end="", flush=True)

            n = sys.stdin.readline().strip()
            if not n:
                break

            try:
                i = int(n)
                r = opt[i]
            except (ValueError, IndexError):
                print("Not a valid option\n")

        if r:
            misspelling.fix = True
            misspelling.data = r

    return misspelling.fix, fix_case(wrongword, misspelling.data)


def print_context(
    lines: List[str],
    index: int,
    context: Tuple[int, int],
) -> None:
    # context = (context_before, context_after)
    for i in range(index - context[0], index + context[1] + 1):
        if 0 <= i < len(lines):
            print(f"{'>' if i == index else ':'} {lines[i].rstrip()}")


def _ignore_word_sub(
    text: str,
    ignore_word_regex: Optional[Pattern[str]],
) -> str:
    if ignore_word_regex:
        text = ignore_word_regex.sub(" ", text)
    return text


def extract_words(
    text: str,
    word_regex: Pattern[str],
    ignore_word_regex: Optional[Pattern[str]],
) -> List[str]:
    return word_regex.findall(_ignore_word_sub(text, ignore_word_regex))


def extract_words_iter(
    text: str,
    word_regex: Pattern[str],
    ignore_word_regex: Optional[Pattern[str]],
) -> List[Match[str]]:
    return list(word_regex.finditer(_ignore_word_sub(text, ignore_word_regex)))


def apply_uri_ignore_words(
    check_matches: List[Match[str]],
    line: str,
    word_regex: Pattern[str],
    ignore_word_regex: Optional[Pattern[str]],
    uri_regex: Pattern[str],
    uri_ignore_words: Set[str],
) -> List[Match[str]]:
    if not uri_ignore_words:
        return check_matches
    for uri in uri_regex.findall(line):
        for uri_word in extract_words(uri, word_regex, ignore_word_regex):
            if uri_word in uri_ignore_words:
                # determine/remove only the first among matches
                for i, match in enumerate(check_matches):
                    if match.group() == uri_word:
                        check_matches = check_matches[:i] + check_matches[i + 1 :]
                        break
    return check_matches


def parse_file(
    filename: str,
    colors: TermColors,
    summary: Optional[Summary],
    misspellings: Dict[str, Misspelling],
    ignore_words_cased: Set[str],
    exclude_lines: Set[str],
    file_opener: FileOpener,
    word_regex: Pattern[str],
    ignore_word_regex: Optional[Pattern[str]],
    uri_regex: Pattern[str],
    uri_ignore_words: Set[str],
    context: Optional[Tuple[int, int]],
    options: argparse.Namespace,
) -> int:
    bad_count = 0
    lines = None
    changed = False

    if filename == "-":
        f = sys.stdin
        encoding = "utf-8"
        lines = f.readlines()
    else:
        if options.check_filenames:
            for word in extract_words(filename, word_regex, ignore_word_regex):
                if word in ignore_words_cased:
                    continue
                lword = word.lower()
                if lword not in misspellings:
                    continue
                fix = misspellings[lword].fix
                fixword = fix_case(word, misspellings[lword].data)

                if summary and fix:
                    summary.update(lword)

                cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
                cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
                crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"

                reason = misspellings[lword].reason
                if reason:
                    if options.quiet_level & QuietLevels.DISABLED_FIXES:
                        continue
                    creason = f"  | {colors.FILE}{reason}{colors.DISABLE}"
                else:
                    if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
                        continue
                    creason = ""

                bad_count += 1

                print(f"{cfilename}: {cwrongword} ==> {crightword}{creason}")

        # ignore irregular files
        if not os.path.isfile(filename):
            return bad_count

        try:
            text = is_text_file(filename)
        except PermissionError as e:
            print(f"WARNING: {e.strerror}: {filename}", file=sys.stderr)
            return bad_count
        except OSError:
            return bad_count

        if not text:
            if not options.quiet_level & QuietLevels.BINARY_FILE:
                print(f"WARNING: Binary file: {filename}", file=sys.stderr)
            return bad_count
        try:
            lines, encoding = file_opener.open(filename)
        except OSError:
            return bad_count

    for i, line in enumerate(lines):
        if line.rstrip() in exclude_lines:
            continue

        extra_words_to_ignore = set()
        match = inline_ignore_regex.search(line)
        if match:
            extra_words_to_ignore = set(
                filter(None, (match.group("words") or "").split(","))
            )
            if not extra_words_to_ignore:
                continue

        fixed_words = set()
        asked_for = set()

        # If all URI spelling errors will be ignored, erase any URI before
        # extracting words. Otherwise, apply ignores after extracting words.
        # This ensures that if a URI ignore word occurs both inside a URI and
        # outside, it will still be a spelling error.
        if "*" in uri_ignore_words:
            line = uri_regex.sub(" ", line)
        check_matches = extract_words_iter(line, word_regex, ignore_word_regex)
        if "*" not in uri_ignore_words:
            check_matches = apply_uri_ignore_words(
                check_matches,
                line,
                word_regex,
                ignore_word_regex,
                uri_regex,
                uri_ignore_words,
            )
        for match in check_matches:
            word = match.group()
            if word in ignore_words_cased:
                continue
            lword = word.lower()
            if lword in misspellings and lword not in extra_words_to_ignore:
                # Sometimes we find a 'misspelling' which is actually a valid word
                # preceded by a string escape sequence.  Ignore such cases as
                # they're usually false alarms; see issue #17 among others.
                char_before_idx = match.start() - 1
                if (
                    char_before_idx >= 0
                    and line[char_before_idx] == "\\"
                    # bell, backspace, formfeed, newline, carriage-return, tab, vtab.
                    and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
                    and lword[1:] not in misspellings
                ):
                    continue

                context_shown = False
                fix = misspellings[lword].fix
                fixword = fix_case(word, misspellings[lword].data)

                if options.interactive and lword not in asked_for:
                    if context is not None:
                        context_shown = True
                        print_context(lines, i, context)
                    fix, fixword = ask_for_word_fix(
                        lines[i],
                        match,
                        misspellings[lword],
                        options.interactive,
                        colors=colors,
                    )
                    asked_for.add(lword)

                if summary and fix:
                    summary.update(lword)

                if word in fixed_words:  # can skip because of re.sub below
                    continue

                if options.write_changes and fix:
                    changed = True
                    lines[i] = re.sub(rf"\b{word}\b", fixword, lines[i])
                    fixed_words.add(word)
                    continue

                # otherwise warning was explicitly set by interactive mode
                if (
                    options.interactive & 2
                    and not fix
                    and not misspellings[lword].reason
                ):
                    continue

                cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
                cline = f"{colors.FILE}{i + 1}{colors.DISABLE}"
                cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
                crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"

                reason = misspellings[lword].reason
                if reason:
                    if options.quiet_level & QuietLevels.DISABLED_FIXES:
                        continue
                    creason = f"  | {colors.FILE}{reason}{colors.DISABLE}"
                else:
                    if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
                        continue
                    creason = ""

                # If we get to this point (uncorrected error) we should change
                # our bad_count and thus return value
                bad_count += 1

                if (not context_shown) and (context is not None):
                    print_context(lines, i, context)
                if filename != "-":
                    print(
                        f"{cfilename}:{cline}: {cwrongword} "
                        f"==> {crightword}{creason}"
                    )
                elif options.stdin_single_line:
                    print(f"{cline}: {cwrongword} ==> {crightword}{creason}")
                else:
                    print(
                        f"{cline}: {line.strip()}\n\t{cwrongword} "
                        f"==> {crightword}{creason}"
                    )

    if changed:
        if filename == "-":
            print("---")
            for line in lines:
                print(line, end="")
        else:
            if not options.quiet_level & QuietLevels.FIXES:
                print(
                    f"{colors.FWORD}FIXED:{colors.DISABLE} {filename}",
                    file=sys.stderr,
                )
            with open(filename, "w", encoding=encoding, newline="") as f:
                f.writelines(lines)
    return bad_count


def flatten_clean_comma_separated_arguments(
    arguments: Iterable[str],
) -> List[str]:
    """
    >>> flatten_clean_comma_separated_arguments(["a, b ,\n c, d,", "e"])
    ['a', 'b', 'c', 'd', 'e']
    >>> flatten_clean_comma_separated_arguments([])
    []
    """
    return [
        item.strip() for argument in arguments for item in argument.split(",") if item
    ]


def _script_main() -> int:
    """Wrap to main() for setuptools."""
    try:
        return main(*sys.argv[1:])
    except KeyboardInterrupt:
        # User has typed CTRL+C
        sys.stdout.write("\n")
        return 130


def _usage_error(parser: argparse.ArgumentParser, message: str) -> int:
    parser.print_usage()
    print(message, file=sys.stderr)
    return EX_USAGE


def main(*args: str) -> int:
    """Contains flow control"""
    try:
        options, parser, used_cfg_files = parse_options(args)
    except configparser.Error as e:
        print(
            f"ERROR: ill-formed config file: {e.message}",
            file=sys.stderr,
        )
        return EX_CONFIG

    # Report used config files
    if not options.quiet_level & QuietLevels.CONFIG_FILES:
        if len(used_cfg_files) > 0:
            print("Used config files:")
        for ifile, cfg_file in enumerate(used_cfg_files, start=1):
            print(f"    {ifile}: {cfg_file}")

    if options.interactive > 0:
        options.write_changes = True

    if options.regex and options.write_changes:
        return _usage_error(
            parser,
            "ERROR: --write-changes cannot be used together with --regex",
        )
    word_regex = options.regex or word_regex_def
    try:
        word_regex = re.compile(word_regex)
    except re.error as e:
        return _usage_error(
            parser,
            f'ERROR: invalid --regex "{word_regex}" ({e})',
        )

    if options.ignore_regex:
        try:
            ignore_word_regex = re.compile(options.ignore_regex)
        except re.error as e:
            return _usage_error(
                parser,
                f'ERROR: invalid --ignore-regex "{options.ignore_regex}" ({e})',
            )
    else:
        ignore_word_regex = None

    if options.ignore_multiline_regex:
        try:
            ignore_multiline_regex = re.compile(
                options.ignore_multiline_regex, re.DOTALL
            )
        except re.error as e:
            return _usage_error(
                parser,
                f"ERROR: invalid --ignore-multiline-regex "
                f'"{options.ignore_multiline_regex}" ({e})',
            )
    else:
        ignore_multiline_regex = None

    ignore_words, ignore_words_cased = parse_ignore_words_option(
        options.ignore_words_list
    )
    if options.ignore_words:
        ignore_words_files = flatten_clean_comma_separated_arguments(
            options.ignore_words
        )
        for ignore_words_file in ignore_words_files:
            if not os.path.isfile(ignore_words_file):
                return _usage_error(
                    parser,
                    f"ERROR: cannot find ignore-words file: {ignore_words_file}",
                )
            build_ignore_words(ignore_words_file, ignore_words, ignore_words_cased)

    uri_regex = options.uri_regex or uri_regex_def
    try:
        uri_regex = re.compile(uri_regex)
    except re.error as e:
        return _usage_error(
            parser,
            f'ERROR: invalid --uri-regex "{uri_regex}" ({e})',
        )

    uri_ignore_words = set(
        itertools.chain(*parse_ignore_words_option(options.uri_ignore_words_list))
    )

    dictionaries = flatten_clean_comma_separated_arguments(options.dictionary or ["-"])

    use_dictionaries = []
    for dictionary in dictionaries:
        if dictionary == "-":
            # figure out which builtin dictionaries to use
            use = sorted(set(options.builtin.split(",")))
            for u in use:
                for builtin in _builtin_dictionaries:
                    if builtin[0] == u:
                        use_dictionaries.append(
                            os.path.join(_data_root, f"dictionary{builtin[2]}.txt")
                        )
                        break
                else:
                    return _usage_error(
                        parser,
                        f"ERROR: Unknown builtin dictionary: {u}",
                    )
        else:
            if not os.path.isfile(dictionary):
                return _usage_error(
                    parser,
                    f"ERROR: cannot find dictionary file: {dictionary}",
                )
            use_dictionaries.append(dictionary)
    misspellings: Dict[str, Misspelling] = {}
    for dictionary in use_dictionaries:
        build_dict(dictionary, misspellings, ignore_words)
    colors = TermColors()
    if not options.colors:
        colors.disable()

    summary = Summary() if options.summary else None

    context = None
    if options.context is not None:
        if (options.before_context is not None) or (options.after_context is not None):
            return _usage_error(
                parser,
                "ERROR: --context/-C cannot be used together with "
                "--context-before/-B or --context-after/-A",
            )
        context_both = max(0, options.context)
        context = (context_both, context_both)
    elif (options.before_context is not None) or (options.after_context is not None):
        context_before = 0
        context_after = 0
        if options.before_context is not None:
            context_before = max(0, options.before_context)
        if options.after_context is not None:
            context_after = max(0, options.after_context)
        context = (context_before, context_after)

    exclude_lines: Set[str] = set()
    if options.exclude_file:
        exclude_files = flatten_clean_comma_separated_arguments(options.exclude_file)
        for exclude_file in exclude_files:
            build_exclude_hashes(exclude_file, exclude_lines)

    file_opener = FileOpener(
        options.hard_encoding_detection,
        options.quiet_level,
        ignore_multiline_regex,
    )

    glob_match = GlobMatch(
        flatten_clean_comma_separated_arguments(options.skip) if options.skip else []
    )
    try:
        glob_match.match("/random/path")  # does not need a real path
    except re.error:
        return _usage_error(
            parser,
            "ERROR: --skip/-S has been fed an invalid glob, "
            "try escaping special characters",
        )

    bad_count = 0
    for filename in sorted(options.files):
        # ignore hidden files
        if is_hidden(filename, options.check_hidden):
            continue

        if os.path.isdir(filename):
            for root, dirs, files in os.walk(filename):
                if glob_match.match(root):  # skip (absolute) directories
                    dirs.clear()
                    continue
                if is_hidden(root, options.check_hidden):  # dir itself hidden
                    continue
                for file_ in sorted(files):
                    # ignore hidden files in directories
                    if is_hidden(file_, options.check_hidden):
                        continue
                    if glob_match.match(file_):  # skip files
                        continue
                    fname = os.path.join(root, file_)
                    if glob_match.match(fname):  # skip paths
                        continue
                    bad_count += parse_file(
                        fname,
                        colors,
                        summary,
                        misspellings,
                        ignore_words_cased,
                        exclude_lines,
                        file_opener,
                        word_regex,
                        ignore_word_regex,
                        uri_regex,
                        uri_ignore_words,
                        context,
                        options,
                    )

                # skip (relative) directories
                dirs[:] = [
                    dir_
                    for dir_ in dirs
                    if not glob_match.match(dir_)
                    and not is_hidden(dir_, options.check_hidden)
                ]

        elif not glob_match.match(filename):  # skip files
            bad_count += parse_file(
                filename,
                colors,
                summary,
                misspellings,
                ignore_words_cased,
                exclude_lines,
                file_opener,
                word_regex,
                ignore_word_regex,
                uri_regex,
                uri_ignore_words,
                context,
                options,
            )

    if summary:
        print("\n-------8<-------\nSUMMARY:")
        print(summary)
    if options.count:
        print(bad_count, file=sys.stderr)
    return EX_DATAERR if bad_count else EX_OK