From 96bd93071f9e6ce5e7fd5e5c41f234d302c8611b Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Sun, 12 May 2024 21:33:55 -0400 Subject: [PATCH] fix: rework exclusion parsing to fix #1779 --- CHANGES.rst | 4 ++ coverage/parser.py | 118 ++++++++++++++++++----------------------- coverage/python.py | 6 ++- lab/parser.py | 22 +++++--- tests/test_coverage.py | 4 +- tests/test_parser.py | 51 ++++++++++-------- 6 files changed, 103 insertions(+), 102 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 9a61b6657..600d21ee8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -23,6 +23,9 @@ upgrading your version of coverage.py. Unreleased ---------- +- Fix: nested matches of exclude patterns could exclude too much code, as + reported in `issue 1779`_. This is now fixed. + - In the HTML report, the filter term and "hide covered" checkbox settings are remembered between viewings, thanks to `Daniel Diniz `_. @@ -30,6 +33,7 @@ Unreleased .. _pull 1776: https://github.com/nedbat/coveragepy/pull/1776 +.. _issue 1779: https://github.com/nedbat/coveragepy/issues/1779 .. scriv-start-here diff --git a/coverage/parser.py b/coverage/parser.py index 7842b36c9..2b4e60aa8 100644 --- a/coverage/parser.py +++ b/coverage/parser.py @@ -25,7 +25,7 @@ from coverage.bytecode import code_objects from coverage.debug import short_stack from coverage.exceptions import NoSource, NotPython -from coverage.misc import join_regex, nice_pair +from coverage.misc import nice_pair from coverage.phystokens import generate_tokens from coverage.types import TArc, TLineNo @@ -62,8 +62,8 @@ def __init__( self.exclude = exclude - # The text lines of the parsed code. - self.lines: list[str] = self.text.split("\n") + # The parsed AST of the text. + self._ast_root: ast.AST | None = None # The normalized line numbers of the statements in the code. Exclusions # are taken into account, and statements are adjusted to their first @@ -101,19 +101,16 @@ def __init__( self._all_arcs: set[TArc] | None = None self._missing_arc_fragments: TArcFragments | None = None - @functools.lru_cache() - def lines_matching(self, *regexes: str) -> set[TLineNo]: - """Find the lines matching one of a list of regexes. + def lines_matching(self, regex: str) -> set[TLineNo]: + """Find the lines matching a regex. - Returns a set of line numbers, the lines that contain a match for one - of the regexes in `regexes`. The entire line needn't match, just a - part of it. + Returns a set of line numbers, the lines that contain a match for + `regex`. The entire line needn't match, just a part of it. """ - combined = join_regex(regexes) - regex_c = re.compile(combined) + regex_c = re.compile(regex) matches = set() - for i, ltext in enumerate(self.lines, start=1): + for i, ltext in enumerate(self.text.split("\n"), start=1): if regex_c.search(ltext): matches.add(self._multiline.get(i, i)) return matches @@ -127,26 +124,18 @@ def _raw_parse(self) -> None: # Find lines which match an exclusion pattern. if self.exclude: self.raw_excluded = self.lines_matching(self.exclude) + self.excluded = set(self.raw_excluded) - # Tokenize, to find excluded suites, to find docstrings, and to find - # multi-line statements. - - # The last token seen. Start with INDENT to get module docstrings - prev_toktype: int = token.INDENT # The current number of indents. indent: int = 0 # An exclusion comment will exclude an entire clause at this indent. exclude_indent: int = 0 # Are we currently excluding lines? excluding: bool = False - # Are we excluding decorators now? - excluding_decorators: bool = False # The line number of the first line in a multi-line statement. first_line: int = 0 # Is the file empty? empty: bool = True - # Is this the first token on a line? - first_on_line: bool = True # Parenthesis (and bracket) nesting level. nesting: int = 0 @@ -162,42 +151,22 @@ def _raw_parse(self) -> None: indent += 1 elif toktype == token.DEDENT: indent -= 1 - elif toktype == token.NAME: - if ttext == "class": - # Class definitions look like branches in the bytecode, so - # we need to exclude them. The simplest way is to note the - # lines with the "class" keyword. - self.raw_classdefs.add(slineno) elif toktype == token.OP: if ttext == ":" and nesting == 0: should_exclude = ( - self.raw_excluded.intersection(range(first_line, elineno + 1)) - or excluding_decorators + self.excluded.intersection(range(first_line, elineno + 1)) ) if not excluding and should_exclude: # Start excluding a suite. We trigger off of the colon # token so that the #pragma comment will be recognized on # the same line as the colon. - self.raw_excluded.add(elineno) + self.excluded.add(elineno) exclude_indent = indent excluding = True - excluding_decorators = False - elif ttext == "@" and first_on_line: - # A decorator. - if elineno in self.raw_excluded: - excluding_decorators = True - if excluding_decorators: - self.raw_excluded.add(elineno) elif ttext in "([{": nesting += 1 elif ttext in ")]}": nesting -= 1 - elif toktype == token.STRING: - if prev_toktype == token.INDENT: - # Strings that are first on an indented line are docstrings. - # (a trick from trace.py in the stdlib.) This works for - # 99.9999% of cases. - self.raw_docstrings.update(range(slineno, elineno+1)) elif toktype == token.NEWLINE: if first_line and elineno != first_line: # We're at the end of a line, and we've ended on a @@ -206,7 +175,6 @@ def _raw_parse(self) -> None: for l in range(first_line, elineno+1): self._multiline[l] = first_line first_line = 0 - first_on_line = True if ttext.strip() and toktype != tokenize.COMMENT: # A non-white-space token. @@ -218,10 +186,7 @@ def _raw_parse(self) -> None: if excluding and indent <= exclude_indent: excluding = False if excluding: - self.raw_excluded.add(elineno) - first_on_line = False - - prev_toktype = toktype + self.excluded.add(elineno) # Find the starts of the executable statements. if not empty: @@ -234,6 +199,34 @@ def _raw_parse(self) -> None: if env.PYBEHAVIOR.module_firstline_1 and self._multiline: self._multiline[1] = min(self.raw_statements) + self.excluded = self.first_lines(self.excluded) + + # AST lets us find classes, docstrings, and decorator-affected + # functions and classes. + assert self._ast_root is not None + for node in ast.walk(self._ast_root): + # Find class definitions. + if isinstance(node, ast.ClassDef): + self.raw_classdefs.add(node.lineno) + # Find docstrings. + if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef, ast.Module)): + if node.body: + first = node.body[0] + if ( + isinstance(first, ast.Expr) + and isinstance(first.value, ast.Constant) + and isinstance(first.value.value, str) + ): + self.raw_docstrings.update( + range(first.lineno, cast(int, first.end_lineno) + 1) + ) + # Exclusions carry from decorators and signatures to the bodies of + # functions and classes. + if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)): + first_line = min((d.lineno for d in node.decorator_list), default=node.lineno) + if self.excluded.intersection(range(first_line, node.lineno + 1)): + self.excluded.update(range(first_line, cast(int, node.end_lineno) + 1)) + @functools.lru_cache(maxsize=1000) def first_line(self, lineno: TLineNo) -> TLineNo: """Return the first line number of the statement including `lineno`.""" @@ -268,19 +261,14 @@ def parse_source(self) -> None: """ try: + self._ast_root = ast.parse(self.text) self._raw_parse() - except (tokenize.TokenError, IndentationError, SyntaxError) as err: - if hasattr(err, "lineno"): - lineno = err.lineno # IndentationError - else: - lineno = err.args[1][0] # TokenError + except (IndentationError, SyntaxError) as err: raise NotPython( f"Couldn't parse '{self.filename}' as Python source: " + - f"{err.args[0]!r} at line {lineno}", + f"{err.args[0]!r} at line {err.lineno}", ) from err - self.excluded = self.first_lines(self.raw_excluded) - ignore = self.excluded | self.raw_docstrings starts = self.raw_statements - ignore self.statements = self.first_lines(starts) - ignore @@ -303,7 +291,8 @@ def _analyze_ast(self) -> None: `_all_arcs` is the set of arcs in the code. """ - aaa = AstArcAnalyzer(self.text, self.raw_statements, self._multiline) + assert self._ast_root is not None + aaa = AstArcAnalyzer(self._ast_root, self.raw_statements, self._multiline) aaa.analyze() self._all_arcs = set() @@ -403,14 +392,9 @@ def __init__( self.code = code else: assert filename is not None - try: - self.code = compile(text, filename, "exec", dont_inherit=True) - except SyntaxError as synerr: - raise NotPython( - "Couldn't parse '%s' as Python source: '%s' at line %d" % ( - filename, synerr.msg, synerr.lineno or 0, - ), - ) from synerr + # We only get here if earlier ast parsing succeeded, so no need to + # catch errors. + self.code = compile(text, filename, "exec", dont_inherit=True) def child_parsers(self) -> Iterable[ByteParser]: """Iterate over all the code objects nested within this one. @@ -685,11 +669,11 @@ class AstArcAnalyzer: def __init__( self, - text: str, + root_node: ast.AST, statements: set[TLineNo], multiline: dict[TLineNo, TLineNo], ) -> None: - self.root_node = ast.parse(text) + self.root_node = root_node # TODO: I think this is happening in too many places. self.statements = {multiline.get(l, l) for l in statements} self.multiline = multiline diff --git a/coverage/python.py b/coverage/python.py index 089c27ea6..4ac241257 100644 --- a/coverage/python.py +++ b/coverage/python.py @@ -206,8 +206,10 @@ def translate_arcs(self, arcs: Iterable[TArc]) -> set[TArc]: def no_branch_lines(self) -> set[TLineNo]: assert self.coverage is not None no_branch = self.parser.lines_matching( - join_regex(self.coverage.config.partial_list), - join_regex(self.coverage.config.partial_always_list), + join_regex( + self.coverage.config.partial_list + + self.coverage.config.partial_always_list + ) ) return no_branch diff --git a/lab/parser.py b/lab/parser.py index 73c414245..b9edc4f5c 100644 --- a/lab/parser.py +++ b/lab/parser.py @@ -80,7 +80,7 @@ def one_file(self, options, filename): if options.dis: print("Main code:") - disassemble(pyparser) + disassemble(pyparser.text) arcs = pyparser.arcs() @@ -95,8 +95,8 @@ def one_file(self, options, filename): exit_counts = pyparser.exit_counts() - for lineno, ltext in enumerate(pyparser.lines, start=1): - marks = [' ', ' ', ' ', ' ', ' '] + for lineno, ltext in enumerate(pyparser.text.splitlines(), start=1): + marks = [' '] * 6 a = ' ' if lineno in pyparser.raw_statements: marks[0] = '-' @@ -110,7 +110,13 @@ def one_file(self, options, filename): if lineno in pyparser.raw_classdefs: marks[3] = 'C' if lineno in pyparser.raw_excluded: - marks[4] = 'x' + marks[4] = 'X' + elif lineno in pyparser.excluded: + marks[4] = '×' + if lineno in pyparser._multiline.values(): + marks[5] = 'o' + elif lineno in pyparser._multiline.keys(): + marks[5] = '.' if arc_chars: a = arc_chars[lineno].ljust(arc_width) @@ -173,13 +179,13 @@ def all_code_objects(code): yield code -def disassemble(pyparser): +def disassemble(text): """Disassemble code, for ad-hoc experimenting.""" - code = compile(pyparser.text, "", "exec", dont_inherit=True) + code = compile(text, "", "exec", dont_inherit=True) for code_obj in all_code_objects(code): - if pyparser.text: - srclines = pyparser.text.splitlines() + if text: + srclines = text.splitlines() else: srclines = None print("\n%s: " % code_obj) diff --git a/tests/test_coverage.py b/tests/test_coverage.py index 81a24b69e..d50971120 100644 --- a/tests/test_coverage.py +++ b/tests/test_coverage.py @@ -664,12 +664,12 @@ def test_module_docstring(self) -> None: [2, 3], ) self.check_coverage("""\ - # Start with a comment, because it changes the behavior(!?) + # Start with a comment, even though it doesn't change the behavior. '''I am a module docstring.''' a = 3 b = 4 """, - [2, 3, 4], + [3, 4], ) diff --git a/tests/test_parser.py b/tests/test_parser.py index eaaf4ae0a..0e208bbdb 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -124,30 +124,15 @@ def foo(): """) assert parser.exit_counts() == { 1:1, 2:1, 3:1, 6:1 } - def test_indentation_error(self) -> None: - msg = ( - "Couldn't parse '' as Python source: " + - "'unindent does not match any outer indentation level.*' at line 3" - ) - with pytest.raises(NotPython, match=msg): - _ = self.parse_text("""\ - 0 spaces - 2 - 1 - """) - - def test_token_error(self) -> None: - submsgs = [ - r"EOF in multi-line string", # before 3.12.0b1 - r"unterminated triple-quoted string literal .detected at line 1.", # after 3.12.0b1 - ] - msg = ( - r"Couldn't parse '' as Python source: '" - + r"(" + "|".join(submsgs) + ")" - + r"' at line 1" - ) + @pytest.mark.parametrize("text", [ + pytest.param("0 spaces\n 2\n 1", id="bad_indent"), + pytest.param("'''", id="string_eof"), + pytest.param("$hello", id="dollar"), + ]) + def test_not_python(self, text: str) -> None: + msg = r"Couldn't parse '' as Python source: '.*' at line \d+" with pytest.raises(NotPython, match=msg): - _ = self.parse_text("'''") + _ = self.parse_text(text) def test_empty_decorated_function(self) -> None: parser = self.parse_text("""\ @@ -740,6 +725,10 @@ def g(): assert parser.raw_statements == raw_statements assert parser.statements == set() + @pytest.mark.xfail( + env.PYPY and env.PYVERSION[:2] == (3, 8), + reason="AST doesn't mark end of classes correctly", + ) def test_class_decorator_pragmas(self) -> None: parser = self.parse_text("""\ class Foo(object): @@ -754,6 +743,22 @@ def __init__(self): assert parser.raw_statements == {1, 2, 3, 5, 6, 7, 8} assert parser.statements == {1, 2, 3} + def test_over_exclusion_bug1779(self) -> None: + # https://github.com/nedbat/coveragepy/issues/1779 + parser = self.parse_text("""\ + import abc + + class MyProtocol: # nocover 3 + @abc.abstractmethod # nocover 4 + def my_method(self) -> int: + ... # 6 + + def function() -> int: + return 9 + """) + assert parser.raw_statements == {1, 3, 4, 5, 6, 8, 9} + assert parser.statements == {1, 8, 9} + class ParserMissingArcDescriptionTest(PythonParserTestBase): """Tests for PythonParser.missing_arc_description."""