diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 27ff644ca6dc1c..effcd8b0c94df0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -43,6 +43,7 @@ jobs: - "!crates/ruff_dev/**" - "!crates/ruff_shrinking/**" - scripts/* + - .github/workflows/ci.yaml formatter: - Cargo.toml @@ -57,6 +58,7 @@ jobs: - crates/ruff_python_parser/** - crates/ruff_dev/** - scripts/* + - .github/workflows/ci.yaml cargo-fmt: name: "cargo fmt" @@ -182,7 +184,7 @@ jobs: - cargo-test-linux - determine_changes # Only runs on pull requests, since that is the only we way we can find the base version for comparison. - if: github.event_name == 'pull_request' && needs.determine_changes.outputs.linter == 'true' + if: github.event_name == 'pull_request' steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 @@ -190,27 +192,48 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - uses: actions/download-artifact@v3 - name: Download Ruff binary + name: Download comparison Ruff binary id: ruff-target with: name: ruff path: target/debug - uses: dawidd6/action-download-artifact@v2 - name: Download base results + name: Download baseline Ruff binary with: name: ruff branch: ${{ github.event.pull_request.base.ref }} check_artifacts: true - - name: Run ecosystem check + - name: Install ruff-ecosystem + run: | + pip install ./python/ruff-ecosystem + + - name: Run `ruff check` ecosystem check + if: ${{ needs.determine_changes.outputs.linter == 'true' }} run: | # Make executable, since artifact download doesn't preserve this - chmod +x ruff ${{ steps.ruff-target.outputs.download-path }}/ruff + chmod +x ./ruff ${{ steps.ruff-target.outputs.download-path }}/ruff + + ruff-ecosystem check ./ruff ${{ steps.ruff-target.outputs.download-path }}/ruff --cache ./checkouts --output-format markdown | tee ecosystem-result-check - scripts/check_ecosystem.py ruff ${{ steps.ruff-target.outputs.download-path }}/ruff | tee ecosystem-result - cat ecosystem-result > $GITHUB_STEP_SUMMARY + cat ecosystem-result-check > $GITHUB_STEP_SUMMARY + cat ecosystem-result-check > ecosystem-result + echo "" >> ecosystem-result + - name: Run `ruff format` ecosystem check + if: ${{ needs.determine_changes.outputs.formatter == 'true' }} + run: | + # Make executable, since artifact download doesn't preserve this + chmod +x ./ruff ${{ steps.ruff-target.outputs.download-path }}/ruff + + ruff-ecosystem format ./ruff ${{ steps.ruff-target.outputs.download-path }}/ruff --cache ./checkouts --output-format markdown | tee ecosystem-result-format + + cat ecosystem-result-format > $GITHUB_STEP_SUMMARY + cat ecosystem-result-format >> ecosystem-result + + - name: Export pull request number + run: | echo ${{ github.event.number }} > pr-number - uses: actions/upload-artifact@v3 diff --git a/python/ruff-ecosystem/README.md b/python/ruff-ecosystem/README.md new file mode 100644 index 00000000000000..abc6be7e57ca1c --- /dev/null +++ b/python/ruff-ecosystem/README.md @@ -0,0 +1,46 @@ +# ruff-ecosystem + +Compare lint and format results for two different ruff versions (e.g. main and a PR) on real world projects. + +## Installation + +From the Ruff project root, install with `pip`: + +```shell +pip install -e ./python/ruff-ecosystem +``` + +## Usage + +```shell +ruff-ecosystem +``` + +Note executable paths may be absolute, relative to the current working directory, or will be looked up in the +current Python environment and PATH. + +Run `ruff check` ecosystem checks comparing your debug build to your system Ruff: + +```shell +ruff-ecosystem check ruff "./target/debug/ruff" +``` + +Run `ruff format` ecosystem checks comparing your debug build to your system Ruff: + +```shell +ruff-ecosystem format ruff "./target/debug/ruff" +``` + +## Development + +When developing, it can be useful to set the `--pdb` flag to drop into a debugger on failure: + +```shell +ruff-ecosystem check ruff "./target/debug/ruff" --pdb +``` + +You can also provide a path to cache checkouts to speed up repeated runs: + +```shell +ruff-ecosystem check ruff "./target/debug/ruff" --cache ./repos +``` diff --git a/python/ruff-ecosystem/pyproject.toml b/python/ruff-ecosystem/pyproject.toml new file mode 100644 index 00000000000000..8bff7a17060b8d --- /dev/null +++ b/python/ruff-ecosystem/pyproject.toml @@ -0,0 +1,15 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "ruff-ecosystem" +version = "0.0.0" +dependencies = ["unidiff==0.7.5"] + +[project.scripts] +ruff-ecosystem = "ruff_ecosystem.cli:entrypoint" + +[tool.ruff] +extend-select = ["I"] +preview = true diff --git a/python/ruff-ecosystem/ruff_ecosystem/__init__.py b/python/ruff-ecosystem/ruff_ecosystem/__init__.py new file mode 100644 index 00000000000000..883ecb9c74b23f --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/__init__.py @@ -0,0 +1,3 @@ +import logging + +logger = logging.getLogger("ruff-ecosystem") diff --git a/python/ruff-ecosystem/ruff_ecosystem/__main__.py b/python/ruff-ecosystem/ruff_ecosystem/__main__.py new file mode 100644 index 00000000000000..bc1e7d2c79373d --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/__main__.py @@ -0,0 +1,8 @@ +""" +Enables usage with `python -m ruff_ecosystem` +""" + +import ruff_ecosystem.cli + +if __name__ == "__main__": + ruff_ecosystem.cli.entrypoint() diff --git a/python/ruff-ecosystem/ruff_ecosystem/check.py b/python/ruff-ecosystem/ruff_ecosystem/check.py new file mode 100644 index 00000000000000..becf357cee0f48 --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/check.py @@ -0,0 +1,565 @@ +""" +Execution, comparison, and summary of `ruff check` ecosystem checks. +""" +from __future__ import annotations + +import asyncio +import dataclasses +import re +import time +from asyncio import create_subprocess_exec +from collections import Counter +from dataclasses import dataclass, field +from pathlib import Path +from subprocess import PIPE +from typing import TYPE_CHECKING, Iterable, Iterator, Self, Sequence + +from ruff_ecosystem import logger +from ruff_ecosystem.markdown import ( + markdown_details, + markdown_plus_minus, + markdown_project_section, +) +from ruff_ecosystem.types import ( + Comparison, + Diff, + Result, + RuffError, + Serializable, +) + +if TYPE_CHECKING: + from ruff_ecosystem.projects import ClonedRepository, Project + + +# Matches lines that are summaries rather than diagnostics +CHECK_SUMMARY_LINE_RE = re.compile(r"^(Found \d+ error.*)|(.* fixable with .*)$") + +# Parses a diagnostic line (in a diff) to retrieve path and line number +CHECK_DIFF_LINE_RE = re.compile( + r"^(?P
[+-]) (?P(?P[^:]+):(?P\d+):\d+:) (?P.*)$",
+)
+
+CHECK_DIAGNOSTIC_LINE_RE = re.compile(
+    r"^(?P[+-])? ?(?P.*): (?P[A-Z]{1,4}[0-9]{3,4})(?P \[\*\])? (?P.*)"
+)
+
+CHECK_VIOLATION_FIX_INDICATOR = " [*]"
+
+GITHUB_MAX_COMMENT_LENGTH = 65536  # characters
+
+
+def markdown_check_result(result: Result) -> str:
+    """
+    Render a `ruff check` ecosystem check result as markdown.
+    """
+    # Calculate the total number of rule changes
+    all_rule_changes = RuleChanges()
+    project_diffs = {
+        project: CheckDiff.from_simple_diff(comparison.diff)
+        for project, comparison in result.completed
+    }
+    project_rule_changes: dict[Project, RuleChanges] = {}
+    for project, diff in project_diffs.items():
+        project_rule_changes[project] = changes = RuleChanges.from_diff(diff)
+        all_rule_changes.update(changes)
+
+    lines = []
+    total_removed = all_rule_changes.total_removed_violations()
+    total_added = all_rule_changes.total_added_violations()
+    total_added_fixes = all_rule_changes.total_added_fixes()
+    total_removed_fixes = all_rule_changes.total_removed_fixes()
+    total_changes = (
+        total_added + total_removed + total_added_fixes + total_removed_fixes
+    )
+    error_count = len(result.errored)
+    total_affected_rules = len(all_rule_changes.rule_codes())
+
+    if total_affected_rules == 0 and error_count == 0:
+        return "\u2705 ecosystem check detected no linter changes."
+
+    # Summarize the total changes
+    if total_affected_rules == 0:
+        # Only errors
+        s = "s" if error_count != 1 else ""
+        lines.append(
+            f"\u2139\ufe0f ecosystem check **encountered linter errors**. (no lint changes; {error_count} project error{s})"
+        )
+    else:
+        change_summary = (
+            f"{markdown_plus_minus(total_added, total_removed)} violations, "
+            f"{markdown_plus_minus(total_added_fixes, total_removed_fixes)} fixes "
+            f"in {len(result.completed)} projects"
+        )
+        if error_count:
+            s = "s" if error_count != 1 else ""
+            change_summary += f"; {error_count} project error{s}"
+        lines.append(
+            f"\u2139\ufe0f ecosystem check **detected linter changes**. ({change_summary})"
+        )
+    lines.append("")
+
+    # Limit the number of items displayed per rule to between 5 and 50
+    max_display_per_rule = max(
+        5,
+        # Calculate the number of affected rules that we would display to increase
+        # the maximum if there are less rules affected
+        50 // total_affected_rules,
+    )
+
+    # Display per project changes
+    for project, comparison in result.completed:
+        # TODO: This is not a performant way to check the length but the whole
+        #       GitHub comment length issue needs to be addressed anyway
+        if len(" ".join(lines)) > GITHUB_MAX_COMMENT_LENGTH // 3:
+            lines.append("")
+            lines.append(
+                "_... Truncated remaining completed projected reports due to GitHub comment length restrictions_"
+            )
+            lines.append("")
+            break
+
+        if not comparison.diff:
+            continue  # Skip empty diffs
+
+        diff = project_diffs[project]
+        rule_changes = project_rule_changes[project]
+        project_removed_violations = rule_changes.total_removed_violations()
+        project_added_violations = rule_changes.total_added_violations()
+        project_added_fixes = rule_changes.total_added_fixes()
+        project_removed_fixes = rule_changes.total_removed_fixes()
+        project_changes = (
+            project_added_violations
+            + project_removed_violations
+            + project_added_fixes
+            + project_removed_fixes
+        )
+
+        # Limit the number of items displayed per project to between 10 and 50
+        # based on the number of total changes present in this project
+        max_display_per_project = max(10, int((project_changes / total_changes) * 50))
+
+        # Display the diff
+        displayed_changes_per_rule = Counter()
+        displayed_changes = 0
+
+        # Wrap with `
` for code-styling with support for links
+        diff_lines = ["
"]
+        for line in diff.parsed_lines:
+            rule_code = line.rule_code
+
+            # Limit the number of changes we'll show per rule code
+            if displayed_changes_per_rule[rule_code] > max_display_per_rule:
+                continue
+
+            diff_lines.append(
+                add_permalink_to_diagnostic_line(comparison.repo, line.to_string())
+            )
+
+            displayed_changes_per_rule[rule_code] += 1
+            displayed_changes += 1
+
+            # If we just reached the maximum... display an omission line
+            if displayed_changes_per_rule[rule_code] > max_display_per_rule:
+                hidden_count = (
+                    rule_changes.added_violations[rule_code]
+                    + rule_changes.removed_violations[rule_code]
+                    + rule_changes.added_fixes[rule_code]
+                    + rule_changes.removed_fixes[rule_code]
+                    - max_display_per_rule
+                )
+                diff_lines.append(
+                    f"... {hidden_count} additional changes omitted for rule {rule_code}"
+                )
+
+            if displayed_changes >= max_display_per_project:
+                break
+
+        if project_changes > max_display_per_project:
+            hidden_count = project_changes - displayed_changes
+            diff_lines.append(
+                f"... {hidden_count} additional changes omitted for project"
+            )
+
+        diff_lines.append("
") + + title = ( + f"+{project_added_violations} " + f"-{project_removed_violations} violations, " + f"+{project_added_fixes} " + f"-{project_removed_fixes} fixes" + ) + + lines.extend( + markdown_project_section( + title=title, + content=diff_lines, + options=project.check_options, + project=project, + ) + ) + + for project, error in result.errored: + lines.extend( + markdown_project_section( + title="error", + content=f"```\n{error}```", + options=project.check_options, + project=project, + ) + ) + + # Display a summary table of changed rules + if all_rule_changes: + table_lines = [] + table_lines.append( + "| code | total | + violation | - violation | + fix | - fix |" + ) + table_lines.append("| ---- | ------- | --------- | -------- | ----- | ---- |") + for rule, total in sorted( + all_rule_changes.total_changes_by_rule(), + key=lambda item: item[1], # Sort by the total changes + reverse=True, + ): + added_violations, removed_violations, added_fixes, removed_fixes = ( + all_rule_changes.added_violations[rule], + all_rule_changes.removed_violations[rule], + all_rule_changes.added_fixes[rule], + all_rule_changes.removed_fixes[rule], + ) + table_lines.append( + f"| {rule} | {total} | {added_violations} | {removed_violations} " + f"| {added_fixes} | {removed_fixes} |" + ) + + lines.extend( + markdown_details( + summary=f"Changes by rule ({total_affected_rules} rules affected)", + preface="", + content=table_lines, + ) + ) + + return "\n".join(lines) + + +@dataclass(frozen=True) +class RuleChanges: + """ + The number of additions and removals by rule code. + + While the attributes are frozen to avoid accidentally changing the value of an attribute, + the counters themselves are mutable and this class can be mutated with `+` and `update`. + """ + + added_violations: Counter = field(default_factory=Counter) + removed_violations: Counter = field(default_factory=Counter) + added_fixes: Counter = field(default_factory=Counter) + removed_fixes: Counter = field(default_factory=Counter) + + def rule_codes(self) -> set[str]: + return ( + set(self.added_violations.keys()) + .union(self.removed_violations.keys()) + .union(self.added_fixes.keys()) + .union(self.removed_fixes.keys()) + ) + + def __add__(self, other: Self) -> Self: + if not isinstance(other, type(self)): + return NotImplemented + + new = type(self)() + new.update(self) + new.update(other) + return new + + def update(self, other: Self) -> Self: + self.added_violations.update(other.added_violations) + self.removed_violations.update(other.removed_violations) + self.added_fixes.update(other.added_fixes) + self.removed_fixes.update(other.removed_fixes) + return self + + def total_added_violations(self) -> int: + return sum(self.added_violations.values()) + + def total_removed_violations(self) -> int: + return sum(self.removed_violations.values()) + + def total_added_fixes(self) -> int: + return sum(self.added_fixes.values()) + + def total_removed_fixes(self) -> int: + return sum(self.removed_fixes.values()) + + def total_changes_by_rule(self) -> Iterator[tuple[str, int]]: + """ + Yields the sum of changes for each rule + """ + totals = Counter() + totals.update(self.added_violations) + totals.update(self.removed_violations) + totals.update(self.added_fixes) + totals.update(self.removed_fixes) + yield from totals.items() + + @classmethod + def from_diff(cls: type[Self], diff: CheckDiff) -> Self: + """ + Parse a diff from `ruff check` to determine the additions and removals for each rule + """ + rule_changes = cls() + + for line in diff.parsed_lines: + if line.is_added: + if line in diff.fix_only_lines: + if line.fix_available: + rule_changes.added_fixes[line.rule_code] += 1 + else: + rule_changes.removed_fixes[line.rule_code] += 1 + else: + rule_changes.added_violations[line.rule_code] += 1 + elif line.is_removed: + if line in diff.fix_only_lines: + if line.fix_available: + rule_changes.removed_fixes[line.rule_code] += 1 + else: + rule_changes.added_fixes[line.rule_code] += 1 + else: + rule_changes.removed_violations[line.rule_code] += 1 + + return rule_changes + + def __bool__(self): + return bool( + self.added_violations + or self.removed_violations + or self.added_fixes + or self.removed_fixes + ) + + +@dataclass(frozen=True) +class DiagnosticLine: + is_added: bool | None + is_removed: bool | None + fix_available: bool + rule_code: str + location: str + message: str + + def to_string(self) -> str: + """ + Construct the line from the components + """ + line = "" + if self.is_added: + line += "+ " + elif self.is_removed: + line += "- " + line += f"{self.location}: {self.rule_code} " + if self.fix_available: + line += "[*] " + line += self.message + return line + + def with_fix_available(self) -> DiagnosticLine: + return DiagnosticLine(**{**dataclasses.asdict(self), "fix_available": True}) + + def without_fix_available(self) -> DiagnosticLine: + return DiagnosticLine(**{**dataclasses.asdict(self), "fix_available": False}) + + def without_diff(self) -> DiagnosticLine: + return DiagnosticLine( + **{**dataclasses.asdict(self), "is_added": None, "is_removed": None} + ) + + @classmethod + def try_from_string(cls: type[Self], line: str) -> Self | None: + """ + Parse the rule code from a diagnostic line string + """ + match = CHECK_DIAGNOSTIC_LINE_RE.match(line) + + if match is None: + # Handle case where there are no regex match e.g. + # + "?application=AIRFLOW&authenticator=TEST_AUTH&role=TEST_ROLE&warehouse=TEST_WAREHOUSE" # noqa: E501, ERA001 + # Which was found in local testing + return None + + match_items = match.groupdict() + + return cls( + location=match_items["location"], + is_removed=match_items.get("diff") == "-", + is_added=match_items.get("diff") == "+", + fix_available=match_items.get("fixable") is not None, + rule_code=match_items["code"], + message=match_items["message"], + ) + + +class CheckDiff(Diff): + """ + Extends the normal diff with diagnostic parsing + """ + + def __init__( + self, + lines: Iterable[str], + parsed_lines: list[DiagnosticLine], + fix_only_lines: set[DiagnosticLine], + ) -> None: + self.parsed_lines = parsed_lines + self.fix_only_lines = fix_only_lines + super().__init__(lines) + + @classmethod + def from_simple_diff(cls, diff: Diff) -> CheckDiff: + """ + Parse a simple diff to include check-specific analyses. + """ + # Drop unchanged lines + diff = diff.without_unchanged_lines() + + # Sort without account for the leading + / - + sorted_lines = list(sorted(diff, key=lambda line: line[2:])) + + # Parse the lines, drop lines that cannot be parsed + parsed_lines: list[DiagnosticLine] = list( + filter( + None, + (DiagnosticLine.try_from_string(line) for line in sorted_lines), + ) + ) + + # Calculate which lines only changed fix availability + fix_only: set[DiagnosticLine] = set() + + # TODO(zanieb): There has to be a cleaner way to express this logic + # We check if each added line is available in the removed set with fix + # availability toggled and vice-versa + for line in parsed_lines: + other_set = diff.removed if line.is_added else diff.added + toggled = ( + line.without_fix_available() + if line.fix_available + else line.with_fix_available() + ) + if toggled.without_diff().to_string() in other_set: + fix_only.add(line) + + return CheckDiff( + lines=sorted_lines, parsed_lines=parsed_lines, fix_only_lines=fix_only + ) + + +def add_permalink_to_diagnostic_line(repo: ClonedRepository, line: str) -> str: + match = CHECK_DIFF_LINE_RE.match(line) + if match is None: + return line + + pre, inner, path, lnum, post = match.groups() + url = repo.url_for(path, int(lnum)) + return f"{pre} {inner} {post}" + + +async def compare_check( + ruff_baseline_executable: Path, + ruff_comparison_executable: Path, + options: CheckOptions, + cloned_repo: ClonedRepository, +) -> Comparison: + async with asyncio.TaskGroup() as tg: + baseline_task = tg.create_task( + ruff_check( + executable=ruff_baseline_executable.resolve(), + path=cloned_repo.path, + name=cloned_repo.fullname, + options=options, + ), + ) + comparison_task = tg.create_task( + ruff_check( + executable=ruff_comparison_executable.resolve(), + path=cloned_repo.path, + name=cloned_repo.fullname, + options=options, + ), + ) + + baseline_output, comparison_output = ( + baseline_task.result(), + comparison_task.result(), + ) + + diff = Diff.from_pair(baseline_output, comparison_output) + + return Comparison(diff=diff, repo=cloned_repo) + + +async def ruff_check( + *, executable: Path, path: Path, name: str, options: CheckOptions +) -> Sequence[str]: + """Run the given ruff binary against the specified path.""" + logger.debug(f"Checking {name} with {executable}") + ruff_args = options.to_cli_args() + + start = time.time() + proc = await create_subprocess_exec( + executable.absolute(), + *ruff_args, + ".", + stdout=PIPE, + stderr=PIPE, + cwd=path, + ) + result, err = await proc.communicate() + end = time.time() + + logger.debug(f"Finished checking {name} with {executable} in {end - start:.2f}s") + + if proc.returncode != 0: + raise RuffError(err.decode("utf8")) + + # Strip summary lines so the diff is only diagnostic lines + lines = [ + line + for line in result.decode("utf8").splitlines() + if not CHECK_SUMMARY_LINE_RE.match(line) + ] + + return lines + + +@dataclass(frozen=True) +class CheckOptions(Serializable): + """ + Ruff check options + """ + + select: str = "" + ignore: str = "" + exclude: str = "" + + # Generating fixes is slow and verbose + show_fixes: bool = False + + # Limit the number of reported lines per rule + max_lines_per_rule: int | None = 50 + + def markdown(self) -> str: + return f"select {self.select} ignore {self.ignore} exclude {self.exclude}" + + def to_cli_args(self) -> list[str]: + args = ["check", "--no-cache", "--exit-zero"] + if self.select: + args.extend(["--select", self.select]) + if self.ignore: + args.extend(["--ignore", self.ignore]) + if self.exclude: + args.extend(["--exclude", self.exclude]) + if self.show_fixes: + args.extend(["--show-fixes", "--ecosystem-ci"]) + return args diff --git a/python/ruff-ecosystem/ruff_ecosystem/cli.py b/python/ruff-ecosystem/ruff_ecosystem/cli.py new file mode 100644 index 00000000000000..348bab300d603a --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/cli.py @@ -0,0 +1,166 @@ +import argparse +import asyncio +import logging +import os +import shutil +import sys +import sysconfig +import tempfile +from contextlib import nullcontext +from pathlib import Path +from signal import SIGINT, SIGTERM + +from ruff_ecosystem import logger +from ruff_ecosystem.defaults import DEFAULT_TARGETS +from ruff_ecosystem.main import OutputFormat, main +from ruff_ecosystem.projects import RuffCommand + + +def excepthook(type, value, tb): + if hasattr(sys, "ps1") or not sys.stderr.isatty(): + # we are in interactive mode or we don't have a tty so call the default + sys.__excepthook__(type, value, tb) + else: + import pdb + import traceback + + traceback.print_exception(type, value, tb) + print() + pdb.post_mortem(tb) + + +def entrypoint(): + args = parse_args() + + if args.pdb: + sys.excepthook = excepthook + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + # Use a temporary directory for caching if no cache is specified + cache_context = ( + tempfile.TemporaryDirectory() if not args.cache else nullcontext(args.cache) + ) + + ruff_baseline = args.ruff_baseline + if not args.ruff_baseline.exists(): + ruff_baseline = get_executable_path(str(args.ruff_baseline)) + if not ruff_baseline: + print( + f"Could not find ruff baseline executable: {args.ruff_baseline}", + sys.stderr, + ) + exit(1) + logger.info( + "Resolved baseline executable %s to %s", args.ruff_baseline, ruff_baseline + ) + + ruff_comparison = args.ruff_comparison + if not args.ruff_comparison.exists(): + ruff_comparison = get_executable_path(str(args.ruff_comparison)) + if not ruff_comparison: + print( + f"Could not find ruff comparison executable: {args.ruff_comparison}", + sys.stderr, + ) + exit(1) + logger.info( + "Resolved comparison executable %s to %s", + args.ruff_comparison, + ruff_comparison, + ) + + with cache_context as cache: + loop = asyncio.get_event_loop() + main_task = asyncio.ensure_future( + main( + command=RuffCommand(args.ruff_command), + ruff_baseline_executable=ruff_baseline, + ruff_comparison_executable=ruff_comparison, + targets=DEFAULT_TARGETS, + format=OutputFormat(args.output_format), + project_dir=Path(cache), + raise_on_failure=args.pdb, + ) + ) + # https://stackoverflow.com/a/58840987/3549270 + for signal in [SIGINT, SIGTERM]: + loop.add_signal_handler(signal, main_task.cancel) + try: + loop.run_until_complete(main_task) + finally: + loop.close() + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Check two versions of ruff against a corpus of open-source code.", + ) + + # TODO: Support non-default `--targets` + # parser.add_argument( + # "--targets", + # type=Path, + # help=( + # "Optional JSON files to use over the default repositories. " + # "Supports both github_search_*.jsonl and known-github-tomls.jsonl." + # ), + # ) + parser.add_argument( + "--cache", + type=Path, + help="Location for caching cloned repositories", + ) + parser.add_argument( + "--output-format", + choices=[option.name for option in OutputFormat], + default="json", + help="Location for caching cloned repositories", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable debug logging", + ) + parser.add_argument( + "--pdb", + action="store_true", + help="Enable debugging on failure", + ) + parser.add_argument( + "ruff_command", + choices=[option.name for option in RuffCommand], + help="The Ruff command to test", + ) + parser.add_argument( + "ruff_baseline", + type=Path, + ) + parser.add_argument( + "ruff_comparison", + type=Path, + ) + + return parser.parse_args() + + +def get_executable_path(name: str) -> Path | None: + # Add suffix for Windows executables + name += ".exe" if sys.platform == "win32" and not name.endswith(".exe") else "" + + path = os.path.join(sysconfig.get_path("scripts"), name) + + # The executable in the current interpreter's scripts directory. + if os.path.exists(path): + return Path(path) + + # The executable in the global environment. + environment_path = shutil.which(name) + if environment_path: + return Path(environment_path) + + return None diff --git a/python/ruff-ecosystem/ruff_ecosystem/defaults.py b/python/ruff-ecosystem/ruff_ecosystem/defaults.py new file mode 100644 index 00000000000000..16db0627153561 --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/defaults.py @@ -0,0 +1,73 @@ +""" +Default projects for ecosystem checks +""" +from ruff_ecosystem.projects import CheckOptions, FormatOptions, Project, Repository + +# TODO(zanieb): Consider exporting this as JSON and loading from there instead +DEFAULT_TARGETS = [ + Project(repo=Repository(owner="DisnakeDev", name="disnake", ref="master")), + Project(repo=Repository(owner="PostHog", name="HouseWatch", ref="main")), + Project(repo=Repository(owner="RasaHQ", name="rasa", ref="main")), + Project(repo=Repository(owner="Snowflake-Labs", name="snowcli", ref="main")), + Project(repo=Repository(owner="aiven", name="aiven-client", ref="main")), + Project(repo=Repository(owner="alteryx", name="featuretools", ref="main")), + Project( + repo=Repository(owner="apache", name="airflow", ref="main"), + check_options=CheckOptions(select="ALL"), + ), + Project(repo=Repository(owner="aws", name="aws-sam-cli", ref="develop")), + Project(repo=Repository(owner="bloomberg", name="pytest-memray", ref="main")), + Project( + repo=Repository(owner="bokeh", name="bokeh", ref="branch-3.3"), + check_options=CheckOptions(select="ALL"), + ), + Project(repo=Repository(owner="commaai", name="openpilot", ref="master")), + Project( + repo=Repository(owner="demisto", name="content", ref="master"), + format_options=FormatOptions( + # Syntax errors in this file + exclude="Packs/ThreatQ/Integrations/ThreatQ/ThreatQ.py" + ), + ), + Project(repo=Repository(owner="docker", name="docker-py", ref="main")), + Project(repo=Repository(owner="freedomofpress", name="securedrop", ref="develop")), + Project(repo=Repository(owner="fronzbot", name="blinkpy", ref="dev")), + Project(repo=Repository(owner="ibis-project", name="ibis", ref="master")), + Project(repo=Repository(owner="ing-bank", name="probatus", ref="main")), + Project(repo=Repository(owner="jrnl-org", name="jrnl", ref="develop")), + Project(repo=Repository(owner="latchbio", name="latch", ref="main")), + Project(repo=Repository(owner="lnbits", name="lnbits", ref="main")), + Project(repo=Repository(owner="milvus-io", name="pymilvus", ref="master")), + Project(repo=Repository(owner="mlflow", name="mlflow", ref="master")), + Project(repo=Repository(owner="model-bakers", name="model_bakery", ref="main")), + Project(repo=Repository(owner="pandas-dev", name="pandas", ref="main")), + Project(repo=Repository(owner="prefecthq", name="prefect", ref="main")), + Project(repo=Repository(owner="pypa", name="build", ref="main")), + Project(repo=Repository(owner="pypa", name="cibuildwheel", ref="main")), + Project(repo=Repository(owner="pypa", name="pip", ref="main")), + Project(repo=Repository(owner="pypa", name="setuptools", ref="main")), + Project(repo=Repository(owner="python", name="mypy", ref="master")), + Project( + repo=Repository( + owner="python", + name="typeshed", + ref="main", + ), + check_options=CheckOptions(select="PYI"), + ), + Project(repo=Repository(owner="python-poetry", name="poetry", ref="master")), + Project(repo=Repository(owner="reflex-dev", name="reflex", ref="main")), + Project(repo=Repository(owner="rotki", name="rotki", ref="develop")), + Project(repo=Repository(owner="scikit-build", name="scikit-build", ref="main")), + Project( + repo=Repository(owner="scikit-build", name="scikit-build-core", ref="main") + ), + Project(repo=Repository(owner="sphinx-doc", name="sphinx", ref="master")), + Project(repo=Repository(owner="spruceid", name="siwe-py", ref="main")), + Project(repo=Repository(owner="tiangolo", name="fastapi", ref="master")), + Project(repo=Repository(owner="yandex", name="ch-backup", ref="main")), + Project( + repo=Repository(owner="zulip", name="zulip", ref="main"), + check_options=CheckOptions(select="ALL"), + ), +] diff --git a/python/ruff-ecosystem/ruff_ecosystem/format.py b/python/ruff-ecosystem/ruff_ecosystem/format.py new file mode 100644 index 00000000000000..686515521e544a --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/format.py @@ -0,0 +1,195 @@ +""" +Execution, comparison, and summary of `ruff format` ecosystem checks. +""" + +from __future__ import annotations + +import time +from asyncio import create_subprocess_exec +from dataclasses import dataclass +from pathlib import Path +from subprocess import PIPE +from typing import TYPE_CHECKING, Sequence + +from unidiff import PatchSet + +from ruff_ecosystem import logger +from ruff_ecosystem.markdown import markdown_project_section +from ruff_ecosystem.types import Comparison, Diff, Result, RuffError + +if TYPE_CHECKING: + from ruff_ecosystem.projects import ClonedRepository + + +def markdown_format_result(result: Result) -> str: + """ + Render a `ruff format` ecosystem check result as markdown. + """ + lines = [] + total_lines_removed = total_lines_added = 0 + total_files_modified = 0 + error_count = len(result.errored) + patch_sets = [] + + for project, comparison in result.completed: + total_lines_added += comparison.diff.lines_added + total_lines_removed += comparison.diff.lines_removed + + patch_set = PatchSet("\n".join(comparison.diff.lines)) + patch_sets.append(patch_set) + total_files_modified += len(patch_set.modified_files) + + if total_lines_removed == 0 and total_lines_added == 0 and error_count == 0: + return "\u2705 ecosystem check detected no format changes." + + # Summarize the total changes + if total_lines_added == 0 and total_lines_added == 0: + # Only errors + s = "s" if error_count != 1 else "" + lines.append( + f"\u2139\ufe0f ecosystem check **encountered format errors**. (no format changes; {error_count} project error{s})" + ) + else: + s = "s" if total_files_modified != 1 else "" + changes = f"+{total_lines_added} -{total_lines_removed} lines in {total_files_modified} file{s} in {len(result.completed)} projects" + if error_count: + s = "s" if error_count != 1 else "" + changes += f"; {error_count} project error{s}" + + lines.append( + f"\u2139\ufe0f ecosystem check **detected format changes**. ({changes})" + ) + + lines.append("") + + # Then per-project changes + for (project, comparison), patch_set in zip(result.completed, patch_sets): + if not comparison.diff: + continue # Skip empty diffs + + files = len(patch_set.modified_files) + s = "s" if files != 1 else "" + title = f"+{comparison.diff.lines_added} -{comparison.diff.lines_removed} lines across {files} file{s}" + + lines.extend( + markdown_project_section( + title=title, + content=format_patchset(patch_set, comparison.repo), + options=project.format_options, + project=project, + ) + ) + + for project, error in result.errored: + lines.extend( + markdown_project_section( + title="error", + content=f"```\n{error}```", + options=project.format_options, + project=project, + ) + ) + + return "\n".join(lines) + + +def format_patchset(patch_set: PatchSet, repo: ClonedRepository) -> str: + """ + Convert a patchset to markdown, adding permalinks to the start of each hunk. + """ + lines = [] + for file_patch in patch_set: + for hunk in file_patch: + # Note: When used for `format` checks, the line number is not exact because + # we formatted the repository for a baseline; we can't know the exact + # line number in the original + # source file. + hunk_link = repo.url_for(file_patch.path, hunk.source_start) + hunk_lines = str(hunk).splitlines() + + # Add a link before the hunk + link_title = file_patch.path + "~L" + str(hunk.source_start) + lines.append(f"{link_title}") + + # Wrap the contents of the hunk in a diff code block + lines.append("```diff") + lines.extend(hunk_lines[1:]) + lines.append("```") + + return "\n".join(lines) + + +async def compare_format( + ruff_baseline_executable: Path, + ruff_comparison_executable: Path, + options: FormatOptions, + cloned_repo: ClonedRepository, +): + # Run format without diff to get the baseline + await ruff_format( + executable=ruff_baseline_executable.resolve(), + path=cloned_repo.path, + name=cloned_repo.fullname, + options=options, + ) + # Then get the diff from stdout + diff = await ruff_format( + executable=ruff_comparison_executable.resolve(), + path=cloned_repo.path, + name=cloned_repo.fullname, + options=options, + diff=True, + ) + + return Comparison(diff=Diff(diff), repo=cloned_repo) + + +async def ruff_format( + *, + executable: Path, + path: Path, + name: str, + options: FormatOptions, + diff: bool = False, +) -> Sequence[str]: + """Run the given ruff binary against the specified path.""" + logger.debug(f"Formatting {name} with {executable}") + ruff_args = options.to_cli_args() + + if diff: + ruff_args.append("--diff") + + start = time.time() + proc = await create_subprocess_exec( + executable.absolute(), + *ruff_args, + ".", + stdout=PIPE, + stderr=PIPE, + cwd=path, + ) + result, err = await proc.communicate() + end = time.time() + + logger.debug(f"Finished formatting {name} with {executable} in {end - start:.2f}s") + + if proc.returncode not in [0, 1]: + raise RuffError(err.decode("utf8")) + + lines = result.decode("utf8").splitlines() + return lines + + +@dataclass(frozen=True) +class FormatOptions: + """ + Ruff format options. + """ + + exclude: str = "" + + def to_cli_args(self) -> list[str]: + args = ["format"] + if self.exclude: + args.extend(["--exclude", self.exclude]) + return args diff --git a/python/ruff-ecosystem/ruff_ecosystem/main.py b/python/ruff-ecosystem/ruff_ecosystem/main.py new file mode 100644 index 00000000000000..b44a5caf34d4be --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/main.py @@ -0,0 +1,144 @@ +import asyncio +import dataclasses +import json +from enum import Enum +from pathlib import Path +from typing import Awaitable, TypeVar + +from ruff_ecosystem import logger +from ruff_ecosystem.check import compare_check, markdown_check_result +from ruff_ecosystem.format import compare_format, markdown_format_result +from ruff_ecosystem.projects import ( + Project, + RuffCommand, +) +from ruff_ecosystem.types import Comparison, Result, Serializable + +T = TypeVar("T") +GITHUB_MAX_COMMENT_LENGTH = 65536 + + +class OutputFormat(Enum): + markdown = "markdown" + json = "json" + + +async def main( + command: RuffCommand, + ruff_baseline_executable: Path, + ruff_comparison_executable: Path, + targets: list[Project], + project_dir: Path, + format: OutputFormat, + max_parallelism: int = 50, + raise_on_failure: bool = False, +) -> None: + logger.debug("Using command %s", command.value) + logger.debug("Using baseline executable at %s", ruff_baseline_executable) + logger.debug("Using comparison executable at %s", ruff_comparison_executable) + logger.debug("Using checkout_dir directory %s", project_dir) + logger.debug("Checking %s targets", len(targets)) + + # Limit parallelism to avoid high memory consumption + semaphore = asyncio.Semaphore(max_parallelism) + + async def limited_parallelism(coroutine: Awaitable[T]) -> T: + async with semaphore: + return await coroutine + + comparisons: list[Exception | Comparison] = await asyncio.gather( + *[ + limited_parallelism( + clone_and_compare( + command, + ruff_baseline_executable, + ruff_comparison_executable, + target, + project_dir, + ) + ) + for target in targets + ], + return_exceptions=not raise_on_failure, + ) + comparisons_by_target = dict(zip(targets, comparisons, strict=True)) + + # Split comparisons into errored / completed + errored, completed = [], [] + for target, comparison in comparisons_by_target.items(): + if isinstance(comparison, Exception): + errored.append((target, comparison)) + else: + completed.append((target, comparison)) + + result = Result(completed=completed, errored=errored) + + match format: + case OutputFormat.json: + print(json.dumps(result, indent=4, cls=JSONEncoder)) + case OutputFormat.markdown: + match command: + case RuffCommand.check: + print(markdown_check_result(result)) + case RuffCommand.format: + print(markdown_format_result(result)) + case _: + raise ValueError(f"Unknown target Ruff command {command}") + case _: + raise ValueError(f"Unknown output format {format}") + + return None + + +async def clone_and_compare( + command: RuffCommand, + ruff_baseline_executable: Path, + ruff_comparison_executable: Path, + target: Project, + project_dir: Path, +) -> Comparison: + """Check a specific repository against two versions of ruff.""" + assert ":" not in target.repo.owner + assert ":" not in target.repo.name + + match command: + case RuffCommand.check: + compare, options = ( + compare_check, + target.check_options, + ) + case RuffCommand.format: + compare, options = ( + compare_format, + target.format_options, + ) + case _: + raise ValueError(f"Unknown target Ruff command {command}") + + checkout_dir = project_dir.joinpath(f"{target.repo.owner}:{target.repo.name}") + cloned_repo = await target.repo.clone(checkout_dir) + + try: + return await compare( + ruff_baseline_executable, + ruff_comparison_executable, + options, + cloned_repo, + ) + except ExceptionGroup as e: + raise e.exceptions[0] from e + + +class JSONEncoder(json.JSONEncoder): + def default(self, o): + if isinstance(o, Serializable): + return o.jsonable() + if dataclasses.is_dataclass(o): + return dataclasses.asdict(o) + if isinstance(o, set): + return tuple(o) + if isinstance(o, Path): + return str(o) + if isinstance(o, Exception): + return str(o) + return super().default(o) diff --git a/python/ruff-ecosystem/ruff_ecosystem/markdown.py b/python/ruff-ecosystem/ruff_ecosystem/markdown.py new file mode 100644 index 00000000000000..80f11f0cb29504 --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/markdown.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ruff_ecosystem.projects import Project + + +def markdown_project_section( + title: str, content: str | list[str], options: object, project: Project +) -> list[str]: + return markdown_details( + summary=f'{project.repo.fullname} ({title})', + # Show the command used for the check + preface="
ruff " + " ".join(options.to_cli_args()) + "
", + content=content, + ) + + +def markdown_plus_minus(added: int, removed: int) -> str: + # TODO(zanieb): GitHub does not support coloring with it seems like the only + # way is to use LateX `${\text{\color{green}+10 \color{red}-10}}$` but + # it renders so ugly it's not worth doing yet + return f"+{added} -{removed}" + + +def markdown_details(summary: str, preface: str, content: str | list[str]): + lines = [] + lines.append(f"
{summary}") + lines.append("

") + lines.append(preface) + lines.append("

") + lines.append("

") + lines.append("") + + if isinstance(content, str): + lines.append(content) + else: + lines.extend(content) + + lines.append("") + lines.append("

") + lines.append("
") + return lines diff --git a/python/ruff-ecosystem/ruff_ecosystem/projects.py b/python/ruff-ecosystem/ruff_ecosystem/projects.py new file mode 100644 index 00000000000000..deb140856c1089 --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/projects.py @@ -0,0 +1,168 @@ +""" +Abstractions and utilities for working with projects to run ecosystem checks on. +""" + +from __future__ import annotations + +from asyncio import create_subprocess_exec +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from subprocess import PIPE +from typing import Self + +from ruff_ecosystem import logger +from ruff_ecosystem.check import CheckOptions +from ruff_ecosystem.format import FormatOptions +from ruff_ecosystem.types import Serializable + + +@dataclass(frozen=True) +class Project(Serializable): + """ + An ecosystem target + """ + + repo: Repository + check_options: CheckOptions = field(default_factory=lambda: CheckOptions()) + format_options: FormatOptions = field(default_factory=lambda: FormatOptions()) + + +class RuffCommand(Enum): + check = "check" + format = "format" + + +class ProjectSetupError(Exception): + """An error setting up a project.""" + + +@dataclass(frozen=True) +class Repository(Serializable): + """ + A remote GitHub repository. + """ + + owner: str + name: str + ref: str | None + + @property + def fullname(self) -> str: + return f"{self.owner}/{self.name}" + + @property + def url(self: Self) -> str: + return f"https://github.com/{self.owner}/{self.name}" + + async def clone(self: Self, checkout_dir: Path) -> ClonedRepository: + """ + Shallow clone this repository + """ + if checkout_dir.exists(): + logger.debug(f"Reusing {self.owner}:{self.name}") + + if self.ref: + logger.debug(f"Checking out ref {self.ref}") + process = await create_subprocess_exec( + *["git", "checkout", "-f", self.ref], + cwd=checkout_dir, + env={"GIT_TERMINAL_PROMPT": "0"}, + stdout=PIPE, + stderr=PIPE, + ) + if await process.wait() != 0: + _, stderr = await process.communicate() + raise ProjectSetupError( + f"Failed to checkout {self.ref}: {stderr.decode()}" + ) + + return await ClonedRepository.from_path(checkout_dir, self) + + logger.debug(f"Cloning {self.owner}:{self.name} to {checkout_dir}") + command = [ + "git", + "clone", + "--config", + "advice.detachedHead=false", + "--quiet", + "--depth", + "1", + "--no-tags", + ] + if self.ref: + command.extend(["--branch", self.ref]) + + command.extend( + [ + f"https://github.com/{self.owner}/{self.name}", + str(checkout_dir), + ], + ) + + process = await create_subprocess_exec( + *command, env={"GIT_TERMINAL_PROMPT": "0"} + ) + + status_code = await process.wait() + + logger.debug( + f"Finished cloning {self.fullname} with status {status_code}", + ) + return await ClonedRepository.from_path(checkout_dir, self) + + +@dataclass(frozen=True) +class ClonedRepository(Repository, Serializable): + """ + A cloned GitHub repository, which includes the hash of the current commit. + """ + + commit_hash: str + path: Path + + def url_for( + self: Self, + path: str, + line_number: int | None = None, + end_line_number: int | None = None, + ) -> str: + """ + Return the remote GitHub URL for the given path in this repository. + """ + url = f"https://github.com/{self.owner}/{self.name}/blob/{self.commit_hash}/{path}" + if line_number: + url += f"#L{line_number}" + if end_line_number: + url += f"-L{end_line_number}" + return url + + @property + def url(self: Self) -> str: + return f"https://github.com/{self.owner}/{self.name}@{self.commit_hash}" + + @classmethod + async def from_path(cls, path: Path, repo: Repository): + return cls( + name=repo.name, + owner=repo.owner, + ref=repo.ref, + path=path, + commit_hash=await cls._get_head_commit(path), + ) + + @staticmethod + async def _get_head_commit(checkout_dir: Path) -> str: + """ + Return the commit sha for the repository in the checkout directory. + """ + process = await create_subprocess_exec( + *["git", "rev-parse", "HEAD"], + cwd=checkout_dir, + stdout=PIPE, + ) + stdout, _ = await process.communicate() + if await process.wait() != 0: + raise ProjectSetupError(f"Failed to retrieve commit sha at {checkout_dir}") + + return stdout.decode().strip() diff --git a/python/ruff-ecosystem/ruff_ecosystem/types.py b/python/ruff-ecosystem/ruff_ecosystem/types.py new file mode 100644 index 00000000000000..e2a36d8f86176e --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/types.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +import abc +import dataclasses +import difflib +from dataclasses import dataclass, is_dataclass +from typing import TYPE_CHECKING, Any, Generator, Iterable, Sequence + +if TYPE_CHECKING: + from ruff_ecosystem.projects import ClonedRepository, Project + + +class Serializable(abc.ABC): + """ + Allows serialization of content by casting to a JSON-compatible type. + """ + + def jsonable(self) -> Any: + # Default implementation for dataclasses + if is_dataclass(self) and not isinstance(self, type): + return dataclasses.asdict(self) + + raise NotImplementedError() + + +class Diff(Serializable): + def __init__(self, lines: Iterable[str], leading_spaces: int = 0) -> None: + self.lines = list(lines) + + # Compute added and removed lines once + self.added = list( + line[2:] + for line in self.lines + if line.startswith("+" + " " * leading_spaces) + ) + self.removed = list( + line[2:] + for line in self.lines + if line.startswith("-" + " " * leading_spaces) + ) + + def __bool__(self) -> bool: + return bool(self.added or self.removed) + + def __iter__(self) -> Generator[str, None, None]: + yield from self.lines + + @property + def lines_added(self): + return len(self.added) + + @property + def lines_removed(self): + return len(self.removed) + + @classmethod + def from_pair(cls, baseline: Sequence[str], comparison: Sequence[str]): + """ + Construct a diff from before and after. + """ + return cls(difflib.ndiff(baseline, comparison), leading_spaces=1) + + def without_unchanged_lines(self) -> Diff: + return Diff( + line for line in self.lines if line.startswith("+") or line.startswith("-") + ) + + def jsonable(self) -> Any: + return self.lines + + +@dataclass(frozen=True) +class Result(Serializable): + """ + The result of an ecosystem check for a collection of projects. + """ + + errored: list[tuple[Project, Exception]] + completed: list[tuple[Project, Comparison]] + + +@dataclass(frozen=True) +class Comparison(Serializable): + """ + The result of a completed ecosystem comparison for a single project. + """ + + diff: Diff + repo: ClonedRepository + + +class RuffError(Exception): + """An error reported by Ruff.""" diff --git a/scripts/check_ecosystem.py b/scripts/check_ecosystem.py index 1b173076c1d54a..f45fae776c3e88 100755 --- a/scripts/check_ecosystem.py +++ b/scripts/check_ecosystem.py @@ -1,5 +1,9 @@ #!/usr/bin/env python3 -"""Check two versions of ruff against a corpus of open-source code. +""" +**DEPRECATED** This script is being replaced by the ruff-ecosystem package. + + +Check two versions of ruff against a corpus of open-source code. Example usage: