Improve performance of Owners rule (#16563)

Possibly will close #16122. @stuhood found in #16122 (comment) that we were spending substantial time in Rust parsing the same globs in the stateless function `matches_filespec`. This improves the situation by introducing `FilespecsMatcher`, which allows us to persist the globs parsing over time. Crucially, `FilespecsMatcher` eagerly parses the globs in its constructor, unlike `PathGlobs` currently being lazy. Then, we use a `@memoized_method` on each `SourcesField` instance to avoid ever creating the matcher more than necessary. Before: ``` ❯ hyperfine -w 2 -r 10 './pants --no-pantsd --no-buildsense-enable --no-anonymous-telemetry-enabled dependencies ::' Time (mean ± σ): 6.562 s ± 0.153 s [User: 7.354 s, System: 1.516 s] Range (min … max): 6.420 s … 6.878 s 10 runs ``` After: ``` ❯ hyperfine -w 2 -r 10 './pants --no-pantsd --no-buildsense-enable --no-anonymous-telemetry-enabled dependencies ::' Time (mean ± σ): 6.460 s ± 0.065 s [User: 7.301 s, System: 1.447 s] Range (min … max): 6.340 s … 6.576 s 10 runs ``` (Note, we expect the performance to be better in repos with the shape described at #16122 (comment)) ## Relationship to `PathGlobs` We also considered changing `PathGlobs` to eagerly parse, and then defining `PathGlobs.matches()`, rather than introducing the new type `FilespecsMatcher`. I believe `FilespecsMatcher` is a clearer API: it is only used for in-memory checks if globs match paths, whereas `PathGlobs` is used for on-disk. Notably, the Rust code only overlaps in how the exclude patterns are ignored. So, making this split allows us to simplify `PathGlobs`. A downside of having separate types is it's harder to share the duplicate parsing of the exclude patterns. This seems fine, given that the include patterns are separate code paths. (See #16564 for improving `PathGlobs` to also eagerly parse.)
pantsbuild · Aug 18, 2022 · d55493f · d55493f
1 parent 8f39837
commit d55493f
Show file tree

Hide file tree

Showing 16 changed files with 174 additions and 137 deletions.
diff --git a/docs/markdown/Writing Plugins/common-plugin-tasks/plugin-upgrade-guide.md b/docs/markdown/Writing Plugins/common-plugin-tasks/plugin-upgrade-guide.md
@@ -65,6 +65,15 @@ your tool.
 However, keep the `UnionRule` the same, i.e. with the first argument still
 `GenerateToolLockfileSentinel`.
 
+### `matches_filespec()` replaced by `FilespecMatcher`
+
+Instead, use `FilespecMatcher(includes=[], excludes=[]).matches(paths: Sequence[str])` from
+`pants.source.filespec`.
+
+The functionality is the same, but can have better performance because we don't need to parse the
+same globs each time `.matches()` is called. When possible, reuse the same `FilespecMatcher` object
+to get these performance benefits.
+
 2.13
 ----
 

diff --git a/src/python/pants/backend/java/goals/tailor.py b/src/python/pants/backend/java/goals/tailor.py
@@ -25,7 +25,7 @@
 from pants.engine.rules import collect_rules, rule
 from pants.engine.target import Target
 from pants.engine.unions import UnionRule
-from pants.source.filespec import Filespec, matches_filespec
+from pants.source.filespec import FilespecMatcher
 from pants.util.logging import LogLevel
 
 
@@ -36,10 +36,8 @@ class PutativeJavaTargetsRequest(PutativeTargetsRequest):
 
 def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
     """Returns a dict of target type -> files that belong to targets of that type."""
-    tests_filespec = Filespec(includes=list(JavaTestsGeneratorSourcesField.default))
-    test_filenames = set(
-        matches_filespec(tests_filespec, paths=[os.path.basename(path) for path in paths])
-    )
+    tests_filespec_matcher = FilespecMatcher(JavaTestsGeneratorSourcesField.default, ())
+    test_filenames = set(tests_filespec_matcher.matches([os.path.basename(path) for path in paths]))
     test_files = {path for path in paths if os.path.basename(path) in test_filenames}
     sources_files = set(paths) - test_files
     return {JunitTestsGeneratorTarget: test_files, JavaSourcesGeneratorTarget: sources_files}

diff --git a/src/python/pants/backend/kotlin/goals/tailor.py b/src/python/pants/backend/kotlin/goals/tailor.py
@@ -25,7 +25,7 @@
 from pants.engine.rules import collect_rules, rule
 from pants.engine.target import Target
 from pants.engine.unions import UnionRule
-from pants.source.filespec import Filespec, matches_filespec
+from pants.source.filespec import FilespecMatcher
 from pants.util.logging import LogLevel
 
 
@@ -36,12 +36,12 @@ class PutativeKotlinTargetsRequest(PutativeTargetsRequest):
 
 def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
     """Returns a dict of target type -> files that belong to targets of that type."""
-    junit_filespec = Filespec(includes=list(KotlinJunitTestsGeneratorSourcesField.default))
+    junit_filespec_matcher = FilespecMatcher(KotlinJunitTestsGeneratorSourcesField.default, ())
     junit_files = {
         path
         for path in paths
         if os.path.basename(path)
-        in set(matches_filespec(junit_filespec, paths=[os.path.basename(path) for path in paths]))
+        in set(junit_filespec_matcher.matches([os.path.basename(path) for path in paths]))
     }
     sources_files = set(paths) - junit_files
     return {

diff --git a/src/python/pants/backend/python/goals/tailor.py b/src/python/pants/backend/python/goals/tailor.py
@@ -39,7 +39,7 @@
 from pants.engine.rules import collect_rules, rule, rule_helper
 from pants.engine.target import Target, UnexpandedTargets
 from pants.engine.unions import UnionRule
-from pants.source.filespec import Filespec, matches_filespec
+from pants.source.filespec import FilespecMatcher
 from pants.source.source_root import SourceRootsRequest, SourceRootsResult
 from pants.util.logging import LogLevel
 
@@ -53,13 +53,13 @@ class PutativePythonTargetsRequest(PutativeTargetsRequest):
 
 def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
     """Returns a dict of target type -> files that belong to targets of that type."""
-    tests_filespec = Filespec(includes=list(PythonTestsGeneratingSourcesField.default))
-    test_utils_filespec = Filespec(includes=list(PythonTestUtilsGeneratingSourcesField.default))
+    tests_filespec_matcher = FilespecMatcher(PythonTestsGeneratingSourcesField.default, ())
+    test_utils_filespec_matcher = FilespecMatcher(PythonTestUtilsGeneratingSourcesField.default, ())
 
     path_to_file_name = {path: os.path.basename(path) for path in paths}
-    test_file_names = set(matches_filespec(tests_filespec, paths=path_to_file_name.values()))
+    test_file_names = set(tests_filespec_matcher.matches(list(path_to_file_name.values())))
     test_util_file_names = set(
-        matches_filespec(test_utils_filespec, paths=path_to_file_name.values())
+        test_utils_filespec_matcher.matches(list(path_to_file_name.values()))
     )
 
     test_files = {

diff --git a/src/python/pants/backend/scala/goals/tailor.py b/src/python/pants/backend/scala/goals/tailor.py
@@ -27,7 +27,7 @@
 from pants.engine.rules import collect_rules, rule
 from pants.engine.target import Target
 from pants.engine.unions import UnionRule
-from pants.source.filespec import Filespec, matches_filespec
+from pants.source.filespec import FilespecMatcher
 from pants.util.logging import LogLevel
 
 
@@ -38,21 +38,19 @@ class PutativeScalaTargetsRequest(PutativeTargetsRequest):
 
 def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
     """Returns a dict of target type -> files that belong to targets of that type."""
-    scalatest_filespec = Filespec(includes=list(ScalatestTestsGeneratorSourcesField.default))
-    junit_filespec = Filespec(includes=list(ScalaJunitTestsGeneratorSourcesField.default))
+    scalatest_filespec_matcher = FilespecMatcher(ScalatestTestsGeneratorSourcesField.default, ())
+    junit_filespec_matcher = FilespecMatcher(ScalaJunitTestsGeneratorSourcesField.default, ())
     scalatest_files = {
         path
         for path in paths
         if os.path.basename(path)
-        in set(
-            matches_filespec(scalatest_filespec, paths=[os.path.basename(path) for path in paths])
-        )
+        in set(scalatest_filespec_matcher.matches([os.path.basename(path) for path in paths]))
     }
     junit_files = {
         path
         for path in paths
         if os.path.basename(path)
-        in set(matches_filespec(junit_filespec, paths=[os.path.basename(path) for path in paths]))
+        in set(junit_filespec_matcher.matches([os.path.basename(path) for path in paths]))
     }
     sources_files = set(paths) - scalatest_files - junit_files
     return {

diff --git a/src/python/pants/backend/shell/tailor.py b/src/python/pants/backend/shell/tailor.py
@@ -25,7 +25,7 @@
 from pants.engine.rules import collect_rules, rule
 from pants.engine.target import Target
 from pants.engine.unions import UnionRule
-from pants.source.filespec import Filespec, matches_filespec
+from pants.source.filespec import FilespecMatcher
 from pants.util.logging import LogLevel
 
 
@@ -36,10 +36,8 @@ class PutativeShellTargetsRequest(PutativeTargetsRequest):
 
 def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
     """Returns a dict of target type -> files that belong to targets of that type."""
-    tests_filespec = Filespec(includes=list(Shunit2TestsGeneratorSourcesField.default))
-    test_filenames = set(
-        matches_filespec(tests_filespec, paths=[os.path.basename(path) for path in paths])
-    )
+    tests_filespec_matcher = FilespecMatcher(Shunit2TestsGeneratorSourcesField.default, ())
+    test_filenames = set(tests_filespec_matcher.matches([os.path.basename(path) for path in paths]))
     test_files = {path for path in paths if os.path.basename(path) in test_filenames}
     sources_files = set(paths) - test_files
     return {Shunit2TestsGeneratorTarget: test_files, ShellSourcesGeneratorTarget: sources_files}

diff --git a/src/python/pants/core/goals/tailor.py b/src/python/pants/core/goals/tailor.py
@@ -43,7 +43,7 @@
 from pants.engine.unions import UnionMembership, union
 from pants.option.global_options import GlobalOptions
 from pants.option.option_types import BoolOption, DictOption, StrListOption, StrOption
-from pants.source.filespec import Filespec, matches_filespec
+from pants.source.filespec import FilespecMatcher
 from pants.util.docutil import bin_name, doc_url
 from pants.util.frozendict import FrozenDict
 from pants.util.logging import LogLevel
@@ -362,8 +362,8 @@ def alias_for(self, standard_type: str) -> str | None:
     def validate_build_file_name(self, build_file_patterns: tuple[str, ...]) -> None:
         """Check that the specified BUILD file name works with the repository's BUILD file
         patterns."""
-        filespec = Filespec(includes=list(build_file_patterns))
-        if not bool(matches_filespec(filespec, paths=[self.build_file_name])):
+        filespec_matcher = FilespecMatcher(build_file_patterns, ())
+        if not bool(filespec_matcher.matches([self.build_file_name])):
             raise ValueError(
                 f"The option `[{self.options_scope}].build_file_name` is set to "
                 f"`{self.build_file_name}`, which is not compatible with "
@@ -375,13 +375,14 @@ def validate_build_file_name(self, build_file_patterns: tuple[str, ...]) -> None
     def filter_by_ignores(
         self, putative_targets: Iterable[PutativeTarget], build_file_ignores: tuple[str, ...]
     ) -> Iterator[PutativeTarget]:
-        ignore_paths_filespec = Filespec(includes=[*self.ignore_paths, *build_file_ignores])
+        ignore_paths_filespec_matcher = FilespecMatcher(
+            (*self.ignore_paths, *build_file_ignores), ()
+        )
         for ptgt in putative_targets:
             is_ignored_file = bool(
-                matches_filespec(
-                    ignore_paths_filespec,
-                    paths=[os.path.join(ptgt.path, self.build_file_name)],
-                )
+                ignore_paths_filespec_matcher.matches(
+                    [os.path.join(ptgt.path, self.build_file_name)]
+                ),
             )
             if is_ignored_file:
                 continue

diff --git a/src/python/pants/core/goals/tailor_test.py b/src/python/pants/core/goals/tailor_test.py
@@ -34,7 +34,7 @@
 from pants.engine.rules import Get, QueryRule, rule
 from pants.engine.target import MultipleSourcesField, Target
 from pants.engine.unions import UnionRule
-from pants.source.filespec import Filespec, matches_filespec
+from pants.source.filespec import FilespecMatcher
 from pants.testutil.option_util import create_goal_subsystem
 from pants.testutil.pytest_util import no_exception
 from pants.testutil.rule_runner import RuleRunner
@@ -79,11 +79,9 @@ async def find_fortran_targets(
     all_fortran_files = await Get(Paths, PathGlobs, req.path_globs("*.f90"))
     unowned_shell_files = set(all_fortran_files.files) - set(all_owned_sources)
 
-    tests_filespec = Filespec(includes=list(FortranTestsSources.default))
+    tests_filespec_matcher = FilespecMatcher(FortranTestsSources.default, ())
     test_filenames = set(
-        matches_filespec(
-            tests_filespec, paths=[os.path.basename(path) for path in unowned_shell_files]
-        )
+        tests_filespec_matcher.matches([os.path.basename(path) for path in unowned_shell_files])
     )
     test_files = {path for path in unowned_shell_files if os.path.basename(path) in test_filenames}
     sources_files = set(unowned_shell_files) - test_files

diff --git a/src/python/pants/engine/internals/graph.py b/src/python/pants/engine/internals/graph.py
@@ -91,7 +91,6 @@
     OwnersNotFoundBehavior,
     UnmatchedBuildFileGlobs,
 )
-from pants.source.filespec import matches_filespec
 from pants.util.docutil import bin_name, doc_url
 from pants.util.frozendict import FrozenDict
 from pants.util.logging import LogLevel
@@ -786,7 +785,7 @@ def create_live_and_deleted_gets(
 
         for candidate_tgt, bfa in zip(candidate_tgts, build_file_addresses):
             matching_files = set(
-                matches_filespec(candidate_tgt.get(SourcesField).filespec, paths=sources_set)
+                candidate_tgt.get(SourcesField).filespec_matcher.matches(list(sources_set))
             )
             # Also consider secondary ownership, meaning it's not a `SourcesField` field with
             # primary ownership, but the target still should match the file. We can't use
@@ -798,7 +797,7 @@ def create_live_and_deleted_gets(
             )
             for secondary_owner_field in secondary_owner_fields:
                 matching_files.update(
-                    matches_filespec(secondary_owner_field.filespec, paths=sources_set)
+                    *secondary_owner_field.filespec_matcher.matches(list(sources_set))
                 )
             if not matching_files and not (
                 owners_request.match_if_owning_build_file_included_in_sources

diff --git a/src/python/pants/engine/internals/native_engine.pyi b/src/python/pants/engine/internals/native_engine.pyi
@@ -8,7 +8,6 @@ from typing import Any, Generic, Iterable, Sequence, TextIO, Tuple, TypeVar, ove
 
 from typing_extensions import Protocol
 
-from pants.engine.fs import PathGlobs
 from pants.engine.internals.scheduler import Workunit, _PathGlobsAndRootCollection
 from pants.engine.internals.session import SessionValues
 from pants.engine.process import InteractiveProcessResult
@@ -149,16 +148,19 @@ class RemovePrefix:
     def __hash__(self) -> int: ...
     def __repr__(self) -> str: ...
 
+class FilespecMatcher:
+    def __init__(self, includes: Sequence[str], excludes: Sequence[str]) -> None: ...
+    def __eq__(self, other: FilespecMatcher | Any) -> bool: ...
+    def __hash__(self) -> int: ...
+    def __repr__(self) -> str: ...
+    def matches(self, paths: Sequence[str]) -> list[str]: ...
+
 EMPTY_DIGEST: Digest
 EMPTY_FILE_DIGEST: FileDigest
 EMPTY_SNAPSHOT: Snapshot
 
 def default_cache_path() -> str: ...
 
-# TODO: Really, `paths` should be `Sequence[str]`. Fix and update call sites so that we don't
-#  cast to `tuple()` when not necessary.
-def match_path_globs(path_globs: PathGlobs, paths: tuple[str, ...]) -> str: ...
-
 # ------------------------------------------------------------------------------
 # Workunits
 # ------------------------------------------------------------------------------

diff --git a/src/python/pants/engine/target.py b/src/python/pants/engine/target.py
@@ -51,7 +51,7 @@
 )
 from pants.engine.unions import UnionMembership, UnionRule, union
 from pants.option.global_options import UnmatchedBuildFileGlobs
-from pants.source.filespec import Filespec
+from pants.source.filespec import Filespec, FilespecMatcher
 from pants.util.collections import ensure_list, ensure_str_list
 from pants.util.dirutil import fast_relpath
 from pants.util.docutil import bin_name, doc_url
@@ -2071,7 +2071,7 @@ def path_globs(self, unmatched_build_file_globs: UnmatchedBuildFileGlobs) -> Pat
             ),
         )
 
-    @property
+    @memoized_property
     def filespec(self) -> Filespec:
         """The original globs, returned in the Filespec dict format.
 
@@ -2089,6 +2089,12 @@ def filespec(self) -> Filespec:
             result["excludes"] = excludes
         return result
 
+    @memoized_property
+    def filespec_matcher(self) -> FilespecMatcher:
+        # Note: memoized because parsing the globs is expensive:
+        # https://github.com/pantsbuild/pants/issues/16122
+        return FilespecMatcher(self.filespec["includes"], self.filespec.get("excludes", []))
+
 
 class MultipleSourcesField(SourcesField, StringSequenceField):
     """The `sources: list[str]` field.
@@ -2408,6 +2414,12 @@ def filespec(self) -> Filespec:
         the build root.
         """
 
+    @memoized_property
+    def filespec_matcher(self) -> FilespecMatcher:
+        # Note: memoized because parsing the globs is expensive:
+        # https://github.com/pantsbuild/pants/issues/16122
+        return FilespecMatcher(self.filespec["includes"], self.filespec.get("excludes", []))
+
 
 def targets_with_sources_types(
     sources_types: Iterable[type[SourcesField]],

diff --git a/src/python/pants/source/filespec.py b/src/python/pants/source/filespec.py
@@ -7,8 +7,10 @@
 
 from typing_extensions import TypedDict
 
-from pants.engine.fs import PathGlobs
-from pants.engine.internals import native_engine
+from pants.base.deprecated import deprecated
+from pants.engine.internals.native_engine import (
+    FilespecMatcher as FilespecMatcher,  # explicit re-export
+)
 
 
 class _IncludesDict(TypedDict, total=True):
@@ -26,11 +28,7 @@ class Filespec(_IncludesDict, total=False):
     excludes: list[str]
 
 
+@deprecated("2.15.0.dev0", "Use `FilespecMatcher().matches()` instead", start_version="2.14.0.dev5")
 def matches_filespec(spec: Filespec, *, paths: Iterable[str]) -> tuple[str, ...]:
-    include_patterns = spec["includes"]
-    exclude_patterns = [f"!{e}" for e in spec.get("excludes", [])]
-    return tuple(
-        native_engine.match_path_globs(
-            PathGlobs((*include_patterns, *exclude_patterns)), tuple(paths)
-        )
-    )
+    matcher = FilespecMatcher(spec["includes"], spec.get("excludes", []))
+    return tuple(matcher.matches(tuple(paths)))
diff --git a/src/python/pants/source/filespec_test.py b/src/python/pants/source/filespec_test.py
@@ -6,7 +6,7 @@
 import pytest
 
 from pants.engine.fs import PathGlobs, Snapshot
-from pants.source.filespec import matches_filespec
+from pants.source.filespec import FilespecMatcher, matches_filespec
 from pants.testutil.rule_runner import RuleRunner
 
 
@@ -19,7 +19,9 @@ def assert_rule_match(
     rule_runner: RuleRunner, glob: str, paths: Tuple[str, ...], *, should_match: bool
 ) -> None:
     # Confirm in-memory behavior.
-    matched_filespec = matches_filespec({"includes": [glob]}, paths=paths)
+    matched_filespec = tuple(FilespecMatcher([glob], ()).matches(paths))
+    deprecated_matched_filespec = matches_filespec({"includes": [glob]}, paths=paths)
+    assert matched_filespec == deprecated_matched_filespec
     if should_match:
         assert matched_filespec == paths
     else: