Skip to content

Commit

Permalink
Improve performance of Owners rule (Cherry-pick of #16563) (#16597)
Browse files Browse the repository at this point in the history
Possibly will close #16122.

@stuhood found in #16122 (comment) that we were spending substantial time in Rust parsing the same globs in the stateless function `matches_filespec`.

This improves the situation by introducing `FilespecsMatcher`, which allows us to persist the globs parsing over time. Crucially, `FilespecsMatcher` eagerly parses the globs in its constructor, unlike `PathGlobs` currently being lazy. Then, we  use a `@memoized_method` on each `SourcesField` instance to avoid ever creating the matcher more than necessary.

Before:

```
❯ hyperfine -w 2 -r 10 './pants --no-pantsd --no-buildsense-enable --no-anonymous-telemetry-enabled dependencies ::'
  Time (mean ± σ):      6.562 s ±  0.153 s    [User: 7.354 s, System: 1.516 s]
  Range (min … max):    6.420 s …  6.878 s    10 runs
```

After:

```
❯ hyperfine -w 2 -r 10 './pants --no-pantsd --no-buildsense-enable --no-anonymous-telemetry-enabled dependencies ::'
  Time (mean ± σ):      6.460 s ±  0.065 s    [User: 7.301 s, System: 1.447 s]
  Range (min … max):    6.340 s …  6.576 s    10 runs
```

(Note, we expect the performance to be better in repos with the shape described at #16122 (comment))

## Relationship to `PathGlobs`

We also considered changing `PathGlobs` to eagerly parse, and then defining `PathGlobs.matches()`, rather than introducing the new type `FilespecsMatcher`.

I believe `FilespecsMatcher` is a clearer API: it is only used for in-memory checks if globs match paths, whereas `PathGlobs` is used for on-disk.

Notably, the Rust code only overlaps in how the exclude patterns are ignored. So, making this split allows us to simplify `PathGlobs`.

A downside of having separate types is it's harder to share the duplicate parsing of the exclude patterns. This seems fine, given that the include patterns are separate code paths.

(See #16564 for improving `PathGlobs` to also eagerly parse.)
  • Loading branch information
Eric-Arellano authored Aug 22, 2022
1 parent 6a0c3af commit 5d80edb
Show file tree
Hide file tree
Showing 15 changed files with 165 additions and 137 deletions.
8 changes: 3 additions & 5 deletions src/python/pants/backend/java/goals/tailor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pants.engine.rules import collect_rules, rule
from pants.engine.target import Target
from pants.engine.unions import UnionRule
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.util.logging import LogLevel


Expand All @@ -36,10 +36,8 @@ class PutativeJavaTargetsRequest(PutativeTargetsRequest):

def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
"""Returns a dict of target type -> files that belong to targets of that type."""
tests_filespec = Filespec(includes=list(JavaTestsGeneratorSourcesField.default))
test_filenames = set(
matches_filespec(tests_filespec, paths=[os.path.basename(path) for path in paths])
)
tests_filespec_matcher = FilespecMatcher(JavaTestsGeneratorSourcesField.default, ())
test_filenames = set(tests_filespec_matcher.matches([os.path.basename(path) for path in paths]))
test_files = {path for path in paths if os.path.basename(path) in test_filenames}
sources_files = set(paths) - test_files
return {JunitTestsGeneratorTarget: test_files, JavaSourcesGeneratorTarget: sources_files}
Expand Down
6 changes: 3 additions & 3 deletions src/python/pants/backend/kotlin/goals/tailor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pants.engine.rules import collect_rules, rule
from pants.engine.target import Target
from pants.engine.unions import UnionRule
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.util.logging import LogLevel


Expand All @@ -36,12 +36,12 @@ class PutativeKotlinTargetsRequest(PutativeTargetsRequest):

def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
"""Returns a dict of target type -> files that belong to targets of that type."""
junit_filespec = Filespec(includes=list(KotlinJunitTestsGeneratorSourcesField.default))
junit_filespec_matcher = FilespecMatcher(KotlinJunitTestsGeneratorSourcesField.default, ())
junit_files = {
path
for path in paths
if os.path.basename(path)
in set(matches_filespec(junit_filespec, paths=[os.path.basename(path) for path in paths]))
in set(junit_filespec_matcher.matches([os.path.basename(path) for path in paths]))
}
sources_files = set(paths) - junit_files
return {
Expand Down
10 changes: 5 additions & 5 deletions src/python/pants/backend/python/goals/tailor.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from pants.engine.rules import collect_rules, rule, rule_helper
from pants.engine.target import Target, UnexpandedTargets
from pants.engine.unions import UnionRule
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.source.source_root import SourceRootsRequest, SourceRootsResult
from pants.util.logging import LogLevel

Expand All @@ -53,13 +53,13 @@ class PutativePythonTargetsRequest(PutativeTargetsRequest):

def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
"""Returns a dict of target type -> files that belong to targets of that type."""
tests_filespec = Filespec(includes=list(PythonTestsGeneratingSourcesField.default))
test_utils_filespec = Filespec(includes=list(PythonTestUtilsGeneratingSourcesField.default))
tests_filespec_matcher = FilespecMatcher(PythonTestsGeneratingSourcesField.default, ())
test_utils_filespec_matcher = FilespecMatcher(PythonTestUtilsGeneratingSourcesField.default, ())

path_to_file_name = {path: os.path.basename(path) for path in paths}
test_file_names = set(matches_filespec(tests_filespec, paths=path_to_file_name.values()))
test_file_names = set(tests_filespec_matcher.matches(list(path_to_file_name.values())))
test_util_file_names = set(
matches_filespec(test_utils_filespec, paths=path_to_file_name.values())
test_utils_filespec_matcher.matches(list(path_to_file_name.values()))
)

test_files = {
Expand Down
12 changes: 5 additions & 7 deletions src/python/pants/backend/scala/goals/tailor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from pants.engine.rules import collect_rules, rule
from pants.engine.target import Target
from pants.engine.unions import UnionRule
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.util.logging import LogLevel


Expand All @@ -38,21 +38,19 @@ class PutativeScalaTargetsRequest(PutativeTargetsRequest):

def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
"""Returns a dict of target type -> files that belong to targets of that type."""
scalatest_filespec = Filespec(includes=list(ScalatestTestsGeneratorSourcesField.default))
junit_filespec = Filespec(includes=list(ScalaJunitTestsGeneratorSourcesField.default))
scalatest_filespec_matcher = FilespecMatcher(ScalatestTestsGeneratorSourcesField.default, ())
junit_filespec_matcher = FilespecMatcher(ScalaJunitTestsGeneratorSourcesField.default, ())
scalatest_files = {
path
for path in paths
if os.path.basename(path)
in set(
matches_filespec(scalatest_filespec, paths=[os.path.basename(path) for path in paths])
)
in set(scalatest_filespec_matcher.matches([os.path.basename(path) for path in paths]))
}
junit_files = {
path
for path in paths
if os.path.basename(path)
in set(matches_filespec(junit_filespec, paths=[os.path.basename(path) for path in paths]))
in set(junit_filespec_matcher.matches([os.path.basename(path) for path in paths]))
}
sources_files = set(paths) - scalatest_files - junit_files
return {
Expand Down
8 changes: 3 additions & 5 deletions src/python/pants/backend/shell/tailor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pants.engine.rules import collect_rules, rule
from pants.engine.target import Target
from pants.engine.unions import UnionRule
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.util.logging import LogLevel


Expand All @@ -36,10 +36,8 @@ class PutativeShellTargetsRequest(PutativeTargetsRequest):

def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
"""Returns a dict of target type -> files that belong to targets of that type."""
tests_filespec = Filespec(includes=list(Shunit2TestsGeneratorSourcesField.default))
test_filenames = set(
matches_filespec(tests_filespec, paths=[os.path.basename(path) for path in paths])
)
tests_filespec_matcher = FilespecMatcher(Shunit2TestsGeneratorSourcesField.default, ())
test_filenames = set(tests_filespec_matcher.matches([os.path.basename(path) for path in paths]))
test_files = {path for path in paths if os.path.basename(path) in test_filenames}
sources_files = set(paths) - test_files
return {Shunit2TestsGeneratorTarget: test_files, ShellSourcesGeneratorTarget: sources_files}
Expand Down
17 changes: 9 additions & 8 deletions src/python/pants/core/goals/tailor.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from pants.engine.unions import UnionMembership, union
from pants.option.global_options import GlobalOptions
from pants.option.option_types import BoolOption, DictOption, StrListOption, StrOption
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.util.docutil import bin_name, doc_url
from pants.util.frozendict import FrozenDict
from pants.util.logging import LogLevel
Expand Down Expand Up @@ -367,8 +367,8 @@ def alias_for(self, standard_type: str) -> str | None:
def validate_build_file_name(self, build_file_patterns: tuple[str, ...]) -> None:
"""Check that the specified BUILD file name works with the repository's BUILD file
patterns."""
filespec = Filespec(includes=list(build_file_patterns))
if not bool(matches_filespec(filespec, paths=[self.build_file_name])):
filespec_matcher = FilespecMatcher(build_file_patterns, ())
if not bool(filespec_matcher.matches([self.build_file_name])):
raise ValueError(
f"The option `[{self.options_scope}].build_file_name` is set to "
f"`{self.build_file_name}`, which is not compatible with "
Expand All @@ -380,13 +380,14 @@ def validate_build_file_name(self, build_file_patterns: tuple[str, ...]) -> None
def filter_by_ignores(
self, putative_targets: Iterable[PutativeTarget], build_file_ignores: tuple[str, ...]
) -> Iterator[PutativeTarget]:
ignore_paths_filespec = Filespec(includes=[*self.ignore_paths, *build_file_ignores])
ignore_paths_filespec_matcher = FilespecMatcher(
(*self.ignore_paths, *build_file_ignores), ()
)
for ptgt in putative_targets:
is_ignored_file = bool(
matches_filespec(
ignore_paths_filespec,
paths=[os.path.join(ptgt.path, self.build_file_name)],
)
ignore_paths_filespec_matcher.matches(
[os.path.join(ptgt.path, self.build_file_name)]
),
)
if is_ignored_file:
continue
Expand Down
8 changes: 3 additions & 5 deletions src/python/pants/core/goals/tailor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from pants.engine.rules import Get, QueryRule, rule
from pants.engine.target import MultipleSourcesField, Target
from pants.engine.unions import UnionRule
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.testutil.option_util import create_goal_subsystem
from pants.testutil.pytest_util import no_exception
from pants.testutil.rule_runner import RuleRunner
Expand Down Expand Up @@ -79,11 +79,9 @@ async def find_fortran_targets(
all_fortran_files = await Get(Paths, PathGlobs, req.path_globs("*.f90"))
unowned_shell_files = set(all_fortran_files.files) - set(all_owned_sources)

tests_filespec = Filespec(includes=list(FortranTestsSources.default))
tests_filespec_matcher = FilespecMatcher(FortranTestsSources.default, ())
test_filenames = set(
matches_filespec(
tests_filespec, paths=[os.path.basename(path) for path in unowned_shell_files]
)
tests_filespec_matcher.matches([os.path.basename(path) for path in unowned_shell_files])
)
test_files = {path for path in unowned_shell_files if os.path.basename(path) in test_filenames}
sources_files = set(unowned_shell_files) - test_files
Expand Down
5 changes: 2 additions & 3 deletions src/python/pants/engine/internals/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@
OwnersNotFoundBehavior,
UnmatchedBuildFileGlobs,
)
from pants.source.filespec import matches_filespec
from pants.util.docutil import bin_name, doc_url
from pants.util.frozendict import FrozenDict
from pants.util.logging import LogLevel
Expand Down Expand Up @@ -787,7 +786,7 @@ def create_live_and_deleted_gets(

for candidate_tgt, bfa in zip(candidate_tgts, build_file_addresses):
matching_files = set(
matches_filespec(candidate_tgt.get(SourcesField).filespec, paths=sources_set)
candidate_tgt.get(SourcesField).filespec_matcher.matches(list(sources_set))
)
# Also consider secondary ownership, meaning it's not a `SourcesField` field with
# primary ownership, but the target still should match the file. We can't use
Expand All @@ -799,7 +798,7 @@ def create_live_and_deleted_gets(
)
for secondary_owner_field in secondary_owner_fields:
matching_files.update(
matches_filespec(secondary_owner_field.filespec, paths=sources_set)
*secondary_owner_field.filespec_matcher.matches(list(sources_set))
)
if not matching_files and bfa.rel_path not in sources_set:
continue
Expand Down
12 changes: 7 additions & 5 deletions src/python/pants/engine/internals/native_engine.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ from typing import Any, Generic, Iterable, Sequence, TextIO, Tuple, TypeVar, ove

from typing_extensions import Protocol

from pants.engine.fs import PathGlobs
from pants.engine.internals.scheduler import Workunit, _PathGlobsAndRootCollection
from pants.engine.internals.session import SessionValues
from pants.engine.process import InteractiveProcessResult
Expand Down Expand Up @@ -149,16 +148,19 @@ class RemovePrefix:
def __hash__(self) -> int: ...
def __repr__(self) -> str: ...

class FilespecMatcher:
def __init__(self, includes: Sequence[str], excludes: Sequence[str]) -> None: ...
def __eq__(self, other: FilespecMatcher | Any) -> bool: ...
def __hash__(self) -> int: ...
def __repr__(self) -> str: ...
def matches(self, paths: Sequence[str]) -> list[str]: ...

EMPTY_DIGEST: Digest
EMPTY_FILE_DIGEST: FileDigest
EMPTY_SNAPSHOT: Snapshot

def default_cache_path() -> str: ...

# TODO: Really, `paths` should be `Sequence[str]`. Fix and update call sites so that we don't
# cast to `tuple()` when not necessary.
def match_path_globs(path_globs: PathGlobs, paths: tuple[str, ...]) -> str: ...

# ------------------------------------------------------------------------------
# Workunits
# ------------------------------------------------------------------------------
Expand Down
16 changes: 14 additions & 2 deletions src/python/pants/engine/target.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
)
from pants.engine.unions import UnionMembership, UnionRule, union
from pants.option.global_options import UnmatchedBuildFileGlobs
from pants.source.filespec import Filespec
from pants.source.filespec import Filespec, FilespecMatcher
from pants.util.collections import ensure_list, ensure_str_list
from pants.util.dirutil import fast_relpath
from pants.util.docutil import bin_name, doc_url
Expand Down Expand Up @@ -2063,7 +2063,7 @@ def path_globs(self, unmatched_build_file_globs: UnmatchedBuildFileGlobs) -> Pat
),
)

@property
@memoized_property
def filespec(self) -> Filespec:
"""The original globs, returned in the Filespec dict format.
Expand All @@ -2081,6 +2081,12 @@ def filespec(self) -> Filespec:
result["excludes"] = excludes
return result

@memoized_property
def filespec_matcher(self) -> FilespecMatcher:
# Note: memoized because parsing the globs is expensive:
# https://github.com/pantsbuild/pants/issues/16122
return FilespecMatcher(self.filespec["includes"], self.filespec.get("excludes", []))


class MultipleSourcesField(SourcesField, StringSequenceField):
"""The `sources: list[str]` field.
Expand Down Expand Up @@ -2400,6 +2406,12 @@ def filespec(self) -> Filespec:
the build root.
"""

@memoized_property
def filespec_matcher(self) -> FilespecMatcher:
# Note: memoized because parsing the globs is expensive:
# https://github.com/pantsbuild/pants/issues/16122
return FilespecMatcher(self.filespec["includes"], self.filespec.get("excludes", []))


def targets_with_sources_types(
sources_types: Iterable[type[SourcesField]],
Expand Down
16 changes: 7 additions & 9 deletions src/python/pants/source/filespec.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@

from typing_extensions import TypedDict

from pants.engine.fs import PathGlobs
from pants.engine.internals import native_engine
from pants.base.deprecated import deprecated
from pants.engine.internals.native_engine import (
FilespecMatcher as FilespecMatcher, # explicit re-export
)


class _IncludesDict(TypedDict, total=True):
Expand All @@ -26,11 +28,7 @@ class Filespec(_IncludesDict, total=False):
excludes: list[str]


@deprecated("2.15.0.dev0", "Use `FilespecMatcher().matches()` instead", start_version="2.14.0.dev5")
def matches_filespec(spec: Filespec, *, paths: Iterable[str]) -> tuple[str, ...]:
include_patterns = spec["includes"]
exclude_patterns = [f"!{e}" for e in spec.get("excludes", [])]
return tuple(
native_engine.match_path_globs(
PathGlobs((*include_patterns, *exclude_patterns)), tuple(paths)
)
)
matcher = FilespecMatcher(spec["includes"], spec.get("excludes", []))
return tuple(matcher.matches(tuple(paths)))
6 changes: 4 additions & 2 deletions src/python/pants/source/filespec_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pytest

from pants.engine.fs import PathGlobs, Snapshot
from pants.source.filespec import matches_filespec
from pants.source.filespec import FilespecMatcher, matches_filespec
from pants.testutil.rule_runner import RuleRunner


Expand All @@ -19,7 +19,9 @@ def assert_rule_match(
rule_runner: RuleRunner, glob: str, paths: Tuple[str, ...], *, should_match: bool
) -> None:
# Confirm in-memory behavior.
matched_filespec = matches_filespec({"includes": [glob]}, paths=paths)
matched_filespec = tuple(FilespecMatcher([glob], ()).matches(paths))
deprecated_matched_filespec = matches_filespec({"includes": [glob]}, paths=paths)
assert matched_filespec == deprecated_matched_filespec
if should_match:
assert matched_filespec == paths
else:
Expand Down
Loading

0 comments on commit 5d80edb

Please sign in to comment.