Skip to content

Commit

Permalink
Improve performance of Owners rule (#16563)
Browse files Browse the repository at this point in the history
Possibly will close #16122.

@stuhood found in #16122 (comment) that we were spending substantial time in Rust parsing the same globs in the stateless function `matches_filespec`.

This improves the situation by introducing `FilespecsMatcher`, which allows us to persist the globs parsing over time. Crucially, `FilespecsMatcher` eagerly parses the globs in its constructor, unlike `PathGlobs` currently being lazy. Then, we  use a `@memoized_method` on each `SourcesField` instance to avoid ever creating the matcher more than necessary.

Before:

```
❯ hyperfine -w 2 -r 10 './pants --no-pantsd --no-buildsense-enable --no-anonymous-telemetry-enabled dependencies ::'
  Time (mean ± σ):      6.562 s ±  0.153 s    [User: 7.354 s, System: 1.516 s]
  Range (min … max):    6.420 s …  6.878 s    10 runs
```

After:

```
❯ hyperfine -w 2 -r 10 './pants --no-pantsd --no-buildsense-enable --no-anonymous-telemetry-enabled dependencies ::'
  Time (mean ± σ):      6.460 s ±  0.065 s    [User: 7.301 s, System: 1.447 s]
  Range (min … max):    6.340 s …  6.576 s    10 runs
```

(Note, we expect the performance to be better in repos with the shape described at #16122 (comment))

## Relationship to `PathGlobs`

We also considered changing `PathGlobs` to eagerly parse, and then defining `PathGlobs.matches()`, rather than introducing the new type `FilespecsMatcher`.

I believe `FilespecsMatcher` is a clearer API: it is only used for in-memory checks if globs match paths, whereas `PathGlobs` is used for on-disk. 

Notably, the Rust code only overlaps in how the exclude patterns are ignored. So, making this split allows us to simplify `PathGlobs`.

A downside of having separate types is it's harder to share the duplicate parsing of the exclude patterns. This seems fine, given that the include patterns are separate code paths.

(See #16564 for improving `PathGlobs` to also eagerly parse.)
  • Loading branch information
Eric-Arellano authored Aug 18, 2022
1 parent 8f39837 commit d55493f
Show file tree
Hide file tree
Showing 16 changed files with 174 additions and 137 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,15 @@ your tool.
However, keep the `UnionRule` the same, i.e. with the first argument still
`GenerateToolLockfileSentinel`.

### `matches_filespec()` replaced by `FilespecMatcher`

Instead, use `FilespecMatcher(includes=[], excludes=[]).matches(paths: Sequence[str])` from
`pants.source.filespec`.

The functionality is the same, but can have better performance because we don't need to parse the
same globs each time `.matches()` is called. When possible, reuse the same `FilespecMatcher` object
to get these performance benefits.

2.13
----

Expand Down
8 changes: 3 additions & 5 deletions src/python/pants/backend/java/goals/tailor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pants.engine.rules import collect_rules, rule
from pants.engine.target import Target
from pants.engine.unions import UnionRule
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.util.logging import LogLevel


Expand All @@ -36,10 +36,8 @@ class PutativeJavaTargetsRequest(PutativeTargetsRequest):

def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
"""Returns a dict of target type -> files that belong to targets of that type."""
tests_filespec = Filespec(includes=list(JavaTestsGeneratorSourcesField.default))
test_filenames = set(
matches_filespec(tests_filespec, paths=[os.path.basename(path) for path in paths])
)
tests_filespec_matcher = FilespecMatcher(JavaTestsGeneratorSourcesField.default, ())
test_filenames = set(tests_filespec_matcher.matches([os.path.basename(path) for path in paths]))
test_files = {path for path in paths if os.path.basename(path) in test_filenames}
sources_files = set(paths) - test_files
return {JunitTestsGeneratorTarget: test_files, JavaSourcesGeneratorTarget: sources_files}
Expand Down
6 changes: 3 additions & 3 deletions src/python/pants/backend/kotlin/goals/tailor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pants.engine.rules import collect_rules, rule
from pants.engine.target import Target
from pants.engine.unions import UnionRule
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.util.logging import LogLevel


Expand All @@ -36,12 +36,12 @@ class PutativeKotlinTargetsRequest(PutativeTargetsRequest):

def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
"""Returns a dict of target type -> files that belong to targets of that type."""
junit_filespec = Filespec(includes=list(KotlinJunitTestsGeneratorSourcesField.default))
junit_filespec_matcher = FilespecMatcher(KotlinJunitTestsGeneratorSourcesField.default, ())
junit_files = {
path
for path in paths
if os.path.basename(path)
in set(matches_filespec(junit_filespec, paths=[os.path.basename(path) for path in paths]))
in set(junit_filespec_matcher.matches([os.path.basename(path) for path in paths]))
}
sources_files = set(paths) - junit_files
return {
Expand Down
10 changes: 5 additions & 5 deletions src/python/pants/backend/python/goals/tailor.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from pants.engine.rules import collect_rules, rule, rule_helper
from pants.engine.target import Target, UnexpandedTargets
from pants.engine.unions import UnionRule
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.source.source_root import SourceRootsRequest, SourceRootsResult
from pants.util.logging import LogLevel

Expand All @@ -53,13 +53,13 @@ class PutativePythonTargetsRequest(PutativeTargetsRequest):

def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
"""Returns a dict of target type -> files that belong to targets of that type."""
tests_filespec = Filespec(includes=list(PythonTestsGeneratingSourcesField.default))
test_utils_filespec = Filespec(includes=list(PythonTestUtilsGeneratingSourcesField.default))
tests_filespec_matcher = FilespecMatcher(PythonTestsGeneratingSourcesField.default, ())
test_utils_filespec_matcher = FilespecMatcher(PythonTestUtilsGeneratingSourcesField.default, ())

path_to_file_name = {path: os.path.basename(path) for path in paths}
test_file_names = set(matches_filespec(tests_filespec, paths=path_to_file_name.values()))
test_file_names = set(tests_filespec_matcher.matches(list(path_to_file_name.values())))
test_util_file_names = set(
matches_filespec(test_utils_filespec, paths=path_to_file_name.values())
test_utils_filespec_matcher.matches(list(path_to_file_name.values()))
)

test_files = {
Expand Down
12 changes: 5 additions & 7 deletions src/python/pants/backend/scala/goals/tailor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from pants.engine.rules import collect_rules, rule
from pants.engine.target import Target
from pants.engine.unions import UnionRule
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.util.logging import LogLevel


Expand All @@ -38,21 +38,19 @@ class PutativeScalaTargetsRequest(PutativeTargetsRequest):

def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
"""Returns a dict of target type -> files that belong to targets of that type."""
scalatest_filespec = Filespec(includes=list(ScalatestTestsGeneratorSourcesField.default))
junit_filespec = Filespec(includes=list(ScalaJunitTestsGeneratorSourcesField.default))
scalatest_filespec_matcher = FilespecMatcher(ScalatestTestsGeneratorSourcesField.default, ())
junit_filespec_matcher = FilespecMatcher(ScalaJunitTestsGeneratorSourcesField.default, ())
scalatest_files = {
path
for path in paths
if os.path.basename(path)
in set(
matches_filespec(scalatest_filespec, paths=[os.path.basename(path) for path in paths])
)
in set(scalatest_filespec_matcher.matches([os.path.basename(path) for path in paths]))
}
junit_files = {
path
for path in paths
if os.path.basename(path)
in set(matches_filespec(junit_filespec, paths=[os.path.basename(path) for path in paths]))
in set(junit_filespec_matcher.matches([os.path.basename(path) for path in paths]))
}
sources_files = set(paths) - scalatest_files - junit_files
return {
Expand Down
8 changes: 3 additions & 5 deletions src/python/pants/backend/shell/tailor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pants.engine.rules import collect_rules, rule
from pants.engine.target import Target
from pants.engine.unions import UnionRule
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.util.logging import LogLevel


Expand All @@ -36,10 +36,8 @@ class PutativeShellTargetsRequest(PutativeTargetsRequest):

def classify_source_files(paths: Iterable[str]) -> dict[type[Target], set[str]]:
"""Returns a dict of target type -> files that belong to targets of that type."""
tests_filespec = Filespec(includes=list(Shunit2TestsGeneratorSourcesField.default))
test_filenames = set(
matches_filespec(tests_filespec, paths=[os.path.basename(path) for path in paths])
)
tests_filespec_matcher = FilespecMatcher(Shunit2TestsGeneratorSourcesField.default, ())
test_filenames = set(tests_filespec_matcher.matches([os.path.basename(path) for path in paths]))
test_files = {path for path in paths if os.path.basename(path) in test_filenames}
sources_files = set(paths) - test_files
return {Shunit2TestsGeneratorTarget: test_files, ShellSourcesGeneratorTarget: sources_files}
Expand Down
17 changes: 9 additions & 8 deletions src/python/pants/core/goals/tailor.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from pants.engine.unions import UnionMembership, union
from pants.option.global_options import GlobalOptions
from pants.option.option_types import BoolOption, DictOption, StrListOption, StrOption
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.util.docutil import bin_name, doc_url
from pants.util.frozendict import FrozenDict
from pants.util.logging import LogLevel
Expand Down Expand Up @@ -362,8 +362,8 @@ def alias_for(self, standard_type: str) -> str | None:
def validate_build_file_name(self, build_file_patterns: tuple[str, ...]) -> None:
"""Check that the specified BUILD file name works with the repository's BUILD file
patterns."""
filespec = Filespec(includes=list(build_file_patterns))
if not bool(matches_filespec(filespec, paths=[self.build_file_name])):
filespec_matcher = FilespecMatcher(build_file_patterns, ())
if not bool(filespec_matcher.matches([self.build_file_name])):
raise ValueError(
f"The option `[{self.options_scope}].build_file_name` is set to "
f"`{self.build_file_name}`, which is not compatible with "
Expand All @@ -375,13 +375,14 @@ def validate_build_file_name(self, build_file_patterns: tuple[str, ...]) -> None
def filter_by_ignores(
self, putative_targets: Iterable[PutativeTarget], build_file_ignores: tuple[str, ...]
) -> Iterator[PutativeTarget]:
ignore_paths_filespec = Filespec(includes=[*self.ignore_paths, *build_file_ignores])
ignore_paths_filespec_matcher = FilespecMatcher(
(*self.ignore_paths, *build_file_ignores), ()
)
for ptgt in putative_targets:
is_ignored_file = bool(
matches_filespec(
ignore_paths_filespec,
paths=[os.path.join(ptgt.path, self.build_file_name)],
)
ignore_paths_filespec_matcher.matches(
[os.path.join(ptgt.path, self.build_file_name)]
),
)
if is_ignored_file:
continue
Expand Down
8 changes: 3 additions & 5 deletions src/python/pants/core/goals/tailor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from pants.engine.rules import Get, QueryRule, rule
from pants.engine.target import MultipleSourcesField, Target
from pants.engine.unions import UnionRule
from pants.source.filespec import Filespec, matches_filespec
from pants.source.filespec import FilespecMatcher
from pants.testutil.option_util import create_goal_subsystem
from pants.testutil.pytest_util import no_exception
from pants.testutil.rule_runner import RuleRunner
Expand Down Expand Up @@ -79,11 +79,9 @@ async def find_fortran_targets(
all_fortran_files = await Get(Paths, PathGlobs, req.path_globs("*.f90"))
unowned_shell_files = set(all_fortran_files.files) - set(all_owned_sources)

tests_filespec = Filespec(includes=list(FortranTestsSources.default))
tests_filespec_matcher = FilespecMatcher(FortranTestsSources.default, ())
test_filenames = set(
matches_filespec(
tests_filespec, paths=[os.path.basename(path) for path in unowned_shell_files]
)
tests_filespec_matcher.matches([os.path.basename(path) for path in unowned_shell_files])
)
test_files = {path for path in unowned_shell_files if os.path.basename(path) in test_filenames}
sources_files = set(unowned_shell_files) - test_files
Expand Down
5 changes: 2 additions & 3 deletions src/python/pants/engine/internals/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@
OwnersNotFoundBehavior,
UnmatchedBuildFileGlobs,
)
from pants.source.filespec import matches_filespec
from pants.util.docutil import bin_name, doc_url
from pants.util.frozendict import FrozenDict
from pants.util.logging import LogLevel
Expand Down Expand Up @@ -786,7 +785,7 @@ def create_live_and_deleted_gets(

for candidate_tgt, bfa in zip(candidate_tgts, build_file_addresses):
matching_files = set(
matches_filespec(candidate_tgt.get(SourcesField).filespec, paths=sources_set)
candidate_tgt.get(SourcesField).filespec_matcher.matches(list(sources_set))
)
# Also consider secondary ownership, meaning it's not a `SourcesField` field with
# primary ownership, but the target still should match the file. We can't use
Expand All @@ -798,7 +797,7 @@ def create_live_and_deleted_gets(
)
for secondary_owner_field in secondary_owner_fields:
matching_files.update(
matches_filespec(secondary_owner_field.filespec, paths=sources_set)
*secondary_owner_field.filespec_matcher.matches(list(sources_set))
)
if not matching_files and not (
owners_request.match_if_owning_build_file_included_in_sources
Expand Down
12 changes: 7 additions & 5 deletions src/python/pants/engine/internals/native_engine.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ from typing import Any, Generic, Iterable, Sequence, TextIO, Tuple, TypeVar, ove

from typing_extensions import Protocol

from pants.engine.fs import PathGlobs
from pants.engine.internals.scheduler import Workunit, _PathGlobsAndRootCollection
from pants.engine.internals.session import SessionValues
from pants.engine.process import InteractiveProcessResult
Expand Down Expand Up @@ -149,16 +148,19 @@ class RemovePrefix:
def __hash__(self) -> int: ...
def __repr__(self) -> str: ...

class FilespecMatcher:
def __init__(self, includes: Sequence[str], excludes: Sequence[str]) -> None: ...
def __eq__(self, other: FilespecMatcher | Any) -> bool: ...
def __hash__(self) -> int: ...
def __repr__(self) -> str: ...
def matches(self, paths: Sequence[str]) -> list[str]: ...

EMPTY_DIGEST: Digest
EMPTY_FILE_DIGEST: FileDigest
EMPTY_SNAPSHOT: Snapshot

def default_cache_path() -> str: ...

# TODO: Really, `paths` should be `Sequence[str]`. Fix and update call sites so that we don't
# cast to `tuple()` when not necessary.
def match_path_globs(path_globs: PathGlobs, paths: tuple[str, ...]) -> str: ...

# ------------------------------------------------------------------------------
# Workunits
# ------------------------------------------------------------------------------
Expand Down
16 changes: 14 additions & 2 deletions src/python/pants/engine/target.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
)
from pants.engine.unions import UnionMembership, UnionRule, union
from pants.option.global_options import UnmatchedBuildFileGlobs
from pants.source.filespec import Filespec
from pants.source.filespec import Filespec, FilespecMatcher
from pants.util.collections import ensure_list, ensure_str_list
from pants.util.dirutil import fast_relpath
from pants.util.docutil import bin_name, doc_url
Expand Down Expand Up @@ -2071,7 +2071,7 @@ def path_globs(self, unmatched_build_file_globs: UnmatchedBuildFileGlobs) -> Pat
),
)

@property
@memoized_property
def filespec(self) -> Filespec:
"""The original globs, returned in the Filespec dict format.
Expand All @@ -2089,6 +2089,12 @@ def filespec(self) -> Filespec:
result["excludes"] = excludes
return result

@memoized_property
def filespec_matcher(self) -> FilespecMatcher:
# Note: memoized because parsing the globs is expensive:
# https://github.com/pantsbuild/pants/issues/16122
return FilespecMatcher(self.filespec["includes"], self.filespec.get("excludes", []))


class MultipleSourcesField(SourcesField, StringSequenceField):
"""The `sources: list[str]` field.
Expand Down Expand Up @@ -2408,6 +2414,12 @@ def filespec(self) -> Filespec:
the build root.
"""

@memoized_property
def filespec_matcher(self) -> FilespecMatcher:
# Note: memoized because parsing the globs is expensive:
# https://github.com/pantsbuild/pants/issues/16122
return FilespecMatcher(self.filespec["includes"], self.filespec.get("excludes", []))


def targets_with_sources_types(
sources_types: Iterable[type[SourcesField]],
Expand Down
16 changes: 7 additions & 9 deletions src/python/pants/source/filespec.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@

from typing_extensions import TypedDict

from pants.engine.fs import PathGlobs
from pants.engine.internals import native_engine
from pants.base.deprecated import deprecated
from pants.engine.internals.native_engine import (
FilespecMatcher as FilespecMatcher, # explicit re-export
)


class _IncludesDict(TypedDict, total=True):
Expand All @@ -26,11 +28,7 @@ class Filespec(_IncludesDict, total=False):
excludes: list[str]


@deprecated("2.15.0.dev0", "Use `FilespecMatcher().matches()` instead", start_version="2.14.0.dev5")
def matches_filespec(spec: Filespec, *, paths: Iterable[str]) -> tuple[str, ...]:
include_patterns = spec["includes"]
exclude_patterns = [f"!{e}" for e in spec.get("excludes", [])]
return tuple(
native_engine.match_path_globs(
PathGlobs((*include_patterns, *exclude_patterns)), tuple(paths)
)
)
matcher = FilespecMatcher(spec["includes"], spec.get("excludes", []))
return tuple(matcher.matches(tuple(paths)))
6 changes: 4 additions & 2 deletions src/python/pants/source/filespec_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pytest

from pants.engine.fs import PathGlobs, Snapshot
from pants.source.filespec import matches_filespec
from pants.source.filespec import FilespecMatcher, matches_filespec
from pants.testutil.rule_runner import RuleRunner


Expand All @@ -19,7 +19,9 @@ def assert_rule_match(
rule_runner: RuleRunner, glob: str, paths: Tuple[str, ...], *, should_match: bool
) -> None:
# Confirm in-memory behavior.
matched_filespec = matches_filespec({"includes": [glob]}, paths=paths)
matched_filespec = tuple(FilespecMatcher([glob], ()).matches(paths))
deprecated_matched_filespec = matches_filespec({"includes": [glob]}, paths=paths)
assert matched_filespec == deprecated_matched_filespec
if should_match:
assert matched_filespec == paths
else:
Expand Down
Loading

0 comments on commit d55493f

Please sign in to comment.