Add "workspace invalidation" sources support for shell / adhoc backen…

…ds (pantsbuild#21051) Add support for "workspace invalidation" sources for the `adhoc_tool` and `shell_command` target types. This supports allows those targets to depend on the content of files in the repository without materializing those sources in the execution sandbox. This support is intended to be used in conjunction with the workspace environment where execution does not take place in a sandbox. The new field `workspace_invalidation_sources` on both target types is a list of globs into the repository. The digest of the referenced files will be inserted as an environment variable in the process executed (which makes it part of the process's cache key).
tdyas · Jun 17, 2024 · b2db430 · b2db430
1 parent 8114a0b
commit b2db430
Show file tree

Hide file tree

Showing 8 changed files with 161 additions and 8 deletions.
diff --git a/docs/docs/using-pants/environments.mdx b/docs/docs/using-pants/environments.mdx
@@ -257,6 +257,8 @@ Thus, Pants puts that burden on you, the Pants user, to ensure a process output
 
 If a process isn't reproducible, re-running a build from the same source code could fail unexpectedly, or give different output to an earlier build.
 
+You should use the `workspace_invalidation_sources` field available on the `adhoc_tool` and `shell_command` target types to inform Pants of what files should cause re-execution of the target's process if they change.
+
 :::
 
 The special environment name `__local_workspace__` can be used to select a matching `experimental_workspace_environment` based on its `compatible_platforms` attribute.

diff --git a/src/python/pants/backend/adhoc/adhoc_tool.py b/src/python/pants/backend/adhoc/adhoc_tool.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import logging
+import os
 
 from pants.backend.adhoc.target_types import (
     AdhocToolArgumentsField,
@@ -19,7 +20,9 @@
     AdhocToolStderrFilenameField,
     AdhocToolStdoutFilenameField,
     AdhocToolWorkdirField,
+    AdhocToolWorkspaceInvalidationSourcesField,
 )
+from pants.base.glob_match_error_behavior import GlobMatchErrorBehavior
 from pants.core.target_types import FileSourceField
 from pants.core.util_rules.adhoc_process_support import (
     AdhocProcessRequest,
@@ -30,7 +33,7 @@
 from pants.core.util_rules.adhoc_process_support import rules as adhoc_process_support_rules
 from pants.core.util_rules.environments import EnvironmentNameRequest, EnvironmentTarget
 from pants.engine.environment import EnvironmentName
-from pants.engine.fs import Digest, Snapshot
+from pants.engine.fs import Digest, PathGlobs, Snapshot
 from pants.engine.rules import Get, collect_rules, rule
 from pants.engine.target import GeneratedSources, GenerateSourcesRequest
 from pants.engine.unions import UnionRule
@@ -81,6 +84,18 @@ async def run_in_sandbox_request(
 
     cache_scope = env_target.default_cache_scope
 
+    workspace_invalidation_globs: PathGlobs | None = None
+    workspace_invalidation_sources = (
+        target.get(AdhocToolWorkspaceInvalidationSourcesField).value or ()
+    )
+    if workspace_invalidation_sources:
+        spec_path = target.address.spec_path
+        workspace_invalidation_globs = PathGlobs(
+            globs=(os.path.join(spec_path, glob) for glob in workspace_invalidation_sources),
+            glob_match_error_behavior=GlobMatchErrorBehavior.error,
+            description_of_origin=f"`{AdhocToolWorkspaceInvalidationSourcesField.alias}` for `adhoc_tool` target at `{target.address}`",
+        )
+
     process_request = AdhocProcessRequest(
         description=description,
         address=target.address,
@@ -99,6 +114,7 @@ async def run_in_sandbox_request(
         log_output=target[AdhocToolLogOutputField].value,
         capture_stderr_file=target[AdhocToolStderrFilenameField].value,
         capture_stdout_file=target[AdhocToolStdoutFilenameField].value,
+        workspace_invalidation_globs=workspace_invalidation_globs,
         cache_scope=cache_scope,
     )
 

diff --git a/src/python/pants/backend/adhoc/adhoc_tool_test.py b/src/python/pants/backend/adhoc/adhoc_tool_test.py
@@ -56,19 +56,27 @@ def rule_runner() -> PythonRuleRunner:
             PythonSourceTarget,
             LocalWorkspaceEnvironmentTarget,
         ],
+        isolated_local_store=True,
     )
     rule_runner.set_options([], env_inherit={"PATH"})
     return rule_runner
 
 
+def execute_adhoc_tool(
+    rule_runner: PythonRuleRunner,
+    address: Address,
+) -> GeneratedSources:
+    generator_type: type[GenerateSourcesRequest] = GenerateFilesFromAdhocToolRequest
+    target = rule_runner.get_target(address)
+    return rule_runner.request(GeneratedSources, [generator_type(EMPTY_SNAPSHOT, target)])
+
+
 def assert_adhoc_tool_result(
     rule_runner: PythonRuleRunner,
     address: Address,
     expected_contents: dict[str, str],
 ) -> None:
-    generator_type: type[GenerateSourcesRequest] = GenerateFilesFromAdhocToolRequest
-    target = rule_runner.get_target(address)
-    result = rule_runner.request(GeneratedSources, [generator_type(EMPTY_SNAPSHOT, target)])
+    result = execute_adhoc_tool(rule_runner, address)
     assert result.snapshot.files == tuple(expected_contents)
     contents = rule_runner.request(DigestContents, [result.snapshot.digest])
     for fc in contents:
@@ -334,3 +342,35 @@ def test_adhoc_tool_with_workspace_execution(rule_runner: PythonRuleRunner) -> N
     workspace_output_path = Path(rule_runner.build_root).joinpath("foo.txt")
     assert workspace_output_path.exists()
     assert workspace_output_path.read_text().strip() == "workspace"
+
+
+def test_adhoc_tool_workspace_invalidation_sources(rule_runner: PythonRuleRunner) -> None:
+    rule_runner.write_files(
+        {
+            "src/BUILD": dedent(
+                """\
+            system_binary(name="bash", binary_name="bash")
+            adhoc_tool(
+              name="cmd",
+              runnable=":bash",
+              # Use a random value so we can detect when re-execution occurs.
+              args=["-c", "echo $RANDOM > out.log"],
+              output_files=["out.log"],
+              workspace_invalidation_sources=['a-file'],
+            )
+            """
+            ),
+            "src/a-file": "",
+        }
+    )
+    address = Address("src", target_name="cmd")
+
+    # Re-executing the initial execution should be cached.
+    result1 = execute_adhoc_tool(rule_runner, address)
+    result2 = execute_adhoc_tool(rule_runner, address)
+    assert result1.snapshot == result2.snapshot
+
+    # Update the hash-only source file's content. The adhoc_tool should be re-executed now.
+    (Path(rule_runner.build_root) / "src" / "a-file").write_text("xyzzy")
+    result3 = execute_adhoc_tool(rule_runner, address)
+    assert result1.snapshot != result3.snapshot
diff --git a/src/python/pants/backend/adhoc/target_types.py b/src/python/pants/backend/adhoc/target_types.py
@@ -253,6 +253,27 @@ class AdhocToolOutputRootDirField(StringField):
     )
 
 
+class AdhocToolWorkspaceInvalidationSourcesField(StringSequenceField):
+    alias: ClassVar[str] = "workspace_invalidation_sources"
+    help = help_text(
+        """
+        Path globs for source files on which this target depends and for which any changes should cause
+        this target's process to be re-executed. Unlike ordinary dependencies, the files referenced by
+        `workspace_invalidation_sources` globs are not materialized into any execution sandbox
+        and are referenced solely for cache invalidation purposes.
+
+        Note: This field is intended to work with the in-workspace execution environment configured by
+        the `workspace_environment` target type. It should only be used when the configured
+        environment for a target is a `workspace_environment`.
+
+        Implementation: Pants computes a digest of all of the files referenced by the provided globs
+        and injects that digest into the process as an environment variable. Since environment variables
+        are part of the cache key for a process's execution, any changes to the referenced files will
+        change the digest and thus force re-exection of the process.
+        """
+    )
+
+
 class AdhocToolTarget(Target):
     alias: ClassVar[str] = "adhoc_tool"
     core_fields = (
@@ -272,6 +293,7 @@ class AdhocToolTarget(Target):
         AdhocToolOutputRootDirField,
         AdhocToolStdoutFilenameField,
         AdhocToolStderrFilenameField,
+        AdhocToolWorkspaceInvalidationSourcesField,
         EnvironmentField,
     )
     help = help_text(

diff --git a/src/python/pants/backend/shell/target_types.py b/src/python/pants/backend/shell/target_types.py
@@ -19,6 +19,7 @@
     AdhocToolRunnableDependenciesField,
     AdhocToolTimeoutField,
     AdhocToolWorkdirField,
+    AdhocToolWorkspaceInvalidationSourcesField,
 )
 from pants.backend.shell.subsystems.shell_setup import ShellSetup
 from pants.core.goals.test import RuntimePackageDependenciesField, TestTimeoutField
@@ -379,6 +380,10 @@ class ShellCommandNamedCachesField(AdhocToolNamedCachesField):
     pass
 
 
+class ShellCommandWorkspaceInvalidationSourcesField(AdhocToolWorkspaceInvalidationSourcesField):
+    pass
+
+
 class SkipShellCommandTestsField(BoolField):
     alias = "skip_tests"
     default = False
@@ -403,6 +408,7 @@ class ShellCommandTarget(Target):
         ShellCommandWorkdirField,
         ShellCommandNamedCachesField,
         ShellCommandOutputRootDirField,
+        ShellCommandWorkspaceInvalidationSourcesField,
         EnvironmentField,
     )
     help = help_text(

diff --git a/src/python/pants/backend/shell/util_rules/shell_command.py b/src/python/pants/backend/shell/util_rules/shell_command.py
@@ -4,6 +4,7 @@
 from __future__ import annotations
 
 import logging
+import os
 import shlex
 from dataclasses import dataclass
 
@@ -24,8 +25,10 @@
     ShellCommandTimeoutField,
     ShellCommandToolsField,
     ShellCommandWorkdirField,
+    ShellCommandWorkspaceInvalidationSourcesField,
 )
 from pants.backend.shell.util_rules.builtin import BASH_BUILTIN_COMMANDS
+from pants.base.glob_match_error_behavior import GlobMatchErrorBehavior
 from pants.core.goals.run import RunFieldSet, RunInSandboxBehavior, RunRequest
 from pants.core.target_types import FileSourceField
 from pants.core.util_rules.adhoc_process_support import (
@@ -41,7 +44,7 @@
 from pants.core.util_rules.environments import EnvironmentNameRequest, EnvironmentTarget
 from pants.core.util_rules.system_binaries import BashBinary, BinaryShims, BinaryShimsRequest
 from pants.engine.environment import EnvironmentName
-from pants.engine.fs import Digest, Snapshot
+from pants.engine.fs import Digest, PathGlobs, Snapshot
 from pants.engine.internals.native_engine import EMPTY_DIGEST
 from pants.engine.process import Process
 from pants.engine.rules import Get, collect_rules, rule
@@ -149,6 +152,18 @@ async def _prepare_process_request_from_target(
 
     cache_scope = env_target.default_cache_scope
 
+    workspace_invalidation_globs: PathGlobs | None = None
+    workspace_invalidation_sources = (
+        shell_command.get(ShellCommandWorkspaceInvalidationSourcesField).value or ()
+    )
+    if workspace_invalidation_sources:
+        spec_path = shell_command.address.spec_path
+        workspace_invalidation_globs = PathGlobs(
+            globs=(os.path.join(spec_path, glob) for glob in workspace_invalidation_sources),
+            glob_match_error_behavior=GlobMatchErrorBehavior.error,
+            description_of_origin=f"`{ShellCommandWorkspaceInvalidationSourcesField.alias}` for `shell_command` target at `{shell_command.address}`",
+        )
+
     return AdhocProcessRequest(
         description=description,
         address=shell_command.address,
@@ -167,6 +182,7 @@ async def _prepare_process_request_from_target(
         log_output=shell_command[ShellCommandLogOutputField].value,
         capture_stdout_file=None,
         capture_stderr_file=None,
+        workspace_invalidation_globs=workspace_invalidation_globs,
         cache_scope=cache_scope,
     )
 

diff --git a/src/python/pants/backend/shell/util_rules/shell_command_test.py b/src/python/pants/backend/shell/util_rules/shell_command_test.py
@@ -5,6 +5,7 @@
 
 import logging
 import shlex
+import time
 from pathlib import Path
 from textwrap import dedent
 
@@ -69,19 +70,27 @@ def rule_runner() -> RuleRunner:
             FilesGeneratorTarget,
             LocalWorkspaceEnvironmentTarget,
         ],
+        isolated_local_store=True,
     )
     rule_runner.set_options([], env_inherit={"PATH"})
     return rule_runner
 
 
+def execute_shell_command(
+    rule_runner: RuleRunner,
+    address: Address,
+) -> GeneratedSources:
+    generator_type: type[GenerateSourcesRequest] = GenerateFilesFromShellCommandRequest
+    target = rule_runner.get_target(address)
+    return rule_runner.request(GeneratedSources, [generator_type(EMPTY_SNAPSHOT, target)])
+
+
 def assert_shell_command_result(
     rule_runner: RuleRunner,
     address: Address,
     expected_contents: dict[str, str],
 ) -> None:
-    generator_type: type[GenerateSourcesRequest] = GenerateFilesFromShellCommandRequest
-    target = rule_runner.get_target(address)
-    result = rule_runner.request(GeneratedSources, [generator_type(EMPTY_SNAPSHOT, target)])
+    result = execute_shell_command(rule_runner, address)
     assert result.snapshot.files == tuple(expected_contents)
     contents = rule_runner.request(DigestContents, [result.snapshot.digest])
     for fc in contents:
@@ -871,3 +880,34 @@ def test_shell_command_with_workspace_execution(rule_runner: RuleRunner) -> None
     workspace_output_path = Path(rule_runner.build_root).joinpath("foo.txt")
     assert workspace_output_path.exists()
     assert workspace_output_path.read_text().strip() == "workspace"
+
+
+def test_shell_command_workspace_invalidation_sources(rule_runner: RuleRunner) -> None:
+    rule_runner.write_files(
+        {
+            "src/BUILD": dedent(
+                """\
+            shell_command(
+              name="cmd",
+              # Use a random value so we can detect when re-execution occurs.
+              command='echo $RANDOM > out.log',
+              output_files=["out.log"],
+              workspace_invalidation_sources=['a-file'],
+            )
+            """
+            ),
+            "src/a-file": "",
+        }
+    )
+    address = Address("src", target_name="cmd")
+
+    # Re-executing the initial execution should be cached.
+    result1 = execute_shell_command(rule_runner, address)
+    result2 = execute_shell_command(rule_runner, address)
+    assert result1.snapshot == result2.snapshot
+
+    # Update the hash-only source file's content. The shell_command should be re-executed now.
+    (Path(rule_runner.build_root) / "src" / "a-file").write_text("xyzzy")
+    time.sleep(0.100)
+    result3 = execute_shell_command(rule_runner, address)
+    assert result1.snapshot != result3.snapshot
diff --git a/src/python/pants/core/util_rules/adhoc_process_support.py b/src/python/pants/core/util_rules/adhoc_process_support.py
@@ -27,6 +27,7 @@
     Directory,
     FileContent,
     MergeDigests,
+    PathGlobs,
     Snapshot,
 )
 from pants.engine.internals.native_engine import AddressInput, RemovePrefix
@@ -71,6 +72,7 @@ class AdhocProcessRequest:
     log_output: bool
     capture_stdout_file: str | None
     capture_stderr_file: str | None
+    workspace_invalidation_globs: PathGlobs | None
     cache_scope: ProcessCacheScope | None = None
 
 
@@ -567,6 +569,15 @@ async def prepare_adhoc_process(
     if supplied_env_vars:
         command_env.update(supplied_env_vars)
 
+    # Compute the digest for any workspace invalidation sources and put the digest into the environment as a dummy variable
+    # so that the process produced by this rule will be invalidated if any of the referenced files change.
+    if request.workspace_invalidation_globs is not None:
+        workspace_invalidation_digest = await Get(
+            Digest, PathGlobs, request.workspace_invalidation_globs
+        )
+        digest_str = f"{workspace_invalidation_digest.fingerprint}-{workspace_invalidation_digest.serialized_bytes_length}"
+        command_env["__PANTS_WORKSPACE_INVALIDATION_SOURCES_DIGEST"] = digest_str
+
     input_snapshot = await Get(Snapshot, Digest, request.input_digest)
 
     if not working_directory or working_directory in input_snapshot.dirs:
-Original file line number
+Diff line change
@@ Expand Up @@
     If a process isn't reproducible, re-running a build from the same source code could fail unexpectedly, or give different output to an earlier build.
+    You should use the `workspace_invalidation_sources` field available on the `adhoc_tool` and `shell_command` target types to inform Pants of what files should cause re-execution of the target's process if they change.
     :::
     The special environment name `__local_workspace__` can be used to select a matching `experimental_workspace_environment` based on its `compatible_platforms` attribute.
@@ Expand Down @@