pantsbuild · thejcannon · Jan 4, 2023 · Dec 29, 2022 · Dec 29, 2022 · Dec 29, 2022
diff --git a/src/python/pants/backend/url_handlers/BUILD b/src/python/pants/backend/url_handlers/BUILD
@@ -0,0 +1,4 @@
+# Copyright 2022 Pants project contributors (see CONTRIBUTORS.md).
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+
+python_sources()
diff --git a/src/python/pants/backend/url_handlers/__init__.py b/src/python/pants/backend/url_handlers/__init__.py
diff --git a/src/python/pants/backend/url_handlers/s3/BUILD b/src/python/pants/backend/url_handlers/s3/BUILD
@@ -0,0 +1,4 @@
+# Copyright 2022 Pants project contributors (see CONTRIBUTORS.md).
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+
+python_sources()
diff --git a/src/python/pants/backend/url_handlers/s3/__init__.py b/src/python/pants/backend/url_handlers/s3/__init__.py
diff --git a/src/python/pants/backend/url_handlers/s3/register.py b/src/python/pants/backend/url_handlers/s3/register.py
@@ -0,0 +1,94 @@
+# Copyright 2022 Pants project contributors (see CONTRIBUTORS.md).
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+import logging
+from dataclasses import dataclass
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from pants.engine.download_file import URLDownloadHandler
+from pants.engine.fs import Digest, NativeDownloadFile
+from pants.engine.internals.selectors import Get
+from pants.engine.rules import collect_rules, rule
+from pants.engine.unions import UnionRule
+from pants.util.strutil import softwrap
+
+CONTENT_TYPE = "binary/octet-stream"
+
+
+logger = logging.getLogger(__name__)
+
+
+class DownloadS3URLHandler(URLDownloadHandler):
+    matches_scheme = "s3"
+
+
+@dataclass(frozen=True)
+class AWSCredentials:
+    access_key_id: str
+    secret_access_key: str
+
+
+@rule
+async def access_aws_credentials() -> AWSCredentials:
+    try:
+        import botocore.credentials  # pants: no-infer-dep
+        import botocore.session  # pants: no-infer-dep
+    except ImportError:
+        logger.warning(
+            softwrap(
+                """
+                In order to resolve s3:// URLs, Pants must load AWS credentials. To do so, `botocore`
+                must be importable in Pants' environment.
+
+                To do that add an entry to `[GLOBAL].plugins` of a pip-resolvable package to download from PyPI.
+                (E.g. `botocore == 1.29.39`). Note that the `botocore` package from PyPI at the time
+                of writing is >70MB, so an alternate package providing the `botocore` modules may be
+                advisable.
+                """
+            )
+        )
+        raise
+
+    session = botocore.session.get_session()
+    creds = botocore.credentials.create_credential_resolver(session).load_credentials()
+
+    return AWSCredentials(
+        access_key_id=creds.access_key,
+        secret_access_key=creds.secret_key,
+    )
+
+
+@rule
+async def download_s3_file(
+    request: DownloadS3URLHandler, aws_credentials: AWSCredentials
+) -> Digest:
+    import botocore.auth  # pants: no-infer-dep
+    import botocore.credentials  # pants: no-infer-dep
+
+    boto_creds = botocore.credentials.Credentials(
+        aws_credentials.access_key_id, aws_credentials.secret_access_key
+    )
+    auth = botocore.auth.SigV3Auth(boto_creds)
+    headers_container = SimpleNamespace(headers={})
+    auth.add_auth(headers_container)
+
+    parsed_url = urlparse(request.url)
+    bucket = parsed_url.netloc
+    key = parsed_url.path
+
+    digest = await Get(
+        Digest,
+        NativeDownloadFile(
+            url=f"https://{bucket}.s3.amazonaws.com{key}",
+            expected_digest=request.expected_digest,
+            auth_headers=headers_container.headers,
+        ),
+    )
+    return digest
+
+
+def rules():
+    return [
+        UnionRule(URLDownloadHandler, DownloadS3URLHandler),
+        *collect_rules(),
+    ]
diff --git a/src/python/pants/engine/download_file.py b/src/python/pants/engine/download_file.py
@@ -0,0 +1,85 @@
+# Copyright 2022 Pants project contributors (see CONTRIBUTORS.md).
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+
+from dataclasses import dataclass
+from typing import ClassVar, Optional
+from urllib.parse import urlparse
+
+from pants.engine.fs import Digest, DownloadFile, NativeDownloadFile
+from pants.engine.internals.native_engine import FileDigest
+from pants.engine.internals.selectors import Get
+from pants.engine.rules import collect_rules, rule
+from pants.engine.unions import UnionMembership, union
+
+
+@union
+@dataclass(frozen=True)
+class URLDownloadHandler:
+    """Union base for custom URL handler.
+
+    To register a custom URL handler:
+    - Subclass this class and declare one or both of the ClassVars.
+    - Declare a rule that takes in your class type and returns a `Digest`.
+    - Register your union member in your `rules()`: `UnionRule(URLDownloadHandler, YourClass)`.
+
+    Example:
+
+        class S3DownloadHandler(URLDownloadHandler):
+            match_scheme = "s3"
+
+        @rule
+        async def download_s3_file(request: S3DownloadHandler) -> Digest:
+            # Lookup auth tokens, etc...
+            # Ideally, download the file using `NativeDownloadFile()`
+            return digest
+
+        def rules():
+            return [
+                *collect_rules(),
+                UnionRule(URLDownloadHandler, S3DownloadHandler),
+            ]
+    """
+
+    match_scheme: ClassVar[Optional[str]] = None
+    """The scheme to match (e.g. 'ftp' or 's3') or `None` to match all schemes.
+
+    Note that 'http' and 'https' are two different schemes. In order to match either, you'll need to
+    register both.
+    """
+
+    match_authority: ClassVar[Optional[str]] = None
+    """The authority to match (e.g. 'pantsbuild.org' or 's3.amazonaws.com') or `None` to match all schemes.
+
+    Note that the authority matches userinfo (e.g. '[email protected]' or 'me:[email protected]')
+    as well as port (e.g. 'pantsbuild.org:80').
+    """
+
+    url: str
+    expected_digest: FileDigest
+
+
+@rule
+async def download_file(
+    request: DownloadFile,
+    union_membership: UnionMembership,
+) -> Digest:
+    parsed_url = urlparse(request.url)
+    handlers = union_membership.get(URLDownloadHandler)
+    for handler in handlers:
+        matches_scheme = handler.match_scheme is None or handler.match_scheme == parsed_url.scheme
+        matches_authority = (
+            handler.match_authority is None or handler.match_authority == parsed_url.netloc
+        )
+        if matches_scheme or matches_authority:
+            digest = await Get(
+                Digest, URLDownloadHandler, handler(request.url, request.expected_digest)
+            )
+            break
+    else:
+        digest = await Get(Digest, NativeDownloadFile(request.url, request.expected_digest))
+
+    return digest
+
+
+def rules():
+    return collect_rules()
diff --git a/src/python/pants/engine/fs.py b/src/python/pants/engine/fs.py
@@ -5,7 +5,7 @@
 
 from dataclasses import dataclass
 from enum import Enum
-from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Iterable, Mapping, Optional, Tuple, Union
 
 # Note: several of these types are re-exported as the public API of `engine/fs.py`.
 from pants.base.glob_match_error_behavior import GlobMatchErrorBehavior as GlobMatchErrorBehavior
@@ -23,6 +23,7 @@
 from pants.engine.internals.native_engine import RemovePrefix as RemovePrefix
 from pants.engine.internals.native_engine import Snapshot as Snapshot
 from pants.engine.rules import QueryRule
+from pants.util.frozendict import FrozenDict
 from pants.util.meta import frozen_after_init
 
 if TYPE_CHECKING:
@@ -248,6 +249,34 @@ class DownloadFile:
     expected_digest: FileDigest
 
 
+@frozen_after_init
+@dataclass(unsafe_hash=True)
+class NativeDownloadFile:
+    """Retrieve the contents of a file via an HTTP GET request or directly for local file:// URLs.
+
+    This request is handled directly by the native engine without any additional coercion by plugins,
+    and therefore should only be used in cases where the URL is known to be publicly accessible.
+    Otherwise, callers should use `DownloadFile`.
+
+    The auth_headers are part of this nodes' cache key for memoization (changing a header invalidates
+    prior results) but are not part of the underlying cache key for the local/remote cache (changing
+    a header won't re-download a file if the file was previously downloaded).
+    """
+
+    url: str
+    expected_digest: FileDigest
+    # NB: This mapping can be of any arbitrary headers, but should be limited to those required for
+    # authorization.
+    auth_headers: FrozenDict[str, str]
+
+    def __init__(
+        self, url: str, expected_digest: FileDigest, auth_headers: Mapping[str, str] | None = None
+    ) -> None:
+        self.url = url
+        self.expected_digest = expected_digest
+        self.auth_headers = FrozenDict(auth_headers or {})
+
+
 @dataclass(frozen=True)
 class Workspace(SideEffecting):
     """A handle for operations that mutate the local filesystem."""
@@ -300,7 +329,7 @@ def rules():
         QueryRule(Digest, (PathGlobs,)),
         QueryRule(Digest, (AddPrefix,)),
         QueryRule(Digest, (RemovePrefix,)),
-        QueryRule(Digest, (DownloadFile,)),
+        QueryRule(Digest, (NativeDownloadFile,)),
         QueryRule(Digest, (MergeDigests,)),
         QueryRule(Digest, (DigestSubset,)),
         QueryRule(DigestContents, (Digest,)),

diff --git a/src/python/pants/engine/internals/scheduler.py b/src/python/pants/engine/internals/scheduler.py
@@ -22,10 +22,10 @@
     DigestEntries,
     DigestSubset,
     Directory,
-    DownloadFile,
     FileContent,
     FileDigest,
     FileEntry,
+    NativeDownloadFile,
     PathGlobs,
     PathGlobsAndRoot,
     Paths,
@@ -157,7 +157,7 @@ def __init__(
             path_globs=PathGlobs,
             create_digest=CreateDigest,
             digest_subset=DigestSubset,
-            download_file=DownloadFile,
+            native_download_file=NativeDownloadFile,
             platform=Platform,
             process=Process,
             process_result=FallibleProcessResult,

diff --git a/src/python/pants/init/engine_initializer.py b/src/python/pants/init/engine_initializer.py
@@ -16,7 +16,7 @@
 from pants.build_graph.build_configuration import BuildConfiguration
 from pants.core.util_rules import environments, system_binaries
 from pants.core.util_rules.environments import determine_bootstrap_environment
-from pants.engine import desktop, fs, process
+from pants.engine import desktop, download_file, fs, process
 from pants.engine.console import Console
 from pants.engine.environment import EnvironmentName
 from pants.engine.fs import PathGlobs, Snapshot, Workspace
@@ -272,6 +272,7 @@ def build_root_singleton() -> BuildRoot:
                 *fs.rules(),
                 *dep_rules.rules(),
                 *desktop.rules(),
+                *download_file.rules(),
                 *git_rules(),
                 *graph.rules(),
                 *specs_rules.rules(),

diff --git a/src/rust/engine/src/downloads.rs b/src/rust/engine/src/downloads.rs
@@ -1,6 +1,7 @@
 // Copyright 2021 Pants project contributors (see CONTRIBUTORS.md).
 // Licensed under the Apache License, Version 2.0 (see LICENSE).
 
+use std::collections::BTreeMap;
 use std::io::{self, Write};
 use std::pin::Pin;
 use std::sync::Arc;
@@ -10,6 +11,7 @@ use bytes::{BufMut, Bytes};
 use futures::stream::StreamExt;
 use hashing::Digest;
 use humansize::{file_size_opts, FileSize};
+use reqwest::header::{HeaderMap, HeaderName};
 use reqwest::Error;
 use tokio_retry::strategy::{jitter, ExponentialBackoff};
 use tokio_retry::RetryIf;
@@ -44,11 +46,21 @@ impl NetDownload {
   async fn start(
     core: &Arc<Core>,
     url: Url,
+    auth_headers: BTreeMap<String, String>,
     file_name: String,
   ) -> Result<NetDownload, StreamingError> {
+    let mut headers = HeaderMap::new();
+    for (k, v) in auth_headers.iter() {
+      headers.insert(
+        HeaderName::from_bytes(k.as_bytes()).unwrap(),
+        v.parse().unwrap(),
+      );
+    }
+
     let response = core
       .http_client
       .get(url.clone())
+      .headers(headers)
       .send()
       .await
       .map_err(|err| StreamingError::Retryable(format!("Error downloading file: {err}")))
@@ -127,6 +139,7 @@ impl StreamingDownload for FileDownload {
 async fn attempt_download(
   core: &Arc<Core>,
   url: &Url,
+  auth_headers: &BTreeMap<String, String>,
   file_name: String,
   expected_digest: Digest,
 ) -> Result<(Digest, Bytes), StreamingError> {
@@ -144,7 +157,7 @@ async fn attempt_download(
       }
       Box::new(FileDownload::start(url.path(), file_name).await?)
     } else {
-      Box::new(NetDownload::start(core, url.clone(), file_name).await?)
+      Box::new(NetDownload::start(core, url.clone(), auth_headers.clone(), file_name).await?)
     }
   };
 
@@ -195,6 +208,7 @@ async fn attempt_download(
 pub async fn download(
   core: Arc<Core>,
   url: Url,
+  auth_headers: BTreeMap<String, String>,
   file_name: String,
   expected_digest: hashing::Digest,
 ) -> Result<(), String> {
@@ -215,7 +229,15 @@ pub async fn download(
       let retry_strategy = ExponentialBackoff::from_millis(10).map(jitter).take(4);
       RetryIf::spawn(
         retry_strategy,
-        || attempt_download(&core2, &url, file_name.clone(), expected_digest),
+        || {
+          attempt_download(
+            &core2,
+            &url,
+            &auth_headers,
+            file_name.clone(),
+            expected_digest,
+          )
+        },
         |err: &StreamingError| matches!(err, StreamingError::Retryable(_)),
       )
       .await

diff --git a/src/rust/engine/src/externs/interface.rs b/src/rust/engine/src/externs/interface.rs
@@ -183,7 +183,7 @@ impl PyTypes {
     path_globs: &PyType,
     create_digest: &PyType,
     digest_subset: &PyType,
-    download_file: &PyType,
+    native_download_file: &PyType,
     platform: &PyType,
     process: &PyType,
     process_result: &PyType,
@@ -215,7 +215,7 @@ impl PyTypes {
       remove_prefix: TypeId::new(py.get_type::<externs::fs::PyRemovePrefix>()),
       create_digest: TypeId::new(create_digest),
       digest_subset: TypeId::new(digest_subset),
-      download_file: TypeId::new(download_file),
+      native_download_file: TypeId::new(native_download_file),
       platform: TypeId::new(platform),
       process: TypeId::new(process),
       process_result: TypeId::new(process_result),