diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 9d35e3f97f..5d1d536704 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -26,5 +26,5 @@ ENV PYTHONDONTWRITEBYTECODE="1"
 
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV HISTFILE="/home/coder/.cache/._bash_history"
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index a9d1b897bc..c1587fc548 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 3130ab8687..2eae34a99a 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index 70a8a37430..8693421bd5 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 6b14f93e2d..041f6da4b1 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 1f8796e67f..b7947b9041 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -56,7 +56,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -68,7 +68,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -79,7 +79,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4499514060..613b135035 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,6 +12,7 @@ concurrency:
 jobs:
   pr-builder:
     needs:
+      - changed-files
       - checks
       - conda-cpp-build
       - conda-cpp-tests
@@ -23,38 +24,71 @@ jobs:
       - wheel-python-build
       - wheel-python-tests
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
+    if: always()
+    with:
+      needs: ${{ toJSON(needs) }}
+  changed-files:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
+    with:
+      files_yaml: |
+        test_cpp:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!docs/**'
+          - '!notebooks/**'
+          - '!python/**'
+        test_notebooks:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+        test_python:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!docs/**'
+          - '!notebooks/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-cpp-tests:
-    needs: conda-cpp-build
+    needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-python-tests:
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -63,17 +97,18 @@ jobs:
       run_script: "ci/build_docs.sh"
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
       build_command: |
         sccache -z;
         build-all --verbose;
+        python -c "import kvikio; print(kvikio.__version__)";
         sccache -s;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -81,14 +116,15 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
   wheel-python-tests:
-    needs: wheel-python-build
+    needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index ef093ee79d..edec9999fd 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 639467d6aa..4f6db170bc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -62,6 +62,10 @@ repos:
           - cmakelang==0.6.13
         verbose: true
         require_serial: true
+        exclude: |
+          (?x)^(
+            cpp/cmake/Modules/FindCUDAToolkit[.]cmake$
+          )
       - id: cmake-lint
         name: cmake-lint
         entry: ./cpp/scripts/run-cmake-format.sh cmake-lint
@@ -73,6 +77,10 @@ repos:
           - cmakelang==0.6.13
         verbose: true
         require_serial: true
+        exclude: |
+          (?x)^(
+            cpp/cmake/Modules/FindCUDAToolkit[.]cmake$
+          )
   - repo: https://github.com/codespell-project/codespell
     rev: v2.2.4
     hooks:
@@ -90,10 +98,14 @@ repos:
               [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$|
               CMakeLists[.]txt$|
               meta[.]yaml$
+        exclude: |
+          (?x)^(
+            cpp/cmake/Modules/FindCUDAToolkit[.]cmake$
+          )
       - id: verify-alpha-spec
         args: ["--fix", "--mode=release"]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.13.11
+    rev: v1.16.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ef921bf18a..9d7a5e150b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,67 @@
+# kvikio 24.12.00 (11 Dec 2024)
+
+## 🚨 Breaking Changes
+
+- Use curl &gt;=8.5.0 to align with conda-forge and avoid CVEs. ([#574](https://github.com/rapidsai/kvikio/pull/574)) [@bdice](https://github.com/bdice)
+- cufile version ([#565](https://github.com/rapidsai/kvikio/pull/565)) [@madsbk](https://github.com/madsbk)
+- Add a new KvikIO compatibility mode &quot;AUTO&quot; ([#547](https://github.com/rapidsai/kvikio/pull/547)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Build KvikIO as a shared library ([#527](https://github.com/rapidsai/kvikio/pull/527)) [@madsbk](https://github.com/madsbk)
+- Small improvements ([#493](https://github.com/rapidsai/kvikio/pull/493)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 🐛 Bug Fixes
+
+- Use curl &gt;=8.5.0 to align with conda-forge and avoid CVEs. ([#574](https://github.com/rapidsai/kvikio/pull/574)) [@bdice](https://github.com/bdice)
+- cufile version ([#565](https://github.com/rapidsai/kvikio/pull/565)) [@madsbk](https://github.com/madsbk)
+- Fix the pytest error for async io ([#559](https://github.com/rapidsai/kvikio/pull/559)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- fix library-loading issues in editable installs ([#553](https://github.com/rapidsai/kvikio/pull/553)) [@jameslamb](https://github.com/jameslamb)
+- Backport `FindCUDAToolkit` from CMake 3.31 ([#550](https://github.com/rapidsai/kvikio/pull/550)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Fix exporting of include directories ([#540](https://github.com/rapidsai/kvikio/pull/540)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Moving details in file_handle.hpp to .cpp ([#539](https://github.com/rapidsai/kvikio/pull/539)) [@madsbk](https://github.com/madsbk)
+- Disallow cuda-python 12.6.1 and 11.8.4 ([#537](https://github.com/rapidsai/kvikio/pull/537)) [@bdice](https://github.com/bdice)
+- Fix case of find_package call ([#534](https://github.com/rapidsai/kvikio/pull/534)) [@vyasr](https://github.com/vyasr)
+- CurlHandle: fix error message handling ([#522](https://github.com/rapidsai/kvikio/pull/522)) [@madsbk](https://github.com/madsbk)
+- Don&#39;t use macros for cuda driver functions ([#516](https://github.com/rapidsai/kvikio/pull/516)) [@Jacobfaib](https://github.com/Jacobfaib)
+- Fix CUDA driver type stub definitions ([#511](https://github.com/rapidsai/kvikio/pull/511)) [@Jacobfaib](https://github.com/Jacobfaib)
+- Stop tagging wheels as arch-agnostic ([#507](https://github.com/rapidsai/kvikio/pull/507)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Include &lt;utility&gt; since we use std::exchange ([#505](https://github.com/rapidsai/kvikio/pull/505)) [@robertmaynard](https://github.com/robertmaynard)
+- Disabling curl tests doesn&#39;t now disable kvikio cpp tests ([#503](https://github.com/rapidsai/kvikio/pull/503)) [@robertmaynard](https://github.com/robertmaynard)
+- Visibility of static class methods ([#492](https://github.com/rapidsai/kvikio/pull/492)) [@madsbk](https://github.com/madsbk)
+
+## 📖 Documentation
+
+- to ([#532](https://github.com/rapidsai/kvikio/pull/532)) [@RichardScottOZ](https://github.com/RichardScottOZ)
+
+## 🚀 New Features
+
+- Add a new KvikIO compatibility mode &quot;AUTO&quot; ([#547](https://github.com/rapidsai/kvikio/pull/547)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Upgrade nvcomp to 4.1.0.6 ([#525](https://github.com/rapidsai/kvikio/pull/525)) [@bdice](https://github.com/bdice)
+
+## 🛠️ Improvements
+
+- prefer wheel-provided libkvikio.so, use RTLD_LOCAL ([#551](https://github.com/rapidsai/kvikio/pull/551)) [@jameslamb](https://github.com/jameslamb)
+- enforce wheel size limits, README formatting in CI ([#548](https://github.com/rapidsai/kvikio/pull/548)) [@jameslamb](https://github.com/jameslamb)
+- remove WheelHelpers.cmake ([#545](https://github.com/rapidsai/kvikio/pull/545)) [@jameslamb](https://github.com/jameslamb)
+- Put a ceiling on cuda-python ([#543](https://github.com/rapidsai/kvikio/pull/543)) [@jameslamb](https://github.com/jameslamb)
+- Replace FindcuFile with upstream FindCUDAToolkit support ([#542](https://github.com/rapidsai/kvikio/pull/542)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Build KvikIO as a shared library ([#527](https://github.com/rapidsai/kvikio/pull/527)) [@madsbk](https://github.com/madsbk)
+- print sccache stats in builds ([#524](https://github.com/rapidsai/kvikio/pull/524)) [@jameslamb](https://github.com/jameslamb)
+- RemoteIO: use a pinned bounce buffer ([#519](https://github.com/rapidsai/kvikio/pull/519)) [@madsbk](https://github.com/madsbk)
+- Use registered strings for NVTX. Add more NVTX annotations. ([#518](https://github.com/rapidsai/kvikio/pull/518)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Minor README fixes ([#517](https://github.com/rapidsai/kvikio/pull/517)) [@Jacobfaib](https://github.com/Jacobfaib)
+- Python bindings to `cuFileDriverOpen()` and `cuFileDriverClose()` ([#514](https://github.com/rapidsai/kvikio/pull/514)) [@madsbk](https://github.com/madsbk)
+- Add CUDA libs in Python Conda, Consolidate Conda CI installs &amp; use `rapids-dask-dependency` ([#513](https://github.com/rapidsai/kvikio/pull/513)) [@jakirkham](https://github.com/jakirkham)
+- S3 benchmark: adding cudf-kvikio and cudf-fsspec ([#509](https://github.com/rapidsai/kvikio/pull/509)) [@madsbk](https://github.com/madsbk)
+- Use Cython&#39;s `array` to back `Py_ssize_t[::1]` ([#504](https://github.com/rapidsai/kvikio/pull/504)) [@jakirkham](https://github.com/jakirkham)
+- Mark all of `Array`&#39;s `nogil` `cdef` functions as `noexcept` ([#502](https://github.com/rapidsai/kvikio/pull/502)) [@jakirkham](https://github.com/jakirkham)
+- Simplify `_to_string` encoding of Python `str`s ([#498](https://github.com/rapidsai/kvikio/pull/498)) [@jakirkham](https://github.com/jakirkham)
+- make conda installs in CI stricter ([#495](https://github.com/rapidsai/kvikio/pull/495)) [@jameslamb](https://github.com/jameslamb)
+- Small improvements ([#493](https://github.com/rapidsai/kvikio/pull/493)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Prune workflows based on changed files ([#489](https://github.com/rapidsai/kvikio/pull/489)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Remote IO: S3 support ([#479](https://github.com/rapidsai/kvikio/pull/479)) [@madsbk](https://github.com/madsbk)
+- Use nvcomp wheel instead of bundling nvcomp ([#478](https://github.com/rapidsai/kvikio/pull/478)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Touch ups to `Array` ([#475](https://github.com/rapidsai/kvikio/pull/475)) [@jakirkham](https://github.com/jakirkham)
+- Remote IO: http support ([#464](https://github.com/rapidsai/kvikio/pull/464)) [@madsbk](https://github.com/madsbk)
+
 # kvikio 24.10.00 (9 Oct 2024)
 
 ## 🚨 Breaking Changes
diff --git a/README.md b/README.md
index e787f8a4e0..50e4328fbd 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,6 @@ KvikIO (pronounced "kuh-VICK-eye-oh", see [here](https://ordnet.dk/ddo_en/dict?q
 bindings to [cuFile](https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html),
 which enables [GPUDirect Storage (GDS)](https://developer.nvidia.com/blog/gpudirect-storage/).
 KvikIO also works efficiently when GDS isn't available and can read/write both host and device data seamlessly.
-The C++ library is header-only making it easy to include in [existing projects](https://github.com/rapidsai/kvikio/blob/HEAD/cpp/examples/downstream/).
 
 
 ### Features
@@ -70,9 +69,9 @@ if __name__ == "__main__":
 #### C++
 ```c++
 #include <cstddef>
+#include <future>
 #include <cuda_runtime.h>
 #include <kvikio/file_handle.hpp>
-using namespace std;
 
 int main()
 {
@@ -85,12 +84,12 @@ int main()
 
   // Write `a` to file
   kvikio::FileHandle fw("test-file", "w");
-  size_t written = fw.write(a, size);
+  std::size_t written = fw.write(a, size);
   fw.close();
 
   // Read file into `b`
   kvikio::FileHandle fr("test-file", "r");
-  size_t read = fr.read(b, size);
+  std::size_t read = fr.read(b, size);
   fr.close();
 
   // Read file into `b` in parallel using 16 threads
@@ -98,8 +97,8 @@ int main()
   {
     // FileHandles have RAII semantics
     kvikio::FileHandle f("test-file", "r");
-    future<size_t> future = f.pread(b_dev, sizeof(a), 0);  // Non-blocking
-    size_t read = future.get(); // Blocking
+    std::future<std::size_t> future = f.pread(b_dev, sizeof(a), 0);  // Non-blocking
+    std::size_t read = future.get(); // Blocking
     // Notice, `f` closes automatically on destruction.
   }
 }
diff --git a/VERSION b/VERSION
index 7c7ba04436..af28c42b52 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.10.00
+24.12.00
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 27ea30176d..8fb3a35991 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -16,6 +16,10 @@ rapids-print-env
 rapids-logger "Begin cpp build"
 conda config --set path_conflict prevent
 
+sccache --zero-stats
+
 rapids-conda-retry mambabuild conda/recipes/libkvikio
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index af1e23ae3c..ea408b6940 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -6,28 +6,22 @@ set -euo pipefail
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+rapids-logger "Downloading artifacts from previous jobs"
+CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
+PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
+
 rapids-dependency-file-generator \
   --output conda \
   --file-key docs \
-  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \
+  --prepend-channel "${CPP_CHANNEL}" --prepend-channel "${PYTHON_CHANNEL}" \
+  | tee env.yaml
 
-rapids-mamba-retry env create --yes -f env.yaml -n docs
+rapids-mamba-retry env create -yq -f env.yaml -n docs
 conda activate docs
 
 rapids-print-env
 
-rapids-logger "Downloading artifacts from previous jobs"
-CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
-PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
-
-rapids-mamba-retry install \
-  --channel "${CPP_CHANNEL}" \
-  --channel "${PYTHON_CHANNEL}" \
-  kvikio libkvikio
-
-export RAPIDS_VERSION="$(rapids-version)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
@@ -44,4 +38,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/kvikio/"html
 mv _html/* "${RAPIDS_DOCS_DIR}/kvikio/html"
 popd
 
-rapids-upload-docs
+RAPIDS_VERSION_NUMBER="$(rapids-version-major-minor)" rapids-upload-docs
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 0b39b6c91f..7e0fc0bf93 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -18,8 +18,12 @@ rapids-logger "Begin py build"
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 conda config --set path_conflict prevent
 
+sccache --zero-stats
+
 rapids-conda-retry mambabuild \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/kvikio
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
new file mode 100755
index 0000000000..b1ede832da
--- /dev/null
+++ b/ci/build_wheel.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_name=$1
+package_dir=$2
+
+source rapids-configure-sccache
+source rapids-date-string
+
+rapids-generate-version > ./VERSION
+
+cd "${package_dir}"
+
+sccache --zero-stats
+
+rapids-logger "Building '${package_name}' wheel"
+python -m pip wheel \
+    -w dist \
+    -v \
+    --no-deps \
+    --disable-pip-version-check \
+    .
+
+sccache --show-adv-stats
diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
index 9893474da3..ca27717769 100755
--- a/ci/build_wheel_cpp.sh
+++ b/ci/build_wheel_cpp.sh
@@ -6,20 +6,36 @@ set -euo pipefail
 package_name="libkvikio"
 package_dir="python/libkvikio"
 
-source rapids-configure-sccache
-source rapids-date-string
+rapids-logger "Generating build requirements"
 
-rapids-generate-version > ./VERSION
+rapids-dependency-file-generator \
+  --output requirements \
+  --file-key "py_build_${package_name}" \
+  --file-key "py_rapids_build_${package_name}" \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true" \
+| tee /tmp/requirements-build.txt
 
-cd "${package_dir}"
+rapids-logger "Installing build requirements"
+python -m pip install \
+    -v \
+    --prefer-binary \
+    -r /tmp/requirements-build.txt
 
-python -m pip install wheel
-# libkvikio is a header-only C++ library with no Python code, so
-# it is entirely platform-agnostic. We cannot use auditwheel for
-# retagging since it has no extension modules, so we use `wheel`
-# directly instead.
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
-python -m wheel tags --platform any dist/* --remove
+# build with '--no-build-isolation', for better sccache hit rate
+# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
+export PIP_NO_BUILD_ISOLATION=0
+
+export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
+./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp dist
+
+mkdir -p ${package_dir}/final_dist
+python -m auditwheel repair \
+    --exclude libnvcomp.so.4 \
+    -w ${package_dir}/final_dist \
+    ${package_dir}/dist/*
+
+./ci/validate_wheel.sh ${package_dir} final_dist
+
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist"
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index a75ef5b08b..088e8e8e8f 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -6,27 +6,25 @@ set -euo pipefail
 package_name="kvikio"
 package_dir="python/kvikio"
 
-source rapids-configure-sccache
-source rapids-date-string
-
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-rapids-generate-version > ./VERSION
-
-CPP_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="libkvikio_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libkvikio_dist)
-
-cd "${package_dir}"
-
-# ensure 'kvikio' wheel builds always use the 'libkvikio' just built in the same CI run
+# Ensure 'kvikio' wheel builds always use the 'libkvikio' just built in the same CI run
 #
-# using env variable PIP_CONSTRAINT is necessary to ensure the constraints
+# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints
 # are used when creating the isolated build environment
-echo "libkvikio-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${CPP_WHEELHOUSE}/libkvikio_*.whl)" > ./constraints.txt
+RAPIDS_PY_WHEEL_NAME="libkvikio_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libkvikio_dist
+echo "libkvikio-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libkvikio_dist/libkvikio_*.whl)" > /tmp/constraints.txt
+export PIP_CONSTRAINT="/tmp/constraints.txt"
+
+export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
+./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
-PIP_CONSTRAINT="${PWD}/constraints.txt" \
-    python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+python -m auditwheel repair \
+    --exclude libkvikio.so \
+    --exclude libnvcomp.so.4 \
+    -w ${package_dir}/final_dist \
+    ${package_dir}/dist/*
 
-mkdir -p final_dist
-python -m auditwheel repair -w final_dist dist/*
+./ci/validate_wheel.sh ${package_dir} final_dist
 
-RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 6b9458b273..102beaa2ba 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -36,7 +36,10 @@ function sed_runner() {
 echo "${NEXT_FULL_TAG}" > VERSION
 
 DEPENDENCIES=(
+  kvikio
   libkvikio
+  libkvikio-tests
+  rapids-dask-dependency
 )
 for DEP in "${DEPENDENCIES[@]}"; do
   for FILE in dependencies.yaml conda/environments/*.yaml; do
diff --git a/ci/run_pytests.sh b/ci/run_pytests.sh
index b2c93dbe56..1a7edb5be5 100755
--- a/ci/run_pytests.sh
+++ b/ci/run_pytests.sh
@@ -6,4 +6,9 @@ set -euo pipefail
 # Support invoking run_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/kvikio
 
-pytest --cache-clear --verbose "$@" tests
+# If running CUDA 11.8 on arm64, we skip tests marked "cufile" since
+# cuFile didn't support arm until 12.4
+[[ "${CUDA_VERSION}" == "11.8.0" && "${RUNNER_ARCH}" == "ARM64" ]] \
+  && PYTEST_MARK=( -m 'not cufile' ) || PYTEST_MARK=()
+
+pytest --cache-clear --verbose "${PYTEST_MARK[@]}" "$@" tests
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index e7de3e68f2..ef7933f150 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -5,30 +5,28 @@ set -euo pipefail
 
 . /opt/conda/etc/profile.d/conda.sh
 
+CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
+
 rapids-logger "Generate C++ testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
   --file-key test_cpp \
-  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee env.yaml
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" \
+  --prepend-channel "${CPP_CHANNEL}" \
+  | tee env.yaml
 
-rapids-mamba-retry env create --yes -f env.yaml -n test
+rapids-mamba-retry env create -qy -f env.yaml -n test
 
 # Temporarily allow unbound variables for conda activation.
 set +u
 conda activate test
 set -u
 
-CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
-SUITEERROR=0
 
 rapids-print-env
 
-rapids-mamba-retry install \
-  --channel "${CPP_CHANNEL}" \
-  libkvikio libkvikio-tests
-
 rapids-logger "Check GPU usage"
 nvidia-smi
 
diff --git a/ci/test_python.sh b/ci/test_python.sh
index fccbcb4728..df16f20f9f 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -8,34 +8,31 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 
 . /opt/conda/etc/profile.d/conda.sh
 
+rapids-logger "Downloading artifacts from previous jobs"
+CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
+PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
+
 rapids-logger "Generate Python testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
   --file-key test_python \
-  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \
+  --prepend-channel "${CPP_CHANNEL}" --prepend-channel "${PYTHON_CHANNEL}" \
+  | tee env.yaml
 
-rapids-mamba-retry env create --yes -f env.yaml -n test
+rapids-mamba-retry env create -qy -f env.yaml -n test
 
 # Temporarily allow unbound variables for conda activation.
 set +u
 conda activate test
 set -u
 
-rapids-logger "Downloading artifacts from previous jobs"
-CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
-PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
-
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
 RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}
 mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}"
 
 rapids-print-env
 
-rapids-mamba-retry install \
-  --channel "${CPP_CHANNEL}" \
-  --channel "${PYTHON_CHANNEL}" \
-  libkvikio kvikio
-
 rapids-logger "Check GPU usage"
 nvidia-smi
 
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 94a31b04b6..46ae5cbcf2 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -4,9 +4,18 @@
 set -eou pipefail
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-WHEELHOUSE="${PWD}/dist/"
-RAPIDS_PY_WHEEL_NAME="kvikio_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python "${WHEELHOUSE}"
 
-python -m pip install "$(echo ${WHEELHOUSE}/kvikio_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
+# Download and install the libkvikio and kvikio wheels built in the previous step
+RAPIDS_PY_WHEEL_NAME="libkvikio_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
+RAPIDS_PY_WHEEL_NAME="kvikio_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
-python -m pytest ./python/kvikio/tests
+python -m pip install -v \
+  "$(echo ./dist/libkvikio_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+  "$(echo ./dist/kvikio_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
+
+# If running CUDA 11.8 on arm64, we skip tests marked "cufile" since
+# cuFile didn't support arm until 12.4
+[[ "${CUDA_VERSION}" == "11.8.0" && "${RUNNER_ARCH}" == "ARM64" ]] \
+  && PYTEST_MARK=( -m 'not cufile' ) || PYTEST_MARK=()
+
+python -m pytest --cache-clear --verbose "${PYTEST_MARK[@]}" ./python/kvikio/tests
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
new file mode 100755
index 0000000000..5910a5c59f
--- /dev/null
+++ b/ci/validate_wheel.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir=$1
+wheel_dir_relative_path=$2
+
+cd "${package_dir}"
+
+rapids-logger "validate packages with 'pydistcheck'"
+
+pydistcheck \
+    --inspect \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
+
+rapids-logger "validate packages with 'twine'"
+
+twine check \
+    --strict \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 65ca39fa34..b94c2b8780 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -6,29 +6,33 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
+- boto3>=1.21.21
 - c-compiler
 - cmake>=3.26.4,!=3.30.0
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask>=2022.05.2
 - doxygen=1.9.1
 - gcc_linux-aarch64=11.*
+- libcurl>=8.5.0,<9.0a0
+- moto>=4.0.8
 - ninja
 - numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-aarch64=11.8
-- nvcomp==4.0.1
+- nvcomp==4.1.0.6
 - packaging
 - pre-commit
 - pytest
 - pytest-cov
 - python>=3.10,<3.13
+- rangehttpserver
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-dask-dependency==24.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a020690e64..87d6fc1025 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -6,31 +6,35 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
+- boto3>=1.21.21
 - c-compiler
 - cmake>=3.26.4,!=3.30.0
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask>=2022.05.2
 - doxygen=1.9.1
 - gcc_linux-64=11.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
+- libcurl>=8.5.0,<9.0a0
+- moto>=4.0.8
 - ninja
 - numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
-- nvcomp==4.0.1
+- nvcomp==4.1.0.6
 - packaging
 - pre-commit
 - pytest
 - pytest-cov
 - python>=3.10,<3.13
+- rangehttpserver
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-dask-dependency==24.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 31145241d7..1ace3210a8 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -6,29 +6,33 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
+- boto3>=1.21.21
 - c-compiler
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvcc
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask>=2022.05.2
 - doxygen=1.9.1
 - gcc_linux-aarch64=11.*
 - libcufile-dev
+- libcurl>=8.5.0,<9.0a0
+- moto>=4.0.8
 - ninja
 - numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcomp==4.0.1
+- nvcomp==4.1.0.6
 - packaging
 - pre-commit
 - pytest
 - pytest-cov
 - python>=3.10,<3.13
+- rangehttpserver
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-dask-dependency==24.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 4d7d0be7c6..25b6a075de 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -6,29 +6,33 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
+- boto3>=1.21.21
 - c-compiler
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvcc
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask>=2022.05.2
 - doxygen=1.9.1
 - gcc_linux-64=11.*
 - libcufile-dev
+- libcurl>=8.5.0,<9.0a0
+- moto>=4.0.8
 - ninja
 - numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcomp==4.0.1
+- nvcomp==4.1.0.6
 - packaging
 - pre-commit
 - pytest
 - pytest-cov
 - python>=3.10,<3.13
+- rangehttpserver
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-dask-dependency==24.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/recipes/kvikio/conda_build_config.yaml b/conda/recipes/kvikio/conda_build_config.yaml
index 35cb51e2f9..776c2623e5 100644
--- a/conda/recipes/kvikio/conda_build_config.yaml
+++ b/conda/recipes/kvikio/conda_build_config.yaml
@@ -4,6 +4,9 @@ c_compiler_version:
 cxx_compiler_version:
   - 11
 
+cmake_version:
+  - ">=3.26.4,!=3.30.0"
+
 cuda_compiler:
   - cuda-nvcc
 
@@ -16,8 +19,18 @@ c_stdlib:
 c_stdlib_version:
   - "2.17"
 
-cmake_version:
-  - ">=3.26.4,!=3.30.0"
+# The CTK libraries below are missing from the conda-forge::cudatoolkit package
+# for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
+# and the "*_run_*" version specifiers correspond to `11.x` packages.
+
+cuda11_libcufile_host_version:
+  - "1.4.0.31"
+
+cuda11_libcufile_run_version:
+  - ">=1.0.0.82,<=1.4.0.31"
+
+libcurl_version:
+  - "==8.5.0"
 
 nvcomp_version:
-  - "=4.0.1"
+  - "=4.1.0.6"
diff --git a/conda/recipes/kvikio/meta.yaml b/conda/recipes/kvikio/meta.yaml
index 4a352012e3..271712ac51 100644
--- a/conda/recipes/kvikio/meta.yaml
+++ b/conda/recipes/kvikio/meta.yaml
@@ -37,6 +37,8 @@ build:
     - {{ compiler('cuda11') }}
     {% else %}
     - {{ compiler('cuda') }}
+    - cuda-cudart-dev
+    - libcufile-dev  # [linux]
     {% endif %}
 
 requirements:
@@ -58,12 +60,17 @@ requirements:
     - cython >=3.0.0
     {% if cuda_major == "11" %}
     - cudatoolkit
+    - libcufile {{ cuda11_libcufile_run_version }}  # [linux64]
+    {% else %}
+    - cuda-cudart-dev
+    - libcufile-dev  # [linux]
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - nvcomp {{ nvcomp_version }}
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
     - libkvikio ={{ version }}
+    - libcurl {{ libcurl_version }}
   run:
     - python
     - numpy >=1.23,<3.0a0
@@ -75,6 +82,10 @@ requirements:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
+    - libcufile {{ cuda11_libcufile_run_version }}  # [linux64]
+    {% else %}
+    - cuda-cudart
+    - libcufile  # [linux]
     {% endif %}
 
 test:
diff --git a/conda/recipes/libkvikio/conda_build_config.yaml b/conda/recipes/libkvikio/conda_build_config.yaml
index 9cf2923599..bacf9b8273 100644
--- a/conda/recipes/libkvikio/conda_build_config.yaml
+++ b/conda/recipes/libkvikio/conda_build_config.yaml
@@ -28,3 +28,6 @@ cuda11_libcufile_host_version:
 
 cuda11_libcufile_run_version:
   - ">=1.0.0.82,<=1.4.0.31"
+
+libcurl_version:
+  - "==8.5.0"
diff --git a/conda/recipes/libkvikio/meta.yaml b/conda/recipes/libkvikio/meta.yaml
index 186c373f56..4019a55ec8 100644
--- a/conda/recipes/libkvikio/meta.yaml
+++ b/conda/recipes/libkvikio/meta.yaml
@@ -52,6 +52,7 @@ requirements:
     {% else %}
     - libcufile-dev  # [linux]
     {% endif %}
+    - libcurl {{ libcurl_version }}
 
 outputs:
   - name: libkvikio
@@ -74,6 +75,7 @@ outputs:
         - cmake {{ cmake_version }}
       host:
         - cuda-version ={{ cuda_version }}
+        - libcurl {{ libcurl_version }}
       run:
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         {% if cuda_major == "11" %}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5990405b1c..a0639c5382 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -35,13 +35,23 @@ rapids_cmake_write_version_file(include/kvikio/version_config.hpp)
 # Set a default build type if none was specified
 rapids_cmake_build_type(Release)
 
-# build options
+# ##################################################################################################
+# * build options ----------------------------------------------------------------------------------
+
+option(BUILD_SHARED_LIBS "Build KvikIO shared library" ON)
 option(KvikIO_BUILD_EXAMPLES "Configure CMake to build examples" ON)
 option(KvikIO_BUILD_TESTS "Configure CMake to build tests" ON)
+option(KvikIO_REMOTE_SUPPORT "Configure CMake to build with remote IO support" ON)
+option(KvikIO_CUDA_SUPPORT "Configure CMake to build with CUDA support" ON)
+option(KvikIO_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON)
 
+# ##################################################################################################
+# * conda environment ------------------------------------------------------------------------------
 rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
 
-# find packages we depend on
+# ##################################################################################################
+# * dependencies -----------------------------------------------------------------------------------
+
 rapids_cpm_init()
 
 rapids_find_package(
@@ -50,95 +60,129 @@ rapids_find_package(
   INSTALL_EXPORT_SET kvikio-exports
 )
 
-rapids_find_package(
-  CUDAToolkit
-  BUILD_EXPORT_SET kvikio-exports
-  INSTALL_EXPORT_SET kvikio-exports
-)
+if(KvikIO_REMOTE_SUPPORT)
+  include(cmake/thirdparty/get_libcurl.cmake)
+endif()
 
-if(CUDAToolkit_FOUND)
+set(cuFile_FOUND 0)
+if(KvikIO_CUDA_SUPPORT)
   rapids_find_package(
-    cuFile
+    CUDAToolkit REQUIRED
     BUILD_EXPORT_SET kvikio-exports
     INSTALL_EXPORT_SET kvikio-exports
   )
-  if(NOT cuFile_FOUND)
+  include(cmake/thirdparty/get_nvtx.cmake)
+
+  if(NOT TARGET CUDA::cuFile)
     message(
       WARNING "Cannot find cuFile - KvikIO will still work but won't use GPUDirect Storage (GDS)"
     )
   else()
-    file(READ "${cuFile_INCLUDE_DIRS}/cufile.h" CUFILE_H_STR)
-    string(FIND "${CUFILE_H_STR}" "cuFileBatchIOSetUp" cuFileBatchIOSetUp_location)
-    if(cuFileBatchIOSetUp_location EQUAL "-1")
-      set(cuFile_BATCH_API_FOUND FALSE)
-    else()
-      set(cuFile_BATCH_API_FOUND TRUE)
-    endif()
+    set(cuFile_FOUND 1)
+
+    # Check API support
+    try_compile(
+      cuFile_BATCH_API_FOUND SOURCE_FROM_CONTENT
+      batch.cpp
+      [[#include <cufile.h>
+      int main() {
+        cuFileBatchIOSetUp(nullptr, 0);
+        return 0;
+      }
+      ]]
+      LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
+      OUTPUT_VARIABLE batch_output
+    )
     message(STATUS "Found cuFile Batch API: ${cuFile_BATCH_API_FOUND}")
-    string(FIND "${CUFILE_H_STR}" "cuFileReadAsync" cuFileReadAsync_location)
-    if(cuFileReadAsync_location EQUAL "-1")
-      set(cuFile_STREAM_API_FOUND FALSE)
-    else()
-      set(cuFile_STREAM_API_FOUND TRUE)
-    endif()
+    try_compile(
+      cuFile_STREAM_API_FOUND SOURCE_FROM_CONTENT
+      stream.cpp
+      [[#include <cufile.h>
+      int main() {
+        CUfileHandle_t fh;
+        CUstream stream;
+        cuFileReadAsync(fh, nullptr, nullptr, nullptr, nullptr, nullptr, stream);
+        return 0;
+      }
+      ]]
+      LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
+      OUTPUT_VARIABLE stream_output
+    )
     message(STATUS "Found cuFile Stream API: ${cuFile_STREAM_API_FOUND}")
+    try_compile(
+      cuFile_VERSION_API_FOUND SOURCE_FROM_CONTENT
+      version.cpp
+      [[#include <cufile.h>
+      int main() {
+        int version;
+        cuFileGetVersion(&version);
+        return 0;
+      }
+      ]]
+      LINK_LIBRARIES CUDA::cuFile rt ${CMAKE_DL_LIBS}
+      OUTPUT_VARIABLE version_output
+    )
+    message(STATUS "Found cuFile Version API: ${cuFile_VERSION_API_FOUND}")
   endif()
-
-  include(cmake/thirdparty/get_nvtx.cmake)
 endif()
 
 include(cmake/thirdparty/get_thread_pool.cmake)
 
-# library targets
-add_library(kvikio INTERFACE)
-add_library(kvikio::kvikio ALIAS kvikio)
+# ##################################################################################################
+# * library targets --------------------------------------------------------------------------------
 
-# We enable CUDA and cuFile both here and in the FINAL_CODE_BLOCK export block. While the code block
-# below (in FINAL_CODE_BLOCK) sets this information when KvikIO is imported from a
-# kvikio-config.cmake file, this code block is intended to be used by projects that include KvikIO's
-# source directory in their own CMake build.
-#
-# Normally we would just set the below without using $<BUILD_LOCAL_INTERFACE:...>, and without the
-# final_code_string, but in this case we want to conditionally set these things at import time, not
-# export time, since KvikIO is a header-only library that can adapt to different build environments.
-
-# Enable CUDA in KvikIO
-if(CUDAToolkit_FOUND)
-  if(CUDA_STATIC_RUNTIME)
-    target_link_libraries(kvikio INTERFACE $<BUILD_LOCAL_INTERFACE:CUDA::cudart_static>)
-  else()
-    target_link_libraries(kvikio INTERFACE $<BUILD_LOCAL_INTERFACE:CUDA::cudart>)
-  endif()
-  target_compile_definitions(kvikio INTERFACE $<BUILD_LOCAL_INTERFACE:KVIKIO_CUDA_FOUND>)
-else()
-  message(WARNING "Building KvikIO without CUDA")
-endif()
+set(SOURCES "src/file_handle.cpp")
 
-# Enable supported cuFile features in KvikIO
-if(cuFile_FOUND)
-  target_link_libraries(kvikio INTERFACE $<BUILD_LOCAL_INTERFACE:cufile::cuFile_interface>)
-  target_compile_definitions(kvikio INTERFACE $<BUILD_LOCAL_INTERFACE:KVIKIO_CUFILE_FOUND>)
-  if(cuFile_BATCH_API_FOUND)
-    target_compile_definitions(
-      kvikio INTERFACE $<BUILD_LOCAL_INTERFACE:KVIKIO_CUFILE_BATCH_API_FOUND>
-    )
-  endif()
-  if(cuFile_STREAM_API_FOUND)
-    target_compile_definitions(
-      kvikio INTERFACE $<BUILD_LOCAL_INTERFACE:KVIKIO_CUFILE_STREAM_API_FOUND>
-    )
-  endif()
+if(KvikIO_REMOTE_SUPPORT)
+  list(APPEND SOURCES "src/remote_handle.cpp")
 endif()
 
+add_library(kvikio ${SOURCES})
+
+# To avoid symbol conflicts when statically linking to libcurl.a (see get_libcurl.cmake) and its
+# dependency OpenSSL, we exclude them when building libkvikio.so. This way, libkvikio.so will not
+# expose any OpenSSL symbols that could conflict with downstream users like CPython that also links
+# to (another version of) OpenSSL.
+target_link_options(kvikio PRIVATE "LINKER:--exclude-libs,ALL")
+
+add_library(kvikio::kvikio ALIAS kvikio)
+
 target_include_directories(
-  kvikio INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-                   "$<INSTALL_INTERFACE:include>"
+  kvikio
+  PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+         "$<BUILD_INTERFACE:$<$<BOOL:${KvikIO_CUDA_SUPPORT}>:${CUDAToolkit_INCLUDE_DIRS}>>"
+  INTERFACE "$<INSTALL_INTERFACE:include>"
 )
+
+# Notice, we do not link to cuda or cufile since KvikIO opens them manually using `dlopen()`.
 target_link_libraries(
-  kvikio INTERFACE Threads::Threads BS::thread_pool ${CMAKE_DL_LIBS}
-                   $<TARGET_NAME_IF_EXISTS:nvtx3::nvtx3-cpp>
+  kvikio
+  PUBLIC Threads::Threads BS::thread_pool ${CMAKE_DL_LIBS} $<TARGET_NAME_IF_EXISTS:nvtx3::nvtx3-cpp>
+  PRIVATE $<TARGET_NAME_IF_EXISTS:CURL::libcurl>
+)
+
+target_compile_definitions(
+  kvikio
+  PUBLIC $<$<BOOL:${KvikIO_REMOTE_SUPPORT}>:KVIKIO_LIBCURL_FOUND>
+         $<$<BOOL:${KvikIO_CUDA_SUPPORT}>:KVIKIO_CUDA_FOUND>
+         $<$<BOOL:${cuFile_FOUND}>:KVIKIO_CUFILE_FOUND>
+         $<$<BOOL:${cuFile_BATCH_API_FOUND}>:KVIKIO_CUFILE_BATCH_API_FOUND>
+         $<$<BOOL:${cuFile_STREAM_API_FOUND}>:KVIKIO_CUFILE_STREAM_API_FOUND>
+         $<$<BOOL:${cuFile_VERSION_API_FOUND}>:KVIKIO_CUFILE_VERSION_API_FOUND>
 )
-target_compile_features(kvikio INTERFACE cxx_std_17)
+
+set_target_properties(
+  kvikio
+  PROPERTIES BUILD_RPATH "\$ORIGIN"
+             INSTALL_RPATH "\$ORIGIN"
+             CXX_STANDARD 17
+             CXX_STANDARD_REQUIRED ON
+             POSITION_INDEPENDENT_CODE ON
+             INTERFACE_POSITION_INDEPENDENT_CODE ON
+)
+
+# ##################################################################################################
+# * add examples -----------------------------------------------------------------------------------
 
 # optionally build examples
 if(KvikIO_BUILD_EXAMPLES)
@@ -159,77 +203,46 @@ if(CUDAToolkit_FOUND
   add_subdirectory(tests)
 endif()
 
+# ##################################################################################################
+# * install targets --------------------------------------------------------------------------------
+
+rapids_cmake_install_lib_dir(lib_dir)
 include(CPack)
+include(GNUInstallDirs)
 
-# install export targets
-install(TARGETS kvikio EXPORT kvikio-exports)
-install(DIRECTORY include/kvikio/ DESTINATION include/kvikio)
-install(FILES ${KvikIO_BINARY_DIR}/include/kvikio/version_config.hpp DESTINATION include/kvikio)
+set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME kvikio)
 
-include("${rapids-cmake-dir}/export/find_package_file.cmake")
-rapids_export_find_package_file(
-  BUILD "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" EXPORT_SET kvikio-exports
-)
-rapids_export_find_package_file(
-  INSTALL "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" EXPORT_SET kvikio-exports
+install(
+  TARGETS kvikio
+  DESTINATION ${lib_dir}
+  EXPORT kvikio-exports
 )
 
+install(DIRECTORY include/kvikio/ DESTINATION include/kvikio)
+install(FILES ${KvikIO_BINARY_DIR}/include/kvikio/version_config.hpp DESTINATION include/kvikio)
+
 set(doc_string
     [=[
-Provide targets for KvikIO: C++ bindings for cuFile.
+Provide targets for KvikIO.
 ]=]
 )
 
 set(final_code_string
-    [=[
-get_property(already_set_kvikio DIRECTORY PROPERTY kvikio_already_set_defines SET)
-if(NOT already_set_kvikio)
-  set_property(DIRECTORY PROPERTY kvikio_already_set_defines "ON")
-
-  find_package(CUDAToolkit QUIET)
-  if(CUDAToolkit_FOUND)
-    if(CUDA_STATIC_RUNTIME)
-      target_link_libraries(kvikio::kvikio INTERFACE CUDA::cudart_static)
-    else()
-      target_link_libraries(kvikio::kvikio INTERFACE CUDA::cudart)
-    endif()
-    target_compile_definitions(kvikio::kvikio INTERFACE KVIKIO_CUDA_FOUND)
-  else()
-    message(WARNING "Building KvikIO without CUDA")
-  endif()
-
-  # Find cuFile and determine which features are supported
-  find_package(cuFile QUIET)
-  if(NOT cuFile_FOUND)
-    message(WARNING "KvikIO: cuFile not found")
-  else()
-    file(READ "${cuFile_INCLUDE_DIRS}/cufile.h" CUFILE_H_STR)
-    string(FIND "${CUFILE_H_STR}" "cuFileBatchIOSetUp" cuFileBatchIOSetUp_location)
-    if(cuFileBatchIOSetUp_location EQUAL "-1")
-      set(cuFile_BATCH_API_FOUND FALSE)
-    else()
-      set(cuFile_BATCH_API_FOUND TRUE)
-    endif()
-    message(STATUS "KvikIO: Found cuFile Batch API: ${cuFile_BATCH_API_FOUND}")
-    string(FIND "${CUFILE_H_STR}" "cuFileReadAsync" cuFileReadAsync_location)
-    if(cuFileReadAsync_location EQUAL "-1")
-      set(cuFile_STREAM_API_FOUND FALSE)
-    else()
-      set(cuFile_STREAM_API_FOUND TRUE)
-    endif()
-    message(STATUS "KvikIO: Found cuFile Stream API: ${cuFile_STREAM_API_FOUND}")
-  endif()
-
-  # Enable supported cuFile features in KvikIO
-  if(cuFile_FOUND)
-    target_link_libraries(kvikio::kvikio INTERFACE cufile::cuFile_interface)
-    target_compile_definitions(kvikio::kvikio INTERFACE KVIKIO_CUFILE_FOUND)
-    if(cuFile_BATCH_API_FOUND)
-      target_compile_definitions(kvikio::kvikio INTERFACE KVIKIO_CUFILE_BATCH_API_FOUND)
-    endif()
-    if(cuFile_STREAM_API_FOUND)
-      target_compile_definitions(kvikio::kvikio INTERFACE KVIKIO_CUFILE_STREAM_API_FOUND)
-    endif()
+    "
+set(KvikIO_CUDA_SUPPORT [=[${KvikIO_CUDA_SUPPORT}]=])
+set(KvikIO_CUFILE_SUPPORT [=[${cuFile_FOUND}]=])
+"
+)
+string(
+  APPEND
+  final_code_string
+  [=[
+if(KvikIO_CUDA_SUPPORT)
+  find_package(CUDAToolkit REQUIRED QUIET)
+  target_include_directories(kvikio::kvikio INTERFACE ${CUDAToolkit_INCLUDE_DIRS})
+
+  if(KvikIO_CUFILE_SUPPORT AND NOT TARGET CUDA::cuFile)
+    message(FATAL_ERROR "Compiled with cuFile support but cuFile not found")
   endif()
 endif()
 ]=]
diff --git a/cpp/cmake/Modules/FindCUDAToolkit.cmake b/cpp/cmake/Modules/FindCUDAToolkit.cmake
new file mode 100644
index 0000000000..6f0272aa2d
--- /dev/null
+++ b/cpp/cmake/Modules/FindCUDAToolkit.cmake
@@ -0,0 +1,1437 @@
+# CMake - Cross Platform Makefile Generator
+# Copyright 2000-2024 Kitware, Inc. and Contributors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+#
+# * Neither the name of Kitware, Inc. nor the names of Contributors
+#   may be used to endorse or promote products derived from this
+#   software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#[=======================================================================[.rst:
+FindCUDAToolkit
+---------------
+
+.. versionadded:: 3.17
+
+This script locates the NVIDIA CUDA toolkit and the associated libraries, but
+does not require the ``CUDA`` language be enabled for a given project. This
+module does not search for the NVIDIA CUDA Samples.
+
+.. versionadded:: 3.19
+  QNX support.
+
+Search Behavior
+^^^^^^^^^^^^^^^
+
+The CUDA Toolkit search behavior uses the following order:
+
+1. If the ``CUDA`` language has been enabled we will use the directory
+   containing the compiler as the first search location for ``nvcc``.
+
+2. If the variable :variable:`CMAKE_CUDA_COMPILER <CMAKE_<LANG>_COMPILER>` or
+   the environment variable :envvar:`CUDACXX` is defined, it will be used
+   as the path to the ``nvcc`` executable.
+
+3. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g.,
+   ``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it
+   will be searched.  If both an environment variable **and** a
+   configuration variable are specified, the *configuration* variable takes
+   precedence.
+
+   The directory specified here must be such that the executable ``nvcc`` or
+   the appropriate ``version.txt`` or ``version.json`` file can be found
+   underneath the specified directory.
+
+4. If the CUDA_PATH environment variable is defined, it will be searched
+   for ``nvcc``.
+
+5. The user's path is searched for ``nvcc`` using :command:`find_program`.  If
+   this is found, no subsequent search attempts are performed.  Users are
+   responsible for ensuring that the first ``nvcc`` to show up in the path is
+   the desired path in the event that multiple CUDA Toolkits are installed.
+
+6. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is
+   used.  No subsequent search attempts are performed.  No default symbolic link
+   location exists for the Windows platform.
+
+7. The platform specific default install locations are searched.  If exactly one
+   candidate is found, this is used.  The default CUDA Toolkit install locations
+   searched are:
+
+   +-------------+-------------------------------------------------------------+
+   | Platform    | Search Pattern                                              |
+   +=============+=============================================================+
+   | macOS       | ``/Developer/NVIDIA/CUDA-X.Y``                              |
+   +-------------+-------------------------------------------------------------+
+   | Other Unix  | ``/usr/local/cuda-X.Y``                                     |
+   +-------------+-------------------------------------------------------------+
+   | Windows     | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` |
+   +-------------+-------------------------------------------------------------+
+
+   Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as
+   ``/usr/local/cuda-9.0`` or
+   ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0``
+
+   .. note::
+
+       When multiple CUDA Toolkits are installed in the default location of a
+       system (e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0``
+       exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this
+       package is marked as **not** found.
+
+       There are too many factors involved in making an automatic decision in
+       the presence of multiple CUDA Toolkits being installed.  In this
+       situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or
+       (2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for
+       :command:`find_program` to find.
+
+Arguments
+^^^^^^^^^
+
+``[<version>]``
+    The ``[<version>]`` argument requests a version with which the package found
+    should be compatible. See :ref:`find_package version format <FIND_PACKAGE_VERSION_FORMAT>`
+    for more details.
+
+Options
+^^^^^^^
+
+``REQUIRED``
+    If specified, configuration will error if a suitable CUDA Toolkit is not
+    found.
+
+``QUIET``
+    If specified, the search for a suitable CUDA Toolkit will not produce any
+    messages.
+
+``EXACT``
+    If specified, the CUDA Toolkit is considered found only if the exact
+    ``VERSION`` specified is recovered.
+
+Imported targets
+^^^^^^^^^^^^^^^^
+
+An :ref:`imported target <Imported targets>` named ``CUDA::toolkit`` is provided.
+
+This module defines :prop_tgt:`IMPORTED` targets for each
+of the following libraries that are part of the CUDAToolkit:
+
+- :ref:`CUDA Runtime Library<cuda_toolkit_rt_lib>`
+- :ref:`CUDA Driver Library<cuda_toolkit_driver_lib>`
+- :ref:`cuBLAS<cuda_toolkit_cuBLAS>`
+- :ref:`cuDLA<cuda_toolkit_cuDLA>`
+- :ref:`cuFile<cuda_toolkit_cuFile>`
+- :ref:`cuFFT<cuda_toolkit_cuFFT>`
+- :ref:`cuRAND<cuda_toolkit_cuRAND>`
+- :ref:`cuSOLVER<cuda_toolkit_cuSOLVER>`
+- :ref:`cuSPARSE<cuda_toolkit_cuSPARSE>`
+- :ref:`cuPTI<cuda_toolkit_cupti>`
+- :ref:`NPP<cuda_toolkit_NPP>`
+- :ref:`nvBLAS<cuda_toolkit_nvBLAS>`
+- :ref:`nvGRAPH<cuda_toolkit_nvGRAPH>`
+- :ref:`nvJPEG<cuda_toolkit_nvJPEG>`
+- :ref:`nvidia-ML<cuda_toolkit_nvML>`
+- :ref:`nvPTX Compiler<cuda_toolkit_nvptx>`
+- :ref:`nvRTC<cuda_toolkit_nvRTC>`
+- :ref:`nvJitLink<cuda_toolkit_nvJitLink>`
+- :ref:`nvFatBin<cuda_toolkit_nvfatbin>`
+- :ref:`nvToolsExt<cuda_toolkit_nvToolsExt>`
+- :ref:`nvtx3<cuda_toolkit_nvtx3>`
+- :ref:`OpenCL<cuda_toolkit_opencl>`
+- :ref:`cuLIBOS<cuda_toolkit_cuLIBOS>`
+
+.. _`cuda_toolkit_rt_lib`:
+
+CUDA Runtime Library
+""""""""""""""""""""
+
+The CUDA Runtime library (cudart) are what most applications will typically
+need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
+
+Targets Created:
+
+- ``CUDA::cudart``
+- ``CUDA::cudart_static``
+
+.. _`cuda_toolkit_driver_lib`:
+
+CUDA Driver Library
+""""""""""""""""""""
+
+The CUDA Driver library (cuda) are used by applications that use calls
+such as `cuMemAlloc`, and `cuMemFree`.
+
+Targets Created:
+
+- ``CUDA::cuda_driver``
+
+.. _`cuda_toolkit_cuBLAS`:
+
+cuBLAS
+""""""
+
+The `cuBLAS <https://docs.nvidia.com/cuda/cublas>`_ library.
+
+Targets Created:
+
+- ``CUDA::cublas``
+- ``CUDA::cublas_static``
+- ``CUDA::cublasLt`` starting in CUDA 10.1
+- ``CUDA::cublasLt_static`` starting in CUDA 10.1
+
+.. _`cuda_toolkit_cuDLA`:
+
+cuDLA
+""""""
+
+.. versionadded:: 3.27
+
+The NVIDIA Tegra Deep Learning Accelerator `cuDLA <https://docs.nvidia.com/cuda/cublas>`_ library.
+
+Targets Created:
+
+- ``CUDA::cudla`` starting in CUDA 11.6
+
+.. _`cuda_toolkit_cuFile`:
+
+cuFile
+""""""
+
+.. versionadded:: 3.25
+
+The NVIDIA GPUDirect Storage `cuFile <https://docs.nvidia.com/gpudirect-storage/api-reference-guide>`_ library.
+
+Targets Created:
+
+- ``CUDA::cuFile`` starting in CUDA 11.4
+- ``CUDA::cuFile_static`` starting in CUDA 11.4
+- ``CUDA::cuFile_rdma`` starting in CUDA 11.4
+- ``CUDA::cuFile_rdma_static`` starting in CUDA 11.4
+
+.. _`cuda_toolkit_cuFFT`:
+
+cuFFT
+"""""
+
+The `cuFFT <https://docs.nvidia.com/cuda/cufft>`_ library.
+
+Targets Created:
+
+- ``CUDA::cufft``
+- ``CUDA::cufftw``
+- ``CUDA::cufft_static``
+- ``CUDA::cufft_static_nocallback`` starting in CUDA 9.2, requires CMake 3.23+
+- ``CUDA::cufftw_static``
+
+cuRAND
+""""""
+
+The `cuRAND <https://docs.nvidia.com/cuda/curand>`_ library.
+
+Targets Created:
+
+- ``CUDA::curand``
+- ``CUDA::curand_static``
+
+.. _`cuda_toolkit_cuSOLVER`:
+
+cuSOLVER
+""""""""
+
+The `cuSOLVER <https://docs.nvidia.com/cuda/cusolver>`_ library.
+
+Targets Created:
+
+- ``CUDA::cusolver``
+- ``CUDA::cusolver_static``
+
+.. _`cuda_toolkit_cuSPARSE`:
+
+cuSPARSE
+""""""""
+
+The `cuSPARSE <https://docs.nvidia.com/cuda/cusparse>`_ library.
+
+Targets Created:
+
+- ``CUDA::cusparse``
+- ``CUDA::cusparse_static``
+
+.. _`cuda_toolkit_cupti`:
+
+cupti
+"""""
+
+The `NVIDIA CUDA Profiling Tools Interface <https://developer.nvidia.com/cupti>`_.
+
+Targets Created:
+
+- ``CUDA::cupti``
+- ``CUDA::cupti_static``
+
+.. versionadded:: 3.27
+
+  - ``CUDA::nvperf_host``         starting in CUDA 10.2
+  - ``CUDA::nvperf_host_static``  starting in CUDA 10.2
+  - ``CUDA::nvperf_target``       starting in CUDA 10.2
+  - ``CUDA::pcsamplingutil``      starting in CUDA 11.3
+
+.. _`cuda_toolkit_NPP`:
+
+NPP
+"""
+
+The `NPP <https://docs.nvidia.com/cuda/npp>`_ libraries.
+
+Targets Created:
+
+- `nppc`:
+
+  - ``CUDA::nppc``
+  - ``CUDA::nppc_static``
+
+- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h`
+
+  - ``CUDA::nppial``
+  - ``CUDA::nppial_static``
+
+- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h`
+
+  - ``CUDA::nppicc``
+  - ``CUDA::nppicc_static``
+
+- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h`
+  Removed starting in CUDA 11.0, use :ref:`nvJPEG<cuda_toolkit_nvJPEG>` instead.
+
+  - ``CUDA::nppicom``
+  - ``CUDA::nppicom_static``
+
+- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h`
+
+  - ``CUDA::nppidei``
+  - ``CUDA::nppidei_static``
+
+- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h`
+
+  - ``CUDA::nppif``
+  - ``CUDA::nppif_static``
+
+- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h`
+
+  - ``CUDA::nppig``
+  - ``CUDA::nppig_static``
+
+- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h`
+
+  - ``CUDA::nppim``
+  - ``CUDA::nppim_static``
+
+- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h`
+
+  - ``CUDA::nppist``
+  - ``CUDA::nppist_static``
+
+- `nppisu`: Memory support functions in `nppi_support_functions.h`
+
+  - ``CUDA::nppisu``
+  - ``CUDA::nppisu_static``
+
+- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h`
+
+  - ``CUDA::nppitc``
+  - ``CUDA::nppitc_static``
+
+- `npps`:
+
+  - ``CUDA::npps``
+  - ``CUDA::npps_static``
+
+.. _`cuda_toolkit_nvBLAS`:
+
+nvBLAS
+""""""
+
+The `nvBLAS <https://docs.nvidia.com/cuda/nvblas>`_ libraries.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvblas``
+
+.. _`cuda_toolkit_nvGRAPH`:
+
+nvGRAPH
+"""""""
+
+The `nvGRAPH <https://web.archive.org/web/20201111171403/https://docs.nvidia.com/cuda/nvgraph/index.html>`_ library.
+Removed starting in CUDA 11.0
+
+Targets Created:
+
+- ``CUDA::nvgraph``
+- ``CUDA::nvgraph_static``
+
+
+.. _`cuda_toolkit_nvJPEG`:
+
+nvJPEG
+""""""
+
+The `nvJPEG <https://docs.nvidia.com/cuda/nvjpeg>`_ library.
+Introduced in CUDA 10.
+
+Targets Created:
+
+- ``CUDA::nvjpeg``
+- ``CUDA::nvjpeg_static``
+
+.. _`cuda_toolkit_nvPTX`:
+
+nvPTX Compiler
+""""""""""""""
+
+.. versionadded:: 3.25
+
+The `nvPTX <https://docs.nvidia.com/cuda/ptx-compiler-api>`_ (PTX Compilation) library.
+The PTX Compiler APIs are a set of APIs which can be used to compile a PTX program into GPU assembly code.
+Introduced in CUDA 11.1
+This is a static library only.
+
+Targets Created:
+
+- ``CUDA::nvptxcompiler_static`` starting in CUDA 11.1
+
+.. _`cuda_toolkit_nvRTC`:
+
+nvRTC
+"""""
+
+The `nvRTC <https://docs.nvidia.com/cuda/nvrtc>`_ (Runtime Compilation) library.
+
+Targets Created:
+
+- ``CUDA::nvrtc``
+
+.. versionadded:: 3.26
+
+  - ``CUDA::nvrtc_builtins``
+  - ``CUDA::nvrtc_static`` starting in CUDA 11.5
+  - ``CUDA::nvrtc_builtins_static`` starting in CUDA 11.5
+
+.. _`cuda_toolkit_nvjitlink`:
+
+nvJitLink
+"""""""""
+
+The `nvJItLink <https://docs.nvidia.com/cuda/>`_ (Runtime LTO Linking) library.
+
+Targets Created:
+
+- ``CUDA::nvJitLink`` starting in CUDA 12.0
+- ``CUDA::nvJitLink_static``  starting in CUDA 12.0
+
+.. _`cuda_toolkit_nvfatbin`:
+
+nvFatBin
+"""""""""
+
+.. versionadded:: 3.30
+
+The `nvFatBin <https://docs.nvidia.com/cuda/>`_ (Runtime fatbin creation) library.
+
+Targets Created:
+
+- ``CUDA::nvfatbin`` starting in CUDA 12.4
+- ``CUDA::nvfatbin_static``  starting in CUDA 12.4
+
+.. _`cuda_toolkit_nvml`:
+
+nvidia-ML
+"""""""""
+
+The `NVIDIA Management Library <https://developer.nvidia.com/management-library-nvml>`_.
+
+Targets Created:
+
+- ``CUDA::nvml``
+- ``CUDA::nvml_static`` starting in CUDA 12.4
+
+.. versionadded:: 3.31
+  Added ``CUDA::nvml_static``.
+
+.. _`cuda_toolkit_nvToolsExt`:
+
+nvToolsExt
+""""""""""
+
+.. deprecated:: 3.25 With CUDA 10.0+, use :ref:`nvtx3 <cuda_toolkit_nvtx3>`.
+
+The `NVIDIA Tools Extension <https://docs.nvidia.com/nvtx/>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvToolsExt``
+
+.. _`cuda_toolkit_nvtx3`:
+
+nvtx3
+"""""
+
+.. versionadded:: 3.25
+
+The header-only `NVIDIA Tools Extension Library <https://nvidia.github.io/NVTX/doxygen>`_.
+Introduced in CUDA 10.0.
+
+Targets created:
+
+- ``CUDA::nvtx3``
+
+.. _`cuda_toolkit_opencl`:
+
+OpenCL
+""""""
+
+The `NVIDIA OpenCL Library <https://developer.nvidia.com/opencl>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::OpenCL``
+
+.. _`cuda_toolkit_cuLIBOS`:
+
+cuLIBOS
+"""""""
+
+The cuLIBOS library is a backend thread abstraction layer library which is
+static only.  The ``CUDA::cublas_static``, ``CUDA::cusparse_static``,
+``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP
+libraries all automatically have this dependency linked.
+
+Target Created:
+
+- ``CUDA::culibos``
+
+**Note**: direct usage of this target by consumers should not be necessary.
+
+.. _`cuda_toolkit_cuRAND`:
+
+
+
+Result variables
+^^^^^^^^^^^^^^^^
+
+``CUDAToolkit_FOUND``
+    A boolean specifying whether or not the CUDA Toolkit was found.
+
+``CUDAToolkit_VERSION``
+    The exact version of the CUDA Toolkit found (as reported by
+    ``nvcc --version``, ``version.txt``, or ``version.json``).
+
+``CUDAToolkit_VERSION_MAJOR``
+    The major version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_MINOR``
+    The minor version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_PATCH``
+    The patch version of the CUDA Toolkit.
+
+``CUDAToolkit_BIN_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    executable ``nvcc``.
+
+``CUDAToolkit_INCLUDE_DIRS``
+    List of paths to all the CUDA Toolkit folders containing header files
+    required to compile a project linking against CUDA.
+
+``CUDAToolkit_LIBRARY_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    Runtime library ``cudart``.
+
+``CUDAToolkit_LIBRARY_ROOT``
+    .. versionadded:: 3.18
+
+    The path to the CUDA Toolkit directory containing the nvvm directory and
+    either version.txt or version.json.
+
+``CUDAToolkit_TARGET_DIR``
+    The path to the CUDA Toolkit directory including the target architecture
+    when cross-compiling. When not cross-compiling this will be equivalent to
+    the parent directory of ``CUDAToolkit_BIN_DIR``.
+
+``CUDAToolkit_NVCC_EXECUTABLE``
+    The path to the NVIDIA CUDA compiler ``nvcc``.  Note that this path may
+    **not** be the same as
+    :variable:`CMAKE_CUDA_COMPILER <CMAKE_<LANG>_COMPILER>`.  ``nvcc`` must be
+    found to determine the CUDA Toolkit version as well as determining other
+    features of the Toolkit.  This variable is set for the convenience of
+    modules that depend on this one.
+
+
+#]=======================================================================]
+
+# NOTE: much of this was simply extracted from FindCUDA.cmake.
+
+#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#   Copyright (c) 2007-2009
+#   Scientific Computing and Imaging Institute, University of Utah
+#
+#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#   for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+function(_CUDAToolkit_build_include_dirs result_variable default_paths_variable)
+  set(content "${${default_paths_variable}}")
+  set(${result_variable} "${content}" PARENT_SCOPE)
+endfunction()
+
+function(_CUDAToolkit_build_library_dirs result_variable default_paths_variable)
+  set(content "${${default_paths_variable}}")
+  set(${result_variable} "${content}" PARENT_SCOPE)
+endfunction()
+
+# The toolkit is located during compiler detection for CUDA and stored in CMakeCUDACompiler.cmake as
+# - CMAKE_CUDA_COMPILER_TOOLKIT_ROOT
+# - CMAKE_CUDA_COMPILER_LIBRARY_ROOT
+# - CMAKE_CUDA_COMPILER_LIBRARY_DIRECTORIES_FROM_IMPLICIT_LIBRARIES
+# - CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES
+# We compute the rest based on those here to avoid re-searching and to avoid finding a possibly
+# different installation.
+if(CMAKE_CUDA_COMPILER_TOOLKIT_ROOT)
+  set(CUDAToolkit_ROOT_DIR "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}")
+  set(CUDAToolkit_LIBRARY_ROOT "${CMAKE_CUDA_COMPILER_LIBRARY_ROOT}")
+  _CUDAToolkit_build_library_dirs(CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES)
+  _CUDAToolkit_build_include_dirs(CUDAToolkit_INCLUDE_DIRECTORIES CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES)
+  set(CUDAToolkit_BIN_DIR "${CUDAToolkit_ROOT_DIR}/bin")
+  set(CUDAToolkit_NVCC_EXECUTABLE "${CUDAToolkit_BIN_DIR}/nvcc${CMAKE_EXECUTABLE_SUFFIX}")
+  set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_TOOLKIT_VERSION}")
+
+  if(CUDAToolkit_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+  endif()
+else()
+  function(_CUDAToolkit_find_root_dir )
+    cmake_parse_arguments(arg "COMPILER_PATHS" "" "SEARCH_PATHS;FIND_FLAGS" ${ARGN})
+
+    if(NOT CUDAToolkit_BIN_DIR)
+      if(arg_COMPILER_PATHS)
+        # need to find parent dir, since this could clang and not nvcc
+        if(EXISTS "${CMAKE_CUDA_COMPILER}")
+          get_filename_component(possible_nvcc_path "${CMAKE_CUDA_COMPILER}" PROGRAM PROGRAM_ARGS CUDAToolkit_compiler_args)
+          get_filename_component(possible_nvcc_path "${possible_nvcc_path}" DIRECTORY)
+        elseif(EXISTS "$ENV{CUDACXX}")
+          get_filename_component(possible_nvcc_path "$ENV{CUDACXX}" PROGRAM PROGRAM_ARGS CUDAToolkit_compiler_args)
+          get_filename_component(possible_nvcc_path "${possible_nvcc_path}" DIRECTORY)
+        endif()
+        if(possible_nvcc_path)
+          find_program(CUDAToolkit_NVCC_EXECUTABLE
+            NAMES nvcc nvcc.exe
+            NO_DEFAULT_PATH
+            PATHS ${possible_nvcc_path}
+          )
+        endif()
+      endif()
+
+      if(NOT CUDAToolkit_SENTINEL_FILE)
+        find_program(CUDAToolkit_NVCC_EXECUTABLE
+          NAMES nvcc nvcc.exe
+          PATHS ${arg_SEARCH_PATHS}
+          ${arg_FIND_FLAGS}
+        )
+      endif()
+
+      if(NOT CUDAToolkit_NVCC_EXECUTABLE)
+        find_file(CUDAToolkit_SENTINEL_FILE
+          NAMES version.txt version.json
+          PATHS ${arg_SEARCH_PATHS}
+          NO_DEFAULT_PATH
+        )
+      endif()
+
+      if(EXISTS "${CUDAToolkit_NVCC_EXECUTABLE}")
+        # If NVCC exists  then invoke it to find the toolkit location.
+        # This allows us to support wrapper scripts (e.g. ccache or colornvcc), CUDA Toolkit,
+        # NVIDIA HPC SDK, and distro's splayed layouts
+        execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "-v" "__cmake_determine_cuda"
+          OUTPUT_VARIABLE _CUDA_NVCC_OUT ERROR_VARIABLE _CUDA_NVCC_OUT)
+        message(CONFIGURE_LOG
+          "Executed nvcc to extract CUDAToolkit information:\n${_CUDA_NVCC_OUT}\n\n")
+        if(_CUDA_NVCC_OUT MATCHES "\\#\\$ TOP=([^\r\n]*)")
+          get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_MATCH_1}/bin" ABSOLUTE)
+          message(CONFIGURE_LOG
+            "Parsed CUDAToolkit nvcc location:\n${CUDAToolkit_BIN_DIR}\n\n")
+        else()
+          get_filename_component(CUDAToolkit_BIN_DIR "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
+        endif()
+        if(_CUDA_NVCC_OUT MATCHES "\\#\\$ INCLUDES=([^\r\n]*)")
+          separate_arguments(_nvcc_output NATIVE_COMMAND "${CMAKE_MATCH_1}")
+          foreach(line IN LISTS _nvcc_output)
+            string(REGEX REPLACE "^-I" "" line "${line}")
+            get_filename_component(line "${line}" ABSOLUTE)
+            list(APPEND _cmake_CUDAToolkit_include_directories "${line}")
+          endforeach()
+          message(CONFIGURE_LOG
+            "Parsed CUDAToolkit nvcc implicit include information:\n${_cmake_CUDAToolkit_include_directories}\n\n")
+
+          set(_cmake_CUDAToolkit_include_directories "${_cmake_CUDAToolkit_include_directories}" CACHE INTERNAL "CUDAToolkit internal list of include directories")
+        endif()
+        if(_CUDA_NVCC_OUT MATCHES "\\#\\$ LIBRARIES=([^\r\n]*)")
+          include(${CMAKE_ROOT}/Modules/CMakeParseImplicitLinkInfo.cmake)
+          set(_nvcc_link_line "cuda-fake-ld ${CMAKE_MATCH_1}")
+          CMAKE_PARSE_IMPLICIT_LINK_INFO("${_nvcc_link_line}"
+                                   _cmake_CUDAToolkit_implicit_link_libs
+                                   _cmake_CUDAToolkit_implicit_link_directories
+                                   _cmake_CUDAToolkit_implicit_frameworks
+                                   _nvcc_log
+                                   "${CMAKE_CUDA_IMPLICIT_OBJECT_REGEX}"
+                                   LANGUAGE CUDA)
+          message(CONFIGURE_LOG
+          "Parsed CUDAToolkit nvcc implicit link information:\n${_nvcc_log}\n${_cmake_CUDAToolkit_implicit_link_directories}\n\n")
+          unset(_nvcc_link_line)
+          unset(_cmake_CUDAToolkit_implicit_link_libs)
+          unset(_cmake_CUDAToolkit_implicit_frameworks)
+
+          set(_cmake_CUDAToolkit_implicit_link_directories "${_cmake_CUDAToolkit_implicit_link_directories}" CACHE INTERNAL "CUDAToolkit internal list of implicit link directories")
+        endif()
+        unset(_CUDA_NVCC_OUT)
+
+        set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE)
+        mark_as_advanced(CUDAToolkit_BIN_DIR)
+      endif()
+
+      if(CUDAToolkit_SENTINEL_FILE)
+        get_filename_component(CUDAToolkit_BIN_DIR ${CUDAToolkit_SENTINEL_FILE} DIRECTORY ABSOLUTE)
+        set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}/bin")
+
+        set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE)
+        mark_as_advanced(CUDAToolkit_BIN_DIR)
+      endif()
+    endif()
+
+    if(DEFINED _cmake_CUDAToolkit_include_directories)
+      _CUDAToolkit_build_include_dirs(_cmake_CUDAToolkit_contents _cmake_CUDAToolkit_include_directories)
+      set(CUDAToolkit_INCLUDE_DIRECTORIES "${_cmake_CUDAToolkit_contents}" PARENT_SCOPE)
+    endif()
+    if(DEFINED _cmake_CUDAToolkit_implicit_link_directories)
+      _CUDAToolkit_build_library_dirs(_cmake_CUDAToolkit_contents _cmake_CUDAToolkit_implicit_link_directories)
+      set(CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES "${_cmake_CUDAToolkit_contents}" PARENT_SCOPE)
+    endif()
+
+    if(CUDAToolkit_BIN_DIR)
+      get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
+      set(CUDAToolkit_ROOT_DIR "${CUDAToolkit_ROOT_DIR}" PARENT_SCOPE)
+    endif()
+
+  endfunction()
+
+  function(_CUDAToolkit_find_version_file result_variable)
+    # We first check for a non-scattered installation to prefer it over a scattered installation.
+    set(version_files version.txt version.json)
+    foreach(vf IN LISTS version_files)
+      if(CUDAToolkit_ROOT AND EXISTS "${CUDAToolkit_ROOT}/${vf}")
+        set(${result_variable} "${CUDAToolkit_ROOT}/${vf}" PARENT_SCOPE)
+        break()
+      elseif(CUDAToolkit_ROOT_DIR AND EXISTS "${CUDAToolkit_ROOT_DIR}/${vf}")
+        set(${result_variable} "${CUDAToolkit_ROOT_DIR}/${vf}" PARENT_SCOPE)
+        break()
+      elseif(CMAKE_SYSROOT_LINK AND EXISTS "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/${vf}")
+        set(${result_variable} "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/${vf}" PARENT_SCOPE)
+        break()
+      elseif(EXISTS "${CMAKE_SYSROOT}/usr/lib/cuda/${vf}")
+        set(${result_variable} "${CMAKE_SYSROOT}/usr/lib/cuda/${vf}" PARENT_SCOPE)
+        break()
+      endif()
+    endforeach()
+  endfunction()
+
+  function(_CUDAToolkit_parse_version_file version_file)
+    if(version_file)
+      file(READ "${version_file}" file_conents)
+      cmake_path(GET version_file EXTENSION LAST_ONLY version_ext)
+      if(version_ext STREQUAL ".json")
+        string(JSON cuda_version_info GET "${file_conents}" "cuda" "version")
+        set(cuda_version_match_regex [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+      elseif(version_ext STREQUAL ".txt")
+        set(cuda_version_info "${file_conents}")
+        set(cuda_version_match_regex [=[CUDA Version ([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+      endif()
+
+      if(cuda_version_info MATCHES "${cuda_version_match_regex}")
+        set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}" PARENT_SCOPE)
+        set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}" PARENT_SCOPE)
+        set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}" PARENT_SCOPE)
+        set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}" PARENT_SCOPE)
+      endif()
+    endif()
+  endfunction()
+
+  # For NVCC we can easily deduce the SDK binary directory from the compiler path.
+  if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
+    get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_CUDA_COMPILER}" DIRECTORY)
+    set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "")
+    # Try language provided path first.
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_BIN_DIR}" FIND_FLAGS NO_DEFAULT_PATH)
+    mark_as_advanced(CUDAToolkit_BIN_DIR)
+  endif()
+
+  # Try user provided path
+  _CUDAToolkit_find_root_dir(COMPILER_PATHS)
+  if(NOT CUDAToolkit_ROOT_DIR AND CUDAToolkit_ROOT)
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_ROOT}" FIND_FLAGS PATH_SUFFIXES bin NO_DEFAULT_PATH)
+  endif()
+  if(NOT CUDAToolkit_ROOT_DIR)
+    _CUDAToolkit_find_root_dir(FIND_FLAGS PATHS ENV CUDA_PATH PATH_SUFFIXES bin)
+  endif()
+
+  # If the user specified CUDAToolkit_ROOT but the toolkit could not be found, this is an error.
+  if(NOT CUDAToolkit_ROOT_DIR AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT}))
+    # Declare error messages now, print later depending on find_package args.
+    set(fail_base "Could not find nvcc executable in path specified by")
+    set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+    set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}")
+
+    if(CUDAToolkit_FIND_REQUIRED)
+      if(DEFINED CUDAToolkit_ROOT)
+        message(FATAL_ERROR ${cuda_root_fail})
+      elseif(DEFINED ENV{CUDAToolkit_ROOT})
+        message(FATAL_ERROR ${env_cuda_root_fail})
+      endif()
+    else()
+      if(NOT CUDAToolkit_FIND_QUIETLY)
+        if(DEFINED CUDAToolkit_ROOT)
+          message(STATUS ${cuda_root_fail})
+        elseif(DEFINED ENV{CUDAToolkit_ROOT})
+          message(STATUS ${env_cuda_root_fail})
+        endif()
+      endif()
+      set(CUDAToolkit_FOUND FALSE)
+      unset(fail_base)
+      unset(cuda_root_fail)
+      unset(env_cuda_root_fail)
+      return()
+    endif()
+  endif()
+
+  # CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults.
+  #
+  # - Linux: /usr/local/cuda-X.Y
+  # - macOS: /Developer/NVIDIA/CUDA-X.Y
+  # - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y
+  #
+  # We will also search the default symlink location /usr/local/cuda first since
+  # if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked
+  # directory is the desired location.
+  if(NOT CUDAToolkit_ROOT_DIR)
+    if(UNIX)
+      if(NOT APPLE)
+        set(platform_base "/usr/local/cuda-")
+      else()
+        set(platform_base "/Developer/NVIDIA/CUDA-")
+      endif()
+    else()
+      set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v")
+    endif()
+
+    # Build out a descending list of possible cuda installations, e.g.
+    file(GLOB possible_paths "${platform_base}*")
+    # Iterate the glob results and create a descending list.
+    set(versions)
+    foreach(p ${possible_paths})
+      # Extract version number from end of string
+      string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p})
+      if(IS_DIRECTORY ${p} AND p_version)
+        list(APPEND versions ${p_version})
+      endif()
+    endforeach()
+
+    # Sort numerically in descending order, so we try the newest versions first.
+    list(SORT versions COMPARE NATURAL ORDER DESCENDING)
+
+    # With a descending list of versions, populate possible paths to search.
+    set(search_paths)
+    foreach(v ${versions})
+      list(APPEND search_paths "${platform_base}${v}")
+    endforeach()
+
+    # Force the global default /usr/local/cuda to the front on Unix.
+    if(UNIX)
+      list(INSERT search_paths 0 "/usr/local/cuda")
+    endif()
+
+    # Now search for the toolkit again using the platform default search paths.
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "${search_paths}" FIND_FLAGS PATH_SUFFIXES bin)
+
+    # We are done with these variables now, cleanup for caller.
+    unset(platform_base)
+    unset(possible_paths)
+    unset(versions)
+    unset(search_paths)
+
+    if(NOT CUDAToolkit_ROOT_DIR)
+      if(CUDAToolkit_FIND_REQUIRED)
+        message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.")
+      elseif(NOT CUDAToolkit_FIND_QUIETLY)
+        message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.")
+      endif()
+
+      set(CUDAToolkit_FOUND FALSE)
+      return()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_version_file( _CUDAToolkit_version_file )
+  if(_CUDAToolkit_version_file)
+    # CUDAToolkit_LIBRARY_ROOT contains the device library and version file.
+    get_filename_component(CUDAToolkit_LIBRARY_ROOT "${_CUDAToolkit_version_file}" DIRECTORY ABSOLUTE)
+  endif()
+  unset(_CUDAToolkit_version_file)
+
+  if(CUDAToolkit_NVCC_EXECUTABLE AND
+     CMAKE_CUDA_COMPILER_VERSION AND
+     CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER)
+    # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value
+    # This if statement will always match, but is used to provide variables for MATCH 1,2,3...
+    if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+      set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+      set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+      set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+      set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}")
+    endif()
+  elseif(CUDAToolkit_NVCC_EXECUTABLE)
+    # Compute the version by invoking nvcc
+    execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+    if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+      set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+      set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+      set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+      set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
+    endif()
+    unset(NVCC_OUT)
+  else()
+    _CUDAToolkit_find_version_file(version_file)
+    _CUDAToolkit_parse_version_file("${version_file}")
+  endif()
+endif()
+
+# Find target directory when crosscompiling.
+if(CMAKE_CROSSCOMPILING)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
+    # Support for NVPACK
+    set(CUDAToolkit_TARGET_NAMES "armv7-linux-androideabi")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    set(CUDAToolkit_TARGET_NAMES "armv7-linux-gnueabihf")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    if(ANDROID_ARCH_NAME STREQUAL "arm64")
+      set(CUDAToolkit_TARGET_NAMES "aarch64-linux-androideabi")
+    elseif (CMAKE_SYSTEM_NAME STREQUAL "QNX")
+      set(CUDAToolkit_TARGET_NAMES "aarch64-qnx")
+    else()
+      set(CUDAToolkit_TARGET_NAMES "aarch64-linux" "sbsa-linux")
+    endif()
+  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+    set(CUDAToolkit_TARGET_NAMES "x86_64-linux")
+  endif()
+
+  foreach(CUDAToolkit_TARGET_NAME IN LISTS CUDAToolkit_TARGET_NAMES)
+    if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+      set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+      # add known CUDA target root path to the set of directories we search for programs, libraries and headers
+      list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}")
+
+      # Mark that we need to pop the root search path changes after we have
+      # found all cuda libraries so that searches for our cross-compilation
+      # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or
+      # PATh
+      set(_CUDAToolkit_Pop_ROOT_PATH True)
+      break()
+    endif()
+  endforeach()
+endif()
+
+# Determine windows search path suffix for libraries
+if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+  if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
+    set(_CUDAToolkit_win_search_dirs lib/x64)
+    set(_CUDAToolkit_win_stub_search_dirs lib/x64/stubs)
+  endif()
+endif()
+
+# If not already set we can simply use the toolkit root or it's a scattered installation.
+if(NOT CUDAToolkit_TARGET_DIR)
+  # Not cross compiling
+  set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}")
+  # Now that we have the real ROOT_DIR, find components inside it.
+  list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
+
+  # Mark that we need to pop the prefix path changes after we have
+  # found the cudart library.
+  set(_CUDAToolkit_Pop_Prefix True)
+endif()
+
+
+# We don't need to verify the cuda_runtime header when we are using `nvcc` include paths
+# as the compiler being enabled means the header was found
+if(NOT CUDAToolkit_INCLUDE_DIRECTORIES)
+  # Otherwise use CUDAToolkit_TARGET_DIR to guess where the `cuda_runtime.h` is located
+  # On a scattered installation /usr, on a non-scattered something like /usr/local/cuda or /usr/local/cuda-10.2/targets/aarch64-linux.
+  if(EXISTS "${CUDAToolkit_TARGET_DIR}/include/cuda_runtime.h")
+    set(CUDAToolkit_INCLUDE_DIRECTORIES "${CUDAToolkit_TARGET_DIR}/include")
+  else()
+    message(STATUS "Unable to find cuda_runtime.h in \"${CUDAToolkit_TARGET_DIR}/include\" for CUDAToolkit_INCLUDE_DIRECTORIES.")
+  endif()
+endif()
+
+# The NVHPC layout moves math library headers and libraries to a sibling directory and it could be nested under
+# the version of the CUDA toolchain
+# Create a separate variable so this directory can be selectively added to math targets.
+find_path(CUDAToolkit_CUBLAS_INCLUDE_DIR cublas_v2.h PATHS
+  ${CUDAToolkit_INCLUDE_DIRECTORIES}
+  NO_DEFAULT_PATH)
+
+if(NOT CUDAToolkit_CUBLAS_INCLUDE_DIR)
+  file(REAL_PATH "${CUDAToolkit_TARGET_DIR}" CUDAToolkit_MATH_INCLUDE_DIR)
+  cmake_path(APPEND CUDAToolkit_MATH_INCLUDE_DIR "../../math_libs/")
+  if(EXISTS "${CUDAToolkit_MATH_INCLUDE_DIR}/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/")
+    cmake_path(APPEND CUDAToolkit_MATH_INCLUDE_DIR "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/")
+  endif()
+  cmake_path(APPEND CUDAToolkit_MATH_INCLUDE_DIR "include")
+  cmake_path(NORMAL_PATH CUDAToolkit_MATH_INCLUDE_DIR)
+
+  find_path(CUDAToolkit_CUBLAS_INCLUDE_DIR cublas_v2.h PATHS
+    ${CUDAToolkit_INCLUDE_DIRECTORIES}
+    )
+  if(CUDAToolkit_CUBLAS_INCLUDE_DIR)
+    list(APPEND CUDAToolkit_INCLUDE_DIRECTORIES "${CUDAToolkit_CUBLAS_INCLUDE_DIR}")
+  endif()
+endif()
+unset(CUDAToolkit_CUBLAS_INCLUDE_DIR CACHE)
+unset(CUDAToolkit_CUBLAS_INCLUDE_DIR)
+
+# Find the CUDA Runtime Library libcudart
+find_library(CUDA_CUDART
+  NAMES cudart
+  PATHS ${CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES}
+  PATH_SUFFIXES lib64 ${_CUDAToolkit_win_search_dirs}
+)
+find_library(CUDA_CUDART
+  NAMES cudart
+  PATHS ${CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES}
+  PATH_SUFFIXES lib64/stubs ${_CUDAToolkit_win_stub_search_dirs} lib/stubs stubs
+)
+
+if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY)
+  message(STATUS "Unable to find cudart library.")
+endif()
+
+if(_CUDAToolkit_Pop_Prefix)
+  list(REMOVE_AT CMAKE_PREFIX_PATH -1)
+  unset(_CUDAToolkit_Pop_Prefix)
+endif()
+
+#-----------------------------------------------------------------------------
+# Perform version comparison and validate all required variables are set.
+include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+find_package_handle_standard_args(CUDAToolkit
+  REQUIRED_VARS
+    CUDAToolkit_INCLUDE_DIRECTORIES
+    CUDA_CUDART
+    CUDAToolkit_BIN_DIR
+  VERSION_VAR
+    CUDAToolkit_VERSION
+)
+
+unset(CUDAToolkit_ROOT_DIR)
+mark_as_advanced(CUDA_CUDART
+                 CUDAToolkit_NVCC_EXECUTABLE
+                 CUDAToolkit_SENTINEL_FILE
+                 )
+
+#-----------------------------------------------------------------------------
+# Construct result variables
+if(CUDAToolkit_FOUND)
+  set(CUDAToolkit_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRECTORIES}")
+  get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE)
+
+  # Build search paths without any symlinks
+  file(REAL_PATH "${CUDAToolkit_LIBRARY_DIR}" _cmake_search_dir)
+  set(CUDAToolkit_LIBRARY_SEARCH_DIRS "${_cmake_search_dir}")
+
+  # Detect we are in a splayed nvhpc toolkit layout and add extra
+  # search paths without symlinks
+  if(CUDAToolkit_LIBRARY_DIR  MATCHES ".*/cuda/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/lib64$")
+    # Search location for math_libs/
+    block(SCOPE_FOR POLICIES)
+      cmake_policy(SET CMP0152 NEW)
+      file(REAL_PATH "${CUDAToolkit_LIBRARY_DIR}/../../../../../" _cmake_search_dir)
+      list(APPEND CUDAToolkit_LIBRARY_SEARCH_DIRS "${_cmake_search_dir}")
+
+      # Search location for extras like cupti
+      file(REAL_PATH "${CUDAToolkit_LIBRARY_DIR}/../../../" _cmake_search_dir)
+      list(APPEND CUDAToolkit_LIBRARY_SEARCH_DIRS "${_cmake_search_dir}")
+    endblock()
+  endif()
+
+  if(DEFINED CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES)
+    list(APPEND CUDAToolkit_LIBRARY_SEARCH_DIRS "${CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES}")
+  endif()
+
+  # If no `CUDAToolkit_LIBRARY_ROOT` exists set it based on CUDAToolkit_LIBRARY_DIR
+  if(NOT DEFINED CUDAToolkit_LIBRARY_ROOT)
+    foreach(CUDAToolkit_search_loc IN LISTS CUDAToolkit_LIBRARY_DIR CUDAToolkit_BIN_DIR)
+      get_filename_component(CUDAToolkit_possible_lib_root "${CUDAToolkit_search_loc}" DIRECTORY ABSOLUTE)
+      if(EXISTS "${CUDAToolkit_possible_lib_root}/nvvm/")
+        set(CUDAToolkit_LIBRARY_ROOT "${CUDAToolkit_possible_lib_root}")
+        break()
+      endif()
+    endforeach()
+    unset(CUDAToolkit_search_loc)
+    unset(CUDAToolkit_possible_lib_root)
+  endif()
+else()
+  # clear cache results when we fail
+  unset(_cmake_CUDAToolkit_implicit_link_directories CACHE)
+  unset(_cmake_CUDAToolkit_include_directories CACHE)
+  unset(CUDA_CUDART CACHE)
+  unset(CUDAToolkit_BIN_DIR CACHE)
+  unset(CUDAToolkit_NVCC_EXECUTABLE CACHE)
+  unset(CUDAToolkit_SENTINEL_FILE CACHE)
+endif()
+unset(CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES)
+unset(CUDAToolkit_INCLUDE_DIRECTORIES)
+
+#-----------------------------------------------------------------------------
+# Construct import targets
+if(CUDAToolkit_FOUND)
+
+  function(_CUDAToolkit_find_and_add_import_lib lib_name)
+    cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES;EXTRA_INCLUDE_DIRS;ONLY_SEARCH_FOR" ${ARGN})
+
+    if(arg_ONLY_SEARCH_FOR)
+      set(search_names ${arg_ONLY_SEARCH_FOR})
+    else()
+      set(search_names ${lib_name} ${arg_ALT})
+    endif()
+
+    find_library(CUDA_${lib_name}_LIBRARY
+      NAMES ${search_names}
+      HINTS ${CUDAToolkit_LIBRARY_SEARCH_DIRS}
+            ENV CUDA_PATH
+      PATH_SUFFIXES nvidia/current lib64 ${_CUDAToolkit_win_search_dirs} lib
+                    # Support NVHPC splayed math library layout
+                    math_libs/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/lib64
+                    math_libs/lib64
+                    ${arg_EXTRA_PATH_SUFFIXES}
+    )
+    # Don't try any stub directories until we have exhausted all other
+    # search locations.
+    set(CUDA_IMPORT_PROPERTY IMPORTED_LOCATION)
+    set(CUDA_IMPORT_TYPE     UNKNOWN)
+    if(NOT CUDA_${lib_name}_LIBRARY)
+      find_library(CUDA_${lib_name}_LIBRARY
+        NAMES ${search_names}
+        HINTS ${CUDAToolkit_LIBRARY_SEARCH_DIRS}
+              ENV CUDA_PATH
+        PATH_SUFFIXES lib64/stubs ${_CUDAToolkit_win_stub_search_dirs} lib/stubs stubs
+      )
+    endif()
+    if(CUDA_${lib_name}_LIBRARY MATCHES "/stubs/" AND NOT CUDA_${lib_name}_LIBRARY MATCHES "\\.a$" AND NOT WIN32)
+      # Use a SHARED library with IMPORTED_IMPLIB, but not IMPORTED_LOCATION,
+      # to indicate that the stub is for linkers but not dynamic loaders.
+      # It will not contribute any RPATH entry.  When encountered as
+      # a private transitive dependency of another shared library,
+      # it will be passed explicitly to linkers so they can find it
+      # even when the runtime library file does not exist on disk.
+      set(CUDA_IMPORT_PROPERTY IMPORTED_IMPLIB)
+      set(CUDA_IMPORT_TYPE     SHARED)
+    endif()
+
+    mark_as_advanced(CUDA_${lib_name}_LIBRARY)
+
+    if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
+      add_library(CUDA::${lib_name} ${CUDA_IMPORT_TYPE} IMPORTED)
+      target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+      if(DEFINED CUDAToolkit_MATH_INCLUDE_DIR)
+        string(FIND ${CUDA_${lib_name}_LIBRARY} "math_libs" math_libs)
+        if(NOT ${math_libs} EQUAL -1)
+          target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_MATH_INCLUDE_DIR}")
+        endif()
+      endif()
+      set_property(TARGET CUDA::${lib_name} PROPERTY ${CUDA_IMPORT_PROPERTY} "${CUDA_${lib_name}_LIBRARY}")
+      foreach(dep ${arg_DEPS})
+        if(TARGET CUDA::${dep})
+          target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep})
+        endif()
+      endforeach()
+      if(arg_EXTRA_INCLUDE_DIRS)
+        target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${arg_EXTRA_INCLUDE_DIRS}")
+      endif()
+    endif()
+  endfunction()
+
+  if(NOT TARGET CUDA::toolkit)
+    add_library(CUDA::toolkit IMPORTED INTERFACE)
+    target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+    target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
+  endif()
+
+  # setup dependencies that are required for cudart/cudart_static when building
+  # on linux. These are generally only required when using the CUDA toolkit
+  # when CUDA language is disabled
+  if(NOT TARGET CUDA::cudart_static_deps)
+    add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
+    if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER))
+      find_package(Threads REQUIRED)
+      target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS})
+    endif()
+
+    if(UNIX AND NOT APPLE AND NOT (CMAKE_SYSTEM_NAME STREQUAL "QNX"))
+      # On Linux, you must link against librt when using the static cuda runtime.
+      find_library(CUDAToolkit_rt_LIBRARY rt)
+      mark_as_advanced(CUDAToolkit_rt_LIBRARY)
+      if(NOT CUDAToolkit_rt_LIBRARY)
+        message(WARNING "Could not find librt library, needed by CUDA::cudart_static")
+      else()
+        target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY})
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda DEPS cudart_static_deps)
+  _CUDAToolkit_find_and_add_import_lib(cudart DEPS cudart_static_deps)
+  _CUDAToolkit_find_and_add_import_lib(cudart_static DEPS cudart_static_deps)
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0.0)
+    _CUDAToolkit_find_and_add_import_lib(nvJitLink)
+    _CUDAToolkit_find_and_add_import_lib(nvJitLink_static DEPS cudart_static_deps)
+  endif()
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.4.0)
+    _CUDAToolkit_find_and_add_import_lib(nvfatbin DEPS cudart_static_deps)
+    _CUDAToolkit_find_and_add_import_lib(nvfatbin_static DEPS cudart_static_deps)
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library
+  foreach (cuda_lib cublasLt cufft nvjpeg)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib})
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS cudart_static_deps culibos)
+  endforeach()
+  foreach (cuda_lib curand nppc)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib})
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos)
+  endforeach()
+
+  _CUDAToolkit_find_and_add_import_lib(cusparse DEPS nvJitLink)
+  _CUDAToolkit_find_and_add_import_lib(cusparse_static DEPS nvJitLink_static culibos)
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.0.0)
+    # cublas depends on cublasLt
+    # https://docs.nvidia.com/cuda/archive/11.0/cublas#static-library
+    _CUDAToolkit_find_and_add_import_lib(cublas DEPS cublasLt culibos)
+    _CUDAToolkit_find_and_add_import_lib(cublas_static DEPS cublasLt_static culibos)
+  else()
+    _CUDAToolkit_find_and_add_import_lib(cublas DEPS culibos)
+    _CUDAToolkit_find_and_add_import_lib(cublas_static DEPS culibos)
+  endif()
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.4)
+    _CUDAToolkit_find_and_add_import_lib(cuFile ALT cufile DEPS culibos)
+    _CUDAToolkit_find_and_add_import_lib(cuFile_static ALT cufile_static DEPS culibos)
+
+    _CUDAToolkit_find_and_add_import_lib(cuFile_rdma ALT cufile_rdma DEPS cuFile culibos)
+    _CUDAToolkit_find_and_add_import_lib(cuFile_rdma_static ALT cufile_rdma_static DEPS cuFile_static culibos)
+  endif()
+
+    if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.6)
+    _CUDAToolkit_find_and_add_import_lib(cudla)
+  endif()
+
+
+  # cuFFTW depends on cuFFT
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft)
+  _CUDAToolkit_find_and_add_import_lib(cufftw_static DEPS cufft_static)
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 9.2)
+    _CUDAToolkit_find_and_add_import_lib(cufft_static_nocallback DEPS culibos)
+  endif()
+
+  # cuSOLVER depends on cuBLAS, and cuSPARSE
+  set(cusolver_deps cublas cusparse)
+  set(cusolver_static_deps cublas_static cusparse_static culibos)
+  if(CUDAToolkit_VERSION VERSION_GREATER 11.2.1)
+    # cusolver depends on libcusolver_metis and cublasLt
+    # https://docs.nvidia.com/cuda/archive/11.2.2/cusolver#link-dependency
+    list(APPEND cusolver_deps cublasLt)
+    _CUDAToolkit_find_and_add_import_lib(cusolver_metis_static ALT metis_static) # implementation detail static lib
+    list(APPEND cusolver_static_deps cusolver_metis_static cublasLt_static)
+  endif()
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.1.2)
+    # cusolver depends on liblapack_static.a starting with CUDA 10.1 update 2,
+    # https://docs.nvidia.com/cuda/archive/11.5.0/cusolver#static-link-lapack
+    _CUDAToolkit_find_and_add_import_lib(cusolver_lapack_static ALT lapack_static) # implementation detail static lib
+    list(APPEND cusolver_static_deps cusolver_lapack_static)
+  endif()
+  _CUDAToolkit_find_and_add_import_lib(cusolver DEPS ${cusolver_deps})
+  _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS ${cusolver_static_deps})
+  unset(cusolver_deps)
+  unset(cusolver_static_deps)
+
+  # nvGRAPH depends on cuRAND, and cuSOLVER.
+  _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver)
+  _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static)
+
+  # Process the majority of the NPP libraries.
+  foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static)
+  endforeach()
+
+  find_path(CUDAToolkit_CUPTI_INCLUDE_DIR cupti.h PATHS
+      "${CUDAToolkit_ROOT_DIR}/extras/CUPTI/include"
+      ${CUDAToolkit_INCLUDE_DIRS}
+      PATH_SUFFIXES "../extras/CUPTI/include"
+                    "../../../extras/CUPTI/include"
+      NO_DEFAULT_PATH)
+  mark_as_advanced(CUDAToolkit_CUPTI_INCLUDE_DIR)
+
+  if(CUDAToolkit_CUPTI_INCLUDE_DIR)
+    set(_cmake_cupti_extra_paths extras/CUPTI/lib64/
+                                 extras/CUPTI/lib/
+                                 ../extras/CUPTI/lib64/
+                                 ../extras/CUPTI/lib/)
+    _CUDAToolkit_find_and_add_import_lib(cupti
+                                        EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths}
+                                        EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+    _CUDAToolkit_find_and_add_import_lib(cupti_static
+                                        EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths}
+                                        EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+    if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.2.0)
+      _CUDAToolkit_find_and_add_import_lib(nvperf_host
+                                          EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths}
+                                          EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+      _CUDAToolkit_find_and_add_import_lib(nvperf_host_static
+                                          EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths}
+                                          EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+      _CUDAToolkit_find_and_add_import_lib(nvperf_target
+                                          EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths}
+                                          EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+    endif()
+    if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.3.0)
+      _CUDAToolkit_find_and_add_import_lib(pcsamplingutil
+                                          EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths}
+                                          EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+    endif()
+  endif()
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.1.0)
+    if(NOT TARGET CUDA::nvptxcompiler_static)
+      _CUDAToolkit_find_and_add_import_lib(nvptxcompiler_static)
+      if(TARGET CUDA::nvptxcompiler_static)
+        target_link_libraries(CUDA::nvptxcompiler_static INTERFACE CUDA::cudart_static_deps)
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(nvrtc_builtins ALT nvrtc-builtins)
+  _CUDAToolkit_find_and_add_import_lib(nvrtc)
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.5.0)
+    _CUDAToolkit_find_and_add_import_lib(nvrtc_builtins_static ALT nvrtc-builtins_static)
+    if(NOT TARGET CUDA::nvrtc_static)
+      _CUDAToolkit_find_and_add_import_lib(nvrtc_static DEPS nvrtc_builtins_static nvptxcompiler_static)
+      if(TARGET CUDA::nvrtc_static AND WIN32 AND NOT (BORLAND OR MINGW OR CYGWIN))
+        target_link_libraries(CUDA::nvrtc_static INTERFACE Ws2_32.lib)
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml)
+  _CUDAToolkit_find_and_add_import_lib(nvml_static ONLY_SEARCH_FOR libnvidia-ml.a libnvml.a)
+
+  if(WIN32)
+    # nvtools can be installed outside the CUDA toolkit directory
+    # so prefer the NVTOOLSEXT_PATH windows only environment variable
+    # In addition on windows the most common name is nvToolsExt64_1
+    find_library(CUDA_nvToolsExt_LIBRARY
+      NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt
+      PATHS ENV NVTOOLSEXT_PATH
+            ENV CUDA_PATH
+      PATH_SUFFIXES lib/x64 lib
+    )
+  endif()
+  _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64)
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.0)
+    # nvToolsExt is deprecated since nvtx3 introduction.
+    # Warn only if the project requires a sufficiently new CMake to make migration possible.
+    if(TARGET CUDA::nvToolsExt AND CMAKE_MINIMUM_REQUIRED_VERSION VERSION_GREATER_EQUAL 3.25)
+      set_property(TARGET CUDA::nvToolsExt PROPERTY DEPRECATION "nvToolsExt has been superseded by nvtx3 since CUDA 10.0 and CMake 3.25. Use CUDA::nvtx3 and include <nvtx3/nvToolsExt.h> instead.")
+    endif()
+
+    # Header-only variant. Uses dlopen().
+    if(NOT TARGET CUDA::nvtx3)
+      add_library(CUDA::nvtx3 INTERFACE IMPORTED)
+      target_include_directories(CUDA::nvtx3 SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+      target_link_libraries(CUDA::nvtx3 INTERFACE ${CMAKE_DL_LIBS})
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(OpenCL)
+endif()
+
+if(_CUDAToolkit_Pop_ROOT_PATH)
+  list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0)
+  unset(_CUDAToolkit_Pop_ROOT_PATH)
+endif()
+
+unset(_CUDAToolkit_win_search_dirs)
+unset(_CUDAToolkit_win_stub_search_dirs)
diff --git a/cpp/cmake/Modules/FindcuFile.cmake b/cpp/cmake/Modules/FindcuFile.cmake
deleted file mode 100644
index 1df4f12d23..0000000000
--- a/cpp/cmake/Modules/FindcuFile.cmake
+++ /dev/null
@@ -1,120 +0,0 @@
-# =============================================================================
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-
-#[=======================================================================[.rst:
-FindcuFile
-----------
-
-Find cuFile headers and libraries.
-
-Imported Targets
-^^^^^^^^^^^^^^^^
-
-``cufile::cuFile``
-  The cuFile library, if found.
-``cufile::cuFileRDMA``
-  The cuFile RDMA library, if found.
-
-Result Variables
-^^^^^^^^^^^^^^^^
-
-This will define the following variables in your project:
-
-``cuFile_FOUND``
-  true if (the requested version of) cuFile is available.
-``cuFile_VERSION``
-  the version of cuFile.
-``cuFile_LIBRARIES``
-  the libraries to link against to use cuFile.
-``cuFileRDMA_LIBRARIES``
-  the libraries to link against to use cuFile RDMA.
-``cuFile_INCLUDE_DIRS``
-  where to find the cuFile headers.
-``cuFile_COMPILE_OPTIONS``
-  this should be passed to target_compile_options(), if the
-  target is not used for linking
-
-#]=======================================================================]
-
-# use pkg-config to get the directories and then use these values in the FIND_PATH() and
-# FIND_LIBRARY() calls
-find_package(PkgConfig QUIET)
-pkg_check_modules(PKG_cuFile QUIET cuFile)
-
-set(cuFile_COMPILE_OPTIONS ${PKG_cuFile_CFLAGS_OTHER})
-set(cuFile_VERSION ${PKG_cuFile_VERSION})
-
-# Find the location of the CUDA Toolkit
-find_package(CUDAToolkit QUIET)
-find_path(
-  cuFile_INCLUDE_DIR
-  NAMES cufile.h
-  HINTS ${PKG_cuFile_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIRS}
-)
-
-find_library(
-  cuFile_LIBRARY
-  NAMES cufile
-  HINTS ${PKG_cuFile_LIBRARY_DIRS} ${CUDAToolkit_LIBRARY_DIR}
-)
-
-find_library(
-  cuFileRDMA_LIBRARY
-  NAMES cufile_rdma
-  HINTS ${PKG_cuFile_LIBRARY_DIRS} ${CUDAToolkit_LIBRARY_DIR}
-)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(
-  cuFile
-  FOUND_VAR cuFile_FOUND
-  REQUIRED_VARS cuFile_LIBRARY cuFileRDMA_LIBRARY cuFile_INCLUDE_DIR
-  VERSION_VAR cuFile_VERSION
-)
-
-if(cuFile_INCLUDE_DIR AND NOT TARGET cufile::cuFile_interface)
-  add_library(cufile::cuFile_interface INTERFACE IMPORTED GLOBAL)
-  target_include_directories(
-    cufile::cuFile_interface INTERFACE "$<BUILD_INTERFACE:${cuFile_INCLUDE_DIR}>"
-  )
-  target_compile_options(cufile::cuFile_interface INTERFACE "${cuFile_COMPILE_OPTIONS}")
-  target_compile_definitions(cufile::cuFile_interface INTERFACE CUFILE_FOUND)
-endif()
-
-if(cuFile_FOUND AND NOT TARGET cufile::cuFile)
-  add_library(cufile::cuFile UNKNOWN IMPORTED GLOBAL)
-  set_target_properties(
-    cufile::cuFile
-    PROPERTIES IMPORTED_LOCATION "${cuFile_LIBRARY}"
-               INTERFACE_COMPILE_OPTIONS "${cuFile_COMPILE_OPTIONS}"
-               INTERFACE_INCLUDE_DIRECTORIES "${cuFile_INCLUDE_DIR}"
-  )
-endif()
-
-if(cuFile_FOUND AND NOT TARGET cufile::cuFileRDMA)
-  add_library(cufile::cuFileRDMA UNKNOWN IMPORTED GLOBAL)
-  set_target_properties(
-    cufile::cuFileRDMA
-    PROPERTIES IMPORTED_LOCATION "${cuFileRDMA_LIBRARY}"
-               INTERFACE_COMPILE_OPTIONS "${cuFile_COMPILE_OPTIONS}"
-               INTERFACE_INCLUDE_DIRECTORIES "${cuFile_INCLUDE_DIR}"
-  )
-endif()
-
-mark_as_advanced(cuFile_LIBRARY cuFileRDMA_LIBRARY cuFile_INCLUDE_DIR)
-
-if(cuFile_FOUND)
-  set(cuFile_LIBRARIES ${cuFile_LIBRARY})
-  set(cuFileRDMA_LIBRARIES ${cuFileRDMA_LIBRARY})
-  set(cuFile_INCLUDE_DIRS ${cuFile_INCLUDE_DIR})
-endif()
diff --git a/cpp/cmake/thirdparty/get_libcurl.cmake b/cpp/cmake/thirdparty/get_libcurl.cmake
new file mode 100644
index 0000000000..ab979b0cf1
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_libcurl.cmake
@@ -0,0 +1,46 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds libcurl and sets any additional necessary environment variables.
+function(find_and_configure_libcurl)
+  include(${rapids-cmake-dir}/cpm/find.cmake)
+
+  # Work around https://github.com/curl/curl/issues/15351
+  if(DEFINED CACHE{BUILD_TESTING})
+    set(CACHE_HAS_BUILD_TESTING $CACHE{BUILD_TESTING})
+  endif()
+
+  rapids_cpm_find(
+    CURL 8.5.0
+    GLOBAL_TARGETS libcurl
+    BUILD_EXPORT_SET kvikio-exports
+    INSTALL_EXPORT_SET kvikio-exports
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/curl/curl
+    GIT_TAG curl-8_5_0
+    OPTIONS "BUILD_CURL_EXE OFF" "BUILD_SHARED_LIBS OFF" "BUILD_TESTING OFF" "CURL_USE_LIBPSL OFF"
+            "CURL_DISABLE_LDAP ON" "CMAKE_POSITION_INDEPENDENT_CODE ON"
+    EXCLUDE_FROM_ALL YES # Don't install libcurl.a (only needed when building libkvikio.so)
+  )
+  if(DEFINED CACHE_HAS_BUILD_TESTING)
+    set(BUILD_TESTING
+        ${CACHE_HAS_BUILD_TESTING}
+        CACHE BOOL "" FORCE
+    )
+  else()
+    unset(BUILD_TESTING CACHE)
+  endif()
+endfunction()
+
+find_and_configure_libcurl()
diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index 21a33b1d45..22bab7c861 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -5,7 +5,7 @@ bindings to [cuFile](https://docs.nvidia.com/gpudirect-storage/api-reference-gui
 which enables [GPUDirect Storage (GDS)](https://developer.nvidia.com/blog/gpudirect-storage/).
 KvikIO also works efficiently when GDS isn't available and can read/write both host and device data seamlessly.
 
-KvikIO C++ is a header-only library that is part of the [RAPIDS](https://rapids.ai/) suite of open-source software libraries for GPU-accelerated data science.
+KvikIO C++ is part of the [RAPIDS](https://rapids.ai/) suite of open-source software libraries for GPU-accelerated data science.
 
 ---
 **Notice** this is the documentation for the C++ library. For the Python documentation, see under [kvikio](https://docs.rapids.ai/api/kvikio/nightly/).
@@ -23,9 +23,7 @@ KvikIO C++ is a header-only library that is part of the [RAPIDS](https://rapids.
 
 ## Installation
 
-KvikIO is a header-only library and as such doesn't need installation.
-However, for convenience we release Conda packages that makes it easy
-to include KvikIO in your CMake projects.
+For convenience we release Conda packages that makes it easy to include KvikIO in your CMake projects.
 
 ### Conda/Mamba
 
@@ -78,14 +76,19 @@ Then run the example:
 ## Runtime Settings
 
 #### Compatibility Mode (KVIKIO_COMPAT_MODE)
-When KvikIO is running in compatibility mode, it doesn't load `libcufile.so`. Instead, reads and writes are done using POSIX. Notice, this is not the same as the compatibility mode in cuFile. That is cuFile can run in compatibility mode while KvikIO is not.
+When KvikIO is running in compatibility mode, it doesn't load `libcufile.so`. Instead, reads and writes are done using POSIX. Notice, this is not the same as the compatibility mode in cuFile. It is possible that KvikIO performs I/O in the non-compatibility mode by using the cuFile library, but the cuFile library itself is configured to operate in its own compatibility mode. For more details, refer to [cuFile compatibility mode](https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#cufile-compatibility-mode) and [cuFile environment variables](https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/index.html#environment-variables)
 
-Set the environment variable `KVIKIO_COMPAT_MODE` to enable/disable compatibility mode. By default, compatibility mode is enabled:
+The environment variable `KVIKIO_COMPAT_MODE` has three options (case-insensitive):
+  - `ON` (aliases: `TRUE`, `YES`, `1`): Enable the compatibility mode.
+  - `OFF` (aliases: `FALSE`, `NO`, `0`): Disable the compatibility mode, and enforce cuFile I/O. GDS will be activated if the system requirements for cuFile are met and cuFile is properly configured. However, if the system is not suited for cuFile, I/O operations under the `OFF` option may error out, crash or hang.
+  - `AUTO`: Try cuFile I/O first, and fall back to POSIX I/O if the system requirements for cuFile are not met.
+
+Under `AUTO`, KvikIO falls back to the compatibility mode:
   - when `libcufile.so` cannot be found.
   - when running in Windows Subsystem for Linux (WSL).
   - when `/run/udev` isn't readable, which typically happens when running inside a docker image not launched with `--volume /run/udev:/run/udev:ro`.
 
-This setting can also be controlled by `defaults::compat_mode()` and `defaults::compat_mode_reset()`.
+This setting can also be programmatically controlled by `defaults::set_compat_mode()` and `defaults::compat_mode_reset()`.
 
 
 #### Thread Pool (KVIKIO_NTHREADS)
diff --git a/cpp/examples/basic_io.cpp b/cpp/examples/basic_io.cpp
index 3a4ab892ad..39bfc315cd 100644
--- a/cpp/examples/basic_io.cpp
+++ b/cpp/examples/basic_io.cpp
@@ -21,8 +21,8 @@
 
 #include <kvikio/batch.hpp>
 #include <kvikio/buffer.hpp>
+#include <kvikio/cufile/driver.hpp>
 #include <kvikio/defaults.hpp>
-#include <kvikio/driver.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/file_handle.hpp>
 
@@ -65,7 +65,7 @@ int main()
   check(cudaSetDevice(0) == cudaSuccess);
 
   cout << "KvikIO defaults: " << endl;
-  if (kvikio::defaults::compat_mode()) {
+  if (kvikio::defaults::is_compat_mode_preferred()) {
     cout << "  Compatibility mode: enabled" << endl;
   } else {
     kvikio::DriverInitializer manual_init_driver;
@@ -181,7 +181,7 @@ int main()
     cout << "Parallel POSIX read (" << kvikio::defaults::thread_pool_nthreads()
          << " threads): " << read << endl;
   }
-  if (kvikio::is_batch_and_stream_available() && !kvikio::defaults::compat_mode()) {
+  if (kvikio::is_batch_api_available() && !kvikio::defaults::is_compat_mode_preferred()) {
     std::cout << std::endl;
     Timer timer;
     // Here we use the batch API to read "/tmp/test-file" into `b_dev` by
diff --git a/cpp/examples/basic_no_cuda.cpp b/cpp/examples/basic_no_cuda.cpp
index 700e3e8be9..42ecb7142d 100644
--- a/cpp/examples/basic_no_cuda.cpp
+++ b/cpp/examples/basic_no_cuda.cpp
@@ -19,8 +19,8 @@
 
 #include <kvikio/batch.hpp>
 #include <kvikio/buffer.hpp>
+#include <kvikio/cufile/driver.hpp>
 #include <kvikio/defaults.hpp>
-#include <kvikio/driver.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/file_handle.hpp>
 
@@ -41,7 +41,7 @@ constexpr int LARGE_SIZE = 8 * SIZE;             // LARGE SIZE to test partial s
 int main()
 {
   cout << "KvikIO defaults: " << endl;
-  if (kvikio::defaults::compat_mode()) {
+  if (kvikio::defaults::is_compat_mode_preferred()) {
     cout << "  Compatibility mode: enabled" << endl;
   } else {
     kvikio::DriverInitializer manual_init_driver;
diff --git a/cpp/examples/downstream/CMakeLists.txt b/cpp/examples/downstream/CMakeLists.txt
index a80d0ba44f..5dddd30441 100644
--- a/cpp/examples/downstream/CMakeLists.txt
+++ b/cpp/examples/downstream/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -28,6 +28,4 @@ include(cmake/get_kvikio.cmake)
 
 add_executable(downstream_example downstream_example.cpp)
 
-# Notice, even though KvikIO is a header-only library, we link to it here. Linking to
-# `kvikio::kvikio` makes CMake include the headers of KvikIO when building.
 target_link_libraries(downstream_example PRIVATE kvikio::kvikio)
diff --git a/cpp/examples/downstream/cmake/get_kvikio.cmake b/cpp/examples/downstream/cmake/get_kvikio.cmake
index 13f5bf50c2..057774d367 100644
--- a/cpp/examples/downstream/cmake/get_kvikio.cmake
+++ b/cpp/examples/downstream/cmake/get_kvikio.cmake
@@ -30,4 +30,4 @@ function(find_and_configure_kvikio MIN_VERSION)
 
 endfunction()
 
-find_and_configure_kvikio("24.10")
+find_and_configure_kvikio("24.12")
diff --git a/cpp/examples/downstream/downstream_example.cpp b/cpp/examples/downstream/downstream_example.cpp
index 269d50e9e1..87603908a1 100644
--- a/cpp/examples/downstream/downstream_example.cpp
+++ b/cpp/examples/downstream/downstream_example.cpp
@@ -1,7 +1,23 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include <iostream>
 
+#include <kvikio/cufile/driver.hpp>
 #include <kvikio/defaults.hpp>
-#include <kvikio/driver.hpp>
 
 using namespace std;
 
diff --git a/cpp/include/kvikio/batch.hpp b/cpp/include/kvikio/batch.hpp
index 9c58a50b1d..7eebbd4df0 100644
--- a/cpp/include/kvikio/batch.hpp
+++ b/cpp/include/kvikio/batch.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,7 +118,7 @@ class BatchHandle {
     std::vector<CUfileIOParams_t> io_batch_params;
     io_batch_params.reserve(operations.size());
     for (const auto& op : operations) {
-      if (op.file_handle.is_compat_mode_on()) {
+      if (op.file_handle.is_compat_mode_preferred()) {
         throw CUfileException("Cannot submit a FileHandle opened in compatibility mode");
       }
 
diff --git a/cpp/include/kvikio/bounce_buffer.hpp b/cpp/include/kvikio/bounce_buffer.hpp
index 8160be3c5c..498f1d6f5f 100644
--- a/cpp/include/kvikio/bounce_buffer.hpp
+++ b/cpp/include/kvikio/bounce_buffer.hpp
@@ -36,6 +36,7 @@ class AllocRetain {
   // The size of each allocation in `_free_allocs`
   std::size_t _size{defaults::bounce_buffer_size()};
 
+ public:
   /**
    * @brief An host memory allocation
    */
@@ -56,6 +57,7 @@ class AllocRetain {
     Alloc& operator=(Alloc&& o)    = delete;
     ~Alloc() noexcept { _manager->put(_alloc, _size); }
     void* get() noexcept { return _alloc; }
+    void* get(std::ptrdiff_t offset) noexcept { return static_cast<char*>(_alloc) + offset; }
     std::size_t size() noexcept { return _size; }
   };
 
@@ -67,6 +69,7 @@ class AllocRetain {
   // <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
   ~AllocRetain() noexcept = default;
 
+ private:
   /**
    * @brief Free all retained allocations
    *
@@ -143,7 +146,7 @@ class AllocRetain {
     return _clear();
   }
 
-  static AllocRetain& instance()
+  KVIKIO_EXPORT static AllocRetain& instance()
   {
     static AllocRetain _instance;
     return _instance;
diff --git a/cpp/include/kvikio/buffer.hpp b/cpp/include/kvikio/buffer.hpp
index c0aa7f9fbc..85c60b3f90 100644
--- a/cpp/include/kvikio/buffer.hpp
+++ b/cpp/include/kvikio/buffer.hpp
@@ -49,7 +49,7 @@ inline void buffer_register(const void* devPtr_base,
                             int flags                                = 0,
                             const std::vector<int>& errors_to_ignore = std::vector<int>())
 {
-  if (defaults::compat_mode()) { return; }
+  if (defaults::is_compat_mode_preferred()) { return; }
   CUfileError_t status = cuFileAPI::instance().BufRegister(devPtr_base, size, flags);
   if (status.err != CU_FILE_SUCCESS) {
     // Check if `status.err` is in `errors_to_ignore`
@@ -67,7 +67,7 @@ inline void buffer_register(const void* devPtr_base,
  */
 inline void buffer_deregister(const void* devPtr_base)
 {
-  if (defaults::compat_mode()) { return; }
+  if (defaults::is_compat_mode_preferred()) { return; }
   CUFILE_TRY(cuFileAPI::instance().BufDeregister(devPtr_base));
 }
 
diff --git a/cpp/include/kvikio/cufile_config.hpp b/cpp/include/kvikio/cufile/config.hpp
similarity index 100%
rename from cpp/include/kvikio/cufile_config.hpp
rename to cpp/include/kvikio/cufile/config.hpp
diff --git a/cpp/include/kvikio/driver.hpp b/cpp/include/kvikio/cufile/driver.hpp
similarity index 97%
rename from cpp/include/kvikio/driver.hpp
rename to cpp/include/kvikio/cufile/driver.hpp
index 7d73f465aa..b609029a69 100644
--- a/cpp/include/kvikio/driver.hpp
+++ b/cpp/include/kvikio/cufile/driver.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ inline void set_driver_flag(unsigned int& prop, unsigned int flag, bool val) noe
 class DriverInitializer {
   // Optional, if not used cuFiles opens the driver automatically
  public:
-  DriverInitializer() { CUFILE_TRY(cuFileAPI::instance().DriverOpen()); }
+  DriverInitializer() { cuFileAPI::instance().driver_open(); }
 
   DriverInitializer(DriverInitializer const&)                = delete;
   DriverInitializer& operator=(DriverInitializer const&)     = delete;
@@ -55,7 +55,7 @@ class DriverInitializer {
   ~DriverInitializer()
   {
     try {
-      CUFILE_TRY(cuFileAPI::instance().DriverClose());
+      cuFileAPI::instance().driver_close();
     } catch (const CUfileException& e) {
       std::cerr << "Unable to close GDS file driver: ";
       std::cerr << e.what();
diff --git a/cpp/include/kvikio/defaults.hpp b/cpp/include/kvikio/defaults.hpp
index c812c6e251..91071cbb28 100644
--- a/cpp/include/kvikio/defaults.hpp
+++ b/cpp/include/kvikio/defaults.hpp
@@ -13,6 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file
+ */
+
 #pragma once
 
 #include <algorithm>
@@ -27,7 +32,48 @@
 #include <kvikio/shim/cufile.hpp>
 
 namespace kvikio {
+/**
+ * @brief I/O compatibility mode.
+ */
+enum class CompatMode : uint8_t {
+  OFF,  ///< Enforce cuFile I/O. GDS will be activated if the system requirements for cuFile are met
+        ///< and cuFile is properly configured. However, if the system is not suited for cuFile, I/O
+        ///< operations under the OFF option may error out, crash or hang.
+  ON,   ///< Enforce POSIX I/O.
+  AUTO,  ///< Try cuFile I/O first, and fall back to POSIX I/O if the system requirements for cuFile
+         ///< are not met.
+};
+
 namespace detail {
+/**
+ * @brief Parse a string into a CompatMode enum.
+ *
+ * @param compat_mode_str Compatibility mode in string format(case-insensitive). Valid values
+ * include:
+ *   - `ON` (alias: `TRUE`, `YES`, `1`)
+ *   - `OFF` (alias: `FALSE`, `NO`, `0`)
+ *   - `AUTO`
+ * @return A CompatMode enum.
+ */
+inline CompatMode parse_compat_mode_str(std::string_view compat_mode_str)
+{
+  // Convert to lowercase
+  std::string tmp{compat_mode_str};
+  std::transform(
+    tmp.begin(), tmp.end(), tmp.begin(), [](unsigned char c) { return std::tolower(c); });
+
+  CompatMode res{};
+  if (tmp == "on" || tmp == "true" || tmp == "yes" || tmp == "1") {
+    res = CompatMode::ON;
+  } else if (tmp == "off" || tmp == "false" || tmp == "no" || tmp == "0") {
+    res = CompatMode::OFF;
+  } else if (tmp == "auto") {
+    res = CompatMode::AUTO;
+  } else {
+    throw std::invalid_argument("Unknown compatibility mode: " + std::string{tmp});
+  }
+  return res;
+}
 
 template <typename T>
 T getenv_or(std::string_view env_var_name, T default_val)
@@ -57,7 +103,14 @@ inline bool getenv_or(std::string_view env_var_name, bool default_val)
   }
   // Convert to lowercase
   std::string str{env_val};
-  std::transform(str.begin(), str.end(), str.begin(), ::tolower);
+  // Special considerations regarding the case conversion:
+  // - std::tolower() is not an addressable function. Passing it to std::transform() as
+  //   a function pointer, if the compile turns out successful, causes the program behavior
+  //   "unspecified (possibly ill-formed)", hence the lambda. ::tolower() is addressable
+  //   and does not have this problem, but the following item still applies.
+  // - To avoid UB in std::tolower() or ::tolower(), the character must be cast to unsigned char.
+  std::transform(
+    str.begin(), str.end(), str.begin(), [](unsigned char c) { return std::tolower(c); });
   // Trim whitespaces
   std::stringstream trimmer;
   trimmer << str;
@@ -70,16 +123,24 @@ inline bool getenv_or(std::string_view env_var_name, bool default_val)
                               std::string{env_val});
 }
 
+template <>
+inline CompatMode getenv_or(std::string_view env_var_name, CompatMode default_val)
+{
+  auto* env_val = std::getenv(env_var_name.data());
+  if (env_val == nullptr) { return default_val; }
+  return parse_compat_mode_str(env_val);
+}
+
 }  // namespace detail
 
 /**
- * @brief Singleton class of default values used thoughtout KvikIO.
+ * @brief Singleton class of default values used throughout KvikIO.
  *
  */
 class defaults {
  private:
   BS::thread_pool _thread_pool{get_num_threads_from_env()};
-  bool _compat_mode;
+  CompatMode _compat_mode;
   std::size_t _task_size;
   std::size_t _gds_threshold;
   std::size_t _bounce_buffer_size;
@@ -97,13 +158,7 @@ class defaults {
   {
     // Determine the default value of `compat_mode`
     {
-      if (std::getenv("KVIKIO_COMPAT_MODE") != nullptr) {
-        // Setting `KVIKIO_COMPAT_MODE` take precedence
-        _compat_mode = detail::getenv_or("KVIKIO_COMPAT_MODE", false);
-      } else {
-        // If `KVIKIO_COMPAT_MODE` isn't set, we infer based on runtime environment
-        _compat_mode = !is_cufile_available();
-      }
+      _compat_mode = detail::getenv_or("KVIKIO_COMPAT_MODE", CompatMode::AUTO);
     }
     // Determine the default value of `task_size`
     {
@@ -133,7 +188,7 @@ class defaults {
     }
   }
 
-  static defaults* instance()
+  KVIKIO_EXPORT static defaults* instance()
   {
     static defaults _instance;
     return &_instance;
@@ -156,19 +211,77 @@ class defaults {
    *  - when `/run/udev` isn't readable, which typically happens when running inside a docker
    *    image not launched with `--volume /run/udev:/run/udev:ro`
    *
-   * @return The boolean answer
+   * @return Compatibility mode.
    */
-  [[nodiscard]] static bool compat_mode() { return instance()->_compat_mode; }
+  [[nodiscard]] static CompatMode compat_mode() { return instance()->_compat_mode; }
 
   /**
-   * @brief Reset the value of `kvikio::defaults::compat_mode()`
+   * @brief Reset the value of `kvikio::defaults::compat_mode()`.
+   *
+   * Changing the compatibility mode affects all the new FileHandles whose `compat_mode` argument is
+   * not explicitly set, but it never affects existing FileHandles.
+   *
+   * @param compat_mode Compatibility mode.
+   */
+  static void compat_mode_reset(CompatMode compat_mode) { instance()->_compat_mode = compat_mode; }
+
+  /**
+   * @brief Infer the `AUTO` compatibility mode from the system runtime.
+   *
+   * If the requested compatibility mode is `AUTO`, set the expected compatibility mode to
+   * `ON` or `OFF` by performing a system config check; otherwise, do nothing. Effectively, this
+   * function reduces the requested compatibility mode from three possible states
+   * (`ON`/`OFF`/`AUTO`) to two (`ON`/`OFF`) so as to determine the actual I/O path. This function
+   * is lightweight as the inferred result is cached.
+   */
+  static CompatMode infer_compat_mode_if_auto(CompatMode compat_mode)
+  {
+    if (compat_mode == CompatMode::AUTO) {
+      static auto inferred_compat_mode_for_auto = []() -> CompatMode {
+        return is_cufile_available() ? CompatMode::OFF : CompatMode::ON;
+      }();
+      return inferred_compat_mode_for_auto;
+    }
+    return compat_mode;
+  }
+
+  /**
+   * @brief Given a requested compatibility mode, whether it is expected to reduce to `ON`.
+   *
+   * This function returns true if any of the two condition is satisfied:
+   *   - The compatibility mode is `ON`.
+   *   - It is `AUTO` but inferred to be `ON`.
+   *
+   * Conceptually, the opposite of this function is whether requested compatibility mode is expected
+   * to be `OFF`, which would occur if any of the two condition is satisfied:
+   *   - The compatibility mode is `OFF`.
+   *   - It is `AUTO` but inferred to be `OFF`.
+   *
+   * @param compat_mode Compatibility mode.
+   * @return Boolean answer.
+   */
+  static bool is_compat_mode_preferred(CompatMode compat_mode)
+  {
+    return compat_mode == CompatMode::ON ||
+           (compat_mode == CompatMode::AUTO &&
+            defaults::infer_compat_mode_if_auto(compat_mode) == CompatMode::ON);
+  }
+
+  /**
+   * @brief Whether the global compatibility mode from class defaults is expected to be `ON`.
+   *
+   * This function returns true if any of the two condition is satisfied:
+   *   - The compatibility mode is `ON`.
+   *   - It is `AUTO` but inferred to be `ON`.
    *
-   * Changing compatibility mode, effects all new FileHandles that doesn't sets the
-   * `compat_mode` argument explicitly but it never effect existing FileHandles.
+   * Conceptually, the opposite of this function is whether the global compatibility mode is
+   * expected to be `OFF`, which would occur if any of the two condition is satisfied:
+   *   - The compatibility mode is `OFF`.
+   *   - It is `AUTO` but inferred to be `OFF`.
    *
-   * @param enable Whether to enable compatibility mode or not.
+   * @return Boolean answer.
    */
-  static void compat_mode_reset(bool enable) { instance()->_compat_mode = enable; }
+  static bool is_compat_mode_preferred() { return is_compat_mode_preferred(compat_mode()); }
 
   /**
    * @brief Get the default thread pool.
diff --git a/cpp/include/kvikio/error.hpp b/cpp/include/kvikio/error.hpp
index e84ebd770c..2ecd37b0b3 100644
--- a/cpp/include/kvikio/error.hpp
+++ b/cpp/include/kvikio/error.hpp
@@ -45,8 +45,8 @@ struct CUfileException : public std::runtime_error {
     if (error != CUDA_SUCCESS) {                                                               \
       const char* err_name     = nullptr;                                                      \
       const char* err_str      = nullptr;                                                      \
-      CUresult err_name_status = cudaAPI::instance().GetErrorName(error, &err_name);           \
-      CUresult err_str_status  = cudaAPI::instance().GetErrorString(error, &err_str);          \
+      CUresult err_name_status = kvikio::cudaAPI::instance().GetErrorName(error, &err_name);   \
+      CUresult err_str_status  = kvikio::cudaAPI::instance().GetErrorString(error, &err_str);  \
       if (err_name_status == CUDA_ERROR_INVALID_VALUE) { err_name = "unknown"; }               \
       if (err_str_status == CUDA_ERROR_INVALID_VALUE) { err_str = "unknown"; }                 \
       throw(_exception_type){std::string{"CUDA error at: "} + __FILE__ + ":" +                 \
diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp
index f84e792489..4880bd4f20 100644
--- a/cpp/include/kvikio/file_handle.hpp
+++ b/cpp/include/kvikio/file_handle.hpp
@@ -15,22 +15,17 @@
  */
 #pragma once
 
-#include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#include <unistd.h>
 
 #include <cstddef>
 #include <cstdlib>
-#include <iostream>
-#include <numeric>
-#include <optional>
 #include <stdexcept>
 #include <system_error>
 #include <utility>
 
 #include <kvikio/buffer.hpp>
-#include <kvikio/cufile_config.hpp>
+#include <kvikio/cufile/config.hpp>
 #include <kvikio/defaults.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/parallel_operation.hpp>
@@ -40,96 +35,6 @@
 #include <kvikio/utils.hpp>
 
 namespace kvikio {
-namespace detail {
-
-/**
- * @brief Parse open file flags given as a string and return oflags
- *
- * @param flags The flags
- * @param o_direct Append O_DIRECT to the open flags
- * @return oflags
- *
- * @throw std::invalid_argument if the specified flags are not supported.
- * @throw std::invalid_argument if `o_direct` is true, but `O_DIRECT` is not supported.
- */
-inline int open_fd_parse_flags(const std::string& flags, bool o_direct)
-{
-  int file_flags = -1;
-  if (flags.empty()) { throw std::invalid_argument("Unknown file open flag"); }
-  switch (flags[0]) {
-    case 'r':
-      file_flags = O_RDONLY;
-      if (flags[1] == '+') { file_flags = O_RDWR; }
-      break;
-    case 'w':
-      file_flags = O_WRONLY;
-      if (flags[1] == '+') { file_flags = O_RDWR; }
-      file_flags |= O_CREAT | O_TRUNC;
-      break;
-    case 'a': throw std::invalid_argument("Open flag 'a' isn't supported");
-    default: throw std::invalid_argument("Unknown file open flag");
-  }
-  file_flags |= O_CLOEXEC;
-  if (o_direct) {
-#if defined(O_DIRECT)
-    file_flags |= O_DIRECT;
-#else
-    throw std::invalid_argument("'o_direct' flag unsupported on this platform");
-#endif
-  }
-  return file_flags;
-}
-
-/**
- * @brief Open file using `open(2)`
- *
- * @param flags Open flags given as a string
- * @param o_direct Append O_DIRECT to `flags`
- * @param mode Access modes
- * @return File descriptor
- */
-inline int open_fd(const std::string& file_path,
-                   const std::string& flags,
-                   bool o_direct,
-                   mode_t mode)
-{
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
-  int fd = ::open(file_path.c_str(), open_fd_parse_flags(flags, o_direct), mode);
-  if (fd == -1) { throw std::system_error(errno, std::generic_category(), "Unable to open file"); }
-  return fd;
-}
-
-/**
- * @brief Get the flags of the file descriptor (see `open(2)`)
- *
- * @return Open flags
- */
-[[nodiscard]] inline int open_flags(int fd)
-{
-  int ret = fcntl(fd, F_GETFL);  // NOLINT(cppcoreguidelines-pro-type-vararg)
-  if (ret == -1) {
-    throw std::system_error(errno, std::generic_category(), "Unable to retrieve open flags");
-  }
-  return ret;
-}
-
-/**
- * @brief Get file size from file descriptor `fstat(3)`
- *
- * @param file_descriptor Open file descriptor
- * @return The number of bytes
- */
-[[nodiscard]] inline std::size_t get_file_size(int file_descriptor)
-{
-  struct stat st {};
-  int ret = fstat(file_descriptor, &st);
-  if (ret == -1) {
-    throw std::system_error(errno, std::generic_category(), "Unable to query file size");
-  }
-  return static_cast<std::size_t>(st.st_size);
-}
-
-}  // namespace detail
 
 /**
  * @brief Handle of an open file registered with cufile.
@@ -142,10 +47,37 @@ class FileHandle {
   int _fd_direct_on{-1};
   int _fd_direct_off{-1};
   bool _initialized{false};
-  bool _compat_mode{false};
+  CompatMode _compat_mode{CompatMode::AUTO};
   mutable std::size_t _nbytes{0};  // The size of the underlying file, zero means unknown.
   CUfileHandle_t _handle{};
 
+  /**
+   * @brief Given a requested compatibility mode, whether it is expected to reduce to `ON` for
+   * asynchronous I/O.
+   *
+   * @param requested_compat_mode Requested compatibility mode.
+   * @return True if POSIX I/O fallback will be used; false for cuFile I/O.
+   * @exception std::runtime_error When the requested compatibility mode is `OFF`, but cuFile
+   * batch/stream library symbol is missing, or cuFile configuration file is missing.
+   */
+  bool is_compat_mode_preferred_for_async(CompatMode requested_compat_mode)
+  {
+    if (defaults::is_compat_mode_preferred(requested_compat_mode)) { return true; }
+
+    if (!is_stream_api_available()) {
+      if (requested_compat_mode == CompatMode::AUTO) { return true; }
+      throw std::runtime_error("Missing the cuFile stream api.");
+    }
+
+    // When checking for availability, we also check if cuFile's config file exists. This is
+    // because even when the stream API is available, it doesn't work if no config file exists.
+    if (config_path().empty()) {
+      if (requested_compat_mode == CompatMode::AUTO) { return true; }
+      throw std::runtime_error("Missing cuFile configuration file.");
+    }
+    return false;
+  }
+
  public:
   static constexpr mode_t m644 = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH;
   FileHandle() noexcept        = default;
@@ -164,38 +96,12 @@ class FileHandle {
    *   "a" -> "open for writing, appending to the end of file if it exists"
    *   "+" -> "open for updating (reading and writing)"
    * @param mode Access modes (see `open(2)`).
-   * @param compat_mode Enable KvikIO's compatibility mode for this file.
+   * @param compat_mode Set KvikIO's compatibility mode for this file.
    */
   FileHandle(const std::string& file_path,
              const std::string& flags = "r",
              mode_t mode              = m644,
-             bool compat_mode         = defaults::compat_mode())
-    : _fd_direct_off{detail::open_fd(file_path, flags, false, mode)},
-      _initialized{true},
-      _compat_mode{compat_mode}
-  {
-    if (_compat_mode) {
-      return;  // Nothing to do in compatibility mode
-    }
-
-    // Try to open the file with the O_DIRECT flag. Fall back to compatibility mode, if it fails.
-    try {
-      _fd_direct_on = detail::open_fd(file_path, flags, true, mode);
-    } catch (const std::system_error&) {
-      _compat_mode = true;
-    } catch (const std::invalid_argument&) {
-      _compat_mode = true;
-    }
-
-    // Create a cuFile handle, if not in compatibility mode
-    if (!_compat_mode) {
-      CUfileDescr_t desc{};  // It is important to set to zero!
-      desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
-      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-union-access)
-      desc.handle.fd = _fd_direct_on;
-      CUFILE_TRY(cuFileAPI::instance().HandleRegister(&_handle, &desc));
-    }
-  }
+             CompatMode compat_mode   = defaults::compat_mode());
 
   /**
    * @brief FileHandle support move semantic but isn't copyable
@@ -206,7 +112,7 @@ class FileHandle {
     : _fd_direct_on{std::exchange(o._fd_direct_on, -1)},
       _fd_direct_off{std::exchange(o._fd_direct_off, -1)},
       _initialized{std::exchange(o._initialized, false)},
-      _compat_mode{std::exchange(o._compat_mode, false)},
+      _compat_mode{std::exchange(o._compat_mode, CompatMode::AUTO)},
       _nbytes{std::exchange(o._nbytes, 0)},
       _handle{std::exchange(o._handle, CUfileHandle_t{})}
   {
@@ -216,13 +122,18 @@ class FileHandle {
     _fd_direct_on  = std::exchange(o._fd_direct_on, -1);
     _fd_direct_off = std::exchange(o._fd_direct_off, -1);
     _initialized   = std::exchange(o._initialized, false);
-    _compat_mode   = std::exchange(o._compat_mode, false);
+    _compat_mode   = std::exchange(o._compat_mode, CompatMode::AUTO);
     _nbytes        = std::exchange(o._nbytes, 0);
     _handle        = std::exchange(o._handle, CUfileHandle_t{});
     return *this;
   }
   ~FileHandle() noexcept { close(); }
 
+  /**
+   * @brief Whether the file is closed according to its initialization status.
+   *
+   * @return Boolean answer.
+   */
   [[nodiscard]] bool closed() const noexcept { return !_initialized; }
 
   /**
@@ -232,7 +143,8 @@ class FileHandle {
   {
     if (closed()) { return; }
 
-    if (!_compat_mode) { cuFileAPI::instance().HandleDeregister(_handle); }
+    if (!is_compat_mode_preferred()) { cuFileAPI::instance().HandleDeregister(_handle); }
+    _compat_mode = CompatMode::AUTO;
     ::close(_fd_direct_off);
     if (_fd_direct_on != -1) { ::close(_fd_direct_on); }
     _fd_direct_on  = -1;
@@ -244,14 +156,14 @@ class FileHandle {
    * @brief Get the underlying cuFile file handle
    *
    * The file handle must be open and not in compatibility mode i.e.
-   * both `.closed()` and `.is_compat_mode_on()` must be return false.
+   * both `closed()` and `is_compat_mode_preferred()` must be false.
    *
    * @return cuFile's file handle
    */
   [[nodiscard]] CUfileHandle_t handle()
   {
     if (closed()) { throw CUfileException("File handle is closed"); }
-    if (_compat_mode) {
+    if (is_compat_mode_preferred()) {
       throw CUfileException("The underlying cuFile handle isn't available in compatibility mode");
     }
     return _handle;
@@ -277,7 +189,7 @@ class FileHandle {
    *
    * @return File descriptor
    */
-  [[nodiscard]] int fd_open_flags() const { return detail::open_flags(_fd_direct_off); }
+  [[nodiscard]] int fd_open_flags() const;
 
   /**
    * @brief Get the file size
@@ -286,12 +198,7 @@ class FileHandle {
    *
    * @return The number of bytes
    */
-  [[nodiscard]] std::size_t nbytes() const
-  {
-    if (closed()) { return 0; }
-    if (_nbytes == 0) { _nbytes = detail::get_file_size(_fd_direct_off); }
-    return _nbytes;
-  }
+  [[nodiscard]] std::size_t nbytes() const;
 
   /**
    * @brief Reads specified bytes from the file into the device memory.
@@ -329,12 +236,13 @@ class FileHandle {
                    std::size_t devPtr_offset,
                    bool sync_default_stream = true)
   {
-    if (_compat_mode) {
-      return posix_device_read(_fd_direct_off, devPtr_base, size, file_offset, devPtr_offset);
+    if (is_compat_mode_preferred()) {
+      return detail::posix_device_read(
+        _fd_direct_off, devPtr_base, size, file_offset, devPtr_offset);
     }
     if (sync_default_stream) { CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr)); }
 
-    KVIKIO_NVTX_FUNC_RANGE("cufileRead()", size);
+    KVIKIO_NVTX_SCOPED_RANGE("cufileRead()", size);
     ssize_t ret = cuFileAPI::instance().Read(
       _handle, devPtr_base, size, convert_size2off(file_offset), convert_size2off(devPtr_offset));
     CUFILE_CHECK_BYTES_DONE(ret);
@@ -380,12 +288,13 @@ class FileHandle {
   {
     _nbytes = 0;  // Invalidate the computed file size
 
-    if (_compat_mode) {
-      return posix_device_write(_fd_direct_off, devPtr_base, size, file_offset, devPtr_offset);
+    if (is_compat_mode_preferred()) {
+      return detail::posix_device_write(
+        _fd_direct_off, devPtr_base, size, file_offset, devPtr_offset);
     }
     if (sync_default_stream) { CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr)); }
 
-    KVIKIO_NVTX_FUNC_RANGE("cufileWrite()", size);
+    KVIKIO_NVTX_SCOPED_RANGE("cufileWrite()", size);
     ssize_t ret = cuFileAPI::instance().Write(
       _handle, devPtr_base, size, convert_size2off(file_offset), convert_size2off(devPtr_offset));
     if (ret == -1) {
@@ -432,13 +341,15 @@ class FileHandle {
                                  std::size_t gds_threshold = defaults::gds_threshold(),
                                  bool sync_default_stream  = true)
   {
+    KVIKIO_NVTX_MARKER("FileHandle::pread()", size);
     if (is_host_memory(buf)) {
       auto op = [this](void* hostPtr_base,
                        std::size_t size,
                        std::size_t file_offset,
                        std::size_t hostPtr_offset) -> std::size_t {
         char* buf = static_cast<char*>(hostPtr_base) + hostPtr_offset;
-        return posix_host_read(_fd_direct_off, buf, size, file_offset, false);
+        return detail::posix_host_read<detail::PartialIO::NO>(
+          _fd_direct_off, buf, size, file_offset);
       };
 
       return parallel_io(op, buf, size, file_offset, task_size, 0);
@@ -450,13 +361,13 @@ class FileHandle {
     if (size < gds_threshold) {
       auto task = [this, ctx, buf, size, file_offset]() -> std::size_t {
         PushAndPopContext c(ctx);
-        return posix_device_read(_fd_direct_off, buf, size, file_offset, 0);
+        return detail::posix_device_read(_fd_direct_off, buf, size, file_offset, 0);
       };
       return std::async(std::launch::deferred, task);
     }
 
     // Let's synchronize once instead of in each task.
-    if (sync_default_stream && !_compat_mode) {
+    if (sync_default_stream && !is_compat_mode_preferred()) {
       PushAndPopContext c(ctx);
       CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr));
     }
@@ -507,13 +418,15 @@ class FileHandle {
                                   std::size_t gds_threshold = defaults::gds_threshold(),
                                   bool sync_default_stream  = true)
   {
+    KVIKIO_NVTX_MARKER("FileHandle::pwrite()", size);
     if (is_host_memory(buf)) {
       auto op = [this](const void* hostPtr_base,
                        std::size_t size,
                        std::size_t file_offset,
                        std::size_t hostPtr_offset) -> std::size_t {
         const char* buf = static_cast<const char*>(hostPtr_base) + hostPtr_offset;
-        return posix_host_write(_fd_direct_off, buf, size, file_offset, false);
+        return detail::posix_host_write<detail::PartialIO::NO>(
+          _fd_direct_off, buf, size, file_offset);
       };
 
       return parallel_io(op, buf, size, file_offset, task_size, 0);
@@ -525,13 +438,13 @@ class FileHandle {
     if (size < gds_threshold) {
       auto task = [this, ctx, buf, size, file_offset]() -> std::size_t {
         PushAndPopContext c(ctx);
-        return posix_device_write(_fd_direct_off, buf, size, file_offset, 0);
+        return detail::posix_device_write(_fd_direct_off, buf, size, file_offset, 0);
       };
       return std::async(std::launch::deferred, task);
     }
 
     // Let's synchronize once instead of in each task.
-    if (sync_default_stream && !_compat_mode) {
+    if (sync_default_stream && !is_compat_mode_preferred()) {
       PushAndPopContext c(ctx);
       CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(nullptr));
     }
@@ -590,16 +503,14 @@ class FileHandle {
                   ssize_t* bytes_read_p,
                   CUstream stream)
   {
-    // When checking for availability, we also check if cuFile's config file exist. This is because
-    // even when the stream API is available, it doesn't work if no config file exist.
-    if (kvikio::is_batch_and_stream_available() && !_compat_mode && !config_path().empty()) {
+    if (is_compat_mode_preferred_for_async(_compat_mode)) {
+      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
+      *bytes_read_p =
+        static_cast<ssize_t>(read(devPtr_base, *size_p, *file_offset_p, *devPtr_offset_p));
+    } else {
       CUFILE_TRY(cuFileAPI::instance().ReadAsync(
         _handle, devPtr_base, size_p, file_offset_p, devPtr_offset_p, bytes_read_p, stream));
-      return;
     }
-    CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
-    *bytes_read_p =
-      static_cast<ssize_t>(read(devPtr_base, *size_p, *file_offset_p, *devPtr_offset_p));
   }
 
   /**
@@ -682,16 +593,14 @@ class FileHandle {
                    ssize_t* bytes_written_p,
                    CUstream stream)
   {
-    // When checking for availability, we also check if cuFile's config file exist. This is because
-    // even when the stream API is available, it doesn't work if no config file exist.
-    if (kvikio::is_batch_and_stream_available() && !_compat_mode && !config_path().empty()) {
+    if (is_compat_mode_preferred_for_async(_compat_mode)) {
+      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
+      *bytes_written_p =
+        static_cast<ssize_t>(write(devPtr_base, *size_p, *file_offset_p, *devPtr_offset_p));
+    } else {
       CUFILE_TRY(cuFileAPI::instance().WriteAsync(
         _handle, devPtr_base, size_p, file_offset_p, devPtr_offset_p, bytes_written_p, stream));
-      return;
     }
-    CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
-    *bytes_written_p =
-      static_cast<ssize_t>(write(devPtr_base, *size_p, *file_offset_p, *devPtr_offset_p));
   }
 
   /**
@@ -733,14 +642,35 @@ class FileHandle {
   }
 
   /**
-   * @brief Returns `true` if the compatibility mode has been enabled for this file.
+   * @brief Returns `true` if the compatibility mode is expected to be `ON` for this file.
    *
    * Compatibility mode can be explicitly enabled in object creation. The mode is also enabled
-   * automatically, if file cannot be opened with the `O_DIRECT` flag.
+   * automatically, if file cannot be opened with the `O_DIRECT` flag, or if the system does not
+   * meet the requirements for the cuFile library under the `AUTO` compatibility mode.
    *
-   * @return compatibility mode state for the object
+   * @return Boolean answer.
    */
-  [[nodiscard]] bool is_compat_mode_on() const noexcept { return _compat_mode; }
+  [[nodiscard]] bool is_compat_mode_preferred() const noexcept
+  {
+    return defaults::is_compat_mode_preferred(_compat_mode);
+  }
+
+  /**
+   * @brief Returns `true` if the compatibility mode is expected to be `ON` for the asynchronous I/O
+   * on this file.
+   *
+   * For asynchronous I/O, the compatibility mode can be automatically enabled if the cuFile batch
+   * and stream symbols are missing, or if the cuFile configuration file is missing, or if
+   * `is_compat_mode_preferred()` returns true.
+   *
+   * @return Boolean answer.
+   */
+  [[nodiscard]] bool is_compat_mode_preferred_for_async() const noexcept
+  {
+    static bool is_extra_symbol_available = is_stream_api_available();
+    static bool is_config_path_empty      = config_path().empty();
+    return is_compat_mode_preferred() || !is_extra_symbol_available || is_config_path_empty;
+  }
 };
 
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/posix_io.hpp b/cpp/include/kvikio/posix_io.hpp
index 9a28e06eec..4327a301ec 100644
--- a/cpp/include/kvikio/posix_io.hpp
+++ b/cpp/include/kvikio/posix_io.hpp
@@ -26,9 +26,23 @@
 #include <kvikio/shim/cuda.hpp>
 #include <kvikio/utils.hpp>
 
-namespace kvikio {
+namespace kvikio::detail {
 
-namespace detail {
+/**
+ * @brief Type of the IO operation.
+ */
+enum class IOOperationType : uint8_t {
+  READ,   ///< POSIX read.
+  WRITE,  ///< POSIX write.
+};
+
+/**
+ * @brief Specifies whether all requested bytes are to be processed or not.
+ */
+enum class PartialIO : uint8_t {
+  YES,  ///< POSIX read/write is called only once, which may not process all bytes requested.
+  NO,   ///< POSIX read/write is called repeatedly until all requested bytes are processed.
+};
 
 /**
  * @brief Singleton class to retrieve a CUDA stream for device-host copying
@@ -51,7 +65,7 @@ class StreamsByThread {
   // cuDevicePrimaryCtxReset() or cudaDeviceReset() before program termination.
   ~StreamsByThread() = default;
 
-  static CUstream get(CUcontext ctx, std::thread::id thd_id)
+  KVIKIO_EXPORT static CUstream get(CUcontext ctx, std::thread::id thd_id)
   {
     static StreamsByThread _instance;
 
@@ -60,12 +74,14 @@ class StreamsByThread {
     auto key = std::make_pair(ctx, thd_id);
 
     // Create a new stream if `ctx` doesn't have one.
-    if (_instance._streams.find(key) == _instance._streams.end()) {
+    if (auto search = _instance._streams.find(key); search == _instance._streams.end()) {
       CUstream stream{};
       CUDA_DRIVER_TRY(cudaAPI::instance().StreamCreate(&stream, CU_STREAM_DEFAULT));
       _instance._streams[key] = stream;
+      return stream;
+    } else {
+      return search->second;
     }
-    return _instance._streams.at(key);
   }
 
   static CUstream get()
@@ -84,29 +100,30 @@ class StreamsByThread {
 /**
  * @brief Read or write host memory to or from disk using POSIX
  *
- * @tparam IsReadOperation Whether the operation is a read or a write
+ * @tparam Operation Whether the operation is a read or a write.
+ * @tparam PartialIOStatus Whether all requested data are processed or not. If `FULL`, all of
+ * `count` bytes are read or written.
  * @param fd File descriptor
  * @param buf Buffer to write
  * @param count Number of bytes to write
  * @param offset File offset
- * @param partial If false, all of `count` bytes are read or written.
  * @return The number of bytes read or written (always gather than zero)
  */
-template <bool IsReadOperation>
-ssize_t posix_host_io(int fd, const void* buf, size_t count, off_t offset, bool partial)
+template <IOOperationType Operation, PartialIO PartialIOStatus>
+ssize_t posix_host_io(int fd, const void* buf, size_t count, off_t offset)
 {
   off_t cur_offset      = offset;
   size_t byte_remaining = count;
   char* buffer          = const_cast<char*>(static_cast<const char*>(buf));
   while (byte_remaining > 0) {
     ssize_t nbytes = 0;
-    if constexpr (IsReadOperation) {
+    if constexpr (Operation == IOOperationType::READ) {
       nbytes = ::pread(fd, buffer, byte_remaining, cur_offset);
     } else {
       nbytes = ::pwrite(fd, buffer, byte_remaining, cur_offset);
     }
     if (nbytes == -1) {
-      const std::string name = IsReadOperation ? "pread" : "pwrite";
+      const std::string name = Operation == IOOperationType::READ ? "pread" : "pwrite";
       if (errno == EBADF) {
         throw CUfileException{std::string{"POSIX error on " + name + " at: "} + __FILE__ + ":" +
                               KVIKIO_STRINGIFY(__LINE__) + ": Operation not permitted"};
@@ -114,13 +131,13 @@ ssize_t posix_host_io(int fd, const void* buf, size_t count, off_t offset, bool
       throw CUfileException{std::string{"POSIX error on " + name + " at: "} + __FILE__ + ":" +
                             KVIKIO_STRINGIFY(__LINE__) + ": " + strerror(errno)};
     }
-    if constexpr (IsReadOperation) {
+    if constexpr (Operation == IOOperationType::READ) {
       if (nbytes == 0) {
         throw CUfileException{std::string{"POSIX error on pread at: "} + __FILE__ + ":" +
                               KVIKIO_STRINGIFY(__LINE__) + ": EOF"};
       }
     }
-    if (partial) { return nbytes; }
+    if constexpr (PartialIOStatus == PartialIO::YES) { return nbytes; }
     buffer += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
     cur_offset += nbytes;
     byte_remaining -= nbytes;
@@ -131,7 +148,7 @@ ssize_t posix_host_io(int fd, const void* buf, size_t count, off_t offset, bool
 /**
  * @brief Read or write device memory to or from disk using POSIX
  *
- * @tparam IsReadOperation Whether the operation is a read or a write
+ * @tparam Operation Whether the operation is a read or a write.
  * @param fd File descriptor
  * @param devPtr_base Device pointer to read or write to.
  * @param size Number of bytes to read or write.
@@ -139,7 +156,7 @@ ssize_t posix_host_io(int fd, const void* buf, size_t count, off_t offset, bool
  * @param devPtr_offset Byte offset to the start of the device pointer.
  * @return Number of bytes read or written.
  */
-template <bool IsReadOperation>
+template <IOOperationType Operation>
 std::size_t posix_device_io(int fd,
                             const void* devPtr_base,
                             std::size_t size,
@@ -158,15 +175,17 @@ std::size_t posix_device_io(int fd,
   while (byte_remaining > 0) {
     const off_t nbytes_requested = std::min(chunk_size2, byte_remaining);
     ssize_t nbytes_got           = nbytes_requested;
-    if constexpr (IsReadOperation) {
-      nbytes_got = posix_host_io<true>(fd, alloc.get(), nbytes_requested, cur_file_offset, true);
+    if constexpr (Operation == IOOperationType::READ) {
+      nbytes_got = posix_host_io<IOOperationType::READ, PartialIO::YES>(
+        fd, alloc.get(), nbytes_requested, cur_file_offset);
       CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(devPtr, alloc.get(), nbytes_got, stream));
       CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
     } else {  // Is a write operation
       CUDA_DRIVER_TRY(
         cudaAPI::instance().MemcpyDtoHAsync(alloc.get(), devPtr, nbytes_requested, stream));
       CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(stream));
-      posix_host_io<false>(fd, alloc.get(), nbytes_requested, cur_file_offset, false);
+      posix_host_io<IOOperationType::WRITE, PartialIO::NO>(
+        fd, alloc.get(), nbytes_requested, cur_file_offset);
     }
     cur_file_offset += nbytes_got;
     devPtr += nbytes_got;
@@ -175,26 +194,26 @@ std::size_t posix_device_io(int fd,
   return size;
 }
 
-}  // namespace detail
-
 /**
  * @brief Read from disk to host memory using POSIX
  *
  * If `size` or `file_offset` isn't aligned with `page_size` then
  * `fd` cannot have been opened with the `O_DIRECT` flag.
  *
+ * @tparam PartialIOStatus Whether all requested data are processed or not. If `FULL`, all of
+ * `count` bytes are read.
  * @param fd File descriptor
  * @param buf Base address of buffer in host memory.
  * @param size Size in bytes to read.
  * @param file_offset Offset in the file to read from.
- * @param partial If false, all of `size` bytes are read.
  * @return Size of bytes that were successfully read.
  */
-inline std::size_t posix_host_read(
-  int fd, void* buf, std::size_t size, std::size_t file_offset, bool partial)
+template <PartialIO PartialIOStatus>
+std::size_t posix_host_read(int fd, void* buf, std::size_t size, std::size_t file_offset)
 {
-  KVIKIO_NVTX_FUNC_RANGE("posix_host_read()", size);
-  return detail::posix_host_io<true>(fd, buf, size, convert_size2off(file_offset), partial);
+  KVIKIO_NVTX_SCOPED_RANGE("posix_host_read()", size);
+  return detail::posix_host_io<IOOperationType::READ, PartialIOStatus>(
+    fd, buf, size, convert_size2off(file_offset));
 }
 
 /**
@@ -203,18 +222,20 @@ inline std::size_t posix_host_read(
  * If `size` or `file_offset` isn't aligned with `page_size` then
  * `fd` cannot have been opened with the `O_DIRECT` flag.
  *
+ * @tparam ioDataCompletionLevel Whether all requested data are processed or not. If `FULL`, all of
+ * `count` bytes are written.
  * @param fd File descriptor
  * @param buf Base address of buffer in host memory.
  * @param size Size in bytes to write.
  * @param file_offset Offset in the file to write to.
- * @param partial If false, all of `size` bytes are written.
  * @return Size of bytes that were successfully read.
  */
-inline std::size_t posix_host_write(
-  int fd, const void* buf, std::size_t size, std::size_t file_offset, bool partial)
+template <PartialIO PartialIOStatus>
+std::size_t posix_host_write(int fd, const void* buf, std::size_t size, std::size_t file_offset)
 {
-  KVIKIO_NVTX_FUNC_RANGE("posix_host_write()", size);
-  return detail::posix_host_io<false>(fd, buf, size, convert_size2off(file_offset), partial);
+  KVIKIO_NVTX_SCOPED_RANGE("posix_host_write()", size);
+  return detail::posix_host_io<IOOperationType::WRITE, PartialIOStatus>(
+    fd, buf, size, convert_size2off(file_offset));
 }
 
 /**
@@ -236,8 +257,9 @@ inline std::size_t posix_device_read(int fd,
                                      std::size_t file_offset,
                                      std::size_t devPtr_offset)
 {
-  KVIKIO_NVTX_FUNC_RANGE("posix_device_read()", size);
-  return detail::posix_device_io<true>(fd, devPtr_base, size, file_offset, devPtr_offset);
+  KVIKIO_NVTX_SCOPED_RANGE("posix_device_read()", size);
+  return detail::posix_device_io<IOOperationType::READ>(
+    fd, devPtr_base, size, file_offset, devPtr_offset);
 }
 
 /**
@@ -259,8 +281,9 @@ inline std::size_t posix_device_write(int fd,
                                       std::size_t file_offset,
                                       std::size_t devPtr_offset)
 {
-  KVIKIO_NVTX_FUNC_RANGE("posix_device_write()", size);
-  return detail::posix_device_io<false>(fd, devPtr_base, size, file_offset, devPtr_offset);
+  KVIKIO_NVTX_SCOPED_RANGE("posix_device_write()", size);
+  return detail::posix_device_io<IOOperationType::WRITE>(
+    fd, devPtr_base, size, file_offset, devPtr_offset);
 }
 
-}  // namespace kvikio
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
new file mode 100644
index 0000000000..e1b152b23c
--- /dev/null
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <optional>
+#include <regex>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include <kvikio/defaults.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/parallel_operation.hpp>
+#include <kvikio/posix_io.hpp>
+#include <kvikio/utils.hpp>
+
+namespace kvikio {
+namespace detail {
+
+/**
+ * @brief Bounce buffer in pinned host memory.
+ *
+ * @note Is not thread-safe.
+ */
+class BounceBufferH2D {
+  CUstream _stream;                 // The CUDA stream to use.
+  CUdeviceptr _dev;                 // The output device buffer.
+  AllocRetain::Alloc _host_buffer;  // The host buffer to bounce data on.
+  std::ptrdiff_t _dev_offset{0};    // Number of bytes written to `_dev`.
+  std::ptrdiff_t _host_offset{0};   // Number of bytes written to `_host` (resets on flush).
+
+ public:
+  /**
+   * @brief Create a bounce buffer for an output device buffer.
+   *
+   * @param stream The CUDA stream used throughout the lifetime of the bounce buffer.
+   * @param device_buffer The output device buffer (final destination of the data).
+   */
+  BounceBufferH2D(CUstream stream, void* device_buffer)
+    : _stream{stream},
+      _dev{convert_void2deviceptr(device_buffer)},
+      _host_buffer{AllocRetain::instance().get()}
+  {
+  }
+
+  /**
+   * @brief The bounce buffer if flushed to device on destruction.
+   */
+  ~BounceBufferH2D() noexcept
+  {
+    try {
+      flush();
+    } catch (CUfileException const& e) {
+      std::cerr << "BounceBufferH2D error on final flush: ";
+      std::cerr << e.what();
+      std::cerr << std::endl;
+    }
+  }
+
+ private:
+  /**
+   * @brief Write host memory to the output device buffer.
+   *
+   * @param src The host memory source.
+   * @param size Number of bytes to write.
+   */
+  void write_to_device(void const* src, std::size_t size)
+  {
+    if (size > 0) {
+      CUDA_DRIVER_TRY(cudaAPI::instance().MemcpyHtoDAsync(_dev + _dev_offset, src, size, _stream));
+      CUDA_DRIVER_TRY(cudaAPI::instance().StreamSynchronize(_stream));
+      _dev_offset += size;
+    }
+  }
+
+  /**
+   * @brief Flush the bounce buffer by writing everything to the output device buffer.
+   */
+  void flush()
+  {
+    write_to_device(_host_buffer.get(), _host_offset);
+    _host_offset = 0;
+  }
+
+ public:
+  /**
+   * @brief Write host memory to the bounce buffer (also host memory).
+   *
+   * Only when the bounce buffer has been filled up is data copied to the output device buffer.
+   *
+   * @param data The host memory source.
+   * @param size Number of bytes to write.
+   */
+  void write(char const* data, std::size_t size)
+  {
+    if (_host_buffer.size() - _host_offset < size) {  // Not enough space left in the bounce buffer
+      flush();
+      assert(_host_offset == 0);
+    }
+    if (_host_buffer.size() < size) {
+      // If still not enough space, we just copy the data to the device. This only happens when
+      // `defaults::bounce_buffer_size()` is smaller than 16kb thus no need to performance
+      // optimize for this case.
+      write_to_device(data, size);
+    } else if (size > 0) {
+      std::memcpy(_host_buffer.get(_host_offset), data, size);
+      _host_offset += size;
+    }
+  }
+};
+
+}  // namespace detail
+
+class CurlHandle;  // Prototype
+
+/**
+ * @brief Abstract base class for remote endpoints.
+ *
+ * In this context, an endpoint refers to a remote file using a specific communication protocol.
+ *
+ * Each communication protocol, such as HTTP or S3, needs to implement this ABC and implement
+ * its own ctor that takes communication protocol specific arguments.
+ */
+class RemoteEndpoint {
+ public:
+  /**
+   * @brief Set needed connection options on a curl handle.
+   *
+   * Subsequently, a call to `curl.perform()` should connect to the endpoint.
+   *
+   * @param curl The curl handle.
+   */
+  virtual void setopt(CurlHandle& curl) = 0;
+
+  /**
+   * @brief Get a description of this remote point instance.
+   *
+   * @returns A string description.
+   */
+  virtual std::string str() const = 0;
+
+  virtual ~RemoteEndpoint() = default;
+};
+
+/**
+ * @brief A remote endpoint using http.
+ */
+class HttpEndpoint : public RemoteEndpoint {
+ private:
+  std::string _url;
+
+ public:
+  /**
+   * @brief Create an http endpoint from a url.
+   *
+   * @param url The full http url to the remote file.
+   */
+  HttpEndpoint(std::string url) : _url{std::move(url)} {}
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override { return _url; }
+  ~HttpEndpoint() override = default;
+};
+
+/**
+ * @brief A remote endpoint using AWS's S3 protocol.
+ */
+class S3Endpoint : public RemoteEndpoint {
+ private:
+  std::string _url;
+  std::string _aws_sigv4;
+  std::string _aws_userpwd;
+
+  /**
+   * @brief Unwrap an optional parameter, obtaining a default from the environment.
+   *
+   * If not nullopt, the optional's value is returned. Otherwise, the environment
+   * variable `env_var` is used. If that also doesn't have a value:
+   *   - if `err_msg` is empty, the empty string is returned.
+   *   - if `err_msg` is not empty, `std::invalid_argument(`err_msg`)` is thrown.
+   *
+   * @param value The value to unwrap.
+   * @param env_var The name of the environment variable to check if `value` isn't set.
+   * @param err_msg The error message to throw on error or the empty string.
+   * @return The parsed AWS argument or the empty string.
+   */
+  static std::string unwrap_or_default(std::optional<std::string> aws_arg,
+                                       std::string const& env_var,
+                                       std::string const& err_msg = "")
+  {
+    if (aws_arg.has_value()) { return std::move(*aws_arg); }
+
+    char const* env = std::getenv(env_var.c_str());
+    if (env == nullptr) {
+      if (err_msg.empty()) { return std::string(); }
+      throw std::invalid_argument(err_msg);
+    }
+    return std::string(env);
+  }
+
+ public:
+  /**
+   * @brief Get url from a AWS S3 bucket and object name.
+   *
+   * @throws std::invalid_argument if no region is specified and no default region is
+   * specified in the environment.
+   *
+   * @param bucket_name The name of the S3 bucket.
+   * @param object_name The name of the S3 object.
+   * @param aws_region The AWS region, such as "us-east-1", to use. If nullopt, the value of the
+   * `AWS_DEFAULT_REGION` environment variable is used.
+   * @param aws_endpoint_url Overwrite the endpoint url (including the protocol part) by using
+   * the scheme: "<aws_endpoint_url>/<bucket_name>/<object_name>". If nullopt, the value of the
+   * `AWS_ENDPOINT_URL` environment variable is used. If this is also not set, the regular AWS
+   * url scheme is used: "https://<bucket_name>.s3.<region>.amazonaws.com/<object_name>".
+   */
+  static std::string url_from_bucket_and_object(std::string const& bucket_name,
+                                                std::string const& object_name,
+                                                std::optional<std::string> const& aws_region,
+                                                std::optional<std::string> aws_endpoint_url)
+  {
+    auto const endpoint_url = unwrap_or_default(std::move(aws_endpoint_url), "AWS_ENDPOINT_URL");
+    std::stringstream ss;
+    if (endpoint_url.empty()) {
+      auto const region =
+        unwrap_or_default(std::move(aws_region),
+                          "AWS_DEFAULT_REGION",
+                          "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
+      // We default to the official AWS url scheme.
+      ss << "https://" << bucket_name << ".s3." << region << ".amazonaws.com/" << object_name;
+    } else {
+      ss << endpoint_url << "/" << bucket_name << "/" << object_name;
+    }
+    return ss.str();
+  }
+
+  /**
+   * @brief Given an url like "s3://<bucket>/<object>", return the name of the bucket and object.
+   *
+   * @throws std::invalid_argument if url is ill-formed or is missing the bucket or object name.
+   *
+   * @param s3_url S3 url.
+   * @return Pair of strings: [bucket-name, object-name].
+   */
+  [[nodiscard]] static std::pair<std::string, std::string> parse_s3_url(std::string const& s3_url)
+  {
+    // Regular expression to match s3://<bucket>/<object>
+    std::regex const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase};
+    std::smatch matches;
+    if (std::regex_match(s3_url, matches, pattern)) { return {matches[1].str(), matches[2].str()}; }
+    throw std::invalid_argument("Input string does not match the expected S3 URL format.");
+  }
+
+  /**
+   * @brief Create a S3 endpoint from a url.
+   *
+   * @param url The full http url to the S3 file. NB: this should be an url starting with
+   * "http://" or "https://". If you have an S3 url of the form "s3://<bucket>/<object>", please
+   * use `S3Endpoint::parse_s3_url()` and `S3Endpoint::url_from_bucket_and_object() to convert it.
+   * @param aws_region The AWS region, such as "us-east-1", to use. If nullopt, the value of the
+   * `AWS_DEFAULT_REGION` environment variable is used.
+   * @param aws_access_key The AWS access key to use. If nullopt, the value of the
+   * `AWS_ACCESS_KEY_ID` environment variable is used.
+   * @param aws_secret_access_key The AWS secret access key to use. If nullopt, the value of the
+   * `AWS_SECRET_ACCESS_KEY` environment variable is used.
+   */
+  S3Endpoint(std::string url,
+             std::optional<std::string> aws_region            = std::nullopt,
+             std::optional<std::string> aws_access_key        = std::nullopt,
+             std::optional<std::string> aws_secret_access_key = std::nullopt)
+    : _url{std::move(url)}
+  {
+    // Regular expression to match http[s]://
+    std::regex pattern{R"(^https?://.*)", std::regex_constants::icase};
+    if (!std::regex_search(_url, pattern)) {
+      throw std::invalid_argument("url must start with http:// or https://");
+    }
+
+    auto const region =
+      unwrap_or_default(std::move(aws_region),
+                        "AWS_DEFAULT_REGION",
+                        "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
+
+    auto const access_key =
+      unwrap_or_default(std::move(aws_access_key),
+                        "AWS_ACCESS_KEY_ID",
+                        "S3: must provide `aws_access_key` if AWS_ACCESS_KEY_ID isn't set.");
+
+    auto const secret_access_key = unwrap_or_default(
+      std::move(aws_secret_access_key),
+      "AWS_SECRET_ACCESS_KEY",
+      "S3: must provide `aws_secret_access_key` if AWS_SECRET_ACCESS_KEY isn't set.");
+
+    // Create the CURLOPT_AWS_SIGV4 option
+    {
+      std::stringstream ss;
+      ss << "aws:amz:" << region << ":s3";
+      _aws_sigv4 = ss.str();
+    }
+    // Create the CURLOPT_USERPWD option
+    // Notice, curl uses `secret_access_key` to generate a AWS V4 signature. It is NOT included
+    // in the http header. See
+    // <https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_sigv-create-signed-request.html>
+    {
+      std::stringstream ss;
+      ss << access_key << ":" << secret_access_key;
+      _aws_userpwd = ss.str();
+    }
+  }
+
+  /**
+   * @brief Create a S3 endpoint from a bucket and object name.
+   *
+   * @param bucket_name The name of the S3 bucket.
+   * @param object_name The name of the S3 object.
+   * @param aws_region The AWS region, such as "us-east-1", to use. If nullopt, the value of the
+   * `AWS_DEFAULT_REGION` environment variable is used.
+   * @param aws_access_key The AWS access key to use. If nullopt, the value of the
+   * `AWS_ACCESS_KEY_ID` environment variable is used.
+   * @param aws_secret_access_key The AWS secret access key to use. If nullopt, the value of the
+   * `AWS_SECRET_ACCESS_KEY` environment variable is used.
+   * @param aws_endpoint_url Overwrite the endpoint url (including the protocol part) by using
+   * the scheme: "<aws_endpoint_url>/<bucket_name>/<object_name>". If nullopt, the value of the
+   * `AWS_ENDPOINT_URL` environment variable is used. If this is also not set, the regular AWS
+   * url scheme is used: "https://<bucket_name>.s3.<region>.amazonaws.com/<object_name>".
+   */
+  S3Endpoint(std::string const& bucket_name,
+             std::string const& object_name,
+             std::optional<std::string> aws_region            = std::nullopt,
+             std::optional<std::string> aws_access_key        = std::nullopt,
+             std::optional<std::string> aws_secret_access_key = std::nullopt,
+             std::optional<std::string> aws_endpoint_url      = std::nullopt)
+    : S3Endpoint(url_from_bucket_and_object(
+                   bucket_name, object_name, aws_region, std::move(aws_endpoint_url)),
+                 std::move(aws_region),
+                 std::move(aws_access_key),
+                 std::move(aws_secret_access_key))
+  {
+  }
+
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override { return _url; }
+  ~S3Endpoint() override = default;
+};
+
+/**
+ * @brief Handle of remote file.
+ */
+class RemoteHandle {
+ private:
+  std::unique_ptr<RemoteEndpoint> _endpoint;
+  std::size_t _nbytes;
+
+ public:
+  /**
+   * @brief Create a new remote handle from an endpoint and a file size.
+   *
+   * @param endpoint Remote endpoint used for subsequent IO.
+   * @param nbytes The size of the remote file (in bytes).
+   */
+  RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
+    : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
+  {
+  }
+
+  /**
+   * @brief Create a new remote handle from an endpoint (infers the file size).
+   *
+   * The file size is received from the remote server using `endpoint`.
+   *
+   * @param endpoint Remote endpoint used for subsequently IO.
+   */
+  RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint);
+
+  // A remote handle is moveable but not copyable.
+  RemoteHandle(RemoteHandle&& o)               = default;
+  RemoteHandle& operator=(RemoteHandle&& o)    = default;
+  RemoteHandle(RemoteHandle const&)            = delete;
+  RemoteHandle& operator=(RemoteHandle const&) = delete;
+
+  /**
+   * @brief Get the file size.
+   *
+   * Note, this is very fast, no communication needed.
+   *
+   * @return The number of bytes.
+   */
+  [[nodiscard]] std::size_t nbytes() const noexcept { return _nbytes; }
+
+  /**
+   * @brief Get a const reference to the underlying remote endpoint.
+   *
+   * @return The remote endpoint.
+   */
+  [[nodiscard]] RemoteEndpoint const& endpoint() const noexcept { return *_endpoint; }
+
+  /**
+   * @brief Read from remote source into buffer (host or device memory).
+   *
+   * When reading into device memory, a bounce buffer is used to avoid many small memory
+   * copies to device. Use `kvikio::default::bounce_buffer_size_reset()` to set the size
+   * of this bounce buffer (default 16 MiB).
+   *
+   * @param buf Pointer to host or device memory.
+   * @param size Number of bytes to read.
+   * @param file_offset File offset in bytes.
+   * @return Number of bytes read, which is always `size`.
+   */
+  std::size_t read(void* buf, std::size_t size, std::size_t file_offset = 0);
+
+  /**
+   * @brief Read from remote source into buffer (host or device memory) in parallel.
+   *
+   * This API is a parallel async version of `.read()` that partitions the operation
+   * into tasks of size `task_size` for execution in the default thread pool.
+   *
+   * @param buf Pointer to host or device memory.
+   * @param size Number of bytes to read.
+   * @param file_offset File offset in bytes.
+   * @param task_size Size of each task in bytes.
+   * @return Future that on completion returns the size of bytes read, which is always `size`.
+   */
+  std::future<std::size_t> pread(void* buf,
+                                 std::size_t size,
+                                 std::size_t file_offset = 0,
+                                 std::size_t task_size   = defaults::task_size());
+};
+
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/shim/cuda.hpp b/cpp/include/kvikio/shim/cuda.hpp
index 5d42bd0dcb..606a618736 100644
--- a/cpp/include/kvikio/shim/cuda.hpp
+++ b/cpp/include/kvikio/shim/cuda.hpp
@@ -85,7 +85,7 @@ class cudaAPI {
   cudaAPI(cudaAPI const&)        = delete;
   void operator=(cudaAPI const&) = delete;
 
-  static cudaAPI& instance()
+  KVIKIO_EXPORT static cudaAPI& instance()
   {
     static cudaAPI _instance;
     return _instance;
diff --git a/cpp/include/kvikio/shim/cuda_h_wrapper.hpp b/cpp/include/kvikio/shim/cuda_h_wrapper.hpp
index 0740c99f31..c0fd55e0c5 100644
--- a/cpp/include/kvikio/shim/cuda_h_wrapper.hpp
+++ b/cpp/include/kvikio/shim/cuda_h_wrapper.hpp
@@ -27,13 +27,24 @@
 #else
 
 // If CUDA isn't defined, we define some of the data types here.
-// Notice, this doesn't need to be ABI compatible with the CUDA definitions.
+// Notice, the functions and constant values don't need to match the CUDA
+// definitions, but the types *do*, since downstream libraries dlsym()-ing
+// the symbols at runtime rely on accurate type definitions. If we mismatch
+// here, then those libraries will get "mismatched type alias redefinition"
+// errors when they include our headers.
 
-using CUresult    = int;
+#if defined(_WIN64) || defined(__LP64__)
+// Don't use uint64_t, we want to match the driver headers exactly
 using CUdeviceptr = unsigned long long;
-using CUdevice    = int;
-using CUcontext   = void*;
-using CUstream    = void*;
+#else
+using CUdeviceptr = unsigned int;
+#endif
+static_assert(sizeof(CUdeviceptr) == sizeof(void*));
+
+using CUresult  = int;
+using CUdevice  = int;
+using CUcontext = struct CUctx_st*;
+using CUstream  = struct CUstream_st*;
 
 #define CUDA_ERROR_STUB_LIBRARY             0
 #define CUDA_SUCCESS                        0
diff --git a/cpp/include/kvikio/shim/cufile.hpp b/cpp/include/kvikio/shim/cufile.hpp
index 354b435b6a..5194d45e74 100644
--- a/cpp/include/kvikio/shim/cufile.hpp
+++ b/cpp/include/kvikio/shim/cufile.hpp
@@ -16,8 +16,8 @@
 #pragma once
 
 #include <stdexcept>
+#include <string>
 
-#include <iostream>
 #include <kvikio/shim/cufile_h_wrapper.hpp>
 #include <kvikio/shim/utils.hpp>
 
@@ -38,8 +38,6 @@ class cuFileAPI {
   decltype(cuFileWrite)* Write{nullptr};
   decltype(cuFileBufRegister)* BufRegister{nullptr};
   decltype(cuFileBufDeregister)* BufDeregister{nullptr};
-  decltype(cuFileDriverOpen)* DriverOpen{nullptr};
-  decltype(cuFileDriverClose)* DriverClose{nullptr};
   decltype(cuFileDriverGetProperties)* DriverGetProperties{nullptr};
   decltype(cuFileDriverSetPollMode)* DriverSetPollMode{nullptr};
   decltype(cuFileDriverSetMaxCacheSize)* DriverSetMaxCacheSize{nullptr};
@@ -54,7 +52,16 @@ class cuFileAPI {
   decltype(cuFileStreamRegister)* StreamRegister{nullptr};
   decltype(cuFileStreamDeregister)* StreamDeregister{nullptr};
 
-  bool stream_available = false;
+ private:
+  // Don't call driver open and close directly, use `.driver_open()` and `.driver_close()`.
+  decltype(cuFileDriverOpen)* DriverOpen{nullptr};
+  decltype(cuFileDriverClose)* DriverClose{nullptr};
+
+  // Don't call `GetVersion` directly, use `cuFileAPI::instance().version`.
+  decltype(cuFileGetVersion)* GetVersion{nullptr};
+
+ public:
+  int version{0};
 
  private:
 #ifdef KVIKIO_CUFILE_FOUND
@@ -84,46 +91,52 @@ class cuFileAPI {
     get_symbol(DriverSetMaxCacheSize, lib, KVIKIO_STRINGIFY(cuFileDriverSetMaxCacheSize));
     get_symbol(DriverSetMaxPinnedMemSize, lib, KVIKIO_STRINGIFY(cuFileDriverSetMaxPinnedMemSize));
 
-#ifdef KVIKIO_CUFILE_BATCH_API_FOUND
-    get_symbol(BatchIOSetUp, lib, KVIKIO_STRINGIFY(cuFileBatchIOSetUp));
-    get_symbol(BatchIOSubmit, lib, KVIKIO_STRINGIFY(cuFileBatchIOSubmit));
-    get_symbol(BatchIOGetStatus, lib, KVIKIO_STRINGIFY(cuFileBatchIOGetStatus));
-    get_symbol(BatchIOCancel, lib, KVIKIO_STRINGIFY(cuFileBatchIOCancel));
-    get_symbol(BatchIODestroy, lib, KVIKIO_STRINGIFY(cuFileBatchIODestroy));
-#endif
-
-#ifdef KVIKIO_CUFILE_STREAM_API_FOUND
-    get_symbol(ReadAsync, lib, KVIKIO_STRINGIFY(cuFileReadAsync));
-    get_symbol(WriteAsync, lib, KVIKIO_STRINGIFY(cuFileWriteAsync));
-    get_symbol(StreamRegister, lib, KVIKIO_STRINGIFY(cuFileStreamRegister));
-    get_symbol(StreamDeregister, lib, KVIKIO_STRINGIFY(cuFileStreamDeregister));
+#ifdef KVIKIO_CUFILE_VERSION_API_FOUND
     try {
-      void* s{};
-      get_symbol(s, lib, "cuFileReadAsync");
-      stream_available = true;
-    } catch (const std::runtime_error&) {
+      get_symbol(GetVersion, lib, KVIKIO_STRINGIFY(cuFileGetVersion));
+      int ver;
+      CUfileError_t const error = GetVersion(&ver);
+      if (error.err == CU_FILE_SUCCESS) { version = ver; }
+    } catch (std::runtime_error const&) {
     }
 #endif
 
-    // cuFile is supposed to open and close the driver automatically but because of a bug in
-    // CUDA 11.8, it sometimes segfault. See <https://github.com/rapidsai/kvikio/issues/159>.
-    CUfileError_t const error = DriverOpen();
-    if (error.err != CU_FILE_SUCCESS) {
-      throw std::runtime_error(std::string{"cuFile error at: "} + __FILE__ + ":" +
-                               KVIKIO_STRINGIFY(__LINE__) + ": " +
-                               cufileop_status_error(error.err));
+    // Some symbols were introduced in later versions, so version guards are required.
+    // Note: `version` is 0 for cuFile versions prior to v1.8 because `cuFileGetVersion`
+    // did not exist. As a result, the batch and stream APIs are not loaded in versions
+    // 1.6 and 1.7, respectively, even though they are available. This trade-off is made
+    // for improved robustness.
+    if (version >= 1060) {
+      get_symbol(BatchIOSetUp, lib, KVIKIO_STRINGIFY(cuFileBatchIOSetUp));
+      get_symbol(BatchIOSubmit, lib, KVIKIO_STRINGIFY(cuFileBatchIOSubmit));
+      get_symbol(BatchIOGetStatus, lib, KVIKIO_STRINGIFY(cuFileBatchIOGetStatus));
+      get_symbol(BatchIOCancel, lib, KVIKIO_STRINGIFY(cuFileBatchIOCancel));
+      get_symbol(BatchIODestroy, lib, KVIKIO_STRINGIFY(cuFileBatchIODestroy));
     }
+    if (version >= 1070) {
+      get_symbol(ReadAsync, lib, KVIKIO_STRINGIFY(cuFileReadAsync));
+      get_symbol(WriteAsync, lib, KVIKIO_STRINGIFY(cuFileWriteAsync));
+      get_symbol(StreamRegister, lib, KVIKIO_STRINGIFY(cuFileStreamRegister));
+      get_symbol(StreamDeregister, lib, KVIKIO_STRINGIFY(cuFileStreamDeregister));
+    }
+
+    // cuFile is supposed to open and close the driver automatically but
+    // because of a bug in cuFile v1.4 (CUDA v11.8) it sometimes segfaults:
+    // <https://github.com/rapidsai/kvikio/issues/159>.
+    if (version < 1050) { driver_open(); }
   }
+
+  // Notice, we have to close the driver at program exit (if we opened it) even though we are
+  // not allowed to call CUDA after main[1]. This is because, cuFile will segfault if the
+  // driver isn't closed on program exit i.e. we are doomed if we do, doomed if we don't, but
+  // this seems to be the lesser of two evils.
+  // [1] <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
   ~cuFileAPI()
   {
-    CUfileError_t const error = DriverClose();
-    if (error.err != CU_FILE_SUCCESS) {
-      std::cerr << "Unable to close GDS file driver: " << cufileop_status_error(error.err)
-                << std::endl;
-    }
+    if (version < 1050) { driver_close(); }
   }
 #else
-  cuFileAPI() { throw std::runtime_error(CUFILE_ERRSTR(0)); }
+  cuFileAPI() { throw std::runtime_error("KvikIO not compiled with cuFile.h"); }
 #endif
 
  public:
@@ -132,11 +145,38 @@ class cuFileAPI {
   cuFileAPI(cuFileAPI const&&)      = delete;
   void operator=(cuFileAPI const&&) = delete;
 
-  static cuFileAPI& instance()
+  KVIKIO_EXPORT static cuFileAPI& instance()
   {
     static cuFileAPI _instance;
     return _instance;
   }
+
+  /**
+   * @brief Open the cuFile driver
+   *
+   * cuFile allows multiple calls to `cufileDriverOpen()`, only the first call opens
+   * the driver, but every call should have a matching call to `cufileDriverClose()`.
+   */
+  void driver_open()
+  {
+    CUfileError_t const error = DriverOpen();
+    if (error.err != CU_FILE_SUCCESS) {
+      throw std::runtime_error(std::string{"Unable to open GDS file driver: "} +
+                               cufileop_status_error(error.err));
+    }
+  }
+
+  /**
+   * @brief Close the cuFile driver
+   */
+  void driver_close()
+  {
+    CUfileError_t const error = DriverClose();
+    if (error.err != CU_FILE_SUCCESS) {
+      throw std::runtime_error(std::string{"Unable to close GDS file driver: "} +
+                               cufileop_status_error(error.err));
+    }
+  }
 };
 
 /**
@@ -174,25 +214,49 @@ inline bool is_cufile_available()
 }
 
 /**
- * @brief Check if cuFile's batch and stream API is available
+ * @brief Get cufile version (or zero if older than v1.8).
  *
- * Technically, the batch API is available in CUDA 12.1 but since there is no good
- * way to check CUDA version using the driver API, we check for the existing of the
- * `cuFileReadAsync` symbol, which is defined in CUDA 12.2+.
+ * The version is returned as (1000*major + 10*minor). E.g., cufile v1.8.0 would
+ * be represented by 1080.
  *
- * @return The boolean answer
+ * Notice, this is not the version of the CUDA toolkit. cufile is part of the
+ * toolkit but follows its own version scheme.
+ *
+ * @return The version (1000*major + 10*minor) or zero if older than 1080.
  */
-#if defined(KVIKIO_CUFILE_STREAM_API_FOUND) && defined(KVIKIO_CUFILE_STREAM_API_FOUND)
-inline bool is_batch_and_stream_available()
+#ifdef KVIKIO_CUFILE_FOUND
+inline int cufile_version()
 {
   try {
-    return is_cufile_available() && cuFileAPI::instance().stream_available;
-  } catch (const std::runtime_error&) {
-    return false;
+    return cuFileAPI::instance().version;
+  } catch (std::runtime_error const&) {
+    return 0;
   }
 }
 #else
-constexpr bool is_batch_and_stream_available() { return false; }
+constexpr int cufile_version() { return 0; }
 #endif
 
+/**
+ * @brief Check if cuFile's batch API is available.
+ *
+ * Since `cuFileGetVersion()` first became available in cufile v1.8 (CTK v12.3),
+ * this function returns false for versions older than v1.8 even though the batch
+ * API became available in v1.6.
+ *
+ * @return The boolean answer
+ */
+inline bool is_batch_api_available() noexcept { return cufile_version() >= 1060; }
+
+/**
+ * @brief Check if cuFile's stream (async) API is available.
+ *
+ * Since `cuFileGetVersion()` first became available in cufile v1.8 (CTK v12.3),
+ * this function returns false for versions older than v1.8 even though the stream
+ * API became available in v1.7.
+ *
+ * @return The boolean answer
+ */
+inline bool is_stream_api_available() noexcept { return cufile_version() >= 1070; }
+
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/shim/cufile_h_wrapper.hpp b/cpp/include/kvikio/shim/cufile_h_wrapper.hpp
index 33c3fee9a2..1c13d2d8a1 100644
--- a/cpp/include/kvikio/shim/cufile_h_wrapper.hpp
+++ b/cpp/include/kvikio/shim/cufile_h_wrapper.hpp
@@ -75,7 +75,7 @@ CUfileError_t cuFileDriverSetMaxPinnedMemSize(...);
 
 #endif
 
-// If the Batch API isn't defined, we define some of the data types here.
+// If some cufile APIs aren't defined, we define some of the data types here.
 // Notice, this doesn't need to be ABI compatible with the cufile definitions and
 // the lack of definitions is not a problem because the linker will never look for
 // these symbols because the "real" function calls are made through the shim instance.
@@ -105,10 +105,13 @@ CUfileError_t cuFileBatchIOCancel(...);
 CUfileError_t cuFileBatchIODestroy(...);
 #endif
 
-// If the Stream API isn't defined, we define some of the data types here.
 #ifndef KVIKIO_CUFILE_STREAM_API_FOUND
 CUfileError_t cuFileReadAsync(...);
 CUfileError_t cuFileWriteAsync(...);
 CUfileError_t cuFileStreamRegister(...);
 CUfileError_t cuFileStreamDeregister(...);
 #endif
+
+#ifndef KVIKIO_CUFILE_VERSION_API_FOUND
+CUfileError_t cuFileGetVersion(...);
+#endif
diff --git a/cpp/include/kvikio/shim/libcurl.hpp b/cpp/include/kvikio/shim/libcurl.hpp
new file mode 100644
index 0000000000..423eff9c60
--- /dev/null
+++ b/cpp/include/kvikio/shim/libcurl.hpp
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#ifndef KVIKIO_LIBCURL_FOUND
+#error \
+  "cannot include the remote IO API, please build KvikIO with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)"
+#endif
+
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <curl/curl.h>
+
+#include <kvikio/defaults.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/parallel_operation.hpp>
+#include <kvikio/posix_io.hpp>
+#include <kvikio/utils.hpp>
+
+namespace kvikio {
+
+/**
+ * @brief Singleton class to initialize and cleanup the global state of libcurl
+ *
+ * Notice, libcurl allows the use of a singleton class:
+ *
+ * In a C++ module, it is common to deal with the global constant situation by defining a special
+ * class that represents the global constant environment of the module. A program always has exactly
+ * one object of the class, in static storage. That way, the program automatically calls the
+ * constructor of the object as the program starts up and the destructor as it terminates. As the
+ * author of this libcurl-using module, you can make the constructor call curl_global_init and the
+ * destructor call curl_global_cleanup and satisfy libcurl's requirements without your user having
+ * to think about it. (Caveat: If you are initializing libcurl from a Windows DLL you should not
+ * initialize it from DllMain or a static initializer because Windows holds the loader lock during
+ * that time and it could cause a deadlock.)
+ *
+ * Source <https://curl.se/libcurl/c/libcurl.html>.
+ */
+class LibCurl {
+ public:
+  // We hold a unique pointer to the raw curl handle and set `curl_easy_cleanup` as its Deleter.
+  using UniqueHandlePtr = std::unique_ptr<CURL, std::function<decltype(curl_easy_cleanup)>>;
+
+ private:
+  std::mutex _mutex{};
+  // Curl handles free to be used.
+  std::vector<UniqueHandlePtr> _free_curl_handles{};
+
+  LibCurl()
+  {
+    CURLcode err = curl_global_init(CURL_GLOBAL_DEFAULT);
+    if (err != CURLE_OK) {
+      throw std::runtime_error("cannot initialize libcurl - errorcode: " + std::to_string(err));
+    }
+    curl_version_info_data* ver = curl_version_info(::CURLVERSION_NOW);
+    if ((ver->features & CURL_VERSION_THREADSAFE) == 0) {
+      throw std::runtime_error("cannot initialize libcurl - built with thread safety disabled");
+    }
+  }
+  ~LibCurl() noexcept
+  {
+    _free_curl_handles.clear();
+    curl_global_cleanup();
+  }
+
+ public:
+  static LibCurl& instance()
+  {
+    static LibCurl _instance;
+    return _instance;
+  }
+
+  /**
+   * @brief Returns a free curl handle if available.
+   */
+  UniqueHandlePtr get_free_handle()
+  {
+    UniqueHandlePtr ret;
+    std::lock_guard const lock(_mutex);
+    if (!_free_curl_handles.empty()) {
+      ret = std::move(_free_curl_handles.back());
+      _free_curl_handles.pop_back();
+    }
+    return ret;
+  }
+
+  /**
+   * @brief Returns a curl handle, create a new handle if none is available.
+   */
+  UniqueHandlePtr get_handle()
+  {
+    // Check if we have a free handle available.
+    UniqueHandlePtr ret = get_free_handle();
+    if (ret) {
+      curl_easy_reset(ret.get());
+    } else {
+      // If not, we create a new handle.
+      CURL* raw_handle = curl_easy_init();
+      if (raw_handle == nullptr) {
+        throw std::runtime_error("libcurl: call to curl_easy_init() failed");
+      }
+      ret = UniqueHandlePtr(raw_handle, curl_easy_cleanup);
+    }
+    return ret;
+  }
+
+  /**
+   * @brief Retain a curl handle for later use.
+   */
+  void retain_handle(UniqueHandlePtr handle)
+  {
+    std::lock_guard const lock(_mutex);
+    _free_curl_handles.push_back(std::move(handle));
+  }
+};
+
+/**
+ * @brief Representation of a curl easy handle pointer and its operations.
+ *
+ * An instance is given a `LibCurl::UniqueHandlePtr` on creation, which is
+ * later retained on destruction.
+ */
+class CurlHandle {
+ private:
+  char _errbuf[CURL_ERROR_SIZE];
+  LibCurl::UniqueHandlePtr _handle;
+  std::string _source_file;
+  std::string _source_line;
+
+ public:
+  /**
+   * @brief Construct a new curl handle.
+   *
+   * Typically, do not call this directly instead use the `create_curl_handle()` macro.
+   *
+   * @param handle An unused curl easy handle pointer, which is retained on destruction.
+   * @param source_file Path of source file of the caller (for error messages).
+   * @param source_line Line of source file of the caller (for error messages).
+   */
+  CurlHandle(LibCurl::UniqueHandlePtr handle, std::string source_file, std::string source_line)
+    : _handle{std::move(handle)},
+      _source_file(std::move(source_file)),
+      _source_line(std::move(source_line))
+  {
+    // Need CURLOPT_NOSIGNAL to support threading, see
+    // <https://curl.se/libcurl/c/CURLOPT_NOSIGNAL.html>
+    setopt(CURLOPT_NOSIGNAL, 1L);
+
+    // We always set CURLOPT_ERRORBUFFER to get better error messages.
+    _errbuf[0] = 0;  // Set the error buffer as empty.
+    setopt(CURLOPT_ERRORBUFFER, _errbuf);
+
+    // Make curl_easy_perform() fail when receiving HTTP code errors.
+    setopt(CURLOPT_FAILONERROR, 1L);
+  }
+  ~CurlHandle() noexcept { LibCurl::instance().retain_handle(std::move(_handle)); }
+
+  /**
+   * @brief CurlHandle support is not movable or copyable.
+   */
+  CurlHandle(CurlHandle const&)            = delete;
+  CurlHandle& operator=(CurlHandle const&) = delete;
+  CurlHandle(CurlHandle&& o)               = delete;
+  CurlHandle& operator=(CurlHandle&& o)    = delete;
+
+  /**
+   * @brief Get the underlying curl easy handle pointer.
+   */
+  CURL* handle() noexcept { return _handle.get(); }
+
+  /**
+   * @brief Set option for the curl handle.
+   *
+   * See <https://curl.se/libcurl/c/curl_easy_setopt.html> for available options.
+   *
+   * @tparam VAL The type of the value.
+   * @param option The curl option to set.
+   */
+  template <typename VAL>
+  void setopt(CURLoption option, VAL value)
+  {
+    CURLcode err = curl_easy_setopt(handle(), option, value);
+    if (err != CURLE_OK) {
+      std::stringstream ss;
+      ss << "curl_easy_setopt() error near " << _source_file << ":" << _source_line;
+      ss << "(" << curl_easy_strerror(err) << ")";
+      throw std::runtime_error(ss.str());
+    }
+  }
+
+  /**
+   * @brief Perform a blocking network transfer using previously set options.
+   *
+   * See <https://curl.se/libcurl/c/curl_easy_perform.html>.
+   */
+  void perform()
+  {
+    // Perform the curl operation and check for errors.
+    CURLcode err = curl_easy_perform(handle());
+    if (err != CURLE_OK) {
+      std::string msg(_errbuf);  // We can do this because we always initialize `_errbuf` as empty.
+      std::stringstream ss;
+      ss << "curl_easy_perform() error near " << _source_file << ":" << _source_line;
+      if (msg.empty()) {
+        ss << "(" << curl_easy_strerror(err) << ")";
+      } else {
+        ss << "(" << msg << ")";
+      }
+      throw std::runtime_error(ss.str());
+    }
+  }
+
+  /**
+   * @brief Extract information from a curl handle.
+   *
+   * See <https://curl.se/libcurl/c/curl_easy_getinfo.html> for available options.
+   *
+   * @tparam OUTPUT The type of the output.
+   * @param output The output, which is used as-is: `curl_easy_getinfo(..., output)`.
+   */
+  template <typename OUTPUT>
+  void getinfo(CURLINFO info, OUTPUT* output)
+  {
+    CURLcode err = curl_easy_getinfo(handle(), info, output);
+    if (err != CURLE_OK) {
+      std::stringstream ss;
+      ss << "curl_easy_getinfo() error near " << _source_file << ":" << _source_line;
+      ss << "(" << curl_easy_strerror(err) << ")";
+      throw std::runtime_error(ss.str());
+    }
+  }
+};
+
+namespace detail {
+/**
+ * @brief Fix Conda's manipulation of __FILE__.
+ *
+ * Conda manipulates the path information in its shared libraries[1] with the results that the
+ * C macro `__FILE__` might contain trailing `\0` chars. Normally, this isn't a problem because
+ * `__FILE__` is a `const char*` that are terminated by the first encounter of `\0`. However, when
+ * creating a `std::string` from a `char*`, the compiler might optimize the code such that the
+ * `std::string` is created from the full size of `__FILE__` including the trailing `\0` chars.
+ *
+ * The extra `\0` is problematic if `CurlHandle` later throws an exception to Cython since, while
+ * converting the exception to Python, Cython might truncate the error message.
+ *
+ * [1] <https://docs.conda.io/projects/conda-build/en/latest/resources/make-relocatable.html>
+ */
+__attribute__((noinline)) inline std::string fix_conda_file_path_hack(std::string filename)
+{
+  if (filename.data() != nullptr) { return std::string{filename.data()}; }
+  return std::string{};
+}
+}  // namespace detail
+
+/**
+ * @brief Create a new curl handle.
+ *
+ * @returns A `kvikio::CurlHandle` instance ready to be used.
+ */
+#define create_curl_handle()                                             \
+  kvikio::CurlHandle(kvikio::LibCurl::instance().get_handle(),           \
+                     kvikio::detail::fix_conda_file_path_hack(__FILE__), \
+                     KVIKIO_STRINGIFY(__LINE__))
+
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/shim/utils.hpp b/cpp/include/kvikio/shim/utils.hpp
index 7aaf78f4bd..7a3c439899 100644
--- a/cpp/include/kvikio/shim/utils.hpp
+++ b/cpp/include/kvikio/shim/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,20 @@
 
 namespace kvikio {
 
+// Macros used for defining symbol visibility.
+// Since KvikIO declares global default values in headers, we rely on the linker to disambiguate
+// inline and static methods that have (or return) static references. To do this, the relevant
+// function/method must have `__attribute__((visibility("default")))`. If not, then if KvikIO is
+// used in two different DSOs, the function will appear twice, and there will be two static objects.
+// See <https://gcc.gnu.org/wiki/Visibility> and <https://github.com/rapidsai/kvikio/issues/442>.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__MINGW32__) && !defined(__MINGW64__)
+#define KVIKIO_EXPORT __attribute__((visibility("default")))
+#define KVIKIO_HIDDEN __attribute__((visibility("hidden")))
+#else
+#define KVIKIO_EXPORT
+#define KVIKIO_HIDDEN
+#endif
+
 #define KVIKIO_STRINGIFY_DETAIL(x) #x
 #define KVIKIO_STRINGIFY(x)        KVIKIO_STRINGIFY_DETAIL(x)
 
diff --git a/cpp/include/kvikio/stream.hpp b/cpp/include/kvikio/stream.hpp
index 9c0564ab27..9eb9942b7a 100644
--- a/cpp/include/kvikio/stream.hpp
+++ b/cpp/include/kvikio/stream.hpp
@@ -16,12 +16,13 @@
 #pragma once
 
 #include <sys/types.h>
-#include <algorithm>
 #include <cstdlib>
+#include <iostream>
 #include <kvikio/error.hpp>
 #include <kvikio/shim/cuda.hpp>
 #include <kvikio/shim/cufile.hpp>
 #include <tuple>
+#include <utility>
 
 namespace kvikio {
 
diff --git a/cpp/include/kvikio/utils.hpp b/cpp/include/kvikio/utils.hpp
index 3c77c1c853..3cad457ffa 100644
--- a/cpp/include/kvikio/utils.hpp
+++ b/cpp/include/kvikio/utils.hpp
@@ -32,20 +32,6 @@
 #include <kvikio/error.hpp>
 #include <kvikio/shim/cuda.hpp>
 
-// Macros used for defining symbol visibility, only GLIBC is supported.
-// Since KvikIO is header-only, we rely on the linker to disambiguate inline functions
-// that have (or return) static references. To do this, the relevant function must have
-// `__attribute__((visibility("default")))`. If not, then if KvikIO is used in two
-// different DSOs, the function will appear twice, and there will be two static objects.
-// See <https://github.com/rapidsai/kvikio/issues/442>.
-#if (defined(__GNUC__) || defined(__clang__)) && !defined(__MINGW32__) && !defined(__MINGW64__)
-#define KVIKIO_EXPORT __attribute__((visibility("default")))
-#define KVIKIO_HIDDEN __attribute__((visibility("hidden")))
-#else
-#define KVIKIO_EXPORT
-#define KVIKIO_HIDDEN
-#endif
-
 namespace kvikio {
 
 // cuFile defines a page size to 4 KiB
@@ -301,47 +287,104 @@ struct libkvikio_domain {
   static constexpr char const* name{"libkvikio"};
 };
 
+// Macro to concatenate two tokens x and y.
+#define KVIKIO_CONCAT_HELPER(x, y) x##y
+#define KVIKIO_CONCAT(x, y)        KVIKIO_CONCAT_HELPER(x, y)
+
+// Macro to create a static, registered string that will not have a name conflict with any
+// registered string defined in the same scope.
+#define KVIKIO_REGISTER_STRING(msg)                                        \
+  [](const char* a_msg) -> auto& {                                         \
+    static nvtx3::registered_string_in<libkvikio_domain> a_reg_str{a_msg}; \
+    return a_reg_str;                                                      \
+  }(msg)
+
 // Macro overloads of KVIKIO_NVTX_FUNC_RANGE
-#define KVIKIO_NVTX_FUNC_RANGE_1() NVTX3_FUNC_RANGE_IN(libkvikio_domain)
-#define KVIKIO_NVTX_FUNC_RANGE_2(msg, val)                    \
-  nvtx3::scoped_range_in<libkvikio_domain> _kvikio_nvtx_range \
-  {                                                           \
-    nvtx3::event_attributes                                   \
-    {                                                         \
-      msg, nvtx3::payload { convert_to_64bit(val) }           \
-    }                                                         \
+#define KVIKIO_NVTX_FUNC_RANGE_IMPL() NVTX3_FUNC_RANGE_IN(libkvikio_domain)
+
+#define KVIKIO_NVTX_SCOPED_RANGE_IMPL(msg, val)                                        \
+  nvtx3::scoped_range_in<libkvikio_domain> KVIKIO_CONCAT(_kvikio_nvtx_range, __LINE__) \
+  {                                                                                    \
+    nvtx3::event_attributes                                                            \
+    {                                                                                  \
+      KVIKIO_REGISTER_STRING(msg), nvtx3::payload { convert_to_64bit(val) }            \
+    }                                                                                  \
   }
-#define GET_KVIKIO_NVTX_FUNC_RANGE_MACRO(_1, _2, NAME, ...) NAME
+
+#define KVIKIO_NVTX_MARKER_IMPL(msg, val) \
+  nvtx3::mark_in<libkvikio_domain>(       \
+    nvtx3::event_attributes{KVIKIO_REGISTER_STRING(msg), nvtx3::payload{convert_to_64bit(val)}})
+
 #endif
 
 /**
  * @brief Convenience macro for generating an NVTX range in the `libkvikio` domain
  * from the lifetime of a function.
  *
- * Takes two arguments (message, payload) or no arguments, in which case the name
- * of the immediately enclosing function returned by `__func__` is used.
+ * Takes no argument. The name of the immediately enclosing function returned by `__func__` is used
+ * as the message.
  *
  * Example:
  * ```
- * void some_function1(){
- *    KVIKIO_NVTX_FUNC_RANGE("my function", 42);
- *    ...
- * }
- * void some_function2(){
- *    KVIKIO_NVTX_FUNC_RANGE();  // The name `some_function2` is used
+ * void some_function(){
+ *    KVIKIO_NVTX_FUNC_RANGE();  // The name `some_function` is used as the message
  *    ...
  * }
  * ```
  */
 #ifdef KVIKIO_CUDA_FOUND
-#define KVIKIO_NVTX_FUNC_RANGE(...)                                  \
-  GET_KVIKIO_NVTX_FUNC_RANGE_MACRO(                                  \
-    __VA_ARGS__, KVIKIO_NVTX_FUNC_RANGE_2, KVIKIO_NVTX_FUNC_RANGE_1) \
-  (__VA_ARGS__)
+#define KVIKIO_NVTX_FUNC_RANGE() KVIKIO_NVTX_FUNC_RANGE_IMPL()
 #else
 #define KVIKIO_NVTX_FUNC_RANGE(...) \
   do {                              \
   } while (0)
 #endif
 
+/**
+ * @brief Convenience macro for generating an NVTX scoped range in the `libkvikio` domain to
+ * annotate a time duration.
+ *
+ * Takes two arguments (message, payload).
+ *
+ * Example:
+ * ```
+ * void some_function(){
+ *    KVIKIO_NVTX_SCOPED_RANGE("my function", 42);
+ *    ...
+ * }
+ * ```
+ */
+#ifdef KVIKIO_CUDA_FOUND
+#define KVIKIO_NVTX_SCOPED_RANGE(msg, val) KVIKIO_NVTX_SCOPED_RANGE_IMPL(msg, val)
+#else
+#define KVIKIO_NVTX_SCOPED_RANGE(msg, val) \
+  do {                                     \
+  } while (0)
+#endif
+
+/**
+ * @brief Convenience macro for generating an NVTX marker in the `libkvikio` domain to annotate a
+ * certain time point.
+ *
+ * Takes two arguments (message, payload). Use this macro to annotate asynchronous I/O operations,
+ * where the payload refers to the I/O size.
+ *
+ * Example:
+ * ```
+ * std::future<void> some_function(){
+ *     size_t io_size{2077};
+ *     KVIKIO_NVTX_MARKER("I/O operation", io_size);
+ *     perform_async_io_operation(io_size);
+ *     ...
+ * }
+ * ```
+ */
+#ifdef KVIKIO_CUDA_FOUND
+#define KVIKIO_NVTX_MARKER(message, payload) KVIKIO_NVTX_MARKER_IMPL(message, payload)
+#else
+#define KVIKIO_NVTX_MARKER(message, payload) \
+  do {                                       \
+  } while (0)
+#endif
+
 }  // namespace kvikio
diff --git a/cpp/src/file_handle.cpp b/cpp/src/file_handle.cpp
new file mode 100644
index 0000000000..2e0de2537b
--- /dev/null
+++ b/cpp/src/file_handle.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cstddef>
+#include <cstdlib>
+#include <stdexcept>
+#include <system_error>
+
+#include <kvikio/defaults.hpp>
+#include <kvikio/file_handle.hpp>
+
+namespace kvikio {
+
+namespace {
+
+/**
+ * @brief Parse open file flags given as a string and return oflags
+ *
+ * @param flags The flags
+ * @param o_direct Append O_DIRECT to the open flags
+ * @return oflags
+ *
+ * @throw std::invalid_argument if the specified flags are not supported.
+ * @throw std::invalid_argument if `o_direct` is true, but `O_DIRECT` is not supported.
+ */
+int open_fd_parse_flags(const std::string& flags, bool o_direct)
+{
+  int file_flags = -1;
+  if (flags.empty()) { throw std::invalid_argument("Unknown file open flag"); }
+  switch (flags[0]) {
+    case 'r':
+      file_flags = O_RDONLY;
+      if (flags[1] == '+') { file_flags = O_RDWR; }
+      break;
+    case 'w':
+      file_flags = O_WRONLY;
+      if (flags[1] == '+') { file_flags = O_RDWR; }
+      file_flags |= O_CREAT | O_TRUNC;
+      break;
+    case 'a': throw std::invalid_argument("Open flag 'a' isn't supported");
+    default: throw std::invalid_argument("Unknown file open flag");
+  }
+  file_flags |= O_CLOEXEC;
+  if (o_direct) {
+#if defined(O_DIRECT)
+    file_flags |= O_DIRECT;
+#else
+    throw std::invalid_argument("'o_direct' flag unsupported on this platform");
+#endif
+  }
+  return file_flags;
+}
+
+/**
+ * @brief Open file using `open(2)`
+ *
+ * @param flags Open flags given as a string
+ * @param o_direct Append O_DIRECT to `flags`
+ * @param mode Access modes
+ * @return File descriptor
+ */
+int open_fd(const std::string& file_path, const std::string& flags, bool o_direct, mode_t mode)
+{
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
+  int fd = ::open(file_path.c_str(), open_fd_parse_flags(flags, o_direct), mode);
+  if (fd == -1) { throw std::system_error(errno, std::generic_category(), "Unable to open file"); }
+  return fd;
+}
+
+/**
+ * @brief Get the flags of the file descriptor (see `open(2)`)
+ *
+ * @return Open flags
+ */
+[[nodiscard]] int open_flags(int fd)
+{
+  int ret = fcntl(fd, F_GETFL);  // NOLINT(cppcoreguidelines-pro-type-vararg)
+  if (ret == -1) {
+    throw std::system_error(errno, std::generic_category(), "Unable to retrieve open flags");
+  }
+  return ret;
+}
+
+/**
+ * @brief Get file size from file descriptor `fstat(3)`
+ *
+ * @param file_descriptor Open file descriptor
+ * @return The number of bytes
+ */
+[[nodiscard]] std::size_t get_file_size(int file_descriptor)
+{
+  struct stat st {};
+  int ret = fstat(file_descriptor, &st);
+  if (ret == -1) {
+    throw std::system_error(errno, std::generic_category(), "Unable to query file size");
+  }
+  return static_cast<std::size_t>(st.st_size);
+}
+
+}  // namespace
+
+FileHandle::FileHandle(const std::string& file_path,
+                       const std::string& flags,
+                       mode_t mode,
+                       CompatMode compat_mode)
+  : _fd_direct_off{open_fd(file_path, flags, false, mode)},
+    _initialized{true},
+    _compat_mode{compat_mode}
+{
+  if (is_compat_mode_preferred()) {
+    return;  // Nothing to do in compatibility mode
+  }
+
+  // Try to open the file with the O_DIRECT flag. Fall back to compatibility mode, if it fails.
+  try {
+    _fd_direct_on = open_fd(file_path, flags, true, mode);
+  } catch (const std::system_error&) {
+    _compat_mode = CompatMode::ON;
+  } catch (const std::invalid_argument&) {
+    _compat_mode = CompatMode::ON;
+  }
+
+  // Create a cuFile handle, if not in compatibility mode
+  if (!is_compat_mode_preferred()) {
+    CUfileDescr_t desc{};  // It is important to set to zero!
+    desc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-union-access)
+    desc.handle.fd = _fd_direct_on;
+    CUFILE_TRY(cuFileAPI::instance().HandleRegister(&_handle, &desc));
+  }
+}
+
+[[nodiscard]] int FileHandle::fd_open_flags() const { return open_flags(_fd_direct_off); }
+
+[[nodiscard]] std::size_t FileHandle::nbytes() const
+{
+  if (closed()) { return 0; }
+  if (_nbytes == 0) { _nbytes = get_file_size(_fd_direct_off); }
+  return _nbytes;
+}
+
+}  // namespace kvikio
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
new file mode 100644
index 0000000000..adcf56befc
--- /dev/null
+++ b/cpp/src/remote_handle.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include <kvikio/defaults.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/parallel_operation.hpp>
+#include <kvikio/posix_io.hpp>
+#include <kvikio/remote_handle.hpp>
+#include <kvikio/shim/libcurl.hpp>
+#include <kvikio/utils.hpp>
+
+namespace kvikio {
+
+void HttpEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); }
+
+void S3Endpoint::setopt(CurlHandle& curl)
+{
+  curl.setopt(CURLOPT_URL, _url.c_str());
+  curl.setopt(CURLOPT_AWS_SIGV4, _aws_sigv4.c_str());
+  curl.setopt(CURLOPT_USERPWD, _aws_userpwd.c_str());
+}
+
+RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
+{
+  auto curl = create_curl_handle();
+
+  endpoint->setopt(curl);
+  curl.setopt(CURLOPT_NOBODY, 1L);
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+  curl.perform();
+  curl_off_t cl;
+  curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl);
+  if (cl < 0) {
+    throw std::runtime_error("cannot get size of " + endpoint->str() +
+                             ", content-length not provided by the server");
+  }
+  _nbytes   = cl;
+  _endpoint = std::move(endpoint);
+}
+
+namespace {
+
+/**
+ * @brief Context used by the "CURLOPT_WRITEFUNCTION" callbacks.
+ */
+struct CallbackContext {
+  char* buf;              // Output buffer to read into.
+  std::size_t size;       // Total number of bytes to read.
+  std::ptrdiff_t offset;  // Offset into `buf` to start reading.
+  bool overflow_error;    // Flag to indicate overflow.
+  CallbackContext(void* buf, std::size_t size)
+    : buf{static_cast<char*>(buf)}, size{size}, offset{0}, overflow_error{0}
+  {
+  }
+  detail::BounceBufferH2D* bounce_buffer{nullptr};  // Only used by callback_device_memory
+};
+
+/**
+ * @brief A "CURLOPT_WRITEFUNCTION" to copy downloaded data to the output host buffer.
+ *
+ * See <https://curl.se/libcurl/c/CURLOPT_WRITEFUNCTION.html>.
+ *
+ * @param data Data downloaded by libcurl that is ready for consumption.
+ * @param size Size of each element in `nmemb`; size is always 1.
+ * @param nmemb Size of the data in `nmemb`.
+ * @param context A pointer to an instance of `CallbackContext`.
+ */
+inline std::size_t callback_host_memory(char* data,
+                                        std::size_t size,
+                                        std::size_t nmemb,
+                                        void* context)
+{
+  auto ctx                 = reinterpret_cast<CallbackContext*>(context);
+  std::size_t const nbytes = size * nmemb;
+  if (ctx->size < ctx->offset + nbytes) {
+    ctx->overflow_error = true;
+    return CURL_WRITEFUNC_ERROR;
+  }
+  KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle - callback_host_memory()", nbytes);
+  std::memcpy(ctx->buf + ctx->offset, data, nbytes);
+  ctx->offset += nbytes;
+  return nbytes;
+}
+
+/**
+ * @brief A "CURLOPT_WRITEFUNCTION" to copy downloaded data to the output device buffer.
+ *
+ * See <https://curl.se/libcurl/c/CURLOPT_WRITEFUNCTION.html>.
+ *
+ * @param data Data downloaded by libcurl that is ready for consumption.
+ * @param size Size of each element in `nmemb`; size is always 1.
+ * @param nmemb Size of the data in `nmemb`.
+ * @param context A pointer to an instance of `CallbackContext`.
+ */
+inline std::size_t callback_device_memory(char* data,
+                                          std::size_t size,
+                                          std::size_t nmemb,
+                                          void* context)
+{
+  auto ctx                 = reinterpret_cast<CallbackContext*>(context);
+  std::size_t const nbytes = size * nmemb;
+  if (ctx->size < ctx->offset + nbytes) {
+    ctx->overflow_error = true;
+    return CURL_WRITEFUNC_ERROR;
+  }
+  KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle - callback_device_memory()", nbytes);
+
+  ctx->bounce_buffer->write(data, nbytes);
+  ctx->offset += nbytes;
+  return nbytes;
+}
+}  // namespace
+
+std::size_t RemoteHandle::read(void* buf, std::size_t size, std::size_t file_offset)
+{
+  KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle::read()", size);
+
+  if (file_offset + size > _nbytes) {
+    std::stringstream ss;
+    ss << "cannot read " << file_offset << "+" << size << " bytes into a " << _nbytes
+       << " bytes file (" << _endpoint->str() << ")";
+    throw std::invalid_argument(ss.str());
+  }
+  bool const is_host_mem = is_host_memory(buf);
+  auto curl              = create_curl_handle();
+  _endpoint->setopt(curl);
+
+  std::string const byte_range =
+    std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1);
+  curl.setopt(CURLOPT_RANGE, byte_range.c_str());
+
+  if (is_host_mem) {
+    curl.setopt(CURLOPT_WRITEFUNCTION, callback_host_memory);
+  } else {
+    curl.setopt(CURLOPT_WRITEFUNCTION, callback_device_memory);
+  }
+  CallbackContext ctx{buf, size};
+  curl.setopt(CURLOPT_WRITEDATA, &ctx);
+
+  try {
+    if (is_host_mem) {
+      curl.perform();
+    } else {
+      PushAndPopContext c(get_context_from_pointer(buf));
+      // We use a bounce buffer to avoid many small memory copies to device. Libcurl has a
+      // maximum chunk size of 16kb (`CURL_MAX_WRITE_SIZE`) but chunks are often much smaller.
+      detail::BounceBufferH2D bounce_buffer(detail::StreamsByThread::get(), buf);
+      ctx.bounce_buffer = &bounce_buffer;
+      curl.perform();
+    }
+  } catch (std::runtime_error const& e) {
+    if (ctx.overflow_error) {
+      std::stringstream ss;
+      ss << "maybe the server doesn't support file ranges? [" << e.what() << "]";
+      throw std::overflow_error(ss.str());
+    }
+    throw;
+  }
+  return size;
+}
+
+std::future<std::size_t> RemoteHandle::pread(void* buf,
+                                             std::size_t size,
+                                             std::size_t file_offset,
+                                             std::size_t task_size)
+{
+  KVIKIO_NVTX_SCOPED_RANGE("RemoteHandle::pread()", size);
+  auto task = [this](void* devPtr_base,
+                     std::size_t size,
+                     std::size_t file_offset,
+                     std::size_t devPtr_offset) -> std::size_t {
+    return read(static_cast<char*>(devPtr_base) + devPtr_offset, size, file_offset);
+  };
+  return parallel_io(task, buf, size, file_offset, task_size, 0);
+}
+
+}  // namespace kvikio
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 429bd8b722..e9024795f5 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -12,9 +12,6 @@
 # the License.
 # =============================================================================
 
-# ##################################################################################################
-# enable testing -----------------------------------------------------------------------------------
-# ##################################################################################################
 enable_testing()
 
 include(rapids-test)
@@ -32,7 +29,8 @@ set_target_properties(
              CUDA_STANDARD 17
              CUDA_STANDARD_REQUIRED ON
 )
-target_link_libraries(cpp_tests PRIVATE kvikio::kvikio GTest::gmock GTest::gtest)
+target_link_libraries(cpp_tests PRIVATE kvikio::kvikio GTest::gmock GTest::gtest CUDA::cudart)
+
 rapids_test_add(
   NAME cpp_tests
   COMMAND cpp_tests
diff --git a/cpp/tests/test_basic_io.cpp b/cpp/tests/test_basic_io.cpp
index 12ccb6d428..c884ec6230 100644
--- a/cpp/tests/test_basic_io.cpp
+++ b/cpp/tests/test_basic_io.cpp
@@ -15,29 +15,84 @@
  */
 
 #include <kvikio/file_handle.hpp>
-
+#include "kvikio/defaults.hpp"
 #include "utils.hpp"
 
 using namespace kvikio::test;
 
-TEST(BasicIO, write_read)
+class BasicIOTest : public testing::Test {
+ protected:
+  void SetUp() override
+  {
+    TempDir tmp_dir{false};
+    _filepath = tmp_dir.path() / "test";
+
+    _dev_a = std::move(DevBuffer::arange(100));
+    _dev_b = std::move(DevBuffer::zero_like(_dev_a));
+  }
+
+  void TearDown() override {}
+
+  std::filesystem::path _filepath;
+  DevBuffer _dev_a;
+  DevBuffer _dev_b;
+};
+
+TEST_F(BasicIOTest, write_read)
 {
-  TempDir tmp_dir{false};
-  auto filepath = tmp_dir.path() / "test";
+  {
+    kvikio::FileHandle f(_filepath, "w");
+    auto nbytes = f.write(_dev_a.ptr, _dev_a.nbytes, 0, 0);
+    EXPECT_EQ(nbytes, _dev_a.nbytes);
+  }
 
-  auto dev_a = DevBuffer::arange(100);
-  auto dev_b = DevBuffer::zero_like(dev_a);
+  {
+    kvikio::FileHandle f(_filepath, "r");
+    auto nbytes = f.read(_dev_b.ptr, _dev_b.nbytes, 0, 0);
+    EXPECT_EQ(nbytes, _dev_b.nbytes);
+    expect_equal(_dev_a, _dev_b);
+  }
+}
+
+TEST_F(BasicIOTest, write_read_async)
+{
+  CUstream stream{};
+  CUDA_DRIVER_TRY(kvikio::cudaAPI::instance().StreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 
+  // Default compatibility mode (AUTO)
   {
-    kvikio::FileHandle f(filepath, "w");
-    auto nbytes = f.write(dev_a.ptr, dev_a.nbytes, 0, 0);
-    EXPECT_EQ(nbytes, dev_a.nbytes);
+    kvikio::FileHandle f(_filepath, "w");
+    auto stream_future = f.write_async(_dev_a.ptr, _dev_a.nbytes, 0, 0, stream);
+    auto nbytes        = stream_future.check_bytes_done();
+    EXPECT_EQ(nbytes, _dev_a.nbytes);
   }
 
   {
-    kvikio::FileHandle f(filepath, "r");
-    auto nbytes = f.read(dev_b.ptr, dev_b.nbytes, 0, 0);
-    EXPECT_EQ(nbytes, dev_b.nbytes);
-    expect_equal(dev_a, dev_b);
+    kvikio::FileHandle f(_filepath, "r");
+    auto stream_future = f.read_async(_dev_b.ptr, _dev_b.nbytes, 0, 0, stream);
+    auto nbytes        = stream_future.check_bytes_done();
+    EXPECT_EQ(nbytes, _dev_b.nbytes);
+    expect_equal(_dev_a, _dev_b);
   }
+
+  // Explicitly set compatibility mode
+  std::array<kvikio::CompatMode, 2> compat_modes{kvikio::CompatMode::AUTO, kvikio::CompatMode::ON};
+  for (const auto& compat_mode : compat_modes) {
+    {
+      kvikio::FileHandle f(_filepath, "w", kvikio::FileHandle::m644, compat_mode);
+      auto stream_future = f.write_async(_dev_a.ptr, _dev_a.nbytes, 0, 0, stream);
+      auto nbytes        = stream_future.check_bytes_done();
+      EXPECT_EQ(nbytes, _dev_a.nbytes);
+    }
+
+    {
+      kvikio::FileHandle f(_filepath, "r", kvikio::FileHandle::m644, compat_mode);
+      auto stream_future = f.read_async(_dev_b.ptr, _dev_b.nbytes, 0, 0, stream);
+      auto nbytes        = stream_future.check_bytes_done();
+      EXPECT_EQ(nbytes, _dev_b.nbytes);
+      expect_equal(_dev_a, _dev_b);
+    }
+  }
+
+  CUDA_DRIVER_TRY(kvikio::cudaAPI::instance().StreamDestroy(stream));
 }
diff --git a/cpp/tests/test_defaults.cpp b/cpp/tests/test_defaults.cpp
new file mode 100644
index 0000000000..c4a88775e4
--- /dev/null
+++ b/cpp/tests/test_defaults.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdexcept>
+
+#include <gtest/gtest.h>
+#include <kvikio/defaults.hpp>
+
+TEST(Defaults, parse_compat_mode_str)
+{
+  {
+    std::vector<std::string> inputs{
+      "ON", "on", "On", "TRUE", "true", "True", "YES", "yes", "Yes", "1"};
+    for (const auto& input : inputs) {
+      EXPECT_EQ(kvikio::detail::parse_compat_mode_str(input), kvikio::CompatMode::ON);
+    }
+  }
+
+  {
+    std::vector<std::string> inputs{
+      "OFF", "off", "oFf", "FALSE", "false", "False", "NO", "no", "No", "0"};
+    for (const auto& input : inputs) {
+      EXPECT_EQ(kvikio::detail::parse_compat_mode_str(input), kvikio::CompatMode::OFF);
+    }
+  }
+
+  {
+    std::vector<std::string> inputs{"AUTO", "auto", "aUtO"};
+    for (const auto& input : inputs) {
+      EXPECT_EQ(kvikio::detail::parse_compat_mode_str(input), kvikio::CompatMode::AUTO);
+    }
+  }
+
+  {
+    std::vector<std::string> inputs{"", "invalidOption", "11", "*&^Yes"};
+    for (const auto& input : inputs) {
+      EXPECT_THROW(kvikio::detail::parse_compat_mode_str(input), std::invalid_argument);
+    }
+  }
+}
diff --git a/cpp/tests/utils.hpp b/cpp/tests/utils.hpp
index 56a2cd5c45..1c671a82bc 100644
--- a/cpp/tests/utils.hpp
+++ b/cpp/tests/utils.hpp
@@ -110,10 +110,12 @@ class TempDir {
  */
 class DevBuffer {
  public:
-  const std::size_t nelem;
-  const std::size_t nbytes;
+  std::size_t nelem;
+  std::size_t nbytes;
   void* ptr{nullptr};
 
+  DevBuffer() : nelem{0}, nbytes{0} {};
+
   DevBuffer(std::size_t nelem) : nelem{nelem}, nbytes{nelem * sizeof(std::int64_t)}
   {
     KVIKIO_CHECK_CUDA(cudaMalloc(&ptr, nbytes));
@@ -123,6 +125,21 @@ class DevBuffer {
     KVIKIO_CHECK_CUDA(cudaMemcpy(ptr, host_buffer.data(), nbytes, cudaMemcpyHostToDevice));
   }
 
+  DevBuffer(DevBuffer&& dev_buffer) noexcept
+    : nelem{std::exchange(dev_buffer.nelem, 0)},
+      nbytes{std::exchange(dev_buffer.nbytes, 0)},
+      ptr{std::exchange(dev_buffer.ptr, nullptr)}
+  {
+  }
+
+  DevBuffer& operator=(DevBuffer&& dev_buffer) noexcept
+  {
+    nelem  = std::exchange(dev_buffer.nelem, 0);
+    nbytes = std::exchange(dev_buffer.nbytes, 0);
+    ptr    = std::exchange(dev_buffer.ptr, nullptr);
+    return *this;
+  }
+
   ~DevBuffer() noexcept { cudaFree(ptr); }
 
   [[nodiscard]] static DevBuffer arange(std::size_t nelem, std::int64_t start = 0)
diff --git a/dependencies.yaml b/dependencies.yaml
index 7abf22bdab..fdc29df8e0 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -13,6 +13,7 @@ files:
       - cuda
       - cuda_version
       - depends_on_cupy
+      - depends_on_nvcomp
       - docs
       - py_version
       - rapids_build_skbuild
@@ -23,12 +24,14 @@ files:
     output: none
     includes:
       - cuda_version
+      - test_libkvikio
       - test_cpp
   test_python:
     output: none
     includes:
       - cuda_version
       - py_version
+      - test_kvikio
       - test_python
   checks:
     output: none
@@ -41,29 +44,32 @@ files:
       - cuda_version
       - docs
       - py_version
-  py_build:
+      - test_kvikio
+  py_build_kvikio:
     output: pyproject
     pyproject_dir: python/kvikio
     extras:
       table: build-system
     includes:
       - rapids_build_skbuild
-  py_build_cpp_wheel:
+  py_build_libkvikio:
     output: pyproject
     pyproject_dir: python/libkvikio
     extras:
       table: build-system
     includes:
       - rapids_build_skbuild
-  py_run:
+  py_run_kvikio:
     output: pyproject
     pyproject_dir: python/kvikio
     extras:
       table: project
     includes:
       - depends_on_cupy
+      - depends_on_nvcomp
+      - depends_on_libkvikio
       - run
-  py_wheel_cpp:
+  py_rapids_build_libkvikio:
     output: pyproject
     pyproject_dir: python/libkvikio
     extras:
@@ -71,7 +77,7 @@ files:
       key: requires
     includes:
       - build-universal
-  py_wheel_python:
+  py_rapids_build_kvikio:
     output: pyproject
     pyproject_dir: python/kvikio
     extras:
@@ -108,6 +114,7 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
+          - libcurl>=8.5.0,<9.0a0
     specific:
       - output_types: conda
         matrices:
@@ -141,7 +148,7 @@ dependencies:
     common:
       - output_types: conda
         packages: &libkvikio_packages
-          - libkvikio==24.10.*
+          - libkvikio==24.12.*,>=0.0.0a0
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -149,12 +156,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==24.10.*
+              - libkvikio-cu12==24.12.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu11==24.10.*
+              - libkvikio-cu11==24.12.*,>=0.0.0a0
           - {matrix: null, packages: *libkvikio_packages}
   build-py-wrapper:
     common:
@@ -199,10 +206,6 @@ dependencies:
             packages:
               - cuda-version=12.5
   cuda:
-    common:
-      - output_types: conda
-        packages:
-          - nvcomp==4.0.1
     specific:
       - output_types: conda
         matrices:
@@ -264,6 +267,50 @@ dependencies:
             packages: &cupy_packages_cu11
               - cupy-cuda11x>=12.0.0
           - {matrix: null, packages: *cupy_packages_cu11}
+  depends_on_nvcomp:
+    common:
+      - output_types: conda
+        packages:
+          - nvcomp==4.1.0.6
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - nvidia-nvcomp-cu12==4.1.0.6
+          - matrix:
+              cuda: "11.*"
+            packages:
+              - nvidia-nvcomp-cu11==4.1.0.6
+          - matrix:
+            packages:
+              - nvidia-nvcomp==4.1.0.6
+  depends_on_libkvikio:
+    common:
+      - output_types: conda
+        packages:
+          - &libkvikio_unsuffixed libkvikio==24.12.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for libkvikio-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - libkvikio-cu12==24.12.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - libkvikio-cu11==24.12.*,>=0.0.0a0
+          - {matrix: null, packages: [*libkvikio_unsuffixed]}
   docs:
     common:
       - output_types: [conda, requirements]
@@ -314,6 +361,18 @@ dependencies:
           # See https://github.com/zarr-developers/numcodecs/pull/475
           - numcodecs !=0.12.0
           - packaging
+  test_libkvikio:
+    common:
+      - output_types: conda
+        packages:
+          - libkvikio==24.12.*,>=0.0.0a0
+          - libkvikio-tests==24.12.*,>=0.0.0a0
+  test_kvikio:
+    common:
+      - output_types: conda
+        packages:
+          - libkvikio==24.12.*,>=0.0.0a0
+          - kvikio==24.12.*,>=0.0.0a0
   test_cpp:
     common:
       - output_types: conda
@@ -323,16 +382,24 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &dask dask>=2022.05.2
+          - rapids-dask-dependency==24.12.*,>=0.0.0a0
           - pytest
           - pytest-cov
+          - rangehttpserver
+          - boto3>=1.21.21
+      - output_types: [requirements, pyproject]
+        packages:
+          - moto[server]>=4.0.8
+      - output_types: conda
+        packages:
+          - moto>=4.0.8
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
           - matrix:
               cuda: "12.*"
             packages:
-              - cuda-python>=12.0,<13.0a0
+              - cuda-python>=12.0,<13.0a0,<=12.6.0
           - matrix: # All CUDA 11 versions
             packages:
-              - cuda-python>=11.7.1,<12.0a0
+              - cuda-python>=11.7.1,<12.0a0,<=11.8.3
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 4d19c09bbb..fd34367a00 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -18,6 +18,13 @@ Zarr
 .. autoclass:: GDSStore
     :members:
 
+RemoteFile
+----------
+.. currentmodule:: kvikio.remote_file
+
+.. autoclass:: RemoteFile
+    :members:
+
 Defaults
 --------
 .. currentmodule:: kvikio.defaults
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 4dd491fd96..9e302b5f44 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -23,6 +23,7 @@ Contents
    install
    quickstart
    zarr
+   remote_file
    runtime_settings
    api
    genindex
diff --git a/docs/source/remote_file.rst b/docs/source/remote_file.rst
new file mode 100644
index 0000000000..ed6fe45b7b
--- /dev/null
+++ b/docs/source/remote_file.rst
@@ -0,0 +1,11 @@
+Remote File
+===========
+
+KvikIO provides direct access to remote files.
+
+
+Example
+-------
+
+.. literalinclude:: ../../python/kvikio/examples/http_io.py
+    :language: python
diff --git a/docs/source/runtime_settings.rst b/docs/source/runtime_settings.rst
index 631ba0c937..be5508741e 100644
--- a/docs/source/runtime_settings.rst
+++ b/docs/source/runtime_settings.rst
@@ -3,15 +3,21 @@ Runtime Settings
 
 Compatibility Mode ``KVIKIO_COMPAT_MODE``
 -----------------------------------------
-When KvikIO is running in compatibility mode, it doesn't load ``libcufile.so``. Instead, reads and writes are done using POSIX. Notice, this is not the same as the compatibility mode in cuFile. That is cuFile can run in compatibility mode while KvikIO is not.
-Set the environment variable ``KVIKIO_COMPAT_MODE`` to enable/disable compatibility mode. By default, compatibility mode is enabled:
+When KvikIO is running in compatibility mode, it doesn't load ``libcufile.so``. Instead, reads and writes are done using POSIX. Notice, this is not the same as the compatibility mode in cuFile. It is possible that KvikIO performs I/O in the non-compatibility mode by using the cuFile library, but the cuFile library itself is configured to operate in its own compatibility mode. For more details, refer to `cuFile compatibility mode <https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html#cufile-compatibility-mode>`_ and `cuFile environment variables <https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/index.html#environment-variables>`_ .
+
+The environment variable ``KVIKIO_COMPAT_MODE`` has three options (case-insensitive):
+
+  * ``ON`` (aliases: ``TRUE``, ``YES``, ``1``): Enable the compatibility mode.
+  * ``OFF`` (aliases: ``FALSE``, ``NO``, ``0``): Disable the compatibility mode, and enforce cuFile I/O. GDS will be activated if the system requirements for cuFile are met and cuFile is properly configured. However, if the system is not suited for cuFile, I/O operations under the ``OFF`` option may error out, crash or hang.
+  * ``AUTO``: Try cuFile I/O first, and fall back to POSIX I/O if the system requirements for cuFile are not met.
+
+Under ``AUTO``, KvikIO falls back to the compatibility mode:
 
   * when ``libcufile.so`` cannot be found.
   * when running in Windows Subsystem for Linux (WSL).
   * when ``/run/udev`` isn't readable, which typically happens when running inside a docker image not launched with ``--volume /run/udev:/run/udev:ro``.
 
-This setting can also be controlled by :py:func:`kvikio.defaults.compat_mode`, :py:func:`kvikio.defaults.compat_mode_reset`, and :py:func:`kvikio.defaults.set_compat_mode`.
-
+This setting can also be programmatically controlled by :py:func:`kvikio.defaults.set_compat_mode` and :py:func:`kvikio.defaults.compat_mode_reset`.
 
 Thread Pool ``KVIKIO_NTHREADS``
 -------------------------------
diff --git a/docs/source/zarr.rst b/docs/source/zarr.rst
index 5b63ffd8b7..82e6186026 100644
--- a/docs/source/zarr.rst
+++ b/docs/source/zarr.rst
@@ -8,7 +8,7 @@ Zarr
 KvikIO provides a GPU backend to Zarr-Python that enables `GPUDirect Storage (GDS) <https://developer.nvidia.com/blog/gpudirect-storage/>`_ seamlessly.
 
 The following is an example of how to use the convenience function :py:meth:`kvikio.zarr.open_cupy_array`
-to create a new Zarr array and how open an existing Zarr array.
+to create a new Zarr array and how to open an existing Zarr array.
 
 
 .. literalinclude:: ../../python/kvikio/examples/zarr_cupy_nvcomp.py
diff --git a/python/kvikio/CMakeLists.txt b/python/kvikio/CMakeLists.txt
index 2b9278fcd1..6e54a5dff5 100644
--- a/python/kvikio/CMakeLists.txt
+++ b/python/kvikio/CMakeLists.txt
@@ -26,10 +26,9 @@ project(
   LANGUAGES CXX CUDA
 )
 
-# TODO: Should we symlink FindcuFile.cmake into python/cmake? find cuFile
-include(../../cpp/cmake/Modules/FindcuFile.cmake)
+option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF)
 
-find_package(KvikIO REQUIRED "${RAPIDS_VERSION}")
+find_package(kvikio REQUIRED "${RAPIDS_VERSION}")
 
 find_package(CUDAToolkit REQUIRED)
 
@@ -40,11 +39,4 @@ add_subdirectory(cmake)
 
 set(cython_lib_dir kvikio)
 
-# It would be better to factor nvcomp out into its own wheel. Until that is available, we vendor it
-# here.
-install_aliased_imported_targets(
-  TARGETS nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp DESTINATION
-  ${cython_lib_dir}/_lib
-)
-
 add_subdirectory(kvikio/_lib)
diff --git a/python/kvikio/cmake/CMakeLists.txt b/python/kvikio/cmake/CMakeLists.txt
index fa94bc3f8e..d3882b5ab3 100644
--- a/python/kvikio/cmake/CMakeLists.txt
+++ b/python/kvikio/cmake/CMakeLists.txt
@@ -13,5 +13,3 @@
 # =============================================================================
 
 include(thirdparty/get_nvcomp.cmake)
-# Needed for install_aliased_imported_targets
-include(thirdparty/WheelHelpers.cmake)
diff --git a/python/kvikio/cmake/thirdparty/WheelHelpers.cmake b/python/kvikio/cmake/thirdparty/WheelHelpers.cmake
deleted file mode 100644
index 3abe98a064..0000000000
--- a/python/kvikio/cmake/thirdparty/WheelHelpers.cmake
+++ /dev/null
@@ -1,59 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-include_guard(GLOBAL)
-
-# Making libraries available inside wheels by installing the associated targets.
-function(install_aliased_imported_targets)
-  list(APPEND CMAKE_MESSAGE_CONTEXT "install_aliased_imported_targets")
-
-  set(options "")
-  set(one_value "DESTINATION")
-  set(multi_value "TARGETS")
-  cmake_parse_arguments(_ "${options}" "${one_value}" "${multi_value}" ${ARGN})
-
-  message(VERBOSE "Installing targets '${__TARGETS}' into lib_dir '${__DESTINATION}'")
-
-  foreach(target IN LISTS __TARGETS)
-
-    if(NOT TARGET ${target})
-      message(VERBOSE "No target named ${target}")
-      continue()
-    endif()
-
-    get_target_property(alias_target ${target} ALIASED_TARGET)
-    if(alias_target)
-      set(target ${alias_target})
-    endif()
-
-    get_target_property(is_imported ${target} IMPORTED)
-    if(NOT is_imported)
-      # If the target isn't imported, install it into the wheel
-      install(TARGETS ${target} DESTINATION ${__DESTINATION})
-      message(VERBOSE "install(TARGETS ${target} DESTINATION ${__DESTINATION})")
-    else()
-      # If the target is imported, make sure it's global
-      get_target_property(type ${target} TYPE)
-      if(${type} STREQUAL "UNKNOWN_LIBRARY")
-        install(FILES $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})
-        message(VERBOSE "install(FILES $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})")
-      else()
-        install(IMPORTED_RUNTIME_ARTIFACTS ${target} DESTINATION ${__DESTINATION})
-        message(
-          VERBOSE
-          "install(IMPORTED_RUNTIME_ARTIFACTS $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})"
-        )
-      endif()
-    endif()
-  endforeach()
-endfunction()
diff --git a/python/kvikio/cmake/thirdparty/get_nvcomp.cmake b/python/kvikio/cmake/thirdparty/get_nvcomp.cmake
index 9361624c07..a2c6326e76 100644
--- a/python/kvikio/cmake/thirdparty/get_nvcomp.cmake
+++ b/python/kvikio/cmake/thirdparty/get_nvcomp.cmake
@@ -18,7 +18,11 @@ set(KVIKIO_USE_PROPRIETARY_BINARY ON)
 function(find_and_configure_nvcomp)
 
   include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  rapids_cpm_nvcomp(USE_PROPRIETARY_BINARY ${KVIKIO_USE_PROPRIETARY_BINARY})
+  set(export_args)
+  if(KvikIO_EXPORT_NVCOMP)
+    set(export_args BUILD_EXPORT_SET kvikio-exports INSTALL_EXPORT_SET kvikio-exports)
+  endif()
+  rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${KVIKIO_USE_PROPRIETARY_BINARY})
 
   # Per-thread default stream
   if(TARGET nvcomp AND PER_THREAD_DEFAULT_STREAM)
diff --git a/python/kvikio/examples/http_io.py b/python/kvikio/examples/http_io.py
new file mode 100644
index 0000000000..26c9af1d44
--- /dev/null
+++ b/python/kvikio/examples/http_io.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import pathlib
+import tempfile
+
+import cupy
+import numpy
+
+import kvikio
+from kvikio.utils import LocalHttpServer
+
+
+def main(tmpdir: pathlib.Path):
+    a = cupy.arange(100)
+    a.tofile(tmpdir / "myfile")
+    b = cupy.empty_like(a)
+
+    # Start a local server that serves files in `tmpdir`
+    with LocalHttpServer(root_path=tmpdir) as server:
+        # Open remote file from a http url
+        with kvikio.RemoteFile.open_http(f"{server.url}/myfile") as f:
+            # KvikIO fetch the file size
+            assert f.nbytes() == a.nbytes
+            # Read the remote file into `b` as if it was a local file.
+            f.read(b)
+            assert all(a == b)
+            # We can also read into host memory seamlessly
+            a = cupy.asnumpy(a)
+            c = numpy.empty_like(a)
+            f.read(c)
+            assert all(a == c)
+
+
+if __name__ == "__main__":
+    with tempfile.TemporaryDirectory() as tmpdir:
+        main(pathlib.Path(tmpdir))
diff --git a/python/kvikio/kvikio/__init__.py b/python/kvikio/kvikio/__init__.py
index 883ac9e784..64aa95df5c 100644
--- a/python/kvikio/kvikio/__init__.py
+++ b/python/kvikio/kvikio/__init__.py
@@ -1,12 +1,26 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
-from kvikio._lib import driver_properties  # type: ignore
-from kvikio._version import __git_commit__, __version__
-from kvikio.cufile import CuFile
+# If libkvikio was installed as a wheel, we must request it to load the library symbols.
+# Otherwise, we assume that the library was installed in a system path that ld can find.
+try:
+    import libkvikio
+except ModuleNotFoundError:
+    pass
+else:
+    libkvikio.load_library()
+    del libkvikio
 
-# TODO: Wrap nicely, maybe as a dataclass?
-DriverProperties = driver_properties.DriverProperties
 
+from kvikio._lib.defaults import CompatMode  # noqa: F401
+from kvikio._version import __git_commit__, __version__
+from kvikio.cufile import CuFile
+from kvikio.remote_file import RemoteFile, is_remote_file_available
 
-__all__ = ["__git_commit__", "__version__", "CuFile"]
+__all__ = [
+    "__git_commit__",
+    "__version__",
+    "CuFile",
+    "RemoteFile",
+    "is_remote_file_available",
+]
diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt
index c77d8e3df1..364699f7bd 100644
--- a/python/kvikio/kvikio/_lib/CMakeLists.txt
+++ b/python/kvikio/kvikio/_lib/CMakeLists.txt
@@ -13,12 +13,32 @@
 # =============================================================================
 
 # Set the list of Cython files to build, one .so per file
-set(cython_modules arr.pyx buffer.pyx defaults.pyx driver_properties.pyx file_handle.pyx future.pyx
+set(cython_modules arr.pyx buffer.pyx defaults.pyx cufile_driver.pyx file_handle.pyx future.pyx
                    libnvcomp.pyx libnvcomp_ll.pyx
 )
 
+if(TARGET CURL::libcurl)
+  message(STATUS "Building remote_handle.pyx (libcurl found)")
+  list(APPEND cython_modules remote_handle.pyx)
+else()
+  message(
+    STATUS
+      "Skipping remote_handle.pyx (please set KvikIO_REMOTE_SUPPORT=ON for remote file support)"
+  )
+endif()
+
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_modules}"
   LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp
 )
+if(USE_NVCOMP_RUNTIME_WHEEL)
+  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
+  foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+    set_property(
+      TARGET ${tgt}
+      PROPERTY INSTALL_RPATH ${rpaths}
+      APPEND
+    )
+  endforeach()
+endif()
diff --git a/python/kvikio/kvikio/_lib/arr.pxd b/python/kvikio/kvikio/_lib/arr.pxd
index a9d2b7e7f1..47bad21a3b 100644
--- a/python/kvikio/kvikio/_lib/arr.pxd
+++ b/python/kvikio/kvikio/_lib/arr.pxd
@@ -28,6 +28,9 @@ cdef class Array:
     cpdef Py_ssize_t _nbytes(self)
 
 
+cpdef Array asarray(obj)
+
+
 cdef pair[uintptr_t, size_t] parse_buffer_argument(
     buf, size, bint accept_host_buffer
 ) except *
diff --git a/python/kvikio/kvikio/_lib/arr.pyx b/python/kvikio/kvikio/_lib/arr.pyx
index 793a414b17..950654db2c 100644
--- a/python/kvikio/kvikio/_lib/arr.pyx
+++ b/python/kvikio/kvikio/_lib/arr.pyx
@@ -4,13 +4,13 @@
 # cython: language_level=3
 
 
-from cpython.array cimport array, newarrayobject
 from cpython.buffer cimport PyBuffer_IsContiguous
+from cpython.mem cimport PyMem_Free, PyMem_Malloc
 from cpython.memoryview cimport PyMemoryView_FromObject, PyMemoryView_GET_BUFFER
-from cpython.object cimport PyObject
 from cpython.ref cimport Py_INCREF
 from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
 from cython cimport auto_pickle, boundscheck, initializedcheck, nonecheck, wraparound
+from cython.view cimport array
 from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
 
@@ -53,13 +53,14 @@ cdef dict itemsize_mapping = {
 }
 
 
-cdef array array_Py_ssize_t = array("q")
+cdef sizeof_Py_ssize_t = sizeof(Py_ssize_t)
 
 
-cdef inline Py_ssize_t[::1] new_Py_ssize_t_array(Py_ssize_t n):
-    return newarrayobject(
-        (<PyObject*>array_Py_ssize_t).ob_type, n, array_Py_ssize_t.ob_descr
-    )
+cdef Py_ssize_t[::1] new_Py_ssize_t_array(Py_ssize_t n):
+    cdef array a = array((n,), sizeof_Py_ssize_t, b"q", "c", False)
+    a.data = <char*>PyMem_Malloc(n * sizeof(Py_ssize_t))
+    a.callback_free_data = PyMem_Free
+    return a
 
 
 @auto_pickle(False)
@@ -236,7 +237,7 @@ cdef class Array:
 cdef inline bint _c_contiguous(Py_ssize_t itemsize,
                                Py_ssize_t ndim,
                                Py_ssize_t[::1] shape_mv,
-                               Py_ssize_t[::1] strides_mv) nogil:
+                               Py_ssize_t[::1] strides_mv) noexcept nogil:
     cdef Py_ssize_t i, s
     if strides_mv is not None:
         s = itemsize
@@ -254,7 +255,7 @@ cdef inline bint _c_contiguous(Py_ssize_t itemsize,
 cdef inline bint _f_contiguous(Py_ssize_t itemsize,
                                Py_ssize_t ndim,
                                Py_ssize_t[::1] shape_mv,
-                               Py_ssize_t[::1] strides_mv) nogil:
+                               Py_ssize_t[::1] strides_mv) noexcept nogil:
     cdef Py_ssize_t i, s
     if strides_mv is not None:
         s = itemsize
@@ -270,7 +271,7 @@ cdef inline bint _f_contiguous(Py_ssize_t itemsize,
 cdef inline bint _contiguous(Py_ssize_t itemsize,
                              Py_ssize_t ndim,
                              Py_ssize_t[::1] shape_mv,
-                             Py_ssize_t[::1] strides_mv) nogil:
+                             Py_ssize_t[::1] strides_mv) noexcept nogil:
     cdef bint r = _c_contiguous(itemsize, ndim, shape_mv, strides_mv)
     if not r:
         r = _f_contiguous(itemsize, ndim, shape_mv, strides_mv)
@@ -283,15 +284,24 @@ cdef inline bint _contiguous(Py_ssize_t itemsize,
 @wraparound(False)
 cdef inline Py_ssize_t _nbytes(Py_ssize_t itemsize,
                                Py_ssize_t ndim,
-                               Py_ssize_t[::1] shape_mv) nogil:
+                               Py_ssize_t[::1] shape_mv) noexcept nogil:
     cdef Py_ssize_t i, nbytes = itemsize
     for i in range(ndim):
         nbytes *= shape_mv[i]
     return nbytes
 
-cpdef asarray(obj):
+
+cpdef Array asarray(obj):
+    """Coerce other objects to ``Array``. No-op for existing ``Array``s.
+
+    Args:
+        obj: Object exposing the Python buffer protocol or ``__cuda_array_interface__``
+
+    Returns:
+        Array: An instance of the ``Array`` class
+    """
     if isinstance(obj, Array):
-        return obj
+        return <Array>obj
     else:
         return Array(obj)
 
diff --git a/python/kvikio/kvikio/_lib/driver_properties.pyx b/python/kvikio/kvikio/_lib/cufile_driver.pyx
similarity index 84%
rename from python/kvikio/kvikio/_lib/driver_properties.pyx
rename to python/kvikio/kvikio/_lib/cufile_driver.pyx
index 674ef14cde..0488eb3b20 100644
--- a/python/kvikio/kvikio/_lib/driver_properties.pyx
+++ b/python/kvikio/kvikio/_lib/cufile_driver.pyx
@@ -8,7 +8,25 @@
 from libcpp cimport bool
 
 
-cdef extern from "<kvikio/driver.hpp>" nogil:
+cdef extern from "<kvikio/shim/cufile.hpp>" nogil:
+    cdef int cpp_libcufile_version "kvikio::cufile_version"() except +
+    cdef void cpp_driver_open "kvikio::cuFileAPI::instance().driver_open"() except +
+    cdef void cpp_driver_close "kvikio::cuFileAPI::instance().driver_close"() except +
+
+
+def libcufile_version() -> int:
+    return cpp_libcufile_version()
+
+
+def driver_open():
+    cpp_driver_open()
+
+
+def driver_close():
+    cpp_driver_close()
+
+
+cdef extern from "<kvikio/cufile/driver.hpp>" nogil:
     cdef cppclass cpp_DriverProperties "kvikio::DriverProperties":
         cpp_DriverProperties() except +
         bool is_gds_available() except +
diff --git a/python/kvikio/kvikio/_lib/defaults.pyx b/python/kvikio/kvikio/_lib/defaults.pyx
index f59cad5cb4..9042069b74 100644
--- a/python/kvikio/kvikio/_lib/defaults.pyx
+++ b/python/kvikio/kvikio/_lib/defaults.pyx
@@ -4,13 +4,18 @@
 # distutils: language = c++
 # cython: language_level=3
 
+from libc.stdint cimport uint8_t
 from libcpp cimport bool
 
 
-cdef extern from "<kvikio/defaults.hpp>" nogil:
-    bool cpp_compat_mode "kvikio::defaults::compat_mode"() except +
+cdef extern from "<kvikio/defaults.hpp>" namespace "kvikio" nogil:
+    cpdef enum class CompatMode(uint8_t):
+        OFF = 0
+        ON = 1
+        AUTO = 2
+    CompatMode cpp_compat_mode "kvikio::defaults::compat_mode"() except +
     void cpp_compat_mode_reset \
-        "kvikio::defaults::compat_mode_reset"(bool enable) except +
+        "kvikio::defaults::compat_mode_reset"(CompatMode compat_mode) except +
     unsigned int cpp_thread_pool_nthreads \
         "kvikio::defaults::thread_pool_nthreads"() except +
     void cpp_thread_pool_nthreads_reset \
@@ -25,12 +30,12 @@ cdef extern from "<kvikio/defaults.hpp>" nogil:
         "kvikio::defaults::bounce_buffer_size_reset"(size_t nbytes) except +
 
 
-def compat_mode() -> bool:
+def compat_mode() -> CompatMode:
     return cpp_compat_mode()
 
 
-def compat_mode_reset(enable: bool) -> None:
-    cpp_compat_mode_reset(enable)
+def compat_mode_reset(compat_mode: CompatMode) -> None:
+    cpp_compat_mode_reset(compat_mode)
 
 
 def thread_pool_nthreads() -> int:
diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
new file mode 100644
index 0000000000..1e0b14acb9
--- /dev/null
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -0,0 +1,164 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+# distutils: language = c++
+# cython: language_level=3
+
+from typing import Optional
+
+from cython.operator cimport dereference as deref
+from libc.stdint cimport uintptr_t
+from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move, pair
+
+from kvikio._lib.arr cimport parse_buffer_argument
+from kvikio._lib.future cimport IOFuture, _wrap_io_future, future
+
+
+cdef extern from "<kvikio/remote_handle.hpp>" nogil:
+    cdef cppclass cpp_RemoteEndpoint "kvikio::RemoteEndpoint":
+        string str() except +
+
+    cdef cppclass cpp_HttpEndpoint "kvikio::HttpEndpoint"(cpp_RemoteEndpoint):
+        cpp_HttpEndpoint(string url) except +
+
+    cdef cppclass cpp_S3Endpoint "kvikio::S3Endpoint"(cpp_RemoteEndpoint):
+        cpp_S3Endpoint(string url) except +
+        cpp_S3Endpoint(string bucket_name, string object_name) except +
+
+    pair[string, string] cpp_parse_s3_url \
+        "kvikio::S3Endpoint::parse_s3_url"(string url) except +
+
+    cdef cppclass cpp_RemoteHandle "kvikio::RemoteHandle":
+        cpp_RemoteHandle(
+            unique_ptr[cpp_RemoteEndpoint] endpoint, size_t nbytes
+        ) except +
+        cpp_RemoteHandle(unique_ptr[cpp_RemoteEndpoint] endpoint) except +
+        int nbytes() except +
+        const cpp_RemoteEndpoint& endpoint() except +
+        size_t read(
+            void* buf,
+            size_t size,
+            size_t file_offset
+        ) except +
+        future[size_t] pread(
+            void* buf,
+            size_t size,
+            size_t file_offset
+        ) except +
+
+
+cdef string _to_string(str s):
+    """Convert Python object to a C++ string (if None, return the empty string)"""
+    if s is not None:
+        return s.encode()
+    else:
+        return string()
+
+# Helper function to cast an endpoint to its base class `RemoteEndpoint`
+cdef extern from *:
+    """
+    template <typename T>
+    std::unique_ptr<kvikio::RemoteEndpoint> cast_to_remote_endpoint(T endpoint)
+    {
+        return std::move(endpoint);
+    }
+    """
+    cdef unique_ptr[cpp_RemoteEndpoint] cast_to_remote_endpoint[T](T handle) except +
+
+
+cdef class RemoteFile:
+    cdef unique_ptr[cpp_RemoteHandle] _handle
+
+    @staticmethod
+    cdef RemoteFile _from_endpoint(
+        unique_ptr[cpp_RemoteEndpoint] ep,
+        nbytes: Optional[int],
+    ):
+        cdef RemoteFile ret = RemoteFile()
+        if nbytes is None:
+            ret._handle = make_unique[cpp_RemoteHandle](move(ep))
+            return ret
+        cdef size_t n = nbytes
+        ret._handle = make_unique[cpp_RemoteHandle](move(ep), n)
+        return ret
+
+    @staticmethod
+    def open_http(
+        url: str,
+        nbytes: Optional[int],
+    ):
+        return RemoteFile._from_endpoint(
+            cast_to_remote_endpoint(
+                make_unique[cpp_HttpEndpoint](_to_string(url))
+            ),
+            nbytes
+        )
+
+    @staticmethod
+    def open_s3(
+        bucket_name: str,
+        object_name: str,
+        nbytes: Optional[int],
+    ):
+        return RemoteFile._from_endpoint(
+            cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](
+                    _to_string(bucket_name), _to_string(object_name)
+                )
+            ),
+            nbytes
+        )
+
+    @staticmethod
+    def open_s3_from_http_url(
+        url: str,
+        nbytes: Optional[int],
+    ):
+        return RemoteFile._from_endpoint(
+            cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](_to_string(url))
+            ),
+            nbytes
+        )
+
+    @staticmethod
+    def open_s3_from_s3_url(
+        url: str,
+        nbytes: Optional[int],
+    ):
+        cdef pair[string, string] bucket_and_object = cpp_parse_s3_url(_to_string(url))
+        return RemoteFile._from_endpoint(
+            cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](
+                    bucket_and_object.first, bucket_and_object.second
+                )
+            ),
+            nbytes
+        )
+
+    def __str__(self) -> str:
+        cdef string ep_str = deref(self._handle).endpoint().str()
+        return f'<{self.__class__.__name__} "{ep_str.decode()}">'
+
+    def nbytes(self) -> int:
+        return deref(self._handle).nbytes()
+
+    def read(self, buf, size: Optional[int], file_offset: int) -> int:
+        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
+        return deref(self._handle).read(
+            <void*>info.first,
+            info.second,
+            file_offset,
+        )
+
+    def pread(self, buf, size: Optional[int], file_offset: int) -> IOFuture:
+        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
+        return _wrap_io_future(
+            deref(self._handle).pread(
+                <void*>info.first,
+                info.second,
+                file_offset,
+            )
+        )
diff --git a/python/kvikio/kvikio/benchmarks/http_io.py b/python/kvikio/kvikio/benchmarks/http_io.py
new file mode 100644
index 0000000000..68d4643004
--- /dev/null
+++ b/python/kvikio/kvikio/benchmarks/http_io.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import argparse
+import contextlib
+import pathlib
+import statistics
+import tempfile
+import time
+from functools import partial
+
+import cupy
+import numpy
+from dask.utils import format_bytes
+
+import kvikio
+import kvikio.defaults
+from kvikio.utils import LocalHttpServer
+
+
+def run_numpy_like(args, xp):
+    src = numpy.arange(args.nelem, dtype=args.dtype)
+    src.tofile(args.server_root_path / "data")
+    dst = xp.empty_like(src)
+    url = f"{args.server_url}/data"
+
+    def run() -> float:
+        t0 = time.perf_counter()
+        with kvikio.RemoteFile.open_http(url, nbytes=src.nbytes) as f:
+            res = f.read(dst)
+        t1 = time.perf_counter()
+        assert res == args.nbytes, f"IO mismatch, expected {args.nbytes} got {res}"
+        xp.testing.assert_array_equal(src, dst)
+        return t1 - t0
+
+    for _ in range(args.nruns):
+        yield run()
+
+
+API = {
+    "cupy": partial(run_numpy_like, xp=cupy),
+    "numpy": partial(run_numpy_like, xp=numpy),
+}
+
+
+def main(args):
+    cupy.cuda.set_allocator(None)  # Disable CuPy's default memory pool
+    cupy.arange(10)  # Make sure CUDA is initialized
+
+    kvikio.defaults.num_threads_reset(args.nthreads)
+    print("Roundtrip benchmark")
+    print("--------------------------------------")
+    print(f"nelem       | {args.nelem} ({format_bytes(args.nbytes)})")
+    print(f"dtype       | {args.dtype}")
+    print(f"nthreads    | {args.nthreads}")
+    print(f"nruns       | {args.nruns}")
+    print(f"server      | {args.server}")
+    if args.server is None:
+        print("--------------------------------------")
+        print("WARNING: the bundled server is slow, ")
+        print("consider using --server.")
+    print("======================================")
+
+    # Run each benchmark using the requested APIs
+    for api in args.api:
+        res = []
+        for elapsed in API[api](args):
+            res.append(elapsed)
+
+        def pprint_api_res(name, samples):
+            samples = [args.nbytes / s for s in samples]  # Convert to throughput
+            mean = statistics.harmonic_mean(samples) if len(samples) > 1 else samples[0]
+            ret = f"{api}-{name}".ljust(18)
+            ret += f"| {format_bytes(mean).rjust(10)}/s".ljust(14)
+            if len(samples) > 1:
+                stdev = statistics.stdev(samples) / mean * 100
+                ret += " ± %5.2f %%" % stdev
+                ret += " ("
+                for sample in samples:
+                    ret += f"{format_bytes(sample)}/s, "
+                ret = ret[:-2] + ")"  # Replace trailing comma
+            return ret
+
+        print(pprint_api_res("read", res))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="HTTP benchmark")
+    parser.add_argument(
+        "-n",
+        "--nelem",
+        metavar="NELEM",
+        default="1024",
+        type=int,
+        help="Number of elements (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--dtype",
+        metavar="DATATYPE",
+        default="float32",
+        type=numpy.dtype,
+        help="The data type of each element (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--nruns",
+        metavar="RUNS",
+        default=1,
+        type=int,
+        help="Number of runs per API (default: %(default)s).",
+    )
+    parser.add_argument(
+        "-t",
+        "--nthreads",
+        metavar="THREADS",
+        default=1,
+        type=int,
+        help="Number of threads to use (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--server",
+        default=None,
+        help=(
+            "Connect to an external http server as opposed "
+            "to the bundled (very slow) HTTP server. "
+            "Remember to also set --server-root-path."
+        ),
+    )
+    parser.add_argument(
+        "--server-root-path",
+        default=None,
+        help="Path to the root directory that `--server` serves (local path).",
+    )
+    parser.add_argument(
+        "--bundled-server-lifetime",
+        metavar="SECONDS",
+        default=3600,
+        type=int,
+        help="Maximum lifetime of the bundled server (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--api",
+        metavar="API",
+        default=list(API.keys())[0],  # defaults to the first API
+        nargs="+",
+        choices=tuple(API.keys()) + ("all",),
+        help="List of APIs to use {%(choices)s} (default: %(default)s).",
+    )
+    args = parser.parse_args()
+    args.nbytes = args.nelem * args.dtype.itemsize
+    if "all" in args.api:
+        args.api = tuple(API.keys())
+
+    with contextlib.ExitStack() as context_stack:
+        if args.server is None:
+            # Create a tmp dir for the bundled server to serve
+            temp_dir = tempfile.TemporaryDirectory()
+            args.bundled_server_root_dir = pathlib.Path(temp_dir.name)
+            context_stack.enter_context(temp_dir)
+
+            # Create the bundled server
+            bundled_server = LocalHttpServer(
+                root_path=args.bundled_server_root_dir,
+                range_support=True,
+                max_lifetime=args.bundled_server_lifetime,
+            )
+            context_stack.enter_context(bundled_server)
+            args.server_url = bundled_server.url
+            args.server_root_path = args.bundled_server_root_dir
+        else:
+            args.server_url = args.server
+            if args.server_root_path is None:
+                raise ValueError("please set --server-root-path")
+            args.server_root_path = pathlib.Path(args.server_root_path)
+        main(args)
diff --git a/python/kvikio/kvikio/benchmarks/s3_io.py b/python/kvikio/kvikio/benchmarks/s3_io.py
new file mode 100644
index 0000000000..5e1846a1e5
--- /dev/null
+++ b/python/kvikio/kvikio/benchmarks/s3_io.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import argparse
+import contextlib
+import multiprocessing
+import os
+import socket
+import statistics
+import sys
+import time
+from functools import partial
+from typing import ContextManager
+from urllib.parse import urlparse
+
+import boto3
+import cupy
+import numpy
+from dask.utils import format_bytes
+
+import kvikio
+import kvikio.defaults
+
+
+def get_local_port() -> int:
+    """Return an available port"""
+    sock = socket.socket()
+    sock.bind(("127.0.0.1", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    return port
+
+
+def start_s3_server(lifetime: int):
+    """Start a server and run it for `lifetime` minutes.
+    NB: to stop before `lifetime`, kill the process/thread running this function.
+    """
+    from moto.server import ThreadedMotoServer
+
+    # Silence the activity info from ThreadedMotoServer
+    sys.stderr = open(os.devnull, "w")
+    url = urlparse(os.environ["AWS_ENDPOINT_URL"])
+    server = ThreadedMotoServer(ip_address=url.hostname, port=url.port)
+    server.start()
+    time.sleep(lifetime)
+
+
+@contextlib.contextmanager
+def local_s3_server(lifetime: int):
+    """Start a server and run it for `lifetime` minutes or kill it on context exit"""
+    # Use fake aws credentials
+    os.environ["AWS_ACCESS_KEY_ID"] = "foobar_key"
+    os.environ["AWS_SECRET_ACCESS_KEY"] = "foobar_secret"
+    os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
+    p = multiprocessing.Process(target=start_s3_server, args=(lifetime,))
+    p.start()
+    yield
+    p.kill()
+
+
+def create_client_and_bucket():
+    client = boto3.client("s3", endpoint_url=os.getenv("AWS_ENDPOINT_URL", None))
+    try:
+        bucket_names = {bucket["Name"] for bucket in client.list_buckets()["Buckets"]}
+        if args.bucket not in bucket_names:
+            client.create_bucket(Bucket=args.bucket, ACL="public-read-write")
+    except Exception:
+        print(
+            "Problem accessing the S3 server? using wrong credentials? Try setting "
+            "AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and/or AWS_ENDPOINT_URL. Also, "
+            "if the bucket doesn't exist, make sure you have the required permission. "
+            "Alternatively, use the bundled server `--use-bundled-server`:\n",
+            file=sys.stderr,
+            flush=True,
+        )
+        raise
+    return client
+
+
+def run_numpy_like(args, xp):
+    # Upload data to S3 server
+    data = numpy.arange(args.nelem, dtype=args.dtype)
+    recv = xp.empty_like(data)
+
+    client = create_client_and_bucket()
+    client.put_object(Bucket=args.bucket, Key="data", Body=bytes(data))
+    url = f"s3://{args.bucket}/data"
+
+    def run() -> float:
+        t0 = time.perf_counter()
+        with kvikio.RemoteFile.open_s3_url(url) as f:
+            res = f.read(recv)
+        t1 = time.perf_counter()
+        assert res == args.nbytes, f"IO mismatch, expected {args.nbytes} got {res}"
+        xp.testing.assert_array_equal(data, recv)
+        return t1 - t0
+
+    for _ in range(args.nruns):
+        yield run()
+
+
+def run_cudf(args, kvikio_remote_io: bool):
+    import cudf
+
+    cudf.set_option("kvikio_remote_io", kvikio_remote_io)
+    url = f"s3://{args.bucket}/data"
+
+    # Upload data to S3 server
+    create_client_and_bucket()
+    data = cupy.random.rand(args.nelem).astype(args.dtype)
+    df = cudf.DataFrame({"a": data})
+    df.to_parquet(url)
+
+    def run() -> float:
+        t0 = time.perf_counter()
+        cudf.read_parquet(url)
+        t1 = time.perf_counter()
+        return t1 - t0
+
+    for _ in range(args.nruns):
+        yield run()
+
+
+API = {
+    "cupy": partial(run_numpy_like, xp=cupy),
+    "numpy": partial(run_numpy_like, xp=numpy),
+    "cudf-kvikio": partial(run_cudf, kvikio_remote_io=True),
+    "cudf-fsspec": partial(run_cudf, kvikio_remote_io=False),
+}
+
+
+def main(args):
+    cupy.cuda.set_allocator(None)  # Disable CuPy's default memory pool
+    cupy.arange(10)  # Make sure CUDA is initialized
+
+    os.environ["KVIKIO_NTHREADS"] = str(args.nthreads)
+    kvikio.defaults.num_threads_reset(args.nthreads)
+
+    print("Remote S3 benchmark")
+    print("--------------------------------------")
+    print(f"nelem       | {args.nelem} ({format_bytes(args.nbytes)})")
+    print(f"dtype       | {args.dtype}")
+    print(f"nthreads    | {args.nthreads}")
+    print(f"nruns       | {args.nruns}")
+    print(f"file        | s3://{args.bucket}/data")
+    if args.use_bundled_server:
+        print("--------------------------------------")
+        print("Using the bundled local server is slow")
+        print("and can be misleading. Consider using")
+        print("a local MinIO or official S3 server.")
+    print("======================================")
+
+    # Run each benchmark using the requested APIs
+    for api in args.api:
+        res = []
+        for elapsed in API[api](args):
+            res.append(elapsed)
+
+        def pprint_api_res(name, samples):
+            samples = [args.nbytes / s for s in samples]  # Convert to throughput
+            mean = statistics.harmonic_mean(samples) if len(samples) > 1 else samples[0]
+            ret = f"{api}-{name}".ljust(18)
+            ret += f"| {format_bytes(mean).rjust(10)}/s".ljust(14)
+            if len(samples) > 1:
+                stdev = statistics.stdev(samples) / mean * 100
+                ret += " ± %5.2f %%" % stdev
+                ret += " ("
+                for sample in samples:
+                    ret += f"{format_bytes(sample)}/s, "
+                ret = ret[:-2] + ")"  # Replace trailing comma
+            return ret
+
+        print(pprint_api_res("read", res))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Roundtrip benchmark")
+    parser.add_argument(
+        "-n",
+        "--nelem",
+        metavar="NELEM",
+        default="1024",
+        type=int,
+        help="Number of elements (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--dtype",
+        metavar="DATATYPE",
+        default="float32",
+        type=numpy.dtype,
+        help="The data type of each element (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--nruns",
+        metavar="RUNS",
+        default=1,
+        type=int,
+        help="Number of runs per API (default: %(default)s).",
+    )
+    parser.add_argument(
+        "-t",
+        "--nthreads",
+        metavar="THREADS",
+        default=1,
+        type=int,
+        help="Number of threads to use (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--use-bundled-server",
+        action="store_true",
+        help="Launch and use a local slow S3 server (ThreadedMotoServer).",
+    )
+    parser.add_argument(
+        "--bundled-server-lifetime",
+        metavar="SECONDS",
+        default=3600,
+        type=int,
+        help="Maximum lifetime of the bundled server (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--bucket",
+        metavar="NAME",
+        default="kvikio-s3-benchmark",
+        type=str,
+        help="Name of the AWS S3 bucket to use (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--api",
+        metavar="API",
+        default="all",
+        nargs="+",
+        choices=tuple(API.keys()) + ("all",),
+        help="List of APIs to use {%(choices)s} (default: %(default)s).",
+    )
+    args = parser.parse_args()
+    args.nbytes = args.nelem * args.dtype.itemsize
+    if "all" in args.api:
+        args.api = tuple(API.keys())
+
+    ctx: ContextManager = contextlib.nullcontext()
+    if args.use_bundled_server:
+        os.environ["AWS_ENDPOINT_URL"] = f"http://127.0.0.1:{get_local_port()}"
+        ctx = local_s3_server(args.bundled_server_lifetime)
+    with ctx:
+        main(args)
diff --git a/python/kvikio/kvikio/benchmarks/utils.py b/python/kvikio/kvikio/benchmarks/utils.py
index 69375b8c21..fa25c361a4 100644
--- a/python/kvikio/kvikio/benchmarks/utils.py
+++ b/python/kvikio/kvikio/benchmarks/utils.py
@@ -12,6 +12,7 @@
 from dask.utils import format_bytes
 
 import kvikio
+import kvikio.cufile_driver
 import kvikio.defaults
 
 
@@ -26,7 +27,8 @@ def drop_vm_cache() -> None:
 def pprint_sys_info() -> None:
     """Pretty print system information"""
 
-    props = kvikio.DriverProperties()
+    version = kvikio.cufile_driver.libcufile_version()
+    props = kvikio.cufile_driver.DriverProperties()
     try:
         import pynvml
 
@@ -40,6 +42,10 @@ def pprint_sys_info() -> None:
         gpu_name = f"{pynvml.nvmlDeviceGetName(dev)} (dev #0)"
         mem_total = format_bytes(pynvml.nvmlDeviceGetMemoryInfo(dev).total)
         bar1_total = format_bytes(pynvml.nvmlDeviceGetBAR1MemoryInfo(dev).bar1Total)
+    if version == (0, 0):
+        libcufile_version = "unknown (earlier than cuFile 1.8)"
+    else:
+        libcufile_version = f"{version[0]}.{version[1]}"
     gds_version = "N/A (Compatibility Mode)"
     if props.is_gds_available:
         gds_version = f"v{props.major_version}.{props.minor_version}"
@@ -60,6 +66,7 @@ def pprint_sys_info() -> None:
     print(f"GPU               | {gpu_name}")
     print(f"GPU Memory Total  | {mem_total}")
     print(f"BAR1 Memory Total | {bar1_total}")
+    print(f"libcufile version | {libcufile_version}")
     print(f"GDS driver        | {gds_version}")
     print(f"GDS config.json   | {gds_config_json_path}")
 
diff --git a/python/kvikio/kvikio/cufile_driver.py b/python/kvikio/kvikio/cufile_driver.py
new file mode 100644
index 0000000000..fb32be347a
--- /dev/null
+++ b/python/kvikio/kvikio/cufile_driver.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import atexit
+from typing import Tuple
+
+from kvikio._lib import cufile_driver  # type: ignore
+
+# TODO: Wrap nicely, maybe as a dataclass?
+# <https://github.com/rapidsai/kvikio/issues/526>
+DriverProperties = cufile_driver.DriverProperties
+
+
+def libcufile_version() -> Tuple[int, int]:
+    """Get the libcufile version.
+
+    Returns (0, 0) for cuFile versions prior to v1.8.
+
+    Notes
+    -----
+    This is not the version of the CUDA toolkit. cufile is part of the
+    toolkit but follows its own version scheme.
+
+    Returns
+    -------
+    The version as a tuple (MAJOR, MINOR).
+    """
+    v = cufile_driver.libcufile_version()
+    # Convert the integer version like 1080 to (1, 8).
+    major, minor = divmod(v, 1000)
+    return (major, minor // 10)
+
+
+def driver_open() -> None:
+    """Open the cuFile driver
+
+    cuFile accepts multiple calls to `driver_open()`. Only the first call
+    opens the driver, but every call must have a matching call to
+    `driver_close()`.
+
+    Normally, it is not required to open and close the cuFile driver since
+    it is done automatically.
+
+    Raises
+    ------
+    RuntimeError
+        If cuFile isn't available.
+    """
+    return cufile_driver.driver_open()
+
+
+def driver_close() -> None:
+    """Close the cuFile driver
+
+    cuFile accepts multiple calls to `driver_open()`. Only the first call
+    opens the driver, but every call must have a matching call to
+    `driver_close()`.
+
+    Raises
+    ------
+    RuntimeError
+        If cuFile isn't available.
+    """
+    return cufile_driver.driver_close()
+
+
+def initialize() -> None:
+    """Open the cuFile driver and close it again at module exit
+
+    Normally, it is not required to open and close the cuFile driver since
+    it is done automatically.
+
+    Notes
+    -----
+    Registers an atexit handler that calls :func:`driver_close`.
+
+    Raises
+    ------
+    RuntimeError
+        If cuFile isn't available.
+    """
+    driver_open()
+    atexit.register(driver_close)
diff --git a/python/kvikio/kvikio/defaults.py b/python/kvikio/kvikio/defaults.py
index a0ff265873..9e959c1f74 100644
--- a/python/kvikio/kvikio/defaults.py
+++ b/python/kvikio/kvikio/defaults.py
@@ -7,7 +7,7 @@
 import kvikio._lib.defaults
 
 
-def compat_mode() -> bool:
+def compat_mode() -> kvikio.CompatMode:
     """Check if KvikIO is running in compatibility mode.
 
     Notice, this is not the same as the compatibility mode in cuFile. That is,
@@ -18,10 +18,11 @@ def compat_mode() -> bool:
 
     Set the environment variable `KVIKIO_COMPAT_MODE` to enable/disable compatibility
     mode. By default, compatibility mode is enabled:
+
     - when `libcufile` cannot be found
     - when running in Windows Subsystem for Linux (WSL)
     - when `/run/udev` isn't readable, which typically happens when running inside
-    a docker image not launched with `--volume /run/udev:/run/udev:ro`
+      a docker image not launched with `--volume /run/udev:/run/udev:ro`
 
     Returns
     -------
@@ -31,32 +32,36 @@ def compat_mode() -> bool:
     return kvikio._lib.defaults.compat_mode()
 
 
-def compat_mode_reset(enable: bool) -> None:
+def compat_mode_reset(compatmode: kvikio.CompatMode) -> None:
     """Reset the compatibility mode.
 
     Use this function to enable/disable compatibility mode explicitly.
 
     Parameters
     ----------
-    enable : bool
-        Set to True to enable and False to disable compatibility mode
+    compatmode : kvikio.CompatMode
+        Set to kvikio.CompatMode.ON to enable and kvikio.CompatMode.OFF to disable
+        compatibility mode, or kvikio.CompatMode.AUTO to let KvikIO determine: try
+        OFF first, and upon failure, fall back to ON.
     """
-    kvikio._lib.defaults.compat_mode_reset(enable)
+    kvikio._lib.defaults.compat_mode_reset(compatmode)
 
 
 @contextlib.contextmanager
-def set_compat_mode(enable: bool):
+def set_compat_mode(compatmode: kvikio.CompatMode):
     """Context for resetting the compatibility mode.
 
     Parameters
     ----------
-    enable : bool
-        Set to True to enable and False to disable compatibility mode
+    compatmode : kvikio.CompatMode
+        Set to kvikio.CompatMode.ON to enable and kvikio.CompatMode.OFF to disable
+        compatibility mode, or kvikio.CompatMode.AUTO to let KvikIO determine: try
+        OFF first, and upon failure, fall back to ON.
     """
     num_threads_reset(get_num_threads())  # Sync all running threads
     old_value = compat_mode()
     try:
-        compat_mode_reset(enable)
+        compat_mode_reset(compatmode)
         yield
     finally:
         compat_mode_reset(old_value)
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
new file mode 100644
index 0000000000..f10f4b49f9
--- /dev/null
+++ b/python/kvikio/kvikio/remote_file.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+from __future__ import annotations
+
+import functools
+from typing import Optional
+
+from kvikio.cufile import IOFuture
+
+
+@functools.cache
+def is_remote_file_available() -> bool:
+    """Check if the remote module is available"""
+    try:
+        import kvikio._lib.remote_handle  # noqa: F401
+    except ImportError:
+        return False
+    else:
+        return True
+
+
+@functools.cache
+def _get_remote_module():
+    """Get the remote module or raise an error"""
+    if not is_remote_file_available():
+        raise RuntimeError(
+            "RemoteFile not available, please build KvikIO "
+            "with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)"
+        )
+    import kvikio._lib.remote_handle
+
+    return kvikio._lib.remote_handle
+
+
+class RemoteFile:
+    """File handle of a remote file."""
+
+    def __init__(self, handle):
+        """Create a remote file from a Cython handle.
+
+        This constructor should not be called directly instead use a
+        factory method like `RemoteFile.open_http()`
+
+        Parameters
+        ----------
+        handle : kvikio._lib.remote_handle.RemoteFile
+            The Cython handle
+        """
+        assert isinstance(handle, _get_remote_module().RemoteFile)
+        self._handle = handle
+
+    @classmethod
+    def open_http(
+        cls,
+        url: str,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """Open a http file.
+
+        Parameters
+        ----------
+        url
+            URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server
+            for the file size.
+        """
+        return RemoteFile(_get_remote_module().RemoteFile.open_http(url, nbytes))
+
+    @classmethod
+    def open_s3(
+        cls,
+        bucket_name: str,
+        object_name: str,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """Open a AWS S3 file from a bucket name and object name.
+
+        Please make sure to set the AWS environment variables:
+          - `AWS_DEFAULT_REGION`
+          - `AWS_ACCESS_KEY_ID`
+          - `AWS_SECRET_ACCESS_KEY`
+
+        Additionally, to overwrite the AWS endpoint, set `AWS_ENDPOINT_URL`.
+        See <https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html>
+
+        Parameters
+        ----------
+        bucket_name
+            The bucket name of the file.
+        object_name
+            The object name of the file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server
+            for the file size.
+        """
+        return RemoteFile(
+            _get_remote_module().RemoteFile.open_s3(bucket_name, object_name, nbytes)
+        )
+
+    @classmethod
+    def open_s3_url(
+        cls,
+        url: str,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """Open a AWS S3 file from an URL.
+
+        The `url` can take two forms:
+          - A full http url such as "http://127.0.0.1/my/file", or
+          - A S3 url such as "s3://<bucket>/<object>".
+
+        Please make sure to set the AWS environment variables:
+          - `AWS_DEFAULT_REGION`
+          - `AWS_ACCESS_KEY_ID`
+          - `AWS_SECRET_ACCESS_KEY`
+
+        Additionally, if `url` is a S3 url, it is possible to overwrite the AWS endpoint
+        by setting `AWS_ENDPOINT_URL`.
+        See <https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html>
+
+        Parameters
+        ----------
+        url
+            Either a http url or a S3 url.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server
+            for the file size.
+        """
+        url = url.lower()
+        if url.startswith("http://") or url.startswith("https://"):
+            return RemoteFile(
+                _get_remote_module().RemoteFile.open_s3_from_http_url(url, nbytes)
+            )
+        if url.startswith("s3://"):
+            return RemoteFile(
+                _get_remote_module().RemoteFile.open_s3_from_s3_url(url, nbytes)
+            )
+        raise ValueError(f"Unsupported protocol: {url}")
+
+    def close(self) -> None:
+        """Close the file"""
+        pass
+
+    def __enter__(self) -> RemoteFile:
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.close()
+
+    def __str__(self) -> str:
+        return str(self._handle)
+
+    def nbytes(self) -> int:
+        """Get the file size.
+
+        Note, this is very fast, no communication needed.
+
+        Returns
+        -------
+        The number of bytes.
+        """
+        return self._handle.nbytes()
+
+    def read(self, buf, size: Optional[int] = None, file_offset: int = 0) -> int:
+        """Read from remote source into buffer (host or device memory) in parallel.
+
+        Parameters
+        ----------
+        buf : buffer-like or array-like
+            Device or host buffer to read into.
+        size
+            Size in bytes to read.
+        file_offset
+            Offset in the file to read from.
+
+        Returns
+        -------
+        The size of bytes that were successfully read.
+        """
+        return self.pread(buf, size, file_offset).get()
+
+    def pread(self, buf, size: Optional[int] = None, file_offset: int = 0) -> IOFuture:
+        """Read from remote source into buffer (host or device memory) in parallel.
+
+        Parameters
+        ----------
+        buf : buffer-like or array-like
+            Device or host buffer to read into.
+        size
+            Size in bytes to read.
+        file_offset
+            Offset in the file to read from.
+
+        Returns
+        -------
+        Future that on completion returns the size of bytes that were successfully
+        read.
+        """
+        return IOFuture(self._handle.pread(buf, size, file_offset))
diff --git a/python/kvikio/kvikio/utils.py b/python/kvikio/kvikio/utils.py
new file mode 100644
index 0000000000..09a9f2062a
--- /dev/null
+++ b/python/kvikio/kvikio/utils.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import functools
+import multiprocessing
+import pathlib
+import threading
+import time
+from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
+
+
+class LocalHttpServer:
+    """Local http server - slow but convenient"""
+
+    @staticmethod
+    def _server(
+        queue: multiprocessing.Queue,
+        root_path: str,
+        range_support: bool,
+        max_lifetime: int,
+    ):
+        if range_support:
+            from RangeHTTPServer import RangeRequestHandler
+
+            handler = RangeRequestHandler
+        else:
+            handler = SimpleHTTPRequestHandler
+        httpd = ThreadingHTTPServer(
+            ("127.0.0.1", 0), functools.partial(handler, directory=root_path)
+        )
+        thread = threading.Thread(target=httpd.serve_forever)
+        thread.start()
+        queue.put(httpd.server_address)
+        time.sleep(max_lifetime)
+        print(
+            f"ThreadingHTTPServer shutting down because of timeout ({max_lifetime}sec)"
+        )
+
+    def __init__(
+        self,
+        root_path: str | pathlib.Path,
+        range_support: bool = True,
+        max_lifetime: int = 120,
+    ) -> None:
+        """Create a context that starts a local http server.
+
+        Example
+        -------
+        >>> with LocalHttpServer(root_path="/my/server/") as server:
+        ...     with kvikio.RemoteFile.open_http(f"{server.url}/myfile") as f:
+        ...         f.read(...)
+
+        Parameters
+        ----------
+        root_path
+            Path to the directory the server will serve.
+        range_support
+            Whether to support the ranges, required by `RemoteFile.open_http()`.
+            Depend on the `RangeHTTPServer` module (`pip install rangehttpserver`).
+        max_lifetime
+            Maximum lifetime of the server (in seconds).
+        """
+        self.root_path = root_path
+        self.range_support = range_support
+        self.max_lifetime = max_lifetime
+
+    def __enter__(self):
+        queue = multiprocessing.Queue()
+        self.process = multiprocessing.Process(
+            target=LocalHttpServer._server,
+            args=(queue, str(self.root_path), self.range_support, self.max_lifetime),
+        )
+        self.process.start()
+        ip, port = queue.get()
+        self.ip = ip
+        self.port = port
+        self.url = f"http://{ip}:{port}"
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.process.kill()
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index d7c215213c..4869a63fab 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -20,8 +20,10 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cupy-cuda11x>=12.0.0",
+    "libkvikio==24.12.*,>=0.0.0a0",
     "numcodecs !=0.12.0",
     "numpy>=1.23,<3.0a0",
+    "nvidia-nvcomp==4.1.0.6",
     "packaging",
     "zarr",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -38,10 +40,13 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
-    "cuda-python>=11.7.1,<12.0a0",
-    "dask>=2022.05.2",
+    "boto3>=1.21.21",
+    "cuda-python>=11.7.1,<12.0a0,<=11.8.3",
+    "moto[server]>=4.0.8",
     "pytest",
     "pytest-cov",
+    "rangehttpserver",
+    "rapids-dask-dependency==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -114,7 +119,7 @@ matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.0",
-    "libkvikio==24.10.*",
+    "libkvikio==24.12.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -134,8 +139,20 @@ provider = "scikit_build_core.metadata.regex"
 input = "kvikio/VERSION"
 regex = "(?P<value>.*)"
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",
     "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache",
+    "ignore::DeprecationWarning:botocore.*",
+]
+markers = [
+    "cufile: tests to skip if cuFile isn't available e.g. run with `pytest -m 'not cufile'`"
 ]
diff --git a/python/kvikio/tests/test_async_io.py b/python/kvikio/tests/test_async_io.py
index 5ddc7536d8..2de4aef9c9 100644
--- a/python/kvikio/tests/test_async_io.py
+++ b/python/kvikio/tests/test_async_io.py
@@ -30,7 +30,11 @@ def test_read_write(tmp_path, size):
     assert f.raw_write_async(a, stream.ptr).check_bytes_done() == a.nbytes
 
     # Try to read file opened in write-only mode
-    with pytest.raises(RuntimeError, match="Operation not permitted"):
+    # POSIX read would yield the error "Operation not permitted"
+    # cuFile read would yield the error "unsupported file open flags"
+    with pytest.raises(
+        RuntimeError, match="Operation not permitted|unsupported file open flags"
+    ):
         # The exception is raised when we call the raw_read_async API.
         future_stream = f.raw_read_async(a, stream.ptr)
         future_stream.check_bytes_done()
diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py
index 3bdaf6613e..307b0b258d 100644
--- a/python/kvikio/tests/test_benchmarks.py
+++ b/python/kvikio/tests/test_benchmarks.py
@@ -8,6 +8,8 @@
 
 import pytest
 
+import kvikio
+
 benchmarks_path = (
     Path(os.path.realpath(__file__)).parent.parent / "kvikio" / "benchmarks"
 )
@@ -78,3 +80,61 @@ def test_zarr_io(run_cmd, tmp_path, api):
         cwd=benchmarks_path,
     )
     assert retcode == 0
+
+
+@pytest.mark.parametrize(
+    "api",
+    [
+        "cupy",
+        "numpy",
+    ],
+)
+def test_http_io(run_cmd, api):
+    """Test benchmarks/http_io.py"""
+
+    if not kvikio.is_remote_file_available():
+        pytest.skip(
+            "RemoteFile not available, please build KvikIO "
+            "with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)"
+        )
+    retcode = run_cmd(
+        cmd=[
+            sys.executable,
+            "http_io.py",
+            "-n",
+            "1000",
+            "--api",
+            api,
+        ],
+        cwd=benchmarks_path,
+    )
+    assert retcode == 0
+
+
+@pytest.mark.parametrize(
+    "api",
+    [
+        "cupy",
+        "numpy",
+    ],
+)
+def test_s3_io(run_cmd, api):
+    """Test benchmarks/s3_io.py"""
+
+    if not kvikio.is_remote_file_available():
+        pytest.skip(
+            "RemoteFile not available, please build KvikIO "
+            "with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)"
+        )
+    retcode = run_cmd(
+        cmd=[
+            sys.executable,
+            "http_io.py",
+            "-n",
+            "1000",
+            "--api",
+            api,
+        ],
+        cwd=benchmarks_path,
+    )
+    assert retcode == 0
diff --git a/python/kvikio/tests/test_cufile_driver.py b/python/kvikio/tests/test_cufile_driver.py
new file mode 100644
index 0000000000..a1dc3a6454
--- /dev/null
+++ b/python/kvikio/tests/test_cufile_driver.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import pytest
+
+import kvikio.cufile_driver
+
+
+def test_version():
+    major, minor = kvikio.cufile_driver.libcufile_version()
+    assert major >= 0
+    assert minor >= 0
+
+
+@pytest.mark.cufile
+def test_open_and_close():
+    kvikio.cufile_driver.driver_open()
+    kvikio.cufile_driver.driver_close()
diff --git a/python/kvikio/tests/test_defaults.py b/python/kvikio/tests/test_defaults.py
index 39892a784d..d7048c418d 100644
--- a/python/kvikio/tests/test_defaults.py
+++ b/python/kvikio/tests/test_defaults.py
@@ -8,17 +8,19 @@
 
 
 @pytest.mark.skipif(
-    kvikio.defaults.compat_mode(),
+    kvikio.defaults.compat_mode() == kvikio.CompatMode.ON,
     reason="cannot test `compat_mode` when already running in compatibility mode",
 )
 def test_compat_mode():
     """Test changing `compat_mode`"""
 
     before = kvikio.defaults.compat_mode()
-    with kvikio.defaults.set_compat_mode(True):
-        assert kvikio.defaults.compat_mode()
-        kvikio.defaults.compat_mode_reset(False)
-        assert not kvikio.defaults.compat_mode()
+    with kvikio.defaults.set_compat_mode(kvikio.CompatMode.ON):
+        assert kvikio.defaults.compat_mode() == kvikio.CompatMode.ON
+        kvikio.defaults.compat_mode_reset(kvikio.CompatMode.OFF)
+        assert kvikio.defaults.compat_mode() == kvikio.CompatMode.OFF
+        kvikio.defaults.compat_mode_reset(kvikio.CompatMode.AUTO)
+        assert kvikio.defaults.compat_mode() == kvikio.CompatMode.AUTO
     assert before == kvikio.defaults.compat_mode()
 
 
diff --git a/python/kvikio/tests/test_examples.py b/python/kvikio/tests/test_examples.py
index e9e1f83d08..07be1fc156 100644
--- a/python/kvikio/tests/test_examples.py
+++ b/python/kvikio/tests/test_examples.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
 import os
@@ -7,6 +7,8 @@
 
 import pytest
 
+import kvikio
+
 examples_path = Path(os.path.realpath(__file__)).parent / ".." / "examples"
 
 
@@ -26,3 +28,16 @@ def test_zarr_cupy_nvcomp(tmp_path, monkeypatch):
 
     monkeypatch.syspath_prepend(str(examples_path))
     import_module("zarr_cupy_nvcomp").main(tmp_path / "test-file")
+
+
+def test_http_io(tmp_path, monkeypatch):
+    """Test examples/http_io.py"""
+
+    if not kvikio.is_remote_file_available():
+        pytest.skip(
+            "RemoteFile not available, please build KvikIO "
+            "with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)"
+        )
+
+    monkeypatch.syspath_prepend(str(examples_path))
+    import_module("http_io").main(tmp_path)
diff --git a/python/kvikio/tests/test_http_io.py b/python/kvikio/tests/test_http_io.py
new file mode 100644
index 0000000000..5c2c3888cd
--- /dev/null
+++ b/python/kvikio/tests/test_http_io.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+
+import numpy as np
+import pytest
+
+import kvikio
+import kvikio.defaults
+from kvikio.utils import LocalHttpServer
+
+pytestmark = pytest.mark.skipif(
+    not kvikio.is_remote_file_available(),
+    reason=(
+        "RemoteFile not available, please build KvikIO "
+        "with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)"
+    ),
+)
+
+
+@pytest.fixture
+def http_server(request, tmpdir):
+    """Fixture to set up http server in separate process"""
+    range_support = True
+    if hasattr(request, "param"):
+        range_support = request.param.get("range_support", True)
+
+    with LocalHttpServer(tmpdir, range_support, max_lifetime=60) as server:
+        yield server.url
+
+
+def test_file_size(http_server, tmpdir):
+    a = np.arange(100)
+    a.tofile(tmpdir / "a")
+    with kvikio.RemoteFile.open_http(f"{http_server}/a") as f:
+        assert f.nbytes() == a.nbytes
+
+
+@pytest.mark.parametrize("size", [10, 100, 1000])
+@pytest.mark.parametrize("nthreads", [1, 3])
+@pytest.mark.parametrize("tasksize", [99, 999])
+def test_read(http_server, tmpdir, xp, size, nthreads, tasksize):
+    a = xp.arange(size)
+    a.tofile(tmpdir / "a")
+
+    with kvikio.defaults.set_num_threads(nthreads):
+        with kvikio.defaults.set_task_size(tasksize):
+            with kvikio.RemoteFile.open_http(f"{http_server}/a") as f:
+                assert f.nbytes() == a.nbytes
+                assert f"{http_server}/a" in str(f)
+                b = xp.empty_like(a)
+                assert f.read(b) == a.nbytes
+                xp.testing.assert_array_equal(a, b)
+
+
+@pytest.mark.parametrize("nthreads", [1, 10])
+def test_large_read(http_server, tmpdir, xp, nthreads):
+    a = xp.arange(16_000_000)
+    a.tofile(tmpdir / "a")
+
+    with kvikio.defaults.set_num_threads(nthreads):
+        with kvikio.RemoteFile.open_http(f"{http_server}/a") as f:
+            assert f.nbytes() == a.nbytes
+            assert f"{http_server}/a" in str(f)
+            b = xp.empty_like(a)
+            assert f.read(b) == a.nbytes
+            xp.testing.assert_array_equal(a, b)
+
+
+def test_error_too_small_file(http_server, tmpdir, xp):
+    a = xp.arange(10, dtype="uint8")
+    b = xp.empty(100, dtype="uint8")
+    a.tofile(tmpdir / "a")
+    with kvikio.RemoteFile.open_http(f"{http_server}/a") as f:
+        assert f.nbytes() == a.nbytes
+        assert f"{http_server}/a" in str(f)
+        with pytest.raises(
+            ValueError, match=r"cannot read 0\+100 bytes into a 10 bytes file"
+        ):
+            f.read(b)
+        with pytest.raises(
+            ValueError, match=r"cannot read 100\+5 bytes into a 10 bytes file"
+        ):
+            f.read(b, size=5, file_offset=100)
+
+
+@pytest.mark.parametrize("http_server", [{"range_support": False}], indirect=True)
+def test_no_range_support(http_server, tmpdir, xp):
+    a = xp.arange(100, dtype="uint8")
+    a.tofile(tmpdir / "a")
+    b = xp.empty_like(a)
+    with kvikio.RemoteFile.open_http(f"{http_server}/a") as f:
+        assert f.nbytes() == a.nbytes
+        assert f"{http_server}/a" in str(f)
+        with pytest.raises(
+            OverflowError, match="maybe the server doesn't support file ranges?"
+        ):
+            f.read(b, size=10, file_offset=0)
+        with pytest.raises(
+            OverflowError, match="maybe the server doesn't support file ranges?"
+        ):
+            f.read(b, size=10, file_offset=10)
diff --git a/python/kvikio/tests/test_s3_io.py b/python/kvikio/tests/test_s3_io.py
new file mode 100644
index 0000000000..1f2bae95d0
--- /dev/null
+++ b/python/kvikio/tests/test_s3_io.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import multiprocessing as mp
+import socket
+import time
+from contextlib import contextmanager
+
+import pytest
+
+import kvikio
+import kvikio.defaults
+
+pytestmark = pytest.mark.skipif(
+    not kvikio.is_remote_file_available(),
+    reason=(
+        "RemoteFile not available, please build KvikIO "
+        "with libcurl (-DKvikIO_REMOTE_SUPPORT=ON)"
+    ),
+)
+
+# Notice, we import boto and moto after the `is_remote_file_available` check.
+import boto3  # noqa: E402
+import moto  # noqa: E402
+import moto.server  # noqa: E402
+
+
+@pytest.fixture(scope="session")
+def endpoint_ip():
+    return "127.0.0.1"
+
+
+@pytest.fixture(scope="session")
+def endpoint_port():
+    # Return a free port per worker session.
+    sock = socket.socket()
+    sock.bind(("127.0.0.1", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    return port
+
+
+def start_s3_server(ip_address, port):
+    server = moto.server.ThreadedMotoServer(ip_address=ip_address, port=port)
+    server.start()
+    time.sleep(600)
+    print("ThreadedMotoServer shutting down because of timeout (10min)")
+
+
+@pytest.fixture(scope="session")
+def s3_base(endpoint_ip, endpoint_port):
+    """Fixture to set up moto server in separate process"""
+    with pytest.MonkeyPatch.context() as monkeypatch:
+        # Use fake aws credentials
+        monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
+        monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
+        monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1")
+        monkeypatch.setenv("AWS_ENDPOINT_URL", f"http://{endpoint_ip}:{endpoint_port}")
+
+        p = mp.Process(target=start_s3_server, args=(endpoint_ip, endpoint_port))
+        p.start()
+        yield f"http://{endpoint_ip}:{endpoint_port}"
+        p.kill()
+
+
+@contextmanager
+def s3_context(s3_base, bucket, files=None):
+    if files is None:
+        files = {}
+    client = boto3.client("s3", endpoint_url=s3_base)
+    client.create_bucket(Bucket=bucket, ACL="public-read-write")
+    for f, data in files.items():
+        client.put_object(Bucket=bucket, Key=f, Body=data)
+    yield s3_base
+    for f, data in files.items():
+        try:
+            client.delete_object(Bucket=bucket, Key=f)
+        except Exception:
+            pass
+
+
+def test_read_access(s3_base):
+    bucket_name = "bucket"
+    object_name = "data"
+    data = b"file content"
+    with s3_context(
+        s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(data)}
+    ) as server_address:
+        with kvikio.RemoteFile.open_s3_url(f"s3://{bucket_name}/{object_name}") as f:
+            assert f.nbytes() == len(data)
+            got = bytearray(len(data))
+            assert f.read(got) == len(got)
+
+        with kvikio.RemoteFile.open_s3(bucket_name, object_name) as f:
+            assert f.nbytes() == len(data)
+            got = bytearray(len(data))
+            assert f.read(got) == len(got)
+
+        with kvikio.RemoteFile.open_s3_url(
+            f"{server_address}/{bucket_name}/{object_name}"
+        ) as f:
+            assert f.nbytes() == len(data)
+            got = bytearray(len(data))
+            assert f.read(got) == len(got)
+
+        with pytest.raises(ValueError, match="Unsupported protocol"):
+            kvikio.RemoteFile.open_s3_url(f"unknown://{bucket_name}/{object_name}")
+
+        with pytest.raises(RuntimeError, match="URL returned error: 404"):
+            kvikio.RemoteFile.open_s3("unknown-bucket", object_name)
+
+        with pytest.raises(RuntimeError, match="URL returned error: 404"):
+            kvikio.RemoteFile.open_s3(bucket_name, "unknown-file")
+
+
+@pytest.mark.parametrize("size", [10, 100, 1000])
+@pytest.mark.parametrize("nthreads", [1, 3])
+@pytest.mark.parametrize("tasksize", [99, 999])
+@pytest.mark.parametrize("buffer_size", [101, 1001])
+def test_read(s3_base, xp, size, nthreads, tasksize, buffer_size):
+    bucket_name = "test_read"
+    object_name = "a1"
+    a = xp.arange(size)
+    with s3_context(
+        s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)}
+    ) as server_address:
+        with kvikio.defaults.set_num_threads(nthreads):
+            with kvikio.defaults.set_task_size(tasksize):
+                with kvikio.defaults.set_bounce_buffer_size(buffer_size):
+                    with kvikio.RemoteFile.open_s3_url(
+                        f"{server_address}/{bucket_name}/{object_name}"
+                    ) as f:
+                        assert f.nbytes() == a.nbytes
+                        b = xp.empty_like(a)
+                        assert f.read(buf=b) == a.nbytes
+                        xp.testing.assert_array_equal(a, b)
+
+
+@pytest.mark.parametrize(
+    "start,end",
+    [
+        (0, 10 * 4096),
+        (1, int(1.3 * 4096)),
+        (int(2.1 * 4096), int(5.6 * 4096)),
+        (42, int(2**20)),
+    ],
+)
+def test_read_with_file_offset(s3_base, xp, start, end):
+    bucket_name = "test_read_with_file_offset"
+    object_name = "a1"
+    a = xp.arange(end, dtype=xp.int64)
+    with s3_context(
+        s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(a)}
+    ) as server_address:
+        url = f"{server_address}/{bucket_name}/{object_name}"
+        with kvikio.RemoteFile.open_s3_url(url) as f:
+            b = xp.zeros(shape=(end - start,), dtype=xp.int64)
+            assert f.read(b, file_offset=start * a.itemsize) == b.nbytes
+            xp.testing.assert_array_equal(a[start:end], b)
diff --git a/python/libkvikio/CMakeLists.txt b/python/libkvikio/CMakeLists.txt
index 278e09f462..270e8ff5be 100644
--- a/python/libkvikio/CMakeLists.txt
+++ b/python/libkvikio/CMakeLists.txt
@@ -38,6 +38,20 @@ unset(kvikio_FOUND)
 
 set(KvikIO_BUILD_EXAMPLES OFF)
 set(KvikIO_BUILD_TESTS OFF)
+if(USE_NVCOMP_RUNTIME_WHEEL)
+  set(KvikIO_EXPORT_NVCOMP OFF)
+endif()
 set(CUDA_STATIC_RUNTIME ON)
 
 add_subdirectory(../../cpp kvikio-cpp)
+
+if(USE_NVCOMP_RUNTIME_WHEEL)
+  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
+  foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+    set_property(
+      TARGET ${tgt}
+      PROPERTY INSTALL_RPATH ${rpaths}
+      APPEND
+    )
+  endforeach()
+endif()
diff --git a/python/libkvikio/libkvikio/__init__.py b/python/libkvikio/libkvikio/__init__.py
index 995cd1027d..a221295d4c 100644
--- a/python/libkvikio/libkvikio/__init__.py
+++ b/python/libkvikio/libkvikio/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.
 
 from libkvikio._version import __git_commit__, __version__
+from libkvikio.load import load_library
 
-__all__ = ["__git_commit__", "__version__"]
+__all__ = ["__git_commit__", "__version__", "load_library"]
diff --git a/python/libkvikio/libkvikio/load.py b/python/libkvikio/libkvikio/load.py
new file mode 100644
index 0000000000..8856923eaf
--- /dev/null
+++ b/python/libkvikio/libkvikio/load.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ctypes
+import os
+
+# Loading with RTLD_LOCAL adds the library itself to the loader's
+# loaded library cache without loading any symbols into the global
+# namespace. This allows libraries that express a dependency on
+# this library to be loaded later and successfully satisfy this dependency
+# without polluting the global symbol table with symbols from
+# libkvikio that could conflict with symbols from other DSOs.
+PREFERRED_LOAD_FLAG = ctypes.RTLD_LOCAL
+
+
+def _load_system_installation(soname: str):
+    """Try to dlopen() the library indicated by ``soname``
+
+    Raises ``OSError`` if library cannot be loaded.
+    """
+    return ctypes.CDLL(soname, PREFERRED_LOAD_FLAG)
+
+
+def _load_wheel_installation(soname: str):
+    """Try to dlopen() the library indicated by ``soname``
+
+    Returns ``None`` if the library cannot be loaded.
+    """
+    if os.path.isfile(lib := os.path.join(os.path.dirname(__file__), "lib64", soname)):
+        return ctypes.CDLL(lib, PREFERRED_LOAD_FLAG)
+    return None
+
+
+def load_library():
+    """Dynamically load libkvikio.so and its dependencies"""
+    prefer_system_installation = (
+        os.getenv("RAPIDS_LIBKVIKIO_PREFER_SYSTEM_LIBRARY", "false").lower() != "false"
+    )
+
+    soname = "libkvikio.so"
+    libkvikio_lib = None
+    if prefer_system_installation:
+        # Prefer a system library if one is present to
+        # avoid clobbering symbols that other packages might expect, but if no
+        # other library is present use the one in the wheel.
+        try:
+            libkvikio_lib = _load_system_installation(soname)
+        except OSError:
+            libkvikio_lib = _load_wheel_installation(soname)
+    else:
+        # Prefer the libraries bundled in this package. If they aren't found
+        # (which might be the case in builds where the library was prebuilt
+        # before packaging the wheel), look for a system installation.
+        try:
+            libkvikio_lib = _load_wheel_installation(soname)
+            if libkvikio_lib is None:
+                libkvikio_lib = _load_system_installation(soname)
+        except OSError:
+            # If none of the searches above succeed, just silently return None
+            # and rely on other mechanisms (like RPATHs on other DSOs) to
+            # help the loader find the library.
+            pass
+
+    # The caller almost never needs to do anything with this library, but no
+    # harm in offering the option since this object at least provides a handle
+    # to inspect where libkvikio was loaded from.
+    return libkvikio_lib
diff --git a/python/libkvikio/pyproject.toml b/python/libkvikio/pyproject.toml
index 3be7cbc0ae..9504cb3755 100644
--- a/python/libkvikio/pyproject.toml
+++ b/python/libkvikio/pyproject.toml
@@ -59,3 +59,11 @@ requires = [
 
 [project.entry-points."cmake.prefix"]
 libkvikio = "libkvikio"
+
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'